Skip to content

WIP/REF: BlockManager.setitem_blockwise #39302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
46acf44
TST: split coercion tests
jbrockmendel Jan 14, 2021
5ca4ab2
REF: implement BlockManager.setitem2
jbrockmendel Jan 14, 2021
77e09e2
checkpoint tests passing
jbrockmendel Jan 14, 2021
1c1be1d
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 15, 2021
46126ab
checkpoint passing
jbrockmendel Jan 15, 2021
da2dda3
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 15, 2021
2624c2e
port test from ref-setitem-blockwise
jbrockmendel Jan 15, 2021
51b90a3
cleanup
jbrockmendel Jan 16, 2021
3fe6d45
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 16, 2021
08d57ba
checkpoint passing
jbrockmendel Jan 16, 2021
19124fe
checkpoint passing
jbrockmendel Jan 16, 2021
35915d4
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 16, 2021
96f5664
rename
jbrockmendel Jan 16, 2021
c1e0b0f
REF: avoid going through iloc
jbrockmendel Jan 16, 2021
83f5545
port from other PRs
jbrockmendel Jan 17, 2021
da87965
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 17, 2021
0513231
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 18, 2021
8718444
cleanup
jbrockmendel Jan 18, 2021
86782b3
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 19, 2021
53e4628
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 19, 2021
d723835
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 19, 2021
2dd1ce4
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 19, 2021
19a5553
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jan 20, 2021
604f796
cleanup
jbrockmendel Jan 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
pandas_dtype,
)
from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna
from pandas.core.dtypes.generic import ABCDataFrame

from pandas.core import nanops, ops
from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts
Expand Down Expand Up @@ -613,6 +614,11 @@ def _validate_listlike(self, value, allow_object: bool = False):
# We treat empty list as our own dtype.
return type(self)._from_sequence([], dtype=self.dtype)

if isinstance(value, ABCDataFrame) and value.shape[1] == 1:
# FIXME: kludge
res = self._validate_listlike(value._ixs(0, axis=1), allow_object=allow_object)
return res.reshape(-1, 1)

if hasattr(value, "dtype") and value.dtype == object:
# `array` below won't do inference if value is an Index or Series.
# so do so here. in the Index case, inferred_type may be cached.
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,14 +506,27 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
return array_equivalent(left, right, dtype_equal=True)


def infer_fill_value(val):
def infer_fill_value(val, length: int):
"""
infer the fill value for the nan/NaT from the provided
scalar/ndarray/list-like if we are a NaT, return the correct dtyped
element to provide proper block construction
"""
if not is_list_like(val):
val = [val]

if is_extension_array_dtype(val):
# We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype
if len(val) == length:
# TODO: in this case see if we can avoid making a copy later on
return val
if length == 0:
return val[:0].copy()

dtype = val.dtype
cls = dtype.construct_array_type()
return cls._from_sequence([dtype._na_value], dtype=dtype).repeat(length)

val = np.array(val, copy=False)
if needs_i8_conversion(val.dtype):
return np.array("NaT", dtype=val.dtype)
Expand Down
30 changes: 23 additions & 7 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3209,6 +3209,7 @@ def _setitem_slice(self, key: slice, value):
def _setitem_array(self, key, value):
# also raises Exception if object array with NA values
if com.is_bool_indexer(key):
# bool indexer is indexing along rows
if len(key) != len(self.index):
raise ValueError(
f"Item wrong length {len(key)} instead of {len(self.index)}!"
Expand All @@ -3218,18 +3219,33 @@ def _setitem_array(self, key, value):
self._check_setitem_copy()
self.iloc[indexer] = value
else:
if isinstance(value, DataFrame):
if isinstance(value, DataFrame): # 7 test_string_array tests fail if this block is disabled
if len(value.columns) != len(key):
raise ValueError("Columns must be same length as key")
for k1, k2 in zip(key, value.columns):
self[k1] = value[k2]

elif not is_list_like(value):
for col in key:
self[col] = value

elif isinstance(value, np.ndarray) and value.ndim == 2:
if value.shape[-1] != len(key):
raise ValueError("Columns must be same length as key")

for i, col in enumerate(key):
self[col] = value[:, i]

elif np.ndim(value) > 1:
# list of lists
value = DataFrame(value).values
return self._setitem_array(key, value)

else:
self.loc._ensure_listlike_indexer(key, axis=1, value=value)
indexer = self.loc._get_listlike_indexer(
key, axis=1, raise_missing=False
)[1]
self._check_setitem_copy()
self.iloc[:, indexer] = value
if len(value) != len(key):
raise ValueError("Columns must be same length as key")
for i, col in enumerate(key):
self[col] = value[i]

def _setitem_frame(self, key, value):
# support boolean setting with DataFrame input, e.g.
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,21 @@ def validate_indices(indices: np.ndarray, n: int) -> None:
# Indexer Conversion


def ensure_iterable_indexer(ncols: int, column_indexer):
"""
Ensure that our column indexer is something that can be iterated over.
"""
if is_integer(column_indexer):
ilocs = [column_indexer]
elif isinstance(column_indexer, slice):
ilocs = np.arange(ncols)[column_indexer]
elif isinstance(column_indexer, np.ndarray) and is_bool_dtype(column_indexer.dtype):
ilocs = np.arange(len(column_indexer))[column_indexer]
else:
ilocs = column_indexer
return ilocs


def maybe_convert_indices(indices, n: int):
"""
Attempt to convert indices into valid, positive indices.
Expand Down
75 changes: 45 additions & 30 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_extension_array_dtype,
is_hashable,
is_integer,
is_iterator,
Expand Down Expand Up @@ -1595,7 +1596,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
# We are setting an entire column
self.obj[key] = value
else:
self.obj[key] = infer_fill_value(value)
self.obj[key] = infer_fill_value(value, len(self.obj))

new_indexer = convert_from_missing_indexer_tuple(
indexer, self.obj.axes
Expand Down Expand Up @@ -1674,7 +1675,14 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str):

elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
# We are setting multiple rows in a single column.
self._setitem_single_column(ilocs[0], value, pi)
if len(value) == len(self.obj):
# Setting entire column, so swapping out
# GH#??? we may want to change this behavior
self.obj._iset_item(ilocs[0], value)
else:
obj = type(self.obj)(value)
self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj)
self.obj._clear_item_cache()

elif len(ilocs) == 1 and 0 != lplane_indexer != len(value):
# We are trying to set N values into M entries of a single
Expand All @@ -1696,17 +1704,37 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str):
pass

elif len(ilocs) == len(value):
# We are setting multiple columns in a single row.
for loc, v in zip(ilocs, value):
self._setitem_single_column(loc, v, pi)
# We are setting multiple columns in a with one row which we broadcast
if is_extension_array_dtype(value): # TODO: not hit
val = DataFrame.from_arrays(
[value], index=[0], columns=range(len(value))
)
elif isinstance(value, np.ndarray):
val = np.atleast_2d(value)
else:
# avoid numpy casting which can take e.g. ["b", 2] -> ["b", "2"]
val = type(self.obj)([value])
if lplane_indexer != 1:
# broadcast to length of pi
# TODO: EA compat for broadcast_to
arrs = list(val._iter_column_arrays())
arrs = [np.broadcast_to(x, lplane_indexer) for x in arrs]
val = type(self.obj)._from_arrays(
arrs, index=range(lplane_indexer), columns=range(len(arrs))
)
self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), val)
self.obj._clear_item_cache()

elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0:
# This is a setitem-with-expansion, see
# test_loc_setitem_empty_append_expands_rows_mixed_dtype
# e.g. df = DataFrame(columns=["x", "y"])
# df["x"] = df["x"].astype(np.int64)
# df.loc[:, "x"] = [1, 2, 3]
self._setitem_single_column(ilocs[0], value, pi)

# Setting entire column, so swapping out
# GH#??? we may want to change this behavior
self.obj._iset_item(ilocs[0], value)

else:
raise ValueError(
Expand All @@ -1717,8 +1745,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str):
else:

# scalar value
for loc in ilocs:
self._setitem_single_column(loc, value, pi)
self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value)
self.obj._clear_item_cache()

def _setitem_with_indexer_2d_value(self, indexer, value):
# We get here with np.ndim(value) == 2, excluding DataFrame,
Expand All @@ -1734,14 +1762,14 @@ def _setitem_with_indexer_2d_value(self, indexer, value):
"Must have equal len keys and value when setting with an ndarray"
)

for i, loc in enumerate(ilocs):
# setting with a list, re-coerces
self._setitem_single_column(loc, value[:, i].tolist(), pi)
# wrap in DataFrame to coerce where appropriate
obj = type(self.obj)(value.tolist())
self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj)
self.obj._clear_item_cache()

def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str):
ilocs = self._ensure_iterable_column_indexer(indexer[1])

sub_indexer = list(indexer)
pi = indexer[0]

multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex)
Expand All @@ -1750,26 +1778,14 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str

# We do not want to align the value in case of iloc GH#37728
if name == "iloc":
for i, loc in enumerate(ilocs):
val = value.iloc[:, i]
self._setitem_single_column(loc, val, pi)
self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value)
self.obj._clear_item_cache()

elif not unique_cols and value.columns.equals(self.obj.columns):
# We assume we are already aligned, see
# test_iloc_setitem_frame_duplicate_columns_multiple_blocks
for loc in ilocs:
item = self.obj.columns[loc]
if item in value:
sub_indexer[1] = item
val = self._align_series(
tuple(sub_indexer),
value.iloc[:, loc],
multiindex_indexer,
)
else:
val = np.nan

self._setitem_single_column(loc, val, pi)
self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value)
self.obj._clear_item_cache()

elif not unique_cols:
raise ValueError("Setting with non-unique columns is not allowed.")
Expand All @@ -1778,9 +1794,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str
for loc in ilocs:
item = self.obj.columns[loc]
if item in value:
sub_indexer[1] = item
val = self._align_series(
tuple(sub_indexer), value[item], multiindex_indexer
(pi, item), value[item], multiindex_indexer
)
else:
val = np.nan
Expand Down
89 changes: 72 additions & 17 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.construction import ensure_wrapped_if_datetimelike, extract_array
from pandas.core.indexers import (
check_setitem_lengths,
is_empty_indexer,
Expand Down Expand Up @@ -901,10 +901,60 @@ def setitem(self, indexer, value):
if self.is_numeric:
value = np.nan

# coerce if block dtype can store value
values = self.values

# FIXME: avoid getting here with DataFrame value; ambiguous casting
if is_extension_array_dtype(getattr(value, "dtype", None)):
# We need to be careful not to allow through strings that
# can be parsed to EADtypes
is_ea_value = True
arr_value = value
else:
is_ea_value = False
arr_value = np.array(value)

# TODO: why the ndim restriction here?
if (
self.dtype == object
and arr_value.dtype.kind in ["m", "M"]
and arr_value.size > 0
and self.ndim == 2
):
# get Timestamp/Timedelta, numpy would cast to ints (yikes!)
# FIXME: np.asarray(dta, dtype=object), dta.to_numpy(object)
# both have the same wrong numpy behavior
arr_value = ensure_wrapped_if_datetimelike(arr_value)
arr_value = np.asarray(arr_value.astype(object))
value = arr_value

if transpose:
values = values.T

# length checking
check_setitem_lengths(indexer, value, values)
exact_match = is_exact_shape_match(values, arr_value)

if not self._can_hold_element(value):
# current dtype cannot store value, coerce to common dtype

is_full = exact_match or (
isinstance(indexer, tuple)
and len(indexer) == self.ndim
and com.is_null_slice(indexer[0])
)
if is_full:
# test_loc_setitem_consistency,
# test_loc_setitem_consistency_dt64_to_float
value2 = lib.item_from_zerodim(value)
if lib.is_scalar(value2):
# TODO: de-duplicate with similar in setitem_single_block
value2 = np.full(self.shape, arr_value)
return self.make_block(value2)
elif arr_value.shape == self.shape[::-1]:
return self.make_block(arr_value.T)
else:
assert False # just checking we never get here

# TODO: can we just use coerce_to_target_dtype for all this
if hasattr(value, "dtype"):
dtype = value.dtype
Expand All @@ -930,21 +980,6 @@ def setitem(self, indexer, value):
return self

# value must be storable at this moment
if is_extension_array_dtype(getattr(value, "dtype", None)):
# We need to be careful not to allow through strings that
# can be parsed to EADtypes
is_ea_value = True
arr_value = value
else:
is_ea_value = False
arr_value = np.array(value)

if transpose:
values = values.T

# length checking
check_setitem_lengths(indexer, value, values)
exact_match = is_exact_shape_match(values, arr_value)
if is_empty_indexer(indexer, arr_value):
# GH#8669 empty indexers
pass
Expand Down Expand Up @@ -1666,6 +1701,26 @@ def setitem(self, indexer, value):
# we are always 1-D
indexer = indexer[0]

if isinstance(indexer, np.ndarray) and self.ndim == indexer.ndim == 2:
# possibly constructed with maybe_convert_ix

indexer = indexer.squeeze()
indexer = np.atleast_1d(indexer)

if (
isinstance(value, (np.ndarray, ExtensionArray))
and value.ndim == self.ndim == 2
):
# TODO: test for this
value = value.T
if value.shape[0] != 1:
raise ValueError
value = value[0]
elif isinstance(value, ABCDataFrame) and self.ndim == 2:
if value.shape[1] != 1:
raise ValueError
value = value._ixs(0, axis=1)._values

check_setitem_lengths(indexer, value, self.values)
self.values[indexer] = value
return self
Expand Down
Loading