-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: setitem_with_indexer DataFrame always go through split_path #40380
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 81 commits
2c761eb
d207a12
f0dae4d
27bd89d
a5c1f5e
15f5265
73302c0
ceeeb78
6b304b8
fe9be7e
1789fca
6400be3
a1cdd19
870ae37
c32a427
d9d8beb
8a283d9
9718f6e
9f6d8e7
dd53abc
f6b09b4
c107e77
ae3ae1b
c631079
4cfe863
03e6eb9
d280f60
9db5537
4054aea
bd76479
7ea792f
20f6a16
b9daceb
6117e8e
94b33a8
8f71f27
17049e4
fdbf6f3
f8363f9
2c6da5d
2d0cf9e
e1ed083
df9d87f
aeb687c
481ece1
95f34c8
be54f15
6896e86
b183084
fa2006b
1af8686
301d582
1d24f71
d8c7c4c
f0037e6
094e47d
6791a97
d01951d
01ff673
8e8e711
18d72ec
ba1d17c
1fca2d5
1382985
10ab24d
8e5a414
cdfc811
d761df6
b4d0211
53be486
05107a2
b87fc55
3279159
9f7e274
a4aed44
e1c30fa
e062dce
9a73ef6
4a9919b
8761ffa
99e3316
be22a99
86d3e71
d49512b
d821e8b
f142f16
eccfebb
0c430c9
295737e
405b21f
520230c
0ac5516
7a46ab8
b19a787
7bf721b
7db1a5a
ca0f516
3b456f9
1e4b617
1d203a6
4e69970
0a0e712
4ce2cd3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,7 @@ | |
Index, | ||
MultiIndex, | ||
) | ||
from pandas.core.internals import ArrayManager | ||
|
||
if TYPE_CHECKING: | ||
from pandas import ( | ||
|
@@ -634,12 +635,12 @@ def __call__(self, axis=None): | |
new_self.axis = axis | ||
return new_self | ||
|
||
def _get_setitem_indexer(self, key): | ||
def _get_setitem_indexer(self, key, value): | ||
""" | ||
Convert a potentially-label-based key into a positional indexer. | ||
""" | ||
if self.name == "loc": | ||
self._ensure_listlike_indexer(key) | ||
self._ensure_listlike_indexer(key, value=value) | ||
|
||
if self.axis is not None: | ||
return self._convert_tuple(key) | ||
|
@@ -677,9 +678,11 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): | |
if self.ndim != 2: | ||
return | ||
|
||
pi = None | ||
if isinstance(key, tuple) and len(key) > 1: | ||
# key may be a tuple if we are .loc | ||
# if length of key is > 1 set key to column part | ||
pi = key[0] | ||
key = key[column_axis] | ||
axis = column_axis | ||
|
||
|
@@ -693,17 +696,26 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): | |
# GH#38148 | ||
keys = self.obj.columns.union(key, sort=False) | ||
|
||
self.obj._mgr = self.obj._mgr.reindex_axis( | ||
keys, axis=0, consolidate=False, only_slice=True | ||
) | ||
if isinstance(value, ABCDataFrame) and com.is_null_slice(pi): | ||
# We are setting obj.loc[:, new_keys] = newframe | ||
# Setting these directly instead of reindexing keeps | ||
# us from converting integer dtypes to floats | ||
new_keys = keys.difference(self.obj.columns) | ||
self.obj[new_keys] = value[new_keys] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it does, but only when the keys are already present in frame.columns. In this case we are only setting for non-present keys |
||
|
||
else: | ||
|
||
self.obj._mgr = self.obj._mgr.reindex_axis( | ||
keys, axis=0, consolidate=False, only_slice=True | ||
) | ||
|
||
def __setitem__(self, key, value): | ||
if isinstance(key, tuple): | ||
key = tuple(list(x) if is_iterator(x) else x for x in key) | ||
key = tuple(com.apply_if_callable(x, self.obj) for x in key) | ||
else: | ||
key = com.apply_if_callable(key, self.obj) | ||
indexer = self._get_setitem_indexer(key) | ||
indexer = self._get_setitem_indexer(key, value) | ||
self._has_valid_setitem_indexer(key) | ||
|
||
iloc = self if self.name == "iloc" else self.obj.iloc | ||
|
@@ -1238,6 +1250,8 @@ def _convert_to_indexer(self, key, axis: int): | |
key = list(key) | ||
|
||
if com.is_bool_indexer(key): | ||
# TODO: in this case should we do a .take on the value here? | ||
# test_loc_setitem_all_false_boolean_two_blocks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @phofl could use some extra eyeballs when convenient. I've got this down to one failing test where we have an all-False mask indexer. we need to do something like
here, but not sure if this is the right place to do that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm no, I think we have to handle this case later.
would not go through there but raises the same error currently because the indexer is empty too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we do not have a test for this case, we should add one I think. This passes on master right now |
||
key = check_bool_indexer(labels, key) | ||
(inds,) = key.nonzero() | ||
return inds | ||
|
@@ -1490,7 +1504,7 @@ def _convert_to_indexer(self, key, axis: int): | |
""" | ||
return key | ||
|
||
def _get_setitem_indexer(self, key): | ||
def _get_setitem_indexer(self, key, value): | ||
# GH#32257 Fall through to let numpy do validation | ||
if is_iterator(key): | ||
return list(key) | ||
|
@@ -1512,32 +1526,6 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): | |
""" | ||
info_axis = self.obj._info_axis_number | ||
|
||
# maybe partial set | ||
take_split_path = not self.obj._mgr.is_single_block | ||
|
||
# if there is only one block/type, still have to take split path | ||
# unless the block is one-dimensional or it can hold the value | ||
if ( | ||
not take_split_path | ||
and getattr(self.obj._mgr, "blocks", False) | ||
and self.ndim > 1 | ||
): | ||
# in case of dict, keys are indices | ||
val = list(value.values()) if isinstance(value, dict) else value | ||
blk = self.obj._mgr.blocks[0] | ||
take_split_path = not blk._can_hold_element(val) | ||
|
||
# if we have any multi-indexes that have non-trivial slices | ||
# (not null slices) then we must take the split path, xref | ||
# GH 10360, GH 27841 | ||
if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): | ||
for i, ax in zip(indexer, self.obj.axes): | ||
if isinstance(ax, MultiIndex) and not ( | ||
is_integer(i) or com.is_null_slice(i) | ||
): | ||
take_split_path = True | ||
break | ||
|
||
if isinstance(indexer, tuple): | ||
nindexer = [] | ||
for i, idx in enumerate(indexer): | ||
|
@@ -1631,7 +1619,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): | |
return | ||
|
||
# align and set the values | ||
if take_split_path: | ||
if self.ndim > 1: | ||
# We have to operate column-wise | ||
self._setitem_with_indexer_split_path(indexer, value, name) | ||
else: | ||
|
@@ -1644,23 +1632,63 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): | |
# Above we only set take_split_path to True for 2D cases | ||
assert self.ndim == 2 | ||
|
||
orig = indexer | ||
if not isinstance(indexer, tuple): | ||
indexer = _tuplify(self.ndim, indexer) | ||
if len(indexer) > self.ndim: | ||
raise IndexError("too many indices for array") | ||
if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: | ||
raise ValueError(r"Cannot set values with ndim > 2") | ||
|
||
if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): | ||
from pandas import Series | ||
|
||
value = self._align_series(indexer, Series(value)) | ||
|
||
info_idx = indexer[1] | ||
pi = indexer[0] | ||
if ( | ||
isinstance(pi, ABCDataFrame) | ||
and orig is pi | ||
and hasattr(self.obj._mgr, "blocks") | ||
and len(self.obj._mgr.blocks) == 1 | ||
): | ||
# FIXME: kludge | ||
return self._setitem_single_block(orig, value, name) | ||
|
||
if ( | ||
com.is_null_slice(info_idx) | ||
and is_scalar(value) | ||
and not isinstance(pi, ABCDataFrame) | ||
and not isinstance(self.obj._mgr, ArrayManager) | ||
): | ||
# We can go directly through BlockManager.setitem without worrying | ||
# about alignment. | ||
# TODO: do we need to do some kind of copy_with_setting check? | ||
self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) | ||
return | ||
|
||
if is_integer(info_idx) and not isinstance(self.obj._mgr, ArrayManager): | ||
if is_integer(pi): | ||
# We need to watch out for case where we are treating a listlike | ||
# as a scalar, e.g. test_setitem_iloc_scalar_single for JSONArray | ||
|
||
mgr = self.obj._mgr | ||
blkno = mgr.blknos[info_idx] | ||
blkloc = mgr.blklocs[info_idx] | ||
blk = mgr.blocks[blkno] | ||
|
||
if blk._can_hold_element(value): | ||
# NB: we are assuming here that _can_hold_element is accurate | ||
# TODO: do we need to do some kind of copy_with_setting check? | ||
self.obj._check_is_chained_assignment_possible() | ||
blk.setitem_inplace((pi, blkloc), value) | ||
self.obj._maybe_update_cacher(clear=True) | ||
return | ||
|
||
# Ensure we have something we can iterate over | ||
info_axis = indexer[1] | ||
ilocs = self._ensure_iterable_column_indexer(info_axis) | ||
|
||
pi = indexer[0] | ||
lplane_indexer = length_of_indexer(pi, self.obj.index) | ||
# lplane_indexer gives the expected length of obj[indexer[0]] | ||
|
||
|
@@ -1676,7 +1704,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): | |
|
||
elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): | ||
# We are setting multiple rows in a single column. | ||
self._setitem_single_column(ilocs[0], value, pi) | ||
self._setitem_iat_loc(ilocs[0], pi, value) | ||
|
||
elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): | ||
# We are trying to set N values into M entries of a single | ||
|
@@ -1700,7 +1728,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): | |
elif len(ilocs) == len(value): | ||
# We are setting multiple columns in a single row. | ||
for loc, v in zip(ilocs, value): | ||
self._setitem_single_column(loc, v, pi) | ||
self._setitem_iat_loc(loc, pi, v) | ||
|
||
elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: | ||
# This is a setitem-with-expansion, see | ||
|
@@ -1738,6 +1766,7 @@ def _setitem_with_indexer_2d_value(self, indexer, value): | |
|
||
for i, loc in enumerate(ilocs): | ||
# setting with a list, re-coerces | ||
# self._setitem_iat_loc(loc, pi, value[:, i].tolist()) | ||
self._setitem_single_column(loc, value[:, i].tolist(), pi) | ||
|
||
def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): | ||
|
@@ -1754,7 +1783,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str | |
if name == "iloc": | ||
for i, loc in enumerate(ilocs): | ||
val = value.iloc[:, i] | ||
self._setitem_single_column(loc, val, pi) | ||
self._setitem_iat_loc(loc, pi, val) | ||
|
||
elif not unique_cols and value.columns.equals(self.obj.columns): | ||
# We assume we are already aligned, see | ||
|
@@ -1771,12 +1800,15 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str | |
else: | ||
val = np.nan | ||
|
||
self._setitem_single_column(loc, val, pi) | ||
self._setitem_iat_loc(loc, pi, val) | ||
|
||
elif not unique_cols: | ||
raise ValueError("Setting with non-unique columns is not allowed.") | ||
|
||
else: | ||
# TODO: not totally clear why we are requiring this | ||
self._align_frame(indexer[0], value) | ||
|
||
for loc in ilocs: | ||
item = self.obj.columns[loc] | ||
if item in value: | ||
|
@@ -1787,7 +1819,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str | |
else: | ||
val = np.nan | ||
|
||
self._setitem_single_column(loc, val, pi) | ||
self._setitem_iat_loc(loc, pi, val) | ||
|
||
def _setitem_single_column(self, loc: int, value, plane_indexer): | ||
""" | ||
|
@@ -1829,6 +1861,33 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): | |
# reset the sliced object if unique | ||
self.obj._iset_item(loc, ser) | ||
|
||
def _setitem_iat_loc(self, loc: int, pi, value): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this method supposed to do? Can you add some docstring / comment? |
||
# TODO: likely a BM method? | ||
if isinstance(self.obj._mgr, ArrayManager): | ||
# TODO: implement this correctly for ArrayManager | ||
return self._setitem_single_column(loc, value, pi) | ||
|
||
mgr = self.obj._mgr | ||
blkno = mgr.blknos[loc] | ||
blkloc = mgr.blklocs[loc] | ||
blk = mgr.blocks[blkno] | ||
assert blk.mgr_locs[blkloc] == loc | ||
|
||
if blk._can_hold_element(value): | ||
# NB: we are assuming here that _can_hold_element is accurate | ||
# TODO: do we need to do some kind of copy_with_setting check? | ||
try: | ||
self.obj._check_is_chained_assignment_possible() | ||
blk.setitem_inplace((pi, blkloc), value) | ||
self.obj._maybe_update_cacher(clear=True) | ||
except ValueError: | ||
if blk.is_extension: | ||
# FIXME: kludge bc _can_hold_element is wrong for EABLock | ||
return self._setitem_single_column(loc, value, pi) | ||
raise | ||
else: | ||
self._setitem_single_column(loc, value, pi) | ||
|
||
def _setitem_single_block(self, indexer, value, name: str): | ||
""" | ||
_setitem_with_indexer for the case when we have a single Block. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of this "workaround", should we have a manager method to set a new column based on a position? (basically what
mgr.iset
does for ArrayManager)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 on this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure.
this is a fairly low priority, posted in the hopes it would help joris with ArrayManager indexing. #40456 is the most blockery of the things I have going ATM.