Skip to content

Bug in iloc aligned objects #37728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,7 @@ Indexing
- Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`)
- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`)
- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`)
- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligned objects in ``__setitem__`` (:issue:`22046`)

Missing
^^^^^^^
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3111,7 +3111,7 @@ def _setitem_slice(self, key: slice, value):
# operates on labels and we need to operate positional for
# backwards-compat, xref GH#31469
self._check_setitem_copy()
self.iloc._setitem_with_indexer(key, value)
self.iloc._setitem_with_indexer(key, value, self.iloc.name)

def _setitem_array(self, key, value):
# also raises Exception if object array with NA values
Expand All @@ -3123,7 +3123,7 @@ def _setitem_array(self, key, value):
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
self._check_setitem_copy()
self.iloc._setitem_with_indexer(indexer, value)
self.iloc._setitem_with_indexer(indexer, value, self.iloc.name)
else:
if isinstance(value, DataFrame):
if len(value.columns) != len(key):
Expand All @@ -3136,7 +3136,9 @@ def _setitem_array(self, key, value):
key, axis=1, raise_missing=False
)[1]
self._check_setitem_copy()
self.iloc._setitem_with_indexer((slice(None), indexer), value)
self.iloc._setitem_with_indexer(
(slice(None), indexer), value, self.iloc.name
)

def _setitem_frame(self, key, value):
# support boolean setting with DataFrame input, e.g.
Expand Down
37 changes: 21 additions & 16 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ def __setitem__(self, key, value):
self._has_valid_setitem_indexer(key)

iloc = self if self.name == "iloc" else self.obj.iloc
iloc._setitem_with_indexer(indexer, value)
iloc._setitem_with_indexer(indexer, value, self.name)

def _validate_key(self, key, axis: int):
"""
Expand Down Expand Up @@ -1525,7 +1525,7 @@ def _get_setitem_indexer(self, key):

# -------------------------------------------------------------------

def _setitem_with_indexer(self, indexer, value):
def _setitem_with_indexer(self, indexer, value, name):
"""
_setitem_with_indexer is for setting values on a Series/DataFrame
using positional indexers.
Expand Down Expand Up @@ -1601,7 +1601,7 @@ def _setitem_with_indexer(self, indexer, value):
new_indexer = convert_from_missing_indexer_tuple(
indexer, self.obj.axes
)
self._setitem_with_indexer(new_indexer, value)
self._setitem_with_indexer(new_indexer, value, name)

return

Expand All @@ -1626,17 +1626,17 @@ def _setitem_with_indexer(self, indexer, value):
indexer, missing = convert_missing_indexer(indexer)

if missing:
self._setitem_with_indexer_missing(indexer, value)
self._setitem_with_indexer_missing(indexer, value, name)
return

# align and set the values
if take_split_path:
# We have to operate column-wise
self._setitem_with_indexer_split_path(indexer, value)
self._setitem_with_indexer_split_path(indexer, value, name)
else:
self._setitem_single_block(indexer, value)
self._setitem_single_block(indexer, value, name)

def _setitem_with_indexer_split_path(self, indexer, value):
def _setitem_with_indexer_split_path(self, indexer, value, name):
"""
Setitem column-wise.
"""
Expand All @@ -1648,7 +1648,7 @@ def _setitem_with_indexer_split_path(self, indexer, value):
if len(indexer) > self.ndim:
raise IndexError("too many indices for array")

if isinstance(value, ABCSeries):
if isinstance(value, ABCSeries) and name != "iloc":
value = self._align_series(indexer, value)

# Ensure we have something we can iterate over
Expand Down Expand Up @@ -1677,7 +1677,7 @@ def _setitem_with_indexer_split_path(self, indexer, value):

# we have an equal len Frame
if isinstance(value, ABCDataFrame):
self._setitem_with_indexer_frame_value(indexer, value)
self._setitem_with_indexer_frame_value(indexer, value, name)

# we have an equal len ndarray/convertible to our ilocs
# hasattr first, to avoid coercing to ndarray without reason.
Expand Down Expand Up @@ -1736,7 +1736,7 @@ def _setitem_with_indexer_2d_value(self, indexer, value):
# setting with a list, re-coerces
self._setitem_single_column(loc, value[:, i].tolist(), plane_indexer)

def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame"):
def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type name here (and above)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

ilocs = self._ensure_iterable_column_indexer(indexer[1])

sub_indexer = list(indexer)
Expand All @@ -1746,7 +1746,12 @@ def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame"):

unique_cols = value.columns.is_unique

if not unique_cols and value.columns.equals(self.obj.columns):
if name == "iloc":
for index, loc in zip(range(len(ilocs)), ilocs):
val = value.iloc[:, index]
self._setitem_single_column(loc, val, plane_indexer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be solved with an appropriate call to _align_frame or _align_series? id really prefer to avoid passing name around. by the time we get to setitem_with_indexer we should always be working with positional indexers (i.e. iloc)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm no not to my knowledge. The issue is, that we want to achieve oposing things between loc and iloc, but the objects do look the same. setitem_with_indexer could be solved with a default value, that is not a problem.

But I can not see a way to do this without passing some kind of flag to the align calls.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looked into this a bit again. The indexer in setitem_with_indexer already is positional, but the value may not be properly aligned for loc, hence we call the align functions there. This is not necessary for the iloc path, but this information is lost there, if we do not pass it through somehow.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it sounds like we should be doing some appropriate alignment within Loc before passing it into iLoc?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be an option, but would result in a bigger rewrite and I am not sure if that would introduce more complexity, because the align_series method is called in various places. Should I give it a try?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like this PR is making it so that we never call align_frame/align_series if setitem_with_indexer was called from iloc, only if it was called from loc. setitem_with_indexer is only called from one place within loc. Can we just do that alignment right before that call?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, we could try, but it is not that simple. For example before calling setitem_with_indexer from loc, value may be a DataFrame with different dtypes. Currently _align_series is called for every column to get a ndarray with the correct dtype. If we call this before calling setitem_with_indexer we would have to call _align_frame or copy the logic, which decides if we align every colum at once or separately. _align_frame returns an object dtype array, which obviously would introduce a lot of dtype errors. That is what I meant with a bigger rewrite here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for explaining, i think i have a better handle on it now.

ive got an upcoming PR to make all 2D cases go through split_path. let's revisit this after that goes through and see if we can make it cleaner

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds great. Just ping me when merged, then I will have a look.


elif not unique_cols and value.columns.equals(self.obj.columns):
# We assume we are already aligned, see
# test_iloc_setitem_frame_duplicate_columns_multiple_blocks
for loc in ilocs:
Expand Down Expand Up @@ -1804,7 +1809,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
# reset the sliced object if unique
self.obj._iset_item(loc, ser)

def _setitem_single_block(self, indexer, value):
def _setitem_single_block(self, indexer, value, name):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

"""
_setitem_with_indexer for the case when we have a single Block.
"""
Expand Down Expand Up @@ -1833,13 +1838,13 @@ def _setitem_single_block(self, indexer, value):

indexer = maybe_convert_ix(*indexer)

if isinstance(value, (ABCSeries, dict)):
if isinstance(value, (ABCSeries, dict)) and name != "iloc":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we have a dict here dont we still want to wrap it in a Series?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, have no tests getting here with iloc. Will add one

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LMK when these tests are all ready and ill take another look. this one is almost ready

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests are all in

# TODO(EA): ExtensionBlock.setitem this causes issues with
# setting for extensionarrays that store dicts. Need to decide
# if it's worth supporting that.
value = self._align_series(indexer, Series(value))

elif isinstance(value, ABCDataFrame):
elif isinstance(value, ABCDataFrame) and name != "iloc":
value = self._align_frame(indexer, value)

# check for chained assignment
Expand All @@ -1850,7 +1855,7 @@ def _setitem_single_block(self, indexer, value):
self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
self.obj._maybe_update_cacher(clear=True)

def _setitem_with_indexer_missing(self, indexer, value):
def _setitem_with_indexer_missing(self, indexer, value, name):
"""
Insert new row(s) or column(s) into the Series or DataFrame.
"""
Expand All @@ -1871,7 +1876,7 @@ def _setitem_with_indexer_missing(self, indexer, value):
if index.is_unique:
new_indexer = index.get_indexer([new_index[-1]])
if (new_indexer != -1).any():
return self._setitem_with_indexer(new_indexer, value)
return self._setitem_with_indexer(new_indexer, value, name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double check me on this, but i think it might be the case that we only get here with name = "loc"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, you are right. Iloc raises when indexer is a dict. That is the only way we can get in there. Removed it


# this preserves dtype of the value
new_values = Series([value])._values
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,14 @@ def test_iloc_setitem_bool_indexer(self, klass):
expected = DataFrame({"flag": ["x", "y", "z"], "value": [2, 3, 4]})
tm.assert_frame_equal(df, expected)

def test_setitem_iloc_pure_position_based(self):
# GH: 22046
df1 = DataFrame({"a2": [11, 12, 13], "b2": [14, 15, 16]})
df2 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df2.iloc[:, [1]] = df1.iloc[:, [0]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there other indexer/value type combinations that are relevant here? e.g. the value being set here is a DataFrame, would a Series trigger the same bug? what if instead of [1] the indexer was slice(1, 2)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

df1 = DataFrame({"a2": [11, 12, 13], "b2": [14, 15, 16]})
df2 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df2.iloc[:, [1]] = df1.iloc[:, 0]

raises

Traceback (most recent call last):
  File "/home/developer/.config/JetBrains/PyCharm2020.2/scratches/scratch_4.py", line 132, in <module>
    df2.iloc[:, [1]] = df1.iloc[:, 0]
  File "/home/developer/PycharmProjects/pandas/pandas/core/indexing.py", line 684, in __setitem__
    iloc._setitem_with_indexer(indexer, value)
  File "/home/developer/PycharmProjects/pandas/pandas/core/indexing.py", line 1637, in _setitem_with_indexer
    self._setitem_single_block(indexer, value)
  File "/home/developer/PycharmProjects/pandas/pandas/core/indexing.py", line 1851, in _setitem_single_block
    self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
  File "/home/developer/PycharmProjects/pandas/pandas/core/internals/managers.py", line 562, in setitem
    return self.apply("setitem", indexer=indexer, value=value)
  File "/home/developer/PycharmProjects/pandas/pandas/core/internals/managers.py", line 427, in apply
    applied = getattr(b, f)(**kwargs)
  File "/home/developer/PycharmProjects/pandas/pandas/core/internals/blocks.py", line 1005, in setitem
    values[indexer] = value
ValueError: shape mismatch: value array of shape (3,) could not be broadcast to indexing result of shape (1,3)

Process finished with exit code 1

Did not check slice previously, this triggered the bug too. Will add test

expected = DataFrame({"a": [1, 2, 3], "b": [11, 12, 13], "c": [7, 8, 9]})
tm.assert_frame_equal(df2, expected)


class TestDataFrameSetItemSlicing:
def test_setitem_slice_position(self):
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,13 @@ def test_iloc_setitem_empty_frame_raises_with_3d_ndarray(self):
with pytest.raises(ValueError, match=msg):
obj.iloc[nd3] = 0

def test_iloc_assign_series_to_df_cell(self):
# GH 37593
df = DataFrame(columns=["a"], index=[0])
df.iloc[0, 0] = Series([1, 2, 3])
expected = DataFrame({"a": [Series([1, 2, 3])]}, columns=["a"], index=[0])
tm.assert_frame_equal(df, expected)


class TestILocErrors:
# NB: this test should work for _any_ Series we can pass as
Expand Down
27 changes: 16 additions & 11 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,43 +671,48 @@ def test_float_index_at_iat(self):
def test_rhs_alignment(self):
# GH8258, tests that both rows & columns are aligned to what is
# assigned to. covers both uniform data-type & multi-type cases
def run_tests(df, rhs, right):
def run_tests(df, rhs, right_loc, right_iloc):
# label, index, slice
lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4)
lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3)

left = df.copy()
left.loc[lbl_one, lbl_two] = rhs
tm.assert_frame_equal(left, right)
tm.assert_frame_equal(left, right_loc)

left = df.copy()
left.iloc[idx_one, idx_two] = rhs
tm.assert_frame_equal(left, right)
tm.assert_frame_equal(left, right_iloc)

left = df.copy()
left.iloc[slice_one, slice_two] = rhs
tm.assert_frame_equal(left, right)
tm.assert_frame_equal(left, right_iloc)

xs = np.arange(20).reshape(5, 4)
cols = ["jim", "joe", "jolie", "joline"]
df = DataFrame(xs, columns=cols, index=list("abcde"))
df = DataFrame(xs, columns=cols, index=list("abcde"), dtype="int64")

# right hand side; permute the indices and multiplpy by -2
rhs = -2 * df.iloc[3:0:-1, 2:0:-1]

# expected `right` result; just multiply by -2
right = df.copy()
right.iloc[1:4, 1:3] *= -2
right_iloc = df.copy()
right_iloc["joe"] = [1, 14, 10, 6, 17]
right_iloc["jolie"] = [2, 13, 9, 5, 18]
right_iloc.iloc[1:4, 1:3] *= -2
right_loc = df.copy()
right_loc.iloc[1:4, 1:3] *= -2

# run tests with uniform dtypes
run_tests(df, rhs, right)
run_tests(df, rhs, right_loc, right_iloc)

# make frames multi-type & re-run tests
for frame in [df, rhs, right]:
for frame in [df, rhs, right_loc, right_iloc]:
frame["joe"] = frame["joe"].astype("float64")
frame["jolie"] = frame["jolie"].map("@{}".format)

run_tests(df, rhs, right)
right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0]
right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"]
run_tests(df, rhs, right_loc, right_iloc)

def test_str_label_slicing_with_negative_step(self):
SLC = pd.IndexSlice
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,12 @@ def test_setitem_slice_into_readonly_backing_data():
series[1:3] = 1

assert not array.any()


def test_setitem_iloc_pure_position_based():
# GH: 22046
ser1 = Series([1, 2, 3])
ser2 = Series([4, 5, 6], index=[1, 0, 2])
ser1.iloc[1:3] = ser2.iloc[1:3]
expected = Series([1, 5, 6])
tm.assert_series_equal(ser1, expected)