Skip to content

BUG: iloc.__setitem__ with duplicate columns #32477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Mar 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ Indexing
- Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`)
- Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`)
- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`)
-
- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`)

Missing
^^^^^^^
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2706,6 +2706,20 @@ def _setitem_frame(self, key, value):
self._check_setitem_copy()
self._where(-key, value, inplace=True)

def _iset_item(self, loc: int, value):
self._ensure_valid_index(value)

# technically _sanitize_column expects a label, not a position,
# but the behavior is the same as long as we pass broadcast=False
value = self._sanitize_column(loc, value, broadcast=False)
NDFrame._iset_item(self, loc, value)

# check if we are modifying a copy
# try to set first as we want an invalid
# value exception to occur first
if len(self):
self._check_setitem_copy()

def _set_item(self, key, value):
"""
Add series to DataFrame in specified column.
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3579,6 +3579,10 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries:
result._set_is_copy(self, copy=is_copy)
return result

def _iset_item(self, loc: int, value) -> None:
self._data.iset(loc, value)
self._clear_item_cache()

def _set_item(self, key, value) -> None:
self._data.set(key, value)
self._clear_item_cache()
Expand Down
44 changes: 27 additions & 17 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,12 @@ def _setitem_with_indexer(self, indexer, value):
info_idx = [info_idx]
labels = item_labels[info_idx]

# Ensure we have something we can iterate over
ilocs = info_idx
if isinstance(info_idx, slice):
ri = Index(range(len(self.obj.columns)))
ilocs = ri[info_idx]

plane_indexer = indexer[:1]
lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index)
# lplane_indexer gives the expected length of obj[indexer[0]]
Expand All @@ -1632,9 +1638,11 @@ def _setitem_with_indexer(self, indexer, value):
"length than the value"
)

def setter(item, v):
ser = self.obj[item]
pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer
pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer

def isetter(loc, v):
# positional setting on column loc
ser = self.obj._ixs(loc, axis=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you comment that this is not a positional setter


# perform the equivalent of a setitem on the info axis
# as we have a null slice or a slice with full bounds
Expand All @@ -1654,7 +1662,7 @@ def setter(item, v):
ser._maybe_update_cacher(clear=True)

# reset the sliced object if unique
self.obj[item] = ser
self.obj._iset_item(loc, ser)

# we need an iterable, with a ndim of at least 1
# eg. don't pass through np.array(0)
Expand All @@ -1664,8 +1672,10 @@ def setter(item, v):
if isinstance(value, ABCDataFrame):
sub_indexer = list(indexer)
multiindex_indexer = isinstance(labels, ABCMultiIndex)
# TODO: we are implicitly assuming value.columns is unique

for item in labels:
for loc in ilocs:
item = item_labels[loc]
if item in value:
sub_indexer[info_axis] = item
v = self._align_series(
Expand All @@ -1674,7 +1684,7 @@ def setter(item, v):
else:
v = np.nan

setter(item, v)
isetter(loc, v)

# we have an equal len ndarray/convertible to our labels
# hasattr first, to avoid coercing to ndarray without reason.
Expand All @@ -1685,44 +1695,44 @@ def setter(item, v):
# note that this coerces the dtype if we are mixed
# GH 7551
value = np.array(value, dtype=object)
if len(labels) != value.shape[1]:
if len(ilocs) != value.shape[1]:
raise ValueError(
"Must have equal len keys and value "
"when setting with an ndarray"
)

for i, item in enumerate(labels):

for i, loc in enumerate(ilocs):
# setting with a list, re-coerces
setter(item, value[:, i].tolist())
isetter(loc, value[:, i].tolist())

elif (
len(labels) == 1
and lplane_indexer == len(value)
and not is_scalar(plane_indexer[0])
):
# we have an equal len list/ndarray
setter(labels[0], value)
# We only get here with len(labels) == len(ilocs) == 1
isetter(ilocs[0], value)

elif lplane_indexer == 0 and len(value) == len(self.obj.index):
# We get here in one case via .loc with a all-False mask
pass

else:
# per-label values
if len(labels) != len(value):
if len(ilocs) != len(value):
raise ValueError(
"Must have equal len keys and value "
"when setting with an iterable"
)

for item, v in zip(labels, value):
setter(item, v)
for loc, v in zip(ilocs, value):
isetter(loc, v)
else:

# scalar
for item in labels:
setter(item, value)
# scalar value
for loc in ilocs:
isetter(loc, value)

else:
if isinstance(indexer, tuple):
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,10 @@ def value_getitem(placement):
"Shape of new values must be compatible with manager shape"
)

if isinstance(loc, int):
if lib.is_integer(loc):
# We have 6 tests where loc is _not_ an int.
# In this case, get_blkno_placements will yield only one tuple,
# containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
loc = [loc]

# Accessing public blknos ensures the public versions are initialized
Expand Down Expand Up @@ -1138,7 +1141,7 @@ def value_getitem(placement):
# one item.
new_blocks.extend(
make_block(
values=value.copy(),
values=value,
ndim=self.ndim,
placement=slice(mgr_loc, mgr_loc + 1),
)
Expand Down
30 changes: 29 additions & 1 deletion pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ def test_iloc_setitem_dups(self):
df = concat([df1, df2], axis=1)

expected = df.fillna(3)
expected["A"] = expected["A"].astype("float64")
inds = np.isnan(df.iloc[:, 0])
mask = inds[inds].index
df.iloc[mask, 0] = df.iloc[mask, 2]
Expand Down Expand Up @@ -694,3 +693,32 @@ def test_series_indexing_zerodim_np_array(self):
s = Series([1, 2])
result = s.iloc[np.array(0)]
assert result == 1


class TestILocSetItemDuplicateColumns:
def test_iloc_setitem_scalar_duplicate_columns(self):
# GH#15686, duplicate columns and mixed dtype
df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}])
df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}])
df = pd.concat([df1, df2], axis=1)
df.iloc[0, 0] = -1

assert df.iloc[0, 0] == -1
assert df.iloc[0, 2] == 3
assert df.dtypes.iloc[2] == np.int64

def test_iloc_setitem_list_duplicate_columns(self):
# GH#22036 setting with same-sized list
df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"])

df.iloc[:, 2] = ["str3"]

expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"])
tm.assert_frame_equal(df, expected)

def test_iloc_setitem_series_duplicate_columns(self):
df = pd.DataFrame(
np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"]
)
df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64)
assert df.dtypes.iloc[2] == np.int64