diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e745bf3f5feed..6c0ad313350ed 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -262,7 +262,7 @@ Indexing - Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`) - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) -- +- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) Missing ^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cd5d81bc70dd9..83d44e546356c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2706,6 +2706,20 @@ def _setitem_frame(self, key, value): self._check_setitem_copy() self._where(-key, value, inplace=True) + def _iset_item(self, loc: int, value): + self._ensure_valid_index(value) + + # technically _sanitize_column expects a label, not a position, + # but the behavior is the same as long as we pass broadcast=False + value = self._sanitize_column(loc, value, broadcast=False) + NDFrame._iset_item(self, loc, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + def _set_item(self, key, value): """ Add series to DataFrame in specified column. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e6c5ac9dbf733..1555de92c6ab6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3579,6 +3579,10 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result + def _iset_item(self, loc: int, value) -> None: + self._data.iset(loc, value) + self._clear_item_cache() + def _set_item(self, key, value) -> None: self._data.set(key, value) self._clear_item_cache() diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9a671c7fc170a..c9362a0527c06 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1615,6 +1615,12 @@ def _setitem_with_indexer(self, indexer, value): info_idx = [info_idx] labels = item_labels[info_idx] + # Ensure we have something we can iterate over + ilocs = info_idx + if isinstance(info_idx, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[info_idx] + plane_indexer = indexer[:1] lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) # lplane_indexer gives the expected length of obj[indexer[0]] @@ -1632,9 +1638,11 @@ def _setitem_with_indexer(self, indexer, value): "length than the value" ) - def setter(item, v): - ser = self.obj[item] - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + + def isetter(loc, v): + # positional setting on column loc + ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis # as we have a null slice or a slice with full bounds @@ -1654,7 +1662,7 @@ def setter(item, v): ser._maybe_update_cacher(clear=True) # reset the sliced object if unique - self.obj[item] = ser + self.obj._iset_item(loc, ser) # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) @@ -1664,8 +1672,10 @@ def setter(item, v): if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, ABCMultiIndex) + # TODO: we are implicitly assuming value.columns is unique - for item in labels: + for loc in ilocs: + item = item_labels[loc] if item in value: sub_indexer[info_axis] = item v = self._align_series( @@ -1674,7 +1684,7 @@ def setter(item, v): else: v = np.nan - setter(item, v) + isetter(loc, v) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. @@ -1685,16 +1695,15 @@ def setter(item, v): # note that this coerces the dtype if we are mixed # GH 7551 value = np.array(value, dtype=object) - if len(labels) != value.shape[1]: + if len(ilocs) != value.shape[1]: raise ValueError( "Must have equal len keys and value " "when setting with an ndarray" ) - for i, item in enumerate(labels): - + for i, loc in enumerate(ilocs): # setting with a list, re-coerces - setter(item, value[:, i].tolist()) + isetter(loc, value[:, i].tolist()) elif ( len(labels) == 1 @@ -1702,7 +1711,8 @@ def setter(item, v): and not is_scalar(plane_indexer[0]) ): # we have an equal len list/ndarray - setter(labels[0], value) + # We only get here with len(labels) == len(ilocs) == 1 + isetter(ilocs[0], value) elif lplane_indexer == 0 and len(value) == len(self.obj.index): # We get here in one case via .loc with a all-False mask @@ -1710,19 +1720,19 @@ def setter(item, v): else: # per-label values - if len(labels) != len(value): + if len(ilocs) != len(value): raise ValueError( "Must have equal len keys and value " "when setting with an iterable" ) - for item, v in zip(labels, value): - setter(item, v) + for loc, v in zip(ilocs, value): + isetter(loc, v) else: - # scalar - for item in labels: - setter(item, value) + # scalar value + for loc in ilocs: + isetter(loc, value) else: if isinstance(indexer, tuple): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 98afc5ac3a0e3..d0a9e5cdcc6ee 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1086,7 +1086,10 @@ def value_getitem(placement): "Shape of new values must be compatible with manager shape" ) - if isinstance(loc, int): + if lib.is_integer(loc): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) loc = [loc] # Accessing public blknos ensures the public versions are initialized @@ -1138,7 +1141,7 @@ def value_getitem(placement): # one item. new_blocks.extend( make_block( - values=value.copy(), + values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), ) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 683d4f2605712..f6b9e9a44ba14 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -349,7 +349,6 @@ def test_iloc_setitem_dups(self): df = concat([df1, df2], axis=1) expected = df.fillna(3) - expected["A"] = expected["A"].astype("float64") inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] @@ -694,3 +693,32 @@ def test_series_indexing_zerodim_np_array(self): s = Series([1, 2]) result = s.iloc[np.array(0)] assert result == 1 + + +class TestILocSetItemDuplicateColumns: + def test_iloc_setitem_scalar_duplicate_columns(self): + # GH#15686, duplicate columns and mixed dtype + df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df = pd.concat([df1, df2], axis=1) + df.iloc[0, 0] = -1 + + assert df.iloc[0, 0] == -1 + assert df.iloc[0, 2] == 3 + assert df.dtypes.iloc[2] == np.int64 + + def test_iloc_setitem_list_duplicate_columns(self): + # GH#22036 setting with same-sized list + df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"]) + + df.iloc[:, 2] = ["str3"] + + expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"]) + tm.assert_frame_equal(df, expected) + + def test_iloc_setitem_series_duplicate_columns(self): + df = pd.DataFrame( + np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"] + ) + df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) + assert df.dtypes.iloc[2] == np.int64