BUG: iloc.__setitem__ with duplicate columns (#32477)

jbrockmendel · web-flow · commit 33f67d98aa74 · 2020-03-10T22:35:21.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -263,7 +263,7 @@ Indexing
 - Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`)
 - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`)
 - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`)
--
+- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`)
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2708,6 +2708,20 @@ def _setitem_frame(self, key, value):
         self._check_setitem_copy()
         self._where(-key, value, inplace=True)
 
+    def _iset_item(self, loc: int, value):
+        self._ensure_valid_index(value)
+
+        # technically _sanitize_column expects a label, not a position,
+        #  but the behavior is the same as long as we pass broadcast=False
+        value = self._sanitize_column(loc, value, broadcast=False)
+        NDFrame._iset_item(self, loc, value)
+
+        # check if we are modifying a copy
+        # try to set first as we want an invalid
+        # value exception to occur first
+        if len(self):
+            self._check_setitem_copy()
+
     def _set_item(self, key, value):
         """
         Add series to DataFrame in specified column.
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3579,6 +3579,10 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries:
         result._set_is_copy(self, copy=is_copy)
         return result
 
+    def _iset_item(self, loc: int, value) -> None:
+        self._data.iset(loc, value)
+        self._clear_item_cache()
+
     def _set_item(self, key, value) -> None:
         self._data.set(key, value)
         self._clear_item_cache()
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1615,6 +1615,12 @@ def _setitem_with_indexer(self, indexer, value):
                 info_idx = [info_idx]
             labels = item_labels[info_idx]
 
+            # Ensure we have something we can iterate over
+            ilocs = info_idx
+            if isinstance(info_idx, slice):
+                ri = Index(range(len(self.obj.columns)))
+                ilocs = ri[info_idx]
+
             plane_indexer = indexer[:1]
             lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index)
             # lplane_indexer gives the expected length of obj[indexer[0]]
@@ -1632,9 +1638,11 @@ def _setitem_with_indexer(self, indexer, value):
                         "length than the value"
                     )
 
-            def setter(item, v):
-                ser = self.obj[item]
-                pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer
+            pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer
+
+            def isetter(loc, v):
+                # positional setting on column loc
+                ser = self.obj._ixs(loc, axis=1)
 
                 # perform the equivalent of a setitem on the info axis
                 # as we have a null slice or a slice with full bounds
@@ -1654,7 +1662,7 @@ def setter(item, v):
                     ser._maybe_update_cacher(clear=True)
 
                 # reset the sliced object if unique
-                self.obj[item] = ser
+                self.obj._iset_item(loc, ser)
 
             # we need an iterable, with a ndim of at least 1
             # eg. don't pass through np.array(0)
@@ -1664,8 +1672,10 @@ def setter(item, v):
                 if isinstance(value, ABCDataFrame):
                     sub_indexer = list(indexer)
                     multiindex_indexer = isinstance(labels, ABCMultiIndex)
+                    # TODO: we are implicitly assuming value.columns is unique
 
-                    for item in labels:
+                    for loc in ilocs:
+                        item = item_labels[loc]
                         if item in value:
                             sub_indexer[info_axis] = item
                             v = self._align_series(
@@ -1674,7 +1684,7 @@ def setter(item, v):
                         else:
                             v = np.nan
 
-                        setter(item, v)
+                        isetter(loc, v)
 
                 # we have an equal len ndarray/convertible to our labels
                 # hasattr first, to avoid coercing to ndarray without reason.
@@ -1685,44 +1695,44 @@ def setter(item, v):
                     # note that this coerces the dtype if we are mixed
                     # GH 7551
                     value = np.array(value, dtype=object)
-                    if len(labels) != value.shape[1]:
+                    if len(ilocs) != value.shape[1]:
                         raise ValueError(
                             "Must have equal len keys and value "
                             "when setting with an ndarray"
                         )
 
-                    for i, item in enumerate(labels):
-
+                    for i, loc in enumerate(ilocs):
                         # setting with a list, re-coerces
-                        setter(item, value[:, i].tolist())
+                        isetter(loc, value[:, i].tolist())
 
                 elif (
                     len(labels) == 1
                     and lplane_indexer == len(value)
                     and not is_scalar(plane_indexer[0])
                 ):
                     # we have an equal len list/ndarray
-                    setter(labels[0], value)
+                    # We only get here with len(labels) == len(ilocs) == 1
+                    isetter(ilocs[0], value)
 
                 elif lplane_indexer == 0 and len(value) == len(self.obj.index):
                     # We get here in one case via .loc with a all-False mask
                     pass
 
                 else:
                     # per-label values
-                    if len(labels) != len(value):
+                    if len(ilocs) != len(value):
                         raise ValueError(
                             "Must have equal len keys and value "
                             "when setting with an iterable"
                         )
 
-                    for item, v in zip(labels, value):
-                        setter(item, v)
+                    for loc, v in zip(ilocs, value):
+                        isetter(loc, v)
             else:
 
-                # scalar
-                for item in labels:
-                    setter(item, value)
+                # scalar value
+                for loc in ilocs:
+                    isetter(loc, value)
 
         else:
             if isinstance(indexer, tuple):
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1096,7 +1096,10 @@ def value_getitem(placement):
                     "Shape of new values must be compatible with manager shape"
                 )
 
-        if isinstance(loc, int):
+        if lib.is_integer(loc):
+            # We have 6 tests where loc is _not_ an int.
+            # In this case, get_blkno_placements will yield only one tuple,
+            #  containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
             loc = [loc]
 
         # Accessing public blknos ensures the public versions are initialized
@@ -1148,7 +1151,7 @@ def value_getitem(placement):
                 # one item.
                 new_blocks.extend(
                     make_block(
-                        values=value.copy(),
+                        values=value,
                         ndim=self.ndim,
                         placement=slice(mgr_loc, mgr_loc + 1),
                     )
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -349,7 +349,6 @@ def test_iloc_setitem_dups(self):
         df = concat([df1, df2], axis=1)
 
         expected = df.fillna(3)
-        expected["A"] = expected["A"].astype("float64")
         inds = np.isnan(df.iloc[:, 0])
         mask = inds[inds].index
         df.iloc[mask, 0] = df.iloc[mask, 2]
@@ -694,3 +693,32 @@ def test_series_indexing_zerodim_np_array(self):
         s = Series([1, 2])
         result = s.iloc[np.array(0)]
         assert result == 1
+
+
+class TestILocSetItemDuplicateColumns:
+    def test_iloc_setitem_scalar_duplicate_columns(self):
+        # GH#15686, duplicate columns and mixed dtype
+        df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}])
+        df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}])
+        df = pd.concat([df1, df2], axis=1)
+        df.iloc[0, 0] = -1
+
+        assert df.iloc[0, 0] == -1
+        assert df.iloc[0, 2] == 3
+        assert df.dtypes.iloc[2] == np.int64
+
+    def test_iloc_setitem_list_duplicate_columns(self):
+        # GH#22036 setting with same-sized list
+        df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"])
+
+        df.iloc[:, 2] = ["str3"]
+
+        expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"])
+        tm.assert_frame_equal(df, expected)
+
+    def test_iloc_setitem_series_duplicate_columns(self):
+        df = pd.DataFrame(
+            np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"]
+        )
+        df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64)
+        assert df.dtypes.iloc[2] == np.int64