From 433098475b2914cfe518807adfd7614691a762d4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 28 Feb 2020 13:24:50 -0800 Subject: [PATCH 1/9] Fix iloc setitem with duplicate columns for scalar case --- pandas/core/frame.py | 12 +++++++ pandas/core/generic.py | 4 +++ pandas/core/indexing.py | 52 +++++++++++++++++++++--------- pandas/core/internals/managers.py | 14 ++++++++ pandas/tests/indexing/test_iloc.py | 19 +++++++++++ 5 files changed, 86 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f304fadbab871..8bb6a7e8c8621 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2708,6 +2708,18 @@ def _setitem_frame(self, key, value): self._check_setitem_copy() self._where(-key, value, inplace=True) + def _iset_item(self, loc: int, value): + self._ensure_valid_index(value) + value = self._sanitize_column(loc, value) + # FIXME: sanitize_column isnt for iloc + NDFrame._iset_item(self, loc, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + def _set_item(self, key, value): """ Add series to DataFrame in specified column. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 25770c2c6470c..3558f1569098e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3490,6 +3490,10 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result + def _iset_item(self, loc: int, value) -> None: + self._data.iset(loc, value) + self._clear_item_cache() + def _set_item(self, key, value) -> None: self._data.set(key, value) self._clear_item_cache() diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3ab180bafd156..aa24f49adef73 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1633,14 +1633,12 @@ def _setitem_with_indexer(self, indexer, value): info_idx = [info_idx] labels = item_labels[info_idx] + plane_indexer = indexer[:1] + lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) + # lplane_indexer gives the expected length of obj[indexer[0]] + if len(labels) == 1: # We can operate on a single column - item = labels[0] - idx = indexer[0] - - plane_indexer = tuple([idx]) - lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) - # lplane_indexer gives the expected length of obj[idx] # require that we are setting the right number of values that # we are indexing @@ -1652,14 +1650,33 @@ def _setitem_with_indexer(self, indexer, value): "length than the value" ) - # non-mi - else: - plane_indexer = indexer[:1] - lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) + pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + + def isetter(loc, v): + ser = self.obj._ixs(loc, axis=1) + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH6149 (null slice), GH10408 (full bounds) + if isinstance(pi, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in pi + ): + ser = v + else: + # set the item, possibly having a dtype change + ser._consolidate_inplace() + ser = ser.copy() + ser._data = ser._data.setitem(indexer=pi, value=v) + ser._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj._iset_item(loc, ser) def setter(item, v): ser = self.obj[item] - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer # perform the equivalent of a setitem on the info axis # as we have a null slice or a slice with full bounds @@ -1740,13 +1757,16 @@ def setter(item, v): setter(item, v) else: - # scalar - for item in labels: - setter(item, value) + # scalar value + if isinstance(info_idx, slice): + # TODO: wrong place for this + ri = Index(range(len(self.obj.columns))) + info_idx = ri[info_idx] + for loc in info_idx: + isetter(loc, value) else: if isinstance(indexer, tuple): - indexer = maybe_convert_ix(*indexer) # if we are setting on the info axis ONLY # set using those methods to avoid block-splitting @@ -1764,6 +1784,8 @@ def setter(item, v): self.obj[item_labels[indexer[info_axis]]] = value return + indexer = maybe_convert_ix(*indexer) + if isinstance(value, (ABCSeries, dict)): # TODO(EA): ExtensionBlock.setitem this causes issues with # setting for extensionarrays that store dicts. Need to decide diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e397167e4881f..49c9de39eb438 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -995,6 +995,20 @@ def delete(self, item): self._shape = None self._rebuild_blknos_and_blklocs() + def iset(self, loc: int, value): + if loc >= len(self.items) or self.ndim != 2: + raise NotImplementedError + + items = self.items + try: + # In order to re-use `set`, we temporarily patch our items + # TODO: This really should defer the other way around + self.axes[0] = Index(range(len(items))) + + self.set(loc, value) + finally: + self.axes[0] = items + def set(self, item, value): """ Set new item in-place. Does not consolidate. Adds new Block if not diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 683d4f2605712..e9ef244353b0d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -694,3 +694,22 @@ def test_series_indexing_zerodim_np_array(self): s = Series([1, 2]) result = s.iloc[np.array(0)] assert result == 1 + + +class TestILocSetItemDuplicateColumns: + def test_iloc_setitem_scalar_duplicate_columns(self): + # GH#15686, duplicate columns and mixed dtype + df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df = pd.concat([df1, df2], axis=1) + df.iloc[0, 0] = -1 + + assert df.iloc[0, 0] == -1 + assert df.iloc[0, 2] == 3 + assert df.dtypes.iloc[2] == np.int64 + + @pytest.mark.xfail(reason="only scalars fixed so far") + def test_iloc_setitem_series_duplicate_columns(self): + df = pd.DataFrame(np.arange(8).reshape(2, 4), columns=["A", "B", "A", "B"]) + df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) + assert df.dtypes.iloc[2] == np.int64 # Still broken From 620c5b9c4cf91d5cedd5b451e084b0a4a029ced9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 28 Feb 2020 15:19:36 -0800 Subject: [PATCH 2/9] Fix setitem in more cases --- pandas/core/indexing.py | 39 +++++++++++++++--------------- pandas/tests/indexing/test_iloc.py | 13 +++++++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index aa24f49adef73..96aa1e6d6d581 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1633,6 +1633,12 @@ def _setitem_with_indexer(self, indexer, value): info_idx = [info_idx] labels = item_labels[info_idx] + # Ensure we have something we can iterate over + ilocs = info_idx + if isinstance(info_idx, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[info_idx] + plane_indexer = indexer[:1] lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) # lplane_indexer gives the expected length of obj[indexer[0]] @@ -1727,42 +1733,38 @@ def setter(item, v): # note that this coerces the dtype if we are mixed # GH 7551 value = np.array(value, dtype=object) - if len(labels) != value.shape[1]: + if len(ilocs) != value.shape[1]: raise ValueError( "Must have equal len keys and value " "when setting with an ndarray" ) - for i, item in enumerate(labels): - - # setting with a list, recoerces - setter(item, value[:, i].tolist()) + for i, loc in enumerate(ilocs): + # setting with a list, re-coerces + isetter(loc, value[:, i].tolist()) # we have an equal len list/ndarray elif _can_do_equal_len( - labels, value, plane_indexer, lplane_indexer, self.obj + labels, value, plane_indexer, lplane_indexer, self.obj.index ): - setter(labels[0], value) + # We only get here with len(labels) == len(ilocs) == 1 + isetter(ilocs[0], value) # per label values else: - if len(labels) != len(value): + if len(ilocs) != len(value): raise ValueError( "Must have equal len keys and value " "when setting with an iterable" ) - for item, v in zip(labels, value): - setter(item, v) + for loc, v in zip(ilocs, value): + isetter(loc, v) else: # scalar value - if isinstance(info_idx, slice): - # TODO: wrong place for this - ri = Index(range(len(self.obj.columns))) - info_idx = ri[info_idx] - for loc in info_idx: + for loc in ilocs: isetter(loc, value) else: @@ -2301,7 +2303,7 @@ def _maybe_numeric_slice(df, slice_, include_bool=False): return slice_ -def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj) -> bool: +def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, index) -> bool: """ Returns ------- @@ -2311,12 +2313,11 @@ def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj) -> bool if not len(labels) == 1 or not np.iterable(value) or is_scalar(plane_indexer[0]): return False - item = labels[0] - index = obj[item].index - values_len = len(value) # equal len list/ndarray if len(index) == values_len: + # FIXME: this looks wrong; the only test that breaks if we do this + # has an all-False boolean mask here return True elif lplane_indexer == values_len: return True diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index e9ef244353b0d..d5164044c4952 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -349,7 +349,6 @@ def test_iloc_setitem_dups(self): df = concat([df1, df2], axis=1) expected = df.fillna(3) - expected["A"] = expected["A"].astype("float64") inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] @@ -708,8 +707,16 @@ def test_iloc_setitem_scalar_duplicate_columns(self): assert df.iloc[0, 2] == 3 assert df.dtypes.iloc[2] == np.int64 - @pytest.mark.xfail(reason="only scalars fixed so far") + def test_iloc_setitem_list_duplicate_columns(self): + # GH#22036 setting with same-sized list + df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"]) + + df.iloc[:, 2] = ["str3"] + + expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"]) + tm.assert_frame_equal(df, expected) + def test_iloc_setitem_series_duplicate_columns(self): df = pd.DataFrame(np.arange(8).reshape(2, 4), columns=["A", "B", "A", "B"]) df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) - assert df.dtypes.iloc[2] == np.int64 # Still broken + assert df.dtypes.iloc[2] == np.int64 From 08f9df3dacc2ec5087f9757181a78968703996cb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 28 Feb 2020 16:06:53 -0800 Subject: [PATCH 3/9] BUG: fix setitem_with_indexer --- pandas/core/indexing.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 96aa1e6d6d581..ab320e3f1b78d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1681,29 +1681,6 @@ def isetter(loc, v): # reset the sliced object if unique self.obj._iset_item(loc, ser) - def setter(item, v): - ser = self.obj[item] - - # perform the equivalent of a setitem on the info axis - # as we have a null slice or a slice with full bounds - # which means essentially reassign to the columns of a - # multi-dim object - # GH6149 (null slice), GH10408 (full bounds) - if isinstance(pi, tuple) and all( - com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) - for idx in pi - ): - ser = v - else: - # set the item, possibly having a dtype change - ser._consolidate_inplace() - ser = ser.copy() - ser._data = ser._data.setitem(indexer=pi, value=v) - ser._maybe_update_cacher(clear=True) - - # reset the sliced object if unique - self.obj[item] = ser - # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: @@ -1712,8 +1689,10 @@ def setter(item, v): if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, ABCMultiIndex) + # TODO: we are implicitly assuming value.columns is unique - for item in labels: + for loc in ilocs: + item = item_labels[loc] if item in value: sub_indexer[info_axis] = item v = self._align_series( @@ -1722,7 +1701,7 @@ def setter(item, v): else: v = np.nan - setter(item, v) + isetter(loc, v) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. From 5b4711daf33d2019a8abc6f7a596df6de3d2993a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 28 Feb 2020 19:35:28 -0800 Subject: [PATCH 4/9] comment, no copy needed --- pandas/core/internals/managers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 49c9de39eb438..146c934e1b2a6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1050,6 +1050,9 @@ def value_getitem(placement): return if isinstance(loc, int): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) loc = [loc] blknos = self._blknos[loc] @@ -1100,7 +1103,7 @@ def value_getitem(placement): # one item. new_blocks.extend( make_block( - values=value.copy(), + values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), ) From 906304765e8e11d020d0b651817c8ad036ba81cd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Mar 2020 10:15:06 -0800 Subject: [PATCH 5/9] update check --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dffe4bb97734a..cb74ea1387018 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1052,7 +1052,7 @@ def value_getitem(placement): "Shape of new values must be compatible with manager shape" ) - if isinstance(loc, int): + if lib.is_integer(loc): # We have 6 tests where loc is _not_ an int. # In this case, get_blkno_placements will yield only one tuple, # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) From 280418ef7aee2c76dcc1a0559db8a47d72dd79f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 16:31:05 -0800 Subject: [PATCH 6/9] comment --- pandas/core/frame.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f6c4f860c6dc5..bac8fdb8acf89 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2708,8 +2708,10 @@ def _setitem_frame(self, key, value): def _iset_item(self, loc: int, value): self._ensure_valid_index(value) - value = self._sanitize_column(loc, value) - # FIXME: sanitize_column isnt for iloc + + # passing loc to sanitize_column is a misnomer, but harmless + # with broadcast=False as long as value is never a DataFrame + value = self._sanitize_column(loc, value, broadcast=False) NDFrame._iset_item(self, loc, value) # check if we are modifying a copy From 4a847e509d4ff075266c8b0d5ca031f513f7da06 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 16:35:31 -0800 Subject: [PATCH 7/9] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 44deab25db695..c7e679ae9c0a1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -260,7 +260,7 @@ Indexing - Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`) - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) -- +- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) Missing ^^^^^^^ From 540d3eab42a193e71db6ce0a9862beb0ea5112da Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Mar 2020 10:30:40 -0800 Subject: [PATCH 8/9] fix test on 32bit platform --- pandas/tests/indexing/test_iloc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index d5164044c4952..f6b9e9a44ba14 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -717,6 +717,8 @@ def test_iloc_setitem_list_duplicate_columns(self): tm.assert_frame_equal(df, expected) def test_iloc_setitem_series_duplicate_columns(self): - df = pd.DataFrame(np.arange(8).reshape(2, 4), columns=["A", "B", "A", "B"]) + df = pd.DataFrame( + np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"] + ) df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) assert df.dtypes.iloc[2] == np.int64 From 3a17ba8b5ab148cee97e97d0f02ef27a24934af2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Mar 2020 10:09:38 -0700 Subject: [PATCH 9/9] comments --- pandas/core/frame.py | 4 ++-- pandas/core/indexing.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4ea0a64f8995e..83d44e546356c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2709,8 +2709,8 @@ def _setitem_frame(self, key, value): def _iset_item(self, loc: int, value): self._ensure_valid_index(value) - # passing loc to sanitize_column is a misnomer, but harmless - # with broadcast=False as long as value is never a DataFrame + # technically _sanitize_column expects a label, not a position, + # but the behavior is the same as long as we pass broadcast=False value = self._sanitize_column(loc, value, broadcast=False) NDFrame._iset_item(self, loc, value) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f67f301959a5a..c9362a0527c06 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1641,6 +1641,7 @@ def _setitem_with_indexer(self, indexer, value): pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer def isetter(loc, v): + # positional setting on column loc ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis