From 2c761eb1f007a87bd2404c68f0c4849cbfb3cef9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 27 Dec 2020 16:00:19 -0800 Subject: [PATCH 01/32] REF: setitem_with_indexer always use same path for 2D --- pandas/core/indexing.py | 58 +++++++++++++++++-------------- pandas/core/internals/blocks.py | 20 +++++++++++ pandas/tests/indexing/test_loc.py | 2 +- 3 files changed, 53 insertions(+), 27 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 693b09336fefc..138cd83c2016a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1538,29 +1538,6 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): """ info_axis = self.obj._info_axis_number - # maybe partial set - take_split_path = not self.obj._mgr.is_single_block - - # if there is only one block/type, still have to take split path - # unless the block is one-dimensional or it can hold the value - if not take_split_path and self.obj._mgr.blocks: - if self.ndim > 1: - # in case of dict, keys are indices - val = list(value.values()) if isinstance(value, dict) else value - blk = self.obj._mgr.blocks[0] - take_split_path = not blk._can_hold_element(val) - - # if we have any multi-indexes that have non-trivial slices - # (not null slices) then we must take the split path, xref - # GH 10360, GH 27841 - if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): - for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, ABCMultiIndex) and not ( - is_integer(i) or com.is_null_slice(i) - ): - take_split_path = True - break - if isinstance(indexer, tuple): nindexer = [] for i, idx in enumerate(indexer): @@ -1629,7 +1606,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): return # align and set the values - if take_split_path: + if self.ndim > 1: # We have to operate column-wise self._setitem_with_indexer_split_path(indexer, value, name) else: @@ -1648,17 +1625,43 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): raise IndexError("too many indices for array") if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: raise ValueError(r"Cannot set values with ndim > 2") - if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): from pandas import Series value = self._align_series(indexer, Series(value)) + info_idx = indexer[1] + pi = indexer[0] + + if com.is_null_slice(info_idx) and is_scalar(value): + # We can go directly through BlockManager.setitem without worrying + # about alignment. + # TODO: do we need to do some kind of copy_with_setting check? + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + return + + if is_integer(info_idx): + if is_integer(pi): + # We need to watch out for case where we are treating a listlike + # as a scalar, e.g. test_setitem_iloc_scalar_single for JSONArray + + mgr = self.obj._mgr + blkno = mgr.blknos[info_idx] + blkloc = mgr.blklocs[info_idx] + blk = mgr.blocks[blkno] + + if blk._can_hold_element(value): + # NB: we are assuming here that _can_hold_element is accurate + # TODO: do we need to do some kind of copy_with_setting check? + self.obj._check_is_chained_assignment_possible() + blk.setitem_inplace((pi, blkloc), value) + self.obj._maybe_update_cacher(clear=True) + return + # Ensure we have something we can iterate over info_axis = indexer[1] ilocs = self._ensure_iterable_column_indexer(info_axis) - pi = indexer[0] lplane_indexer = length_of_indexer(pi, self.obj.index) # lplane_indexer gives the expected length of obj[indexer[0]] @@ -1775,6 +1778,9 @@ def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: s raise ValueError("Setting with non-unique columns is not allowed.") else: + # TODO: not totally clear why we are requiring this + self._align_frame(indexer[0], value) + for loc in ilocs: item = self.obj.columns[loc] if item in value: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ea1b8259eeadd..b51c06152c979 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -991,6 +991,26 @@ def setitem(self, indexer, value): block = self.make_block(values) return block + def setitem_inplace(self, indexer, value): + """ + setitem but only inplace. + + Notes + ----- + Assumes self is 2D and that indexer is a 2-tuple. + """ + if lib.is_scalar(value) and not self.is_extension: + # Convert timedelta/datetime to timedelta64/datetime64 + value = convert_scalar_for_putitemlike(value, self.dtype) + + pi = indexer[0] + if self.is_extension: + # TODO(EA2D): not needed with 2D EAs + self.values[pi] = value + else: + blkloc = indexer[1] + self.values[blkloc, pi] = value + def _putmask_simple(self, mask: np.ndarray, value: Any): """ Like putmask but diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 89315b16937b1..204806b9f2b5e 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -995,7 +995,7 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "cannot copy sequence with size 2 to array axis with dimension 0" + msg = "Must have equal len keys and value when setting with an iterable" with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data From 27bd89d3053a67d361d00e9118f71960217abe68 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 8 Jan 2021 14:21:08 -0800 Subject: [PATCH 02/32] fixed xfail --- pandas/tests/indexing/test_loc.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d9003a3c84e44..830e175962531 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev import pandas.util._test_decorators as td import pandas as pd @@ -986,7 +985,6 @@ def test_loc_setitem_empty_append_single_value(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe From a5c1f5ed6f82a7b606c326fb9b1b5614d9c9d813 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 26 Jan 2021 20:13:17 -0800 Subject: [PATCH 03/32] REF: DataFrame._setitem_array dont use iloc.__setitem__ --- pandas/core/frame.py | 73 +++++++++++++++++---- pandas/tests/frame/indexing/test_setitem.py | 4 +- pandas/tests/indexing/test_indexing.py | 8 ++- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7585b21abe99..e15ff6efc9442 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3211,9 +3211,10 @@ def _setitem_slice(self, key: slice, value): self._check_setitem_copy() self.iloc[key] = value - def _setitem_array(self, key, value): + def _setitem_array(self, key: Sequence[Hashable], value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): + # bool indexer is indexing along rows if len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}!" @@ -3222,19 +3223,67 @@ def _setitem_array(self, key, value): indexer = key.nonzero()[0] self._check_setitem_copy() self.iloc[indexer] = value + + elif isinstance(key, np.ndarray) and key.ndim != 1: + # FIXME: kludge just to get the right exception message + # in test_setitem_ndarray_3d + raise ValueError( + f"Buffer has wrong number of dimensions (expected 1, got {key.ndim})" + ) + + elif not isinstance(value, (np.ndarray, DataFrame)) and np.ndim(value) > 1: + # list of lists + value = DataFrame(value) + return self._setitem_array(key, value) + + elif not self.columns.is_unique and not all( + is_hashable(x) and x in self.columns for x in key + ): + raise NotImplementedError + else: - if isinstance(value, DataFrame) and self.columns.is_unique: - if len(value.columns) != len(key): - raise ValueError("Columns must be same length as key") - for k1, k2 in zip(key, value.columns): - self[k1] = value[k2] + # Note: unlike self.iloc[:, indexer] = value, this will + # never try to overwrite values inplace + + def igetitem(val, i: int): + if isinstance(val, np.ndarray) and val.ndim == 2: + return val[:, i] + elif isinstance(val, DataFrame): + return val.iloc[:, i] + elif not is_list_like(val): + return val + else: + return val[i] + + if self.columns.is_unique: + key_len = len(key) + else: + indexer, missing = self.columns.get_indexer_non_unique(key) + if len(missing) == 0: + key_len = len(indexer) + else: + key_len = np.shape(value)[-1] + + if not is_list_like(value): + vlen = key_len + else: + vlen = np.shape(value)[-1] + + if vlen != key_len: + # TODO: is this only if all are contained? + raise ValueError("Columns must be same length as key") + + if self.columns.is_unique: + # easy case + for i, col in enumerate(key): + self[col] = igetitem(value, i) + else: - self.loc._ensure_listlike_indexer(key, axis=1, value=value) - indexer = self.loc._get_listlike_indexer( - key, axis=1, raise_missing=False - )[1] - self._check_setitem_copy() - self.iloc[:, indexer] = value + mask = self.columns.isin(key) + ilocs = mask.nonzero()[0] + for i, iloc in enumerate(ilocs): + # TODO: sure this is never-inplace? + self._iset_item(iloc, igetitem(value, i)) def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 4f8ac49cb17ec..b54421d76c446 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -361,9 +361,11 @@ def test_setitem_frame_duplicate_columns(self): [np.nan, 1, 2, np.nan, 4, 5], [np.nan, 1, 2, np.nan, 4, 5], ], - columns=cols, dtype="object", ) + expected[2] = expected[2].astype(np.int64) + expected[5] = expected[5].astype(np.int64) + expected.columns = cols tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f67341ab176d7..596b437860747 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -99,7 +99,13 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer): ) else: err = ValueError - msg = r"Buffer has wrong number of dimensions \(expected 1, got 3\)|" + msg = "|".join( + [ + r"Buffer has wrong number of dimensions \(expected 1, got 3\)", + "Cannot set values with ndim > 1", + "Index data must be 1-dimensional", + ] + ) with pytest.raises(err, match=msg): idxr[nd3] = 0 From 1789fcad0a1fdc44f3098eb2ece4c5083a4e9e1d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Feb 2021 17:08:22 -0800 Subject: [PATCH 04/32] mypy fixup --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70aa4fb13ac35..ae77384357df5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3218,7 +3218,7 @@ def _setitem_slice(self, key: slice, value): self._check_setitem_copy() self.iloc[key] = value - def _setitem_array(self, key: Sequence[Hashable], value): + def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): # bool indexer is indexing along rows From a1cdd1931ff24d4a54777e6312957c282b6cda24 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Feb 2021 19:43:07 -0800 Subject: [PATCH 05/32] mypy fixup --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 09c8c46168e24..f2176496fa637 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3285,7 +3285,7 @@ def igetitem(obj, i: int): # Using self.iloc[:, i] = ... may set values inplace, which # by convention we do not do in __setitem__ try: - self.columns = range(len(self.columns)) + self.columns = Index(range(len(self.columns))) for i, iloc in enumerate(ilocs): self[iloc] = igetitem(value, i) finally: From c32a427dfbfad2c3bbb99b4ec873606dc143c1e9 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 10:02:13 -0800 Subject: [PATCH 06/32] 32bit compat --- pandas/tests/frame/indexing/test_setitem.py | 2 ++ pandas/tests/reshape/test_pivot.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e48a40f7ac110..d9827cd937ce6 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -322,6 +322,8 @@ def test_setitem_complete_column_with_array(self): "d": [1, 1, 1], } ) + expected["c"] = expected["c"].astype(np.intp) + expected["d"] = expected["d"].astype(np.intp) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f9b2a02920841..bc5f00d8048ae 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -969,7 +969,7 @@ def test_margins_dtype(self): expected = DataFrame( {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) - expected["All"] = expected["dull"] + expected["shiny"] + expected["All"] = (expected["dull"] + expected["shiny"]).astype(np.intp) result = df.pivot_table( values="D", From 8a283d94a5e73233c993494d18c608490613c763 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 13:16:59 -0800 Subject: [PATCH 07/32] troubleshoot windows builds --- pandas/tests/frame/indexing/test_setitem.py | 6 ++++-- pandas/tests/reshape/test_pivot.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d9827cd937ce6..d30a848564882 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -322,8 +322,10 @@ def test_setitem_complete_column_with_array(self): "d": [1, 1, 1], } ) - expected["c"] = expected["c"].astype(np.intp) - expected["d"] = expected["d"].astype(np.intp) + expected["c"] = expected["c"].astype(arr.dtype) + expected["d"] = expected["d"].astype(arr.dtype) + assert expected["c"].dtype == arr.dtype + assert expected["d"].dtype == arr.dtype tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index bc5f00d8048ae..6ffe4a458016d 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -969,7 +969,9 @@ def test_margins_dtype(self): expected = DataFrame( {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) - expected["All"] = (expected["dull"] + expected["shiny"]).astype(np.intp) + expected["All"] = expected["dull"] + expected["shiny"] + expected["All"] = expected["All"].astype(np.intp) + assert expected.dtypes["All"] == np.intp result = df.pivot_table( values="D", From 9f6d8e7af3d0ee66256929994bb0e21d9188f7fd Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 18:19:40 -0800 Subject: [PATCH 08/32] troubleshoot 32bit --- pandas/tests/reshape/test_pivot.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6ffe4a458016d..301163517223b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -962,7 +962,7 @@ def test_margins_dtype(self): # GH 17013 df = self.data.copy() - df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) + df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8") mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) @@ -970,8 +970,6 @@ def test_margins_dtype(self): {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) expected["All"] = expected["dull"] + expected["shiny"] - expected["All"] = expected["All"].astype(np.intp) - assert expected.dtypes["All"] == np.intp result = df.pivot_table( values="D", From c107e772dc73978c10e926c73b1e99affc1b21af Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Feb 2021 12:51:30 -0800 Subject: [PATCH 09/32] TST: missed raising cases --- pandas/core/frame.py | 6 +++--- pandas/tests/frame/indexing/test_setitem.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7d8ebecb595b..06c402ee39e5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3245,9 +3245,9 @@ def _setitem_array(self, key, value): def _iset_not_inplace(self, key, value): def igetitem(obj, i: int): - if isinstance(obj, DataFrame): - return obj.iloc[:, i] - elif isinstance(obj, np.ndarray): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): return obj[..., i] else: return obj[i] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d30a848564882..e706133eb6b1e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -372,6 +372,17 @@ def test_setitem_frame_duplicate_columns(self): expected.columns = cols tm.assert_frame_equal(df, expected) + def test_setitem_frame_duplicate_columns_size_mismatch(self): + # GH#39510 + cols = ["A", "B", "C"] * 2 + df = DataFrame(index=range(3), columns=cols) + with pytest.raises(ValueError, match="Columns must be same length as key"): + df[["A"]] = (0, 3, 5) + + df2 = df.iloc[:, :3] # unique columns + with pytest.raises(ValueError, match="Columns must be same length as key"): + df2[["A"]] = (0, 3, 5) + @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) def test_setitem_df_wrong_column_number(self, cols): # GH#38604 From 4cfe863358368acfaab2ab4e419ccaef0ffcc4d5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 14:06:20 -0800 Subject: [PATCH 10/32] port phofl tests --- pandas/tests/frame/indexing/test_setitem.py | 39 +++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cfc49d982cc0f..3af345d2704f2 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -784,3 +784,42 @@ def test_setitem_clear_caches(self): assert df["z"] is not foo tm.assert_series_equal(df["z"], expected) + + def test_setitem_duplicate_columns_not_inplace(self): + # GH#39510 + cols = ["A", "B"] * 2 + df = DataFrame(0.0, index=[0], columns=cols) + df_copy = df.copy() + df_view = df[:] + df["B"] = (2, 5) + + expected = DataFrame([[0.0, 2, 0.0, 5]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) + + @pytest.mark.xfail(reason="Setitem with same dtype still changing inplace") + @pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]]) + def test_setitem_same_dtype_not_inplace(self, value): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1], [0, 1]], columns=cols) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df_view, df_copy) + + @pytest.mark.parametrize("value", [1.0, np.array([[1.0], [1.0]]), [[1.0], [1.0]]]) + def test_setitem_listlike_key_scalar_value_not_inplace(self, value): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) From 7ea792f267b5e610ee00ff6d3bdcf39795af689b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 07:15:23 -0800 Subject: [PATCH 11/32] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f7204ceb9d412..f0dfcb3788581 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -449,6 +449,7 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) +- Bug in :meth:`DataFrame.__setitem__` in some cases overwriting existing values instead of inserting a new array (:issue:`39510`) Missing ^^^^^^^ From 20f6a161899cbcf1f7d3be23b55eabb427a310cc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 15:19:09 -0800 Subject: [PATCH 12/32] clarify whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f0dfcb3788581..32860b9319459 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -449,7 +449,7 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) -- Bug in :meth:`DataFrame.__setitem__` in some cases overwriting existing values instead of inserting a new array (:issue:`39510`) +- Bug in :meth:`DataFrame.__setitem__` when setting multiple columns incorrectly overwriting existing data rather than inserting new arrays (:issue:`39510`) Missing ^^^^^^^ From 2d0cf9e8a6d4201855dc092618d5e35cc38c617c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 12:40:01 -0800 Subject: [PATCH 13/32] comment --- pandas/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5b44ede283a7..bdafcc53b4032 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3516,6 +3516,12 @@ def _setitem_array(self, key, value): self._iset_not_inplace(key, value) def _iset_not_inplace(self, key, value): + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. + def igetitem(obj, i: int): # Note: we catch DataFrame obj before getting here, but # hypothetically would return obj.iloc[:, i] From e1ed083207a59067fcdbd017a74d0b5a1dbda632 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 12:51:40 -0800 Subject: [PATCH 14/32] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9e05e1c80a1d1..d539512edeb64 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -223,6 +223,41 @@ In pandas 1.3.0, ``df`` continues to share data with ``values`` np.shares_memory(df["A"], values) +.. _whatsnew_130.notable_bug_fixes.setitem_never_inplace: + +Never Operate Inplace When Setting ``frame[keys] = values`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When setting multiple columns using ``frame[keys] = values`` new arrays will +replace pre-existing arrays for these keys, which will *not* be over-written +(:issue:`39510`). As a result, the columns will retain the dtype(s) of ``values``, +never casting to the dtypes of the existing arrays. + +.. ipython:: python + + df = pd.DataFrame(range(3), columns=["A"], dtype="float64") + df[["A"]] = 5 + +In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing +array backing ``df``: + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: df.dtypes + Out[1]: + A float64 + +In the new behavior, we get a new array, and retain an integer-dtyped ``5``: + +*pandas 1.3.0* + +.. ipython:: python + + df.dtypes + + .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: Consistent Casting With Setting Into Boolean Series @@ -495,7 +530,6 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) -- Bug in :meth:`DataFrame.__setitem__` when setting multiple columns incorrectly overwriting existing data rather than inserting new arrays (:issue:`39510`) Missing ^^^^^^^ From df9d87fe8a172eebeb45434522f46d412534b88c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 17:41:16 -0800 Subject: [PATCH 15/32] checkpoint passing --- pandas/core/frame.py | 17 +++++++- pandas/core/indexers.py | 2 + pandas/core/indexing.py | 54 +++++++++++++++++++++++--- pandas/core/internals/blocks.py | 7 ---- pandas/core/internals/managers.py | 1 + pandas/tests/extension/test_numpy.py | 20 ---------- pandas/tests/indexing/test_iloc.py | 17 ++------ pandas/tests/indexing/test_indexing.py | 35 ++++++++++++----- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/indexing/test_partial.py | 4 +- 10 files changed, 102 insertions(+), 59 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d9f6ea74ac5a..ffa7e363f0cb0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4950,12 +4950,27 @@ def _replace_columnwise( target, value = mapping[ax[i]] newobj = ser.replace(target, value, regex=regex) - res.iloc[:, i] = newobj + res._isetitem(i, newobj) if inplace: return return res.__finalize__(self) + def _isetitem(self, loc: int, value): + cols = self.columns + if cols.is_unique: + col = cols[loc] + self[col] = value + return + + # Otherwise we temporarily pin unique columns and call __setitem__ + newcols = Index(range(len(cols))) + try: + self.columns = newcols + self[loc] = value + finally: + self.columns = cols + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift( self, diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 86d6b772fe2e4..6b190a9f85e75 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -339,6 +339,8 @@ def length_of_indexer(indexer, target=None) -> int: # GH#25774 return indexer.sum() return len(indexer) + elif isinstance(indexer, range): + return (indexer.stop - indexer.start) // indexer.step elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4bfa93fb3dd61..f516dc4886361 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1676,6 +1676,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): # Above we only set take_split_path to True for 2D cases assert self.ndim == 2 + orig = indexer if not isinstance(indexer, tuple): indexer = _tuplify(self.ndim, indexer) if len(indexer) > self.ndim: @@ -1689,8 +1690,20 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): info_idx = indexer[1] pi = indexer[0] + if ( + isinstance(pi, ABCDataFrame) + and orig is pi + and hasattr(self.obj._mgr, "blocks") + and len(self.obj._mgr.blocks) == 1 + ): + # FIXME: kludge + return self._setitem_single_block(orig, value, name) - if com.is_null_slice(info_idx) and is_scalar(value): + if ( + com.is_null_slice(info_idx) + and is_scalar(value) + and not isinstance(pi, ABCDataFrame) + ): # We can go directly through BlockManager.setitem without worrying # about alignment. # TODO: do we need to do some kind of copy_with_setting check? @@ -1734,7 +1747,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): # We are setting multiple rows in a single column. - self._setitem_single_column(ilocs[0], value, pi) + self._setitem_iat_loc(ilocs[0], pi, value) + # self._setitem_single_column(ilocs[0], value, pi) elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): # We are trying to set N values into M entries of a single @@ -1758,7 +1772,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == len(value): # We are setting multiple columns in a single row. for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) + self._setitem_iat_loc(loc, pi, v) + # self._setitem_single_column(loc, v, pi) elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: # This is a setitem-with-expansion, see @@ -1796,6 +1811,7 @@ def _setitem_with_indexer_2d_value(self, indexer, value): for i, loc in enumerate(ilocs): # setting with a list, re-coerces + # self._setitem_iat_loc(loc, pi, value[:, i].tolist()) self._setitem_single_column(loc, value[:, i].tolist(), pi) def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): @@ -1812,7 +1828,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str if name == "iloc": for i, loc in enumerate(ilocs): val = value.iloc[:, i] - self._setitem_single_column(loc, val, pi) + self._setitem_iat_loc(loc, pi, val) + # self._setitem_single_column(loc, val, pi) elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see @@ -1829,7 +1846,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str else: val = np.nan - self._setitem_single_column(loc, val, pi) + self._setitem_iat_loc(loc, pi, val) + # self._setitem_single_column(loc, val, pi) elif not unique_cols: raise ValueError("Setting with non-unique columns is not allowed.") @@ -1848,7 +1866,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str else: val = np.nan - self._setitem_single_column(loc, val, pi) + self._setitem_iat_loc(loc, pi, val) + # self._setitem_single_column(loc, val, pi) def _setitem_single_column(self, loc: int, value, plane_indexer): """ @@ -1882,6 +1901,29 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # reset the sliced object if unique self.obj._iset_item(loc, ser) + def _setitem_iat_loc(self, loc: int, pi, value): + # TODO: likely a BM method? + mgr = self.obj._mgr + blkno = mgr.blknos[loc] + blkloc = mgr.blklocs[loc] + blk = mgr.blocks[blkno] + assert blk.mgr_locs[blkloc] == loc + + if blk._can_hold_element(value): + # NB: we are assuming here that _can_hold_element is accurate + # TODO: do we need to do some kind of copy_with_setting check? + try: + self.obj._check_is_chained_assignment_possible() + blk.setitem_inplace((pi, blkloc), value) + self.obj._maybe_update_cacher(clear=True) + except ValueError: + if blk.is_extension: + # FIXME: kludge bc _can_hold_element is wrong for EABLock + return self._setitem_single_column(loc, value, pi) + raise + else: + self._setitem_single_column(loc, value, pi) + def _setitem_single_block(self, indexer, value, name: str): """ _setitem_with_indexer for the case when we have a single Block. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ff5efd9b78805..7a2929324d5f0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1008,22 +1008,15 @@ def setitem(self, indexer, value): values[indexer] = value elif exact_match and is_categorical_dtype(arr_value.dtype): - # GH25495 - If the current dtype is not categorical, - # we need to create a new categorical block values[indexer] = value elif exact_match and is_ea_value: - # GH#32395 if we're going to replace the values entirely, just - # substitute in the new array if not self.is_object and isinstance(value, (IntegerArray, FloatingArray)): values[indexer] = value.to_numpy(value.dtype.numpy_dtype) else: values[indexer] = np.asarray(value) - # if we are an exact match (ex-broadcasting), - # then use the resultant dtype elif exact_match: - # We are setting _all_ of the array's values, so can cast to new dtype values[indexer] = value elif is_ea_value: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b656c9e83e1a8..49464fe3b4128 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1464,6 +1464,7 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True """ Take items along any axis. """ + # TODO: should these be np.intp? indexer = ( np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") if isinstance(indexer, slice) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 718ef087e47d3..3e91757ece9fe 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -350,26 +350,6 @@ def test_setitem_sequence_broadcasts(self, data, box_in_series): # length than the value super().test_setitem_sequence_broadcasts(data, box_in_series) - @skip_nested - def test_setitem_loc_scalar_mixed(self, data): - # AssertionError - super().test_setitem_loc_scalar_mixed(data) - - @skip_nested - def test_setitem_loc_scalar_multiple_homogoneous(self, data): - # AssertionError - super().test_setitem_loc_scalar_multiple_homogoneous(data) - - @skip_nested - def test_setitem_iloc_scalar_mixed(self, data): - # AssertionError - super().test_setitem_iloc_scalar_mixed(data) - - @skip_nested - def test_setitem_iloc_scalar_multiple_homogoneous(self, data): - # AssertionError - super().test_setitem_iloc_scalar_multiple_homogoneous(data) - @skip_nested @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_broadcast(self, data, setter): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index ad5f54174952d..2944a1f6ac05f 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -74,24 +74,14 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key): orig_vals = df.values indexer(df)[key, 0] = cat - overwrite = isinstance(key, slice) and key == slice(None) - - if overwrite: - # TODO: GH#39986 this probably shouldn't behave differently - expected = DataFrame({0: cat}) - assert not np.shares_memory(df.values, orig_vals) - else: - expected = DataFrame({0: cat}).astype(object) - assert np.shares_memory(df.values, orig_vals) + expected = DataFrame({0: cat.astype(object)}) + assert np.shares_memory(df.values, orig_vals) tm.assert_frame_equal(df, expected) # check we dont have a view on cat (may be undesired GH#39986) df.iloc[0, 0] = "gamma" - if overwrite: - assert cat[0] != "gamma" - else: - assert cat[0] != "gamma" + assert cat[0] != "gamma" @pytest.mark.parametrize("box", [pd_array, Series]) def test_iloc_setitem_ea_inplace(self, frame_or_series, box): @@ -824,7 +814,6 @@ def test_series_indexing_zerodim_np_array(self): result = s.iloc[np.array(0)] assert result == 1 - @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457") def test_iloc_setitem_categorical_updates_inplace(self): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = Categorical(["A", "B", "C"]) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 67484c9a5c13f..99aa7110cb912 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -529,6 +529,9 @@ def test_astype_assignment(self): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + # original (object) array can hold new values, so setting is inplace + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -536,6 +539,9 @@ def test_astype_assignment(self): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + # original (object) array can hold new values, so setting is inplace + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -544,6 +550,8 @@ def test_astype_assignment(self): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + # df["A"] can hold the RHS, so the assignment is inplace, remains object + expected["A"] = expected["A"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -551,19 +559,28 @@ def test_astype_assignment(self): expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + # original (object) array can hold new values, so setting is inplace + expected["B"] = expected["B"].astype(object) + expected["C"] = expected["C"].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): # full replacements / no nans - df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) - df.iloc[:, 0] = df["A"].astype(np.int64) - expected = DataFrame({"A": [1, 2, 3, 4]}) - tm.assert_frame_equal(df, expected) - - df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) - df.loc[:, "A"] = df["A"].astype(np.int64) - expected = DataFrame({"A": [1, 2, 3, 4]}) - tm.assert_frame_equal(df, expected) + # the new values can all be held by the existing array, so the assignment + # is in-place + orig = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + value = orig.astype(np.int64) + # expected = DataFrame({"A": [1, 2, 3, 4]}) + + df = orig.copy() + df.iloc[ + :, 0 + ] = value # <- not yet, bc value is a DataFrame; would work with value["A"] + tm.assert_frame_equal(df, orig) + + df = orig.copy() + df.loc[:, "A"] = value + tm.assert_frame_equal(df, orig) @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc]) def test_index_type_coercion(self, indexer): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index f23e5a86d38a2..5a59a36696e41 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -781,6 +781,7 @@ def test_loc_coercion(self): result = df.iloc[[1]] tm.assert_series_equal(result.dtypes, expected) + def test_loc_coercion2(self): # 12045 import datetime @@ -795,6 +796,7 @@ def test_loc_coercion(self): result = df.iloc[[1]] tm.assert_series_equal(result.dtypes, expected) + def test_loc_coercion3(self): # 11594 df = DataFrame({"text": ["some words"] + [None] * 9}) expected = df.dtypes @@ -1208,7 +1210,7 @@ def test_loc_setitem_single_row_categorical(self): df.loc[:, "Alpha"] = categories result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha") + expected = Series(categories, index=df.index, name="Alpha").astype(object) tm.assert_series_equal(result, expected) def test_loc_setitem_datetime_coercion(self): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 468e4cad742df..b4904961f7ac5 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -100,7 +100,8 @@ def test_partial_setting(self): tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite - expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) + # float64 can hold df.loc[:, "A"], so setting is inplace + expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0.0, 2.0, 4.0])})) df = df_orig.copy() df["B"] = df["B"].astype(np.float64) df.loc[:, "B"] = df.loc[:, "A"] @@ -120,6 +121,7 @@ def test_partial_setting(self): df.loc[:, "C"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) + def test_partial_setting2(self): # GH 8473 dates = date_range("1/1/2000", periods=8) df_orig = DataFrame( From be54f150f5b2e2abf4838ce8fce2672013484861 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Mar 2021 09:03:51 -0800 Subject: [PATCH 16/32] Fix dtype in loc-setitem-with-expansion --- pandas/core/indexing.py | 30 ++++++++++++++++++------------ pandas/tests/indexing/test_loc.py | 10 +++++----- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f516dc4886361..a9798a1e1d558 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -632,12 +632,12 @@ def __call__(self, axis=None): new_self.axis = axis return new_self - def _get_setitem_indexer(self, key): + def _get_setitem_indexer(self, key, value): """ Convert a potentially-label-based key into a positional indexer. """ if self.name == "loc": - self._ensure_listlike_indexer(key) + self._ensure_listlike_indexer(key, value=value) if self.axis is not None: return self._convert_tuple(key, is_setter=True) @@ -684,9 +684,11 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if self.ndim != 2: return + pi = None if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part + pi = key[0] key = key[column_axis] axis = column_axis @@ -700,16 +702,25 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): # GH#38148 keys = self.obj.columns.union(key, sort=False) - self.obj._mgr = self.obj._mgr.reindex_axis( - keys, axis=0, copy=False, consolidate=False, only_slice=True - ) + if isinstance(value, ABCDataFrame) and com.is_null_slice(pi): + # We are setting obj.loc[:, new_keys] = newframe + # Setting these directly instead of reindexing keeps + # us from converting integer dtypes to floats + new_keys = keys.difference(self.obj.columns) + self.obj[new_keys] = value[new_keys] + + else: + + self.obj._mgr = self.obj._mgr.reindex_axis( + keys, axis=0, copy=False, consolidate=False, only_slice=True + ) def __setitem__(self, key, value): if isinstance(key, tuple): key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: key = com.apply_if_callable(key, self.obj) - indexer = self._get_setitem_indexer(key) + indexer = self._get_setitem_indexer(key, value) self._has_valid_setitem_indexer(key) iloc = self if self.name == "iloc" else self.obj.iloc @@ -1565,7 +1576,7 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): """ return key - def _get_setitem_indexer(self, key): + def _get_setitem_indexer(self, key, value): # GH#32257 Fall through to let numpy do validation return key @@ -1748,7 +1759,6 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): # We are setting multiple rows in a single column. self._setitem_iat_loc(ilocs[0], pi, value) - # self._setitem_single_column(ilocs[0], value, pi) elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): # We are trying to set N values into M entries of a single @@ -1773,7 +1783,6 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): # We are setting multiple columns in a single row. for loc, v in zip(ilocs, value): self._setitem_iat_loc(loc, pi, v) - # self._setitem_single_column(loc, v, pi) elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: # This is a setitem-with-expansion, see @@ -1829,7 +1838,6 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str for i, loc in enumerate(ilocs): val = value.iloc[:, i] self._setitem_iat_loc(loc, pi, val) - # self._setitem_single_column(loc, val, pi) elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see @@ -1847,7 +1855,6 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str val = np.nan self._setitem_iat_loc(loc, pi, val) - # self._setitem_single_column(loc, val, pi) elif not unique_cols: raise ValueError("Setting with non-unique columns is not allowed.") @@ -1867,7 +1874,6 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str val = np.nan self._setitem_iat_loc(loc, pi, val) - # self._setitem_single_column(loc, val, pi) def _setitem_single_column(self, loc: int, value, plane_indexer): """ diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 746599828157c..7bc54765f695b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -281,14 +281,14 @@ def test_loc_setitem_slice(self): def test_loc_setitem_dtype(self): # GH31340 df = DataFrame({"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}) + orig = df.copy() cols = ["a", "b", "c"] + # the float32 data (for columns "b" and "c") can be held in the existing + # float64 columns losslessly, so we keep the original underlying arrays + # and our dtypes are not changed. df.loc[:, cols] = df.loc[:, cols].astype("float32") - expected = DataFrame( - {"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}, dtype="float32" - ) # id is inferred as object - - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, orig) def test_getitem_label_list_with_missing(self): s = Series(range(3), index=["a", "b", "c"]) From fa2006b6f3439f87770e2741708e22dde18fe14d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 15:38:05 -0800 Subject: [PATCH 17/32] ArrayManager compat --- pandas/core/indexing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a9798a1e1d558..c1ed9d54c1cb9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -56,6 +56,7 @@ length_of_indexer, ) from pandas.core.indexes.api import Index +from pandas.core.internals import ArrayManager if TYPE_CHECKING: from pandas import ( @@ -1909,6 +1910,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): def _setitem_iat_loc(self, loc: int, pi, value): # TODO: likely a BM method? + if isinstance(self.obj._mgr, ArrayManager): + # TODO: implement this correctly for ArrayManager + return self._setitem_single_column(loc, value, pi) + mgr = self.obj._mgr blkno = mgr.blknos[loc] blkloc = mgr.blklocs[loc] From 1af868610dfb3c6d313c7fa567bab71aa20f5bca Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 18:57:30 -0800 Subject: [PATCH 18/32] xfail only for BM --- pandas/tests/frame/indexing/test_setitem.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index eb2d48dfc9d2a..e8cdcfcaafa86 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -926,10 +926,15 @@ def test_setitem_duplicate_columns_not_inplace(self): tm.assert_frame_equal(df_view, df_copy) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(reason="Setitem with same dtype still changing inplace") @pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]]) - def test_setitem_same_dtype_not_inplace(self, value): + def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request): # GH#39510 + if not using_array_manager: + mark = pytest.mark.xfail( + reason="Setitem with same dtype still changing inplace" + ) + request.node.add_marker(mark) + cols = ["A", "B"] df = DataFrame(0, index=[0, 1], columns=cols) df_copy = df.copy() From 1d24f719ea33979b93701bc896bff5ccf1ec6d56 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 19:18:39 -0800 Subject: [PATCH 19/32] mypy fixup --- pandas/core/internals/blocks.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ca2fb2d4e6f52..86f6fde4d411b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1015,17 +1015,18 @@ def setitem_inplace(self, indexer, value): ----- Assumes self is 2D and that indexer is a 2-tuple. """ - if lib.is_scalar(value) and not self.is_extension: + if lib.is_scalar(value) and isinstance(self.dtype, np.dtype): # Convert timedelta/datetime to timedelta64/datetime64 value = convert_scalar_for_putitemlike(value, self.dtype) pi = indexer[0] - if self.is_extension: - # TODO(EA2D): not needed with 2D EAs - self.values[pi] = value - else: + values = self.values + if isinstance(values, np.ndarray): blkloc = indexer[1] - self.values[blkloc, pi] = value + values[blkloc, pi] = value + else: + # TODO(EA2D): special case not needed with 2D EAs + values[pi] = value def putmask(self, mask, new) -> List[Block]: """ From d8c7c4c09c4caf9c178582184fdc1f0dd65c1ea4 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 21:38:11 -0800 Subject: [PATCH 20/32] special case ArrayManager --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c1ed9d54c1cb9..5665857c73736 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1722,7 +1722,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) return - if is_integer(info_idx): + if is_integer(info_idx) and not isinstance(self.obj._mgr, ArrayManager): if is_integer(pi): # We need to watch out for case where we are treating a listlike # as a scalar, e.g. test_setitem_iloc_scalar_single for JSONArray From 6791a978f4eeaa07b61ac8d151268273e98641e8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 20:38:33 -0700 Subject: [PATCH 21/32] ArrayManager compat --- pandas/core/indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5665857c73736..47713d51674d6 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1715,6 +1715,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): com.is_null_slice(info_idx) and is_scalar(value) and not isinstance(pi, ABCDataFrame) + and not isinstance(self.obj._mgr, ArrayManager) ): # We can go directly through BlockManager.setitem without worrying # about alignment. From 8e8e7114717da9a94b8a6d67c196e02ca8e0bb59 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Mar 2021 19:20:50 -0700 Subject: [PATCH 22/32] arraymanager compat for tests --- pandas/tests/indexing/test_indexing.py | 43 +++++++++++++++++--------- pandas/tests/indexing/test_loc.py | 26 +++++++++++----- pandas/tests/indexing/test_partial.py | 10 ++++-- 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 92389e65435a9..fef7a66223be2 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -527,7 +527,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="'2011'"): df.loc["2011", 0] - def test_astype_assignment(self): + def test_astype_assignment(self, using_array_manager): # GH4312 (iloc) df_orig = DataFrame( @@ -539,9 +539,11 @@ def test_astype_assignment(self): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - # original (object) array can hold new values, so setting is inplace - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + if not using_array_manager: + # TODO(ArrayManager): get behaviors to match + # original (object) array can hold new values, so setting is inplace + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -549,9 +551,11 @@ def test_astype_assignment(self): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - # original (object) array can hold new values, so setting is inplace - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + if not using_array_manager: + # TODO(ArrayManager): get behaviors to match + # original (object) array can hold new values, so setting is inplace + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -560,8 +564,10 @@ def test_astype_assignment(self): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - # df["A"] can hold the RHS, so the assignment is inplace, remains object - expected["A"] = expected["A"].astype(object) + if not using_array_manager: + # TODO(ArrayManager): get behaviors to match + # df["A"] can hold the RHS, so the assignment is inplace, remains object + expected["A"] = expected["A"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -569,12 +575,14 @@ def test_astype_assignment(self): expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - # original (object) array can hold new values, so setting is inplace - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + if not using_array_manager: + # TODO(ArrayManager): get behaviors to match + # original (object) array can hold new values, so setting is inplace + expected["B"] = expected["B"].astype(object) + expected["C"] = expected["C"].astype(object) tm.assert_frame_equal(df, expected) - def test_astype_assignment_full_replacements(self): + def test_astype_assignment_full_replacements(self, using_array_manager): # full replacements / no nans # the new values can all be held by the existing array, so the assignment # is in-place @@ -586,11 +594,16 @@ def test_astype_assignment_full_replacements(self): df.iloc[ :, 0 ] = value # <- not yet, bc value is a DataFrame; would work with value["A"] - tm.assert_frame_equal(df, orig) + if using_array_manager: + # TODO(ArrayManager): get behaviors to match + expected = DataFrame({"A": [1, 2, 3, 4]}) + else: + expected = orig + tm.assert_frame_equal(df, expected) df = orig.copy() df.loc[:, "A"] = value - tm.assert_frame_equal(df, orig) + tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc]) def test_index_type_coercion(self, indexer): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 97243b3f7f698..08cfda5a96a30 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -277,17 +277,26 @@ def test_loc_setitem_slice(self): expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") tm.assert_frame_equal(df2, expected) - def test_loc_setitem_dtype(self): + def test_loc_setitem_dtype(self, using_array_manager): # GH31340 df = DataFrame({"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}) orig = df.copy() cols = ["a", "b", "c"] - # the float32 data (for columns "b" and "c") can be held in the existing - # float64 columns losslessly, so we keep the original underlying arrays - # and our dtypes are not changed. + df.loc[:, cols] = df.loc[:, cols].astype("float32") - tm.assert_frame_equal(df, orig) + if using_array_manager: + # TODO(ArrayManager): get behaviors to match + expected = DataFrame( + {"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}, dtype="float32" + ) # id is inferred as object + else: + # the float32 data (for columns "b" and "c") can be held in the existing + # float64 columns losslessly, so we keep the original underlying arrays + # and our dtypes are not changed. + expected = orig + + tm.assert_frame_equal(df, expected) def test_getitem_label_list_with_missing(self): s = Series(range(3), index=["a", "b", "c"]) @@ -1231,14 +1240,17 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - def test_loc_setitem_single_row_categorical(self): + def test_loc_setitem_single_row_categorical(self, using_array_manager): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) df.loc[:, "Alpha"] = categories result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha").astype(object) + expected = Series(categories, index=df.index, name="Alpha") + if not using_array_manager: + # TODO(ArrayManager): get behavior to match + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_loc_setitem_datetime_coercion(self): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index fd6f1a1b9c293..06795454d81fe 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -23,7 +23,7 @@ class TestPartialSetting: - def test_partial_setting(self): + def test_partial_setting(self, using_array_manager): # GH2578, allow ix and friends to partially set @@ -102,8 +102,12 @@ def test_partial_setting(self): tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite - # float64 can hold df.loc[:, "A"], so setting is inplace - expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0.0, 2.0, 4.0])})) + if using_array_manager: + # TODO(ArrayManager): get behavior to match + expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) + else: + # float64 can hold df.loc[:, "A"], so setting is inplace + expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0.0, 2.0, 4.0])})) df = df_orig.copy() df["B"] = df["B"].astype(np.float64) df.loc[:, "B"] = df.loc[:, "A"] From 10ab24d1f1d92b6dfc21430f642a01d01dffa762 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 10:30:46 -0700 Subject: [PATCH 23/32] merge master --- pandas/core/internals/blocks.py | 4 +++- pandas/tests/indexing/test_iloc.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 793cd26d0d9e2..32d36d2c33a5d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1012,7 +1012,8 @@ def setitem_inplace(self, indexer, value): pi = indexer[0] values = self.values - if isinstance(values, np.ndarray): + if isinstance(values.dtype, np.dtype): + # includes DaetimeArray, TimedeltaArray blkloc = indexer[1] values[blkloc, pi] = value else: @@ -1953,6 +1954,7 @@ def convert( copy=copy, ) res_values = ensure_block_shape(res_values, self.ndim) + res_values = ensure_wrapped_if_datetimelike(res_values) return [self.make_block(res_values)] diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index edd737d221ab7..e5049fcadad81 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -863,7 +863,7 @@ def test_iloc_setitem_categorical_updates_inplace( ) request.node.add_marker(mark) cat = Categorical(["A", "B", "C"]) - df = DataFrame({1: cat, 2: [1, 2, 3]}) + df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False) # This should modify our original values in-place df.iloc[:, 0] = cat[::-1] From 9a73ef637d69e38747ca2e8670b96a656443be65 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Jul 2021 09:55:55 -0700 Subject: [PATCH 24/32] update incorrect tests --- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/indexing/test_loc.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 3eef0b9f1035c..62106858292ad 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1219,7 +1219,7 @@ def test_iloc_setitem_dtypes_duplicate_columns( df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"]) df.iloc[:, 0] = df.iloc[:, 0].astype(dtypes) expected_df = DataFrame( - [[expected_value, "str", "str2"]], columns=["a", "b", "b"] + [[expected_value, "str", "str2"]], columns=["a", "b", "b"], dtype=object ) tm.assert_frame_equal(df, expected_df) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 80bfa9a2d6722..3ccf1cdaa19e1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -326,14 +326,14 @@ def test_loc_setitem_dtype(self, using_array_manager): # float64 columns losslessly, so we keep the original underlying arrays # and our dtypes are not changed. expected = orig - #expected = DataFrame( + # expected = DataFrame( # { # "id": ["A"], # "a": np.array([1.2], dtype="float32"), # "b": np.array([0.0], dtype="float32"), # "c": np.array([-2.5], dtype="float32"), # } - #) # id is inferred as object + # ) # id is inferred as object tm.assert_frame_equal(df, expected) @@ -655,7 +655,8 @@ def test_loc_setitem_frame_with_reindex_mixed(self): df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") + # setting int64 array into float64 column successfully casts so is inplace + ser = Series([2.0, 3.0, 1.0], index=[3, 5, 4], dtype="float64") expected = DataFrame({"A": ser}) expected["B"] = "string" tm.assert_frame_equal(df, expected) @@ -665,7 +666,8 @@ def test_loc_setitem_frame_with_inverted_slice(self): df = DataFrame(index=[1, 2, 3], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[slice(3, 0, -1), "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": [3, 2, 1], "B": "string"}, index=[1, 2, 3]) + # setting int64 array into float64 column successfully casts so is inplace + expected = DataFrame({"A": [3.0, 2.0, 1.0], "B": "string"}, index=[1, 2, 3]) tm.assert_frame_equal(df, expected) # TODO(ArrayManager) "split" path overwrites column and therefore don't take From 8761ffaa4b5d9b36ca08c209a1388660a5f42497 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Jul 2021 18:58:49 -0700 Subject: [PATCH 25/32] docstring, comment --- pandas/core/frame.py | 7 +++++++ pandas/core/indexing.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e01fba93264fc..5425aeeb06ca6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5252,6 +5252,13 @@ def _replace_columnwise( return res.__finalize__(self) def _isetitem(self, loc: int, value): + """ + Set a new array in our the given column position. + + Notes + ----- + Replaces the existing array, does not write into it. + """ cols = self.columns if cols.is_unique: col = cols[loc] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c54153341b19f..24f3c7de65c21 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1252,6 +1252,8 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): key = list(key) if com.is_bool_indexer(key): + # TODO: in this case should we do a .take on the value here? + # test_loc_setitem_all_false_boolean_two_blocks key = check_bool_indexer(labels, key) (inds,) = key.nonzero() return inds From d821e8b236c0d4745c5d6a1e3da10c3b59d9303d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Sep 2021 20:32:58 -0700 Subject: [PATCH 26/32] REF: implement mask_setitem_value --- pandas/core/indexing.py | 28 ++++++++++++++++++++++++++-- pandas/core/internals/blocks.py | 3 +++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bbb3cb3391dfa..074c9ca7b483c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1239,8 +1239,7 @@ def _convert_to_indexer(self, key, axis: int): if com.is_bool_indexer(key): key = check_bool_indexer(labels, key) - (inds,) = key.nonzero() - return inds + return key else: return self._get_listlike_indexer(key, axis)[1] else: @@ -1804,6 +1803,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): The indexer we use for setitem along axis=0. """ pi = plane_indexer + pi, value = mask_setitem_value(pi, value, len(self.obj)) ser = self.obj._ixs(loc, axis=1) @@ -2395,3 +2395,27 @@ def need_slice(obj: slice) -> bool: or obj.stop is not None or (obj.step is not None and obj.step != 1) ) + + +def mask_setitem_value(indexer, value, length: int): + """ + Convert a boolean indexer to a positional indexer, masking `value` if necessary. + """ + if com.is_bool_indexer(indexer): + indexer = np.asarray(indexer).nonzero()[0] + if is_list_like(value) and len(value) == length: + if not is_array_like(value): + value = [value[n] for n in indexer] + else: + value = value[indexer] + + elif isinstance(indexer, tuple): + indexer = list(indexer) + for i, key in enumerate(indexer): + if com.is_bool_indexer(key): + new_key = np.asarray(key).nonzero()[0] + indexer[i] = new_key + # TODO: sometimes need to do take on the value? + + indexer = tuple(indexer) + return indexer, value diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index da7ffbf08c34b..1d6639b062e1d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -112,6 +112,7 @@ is_exact_shape_match, is_scalar_indexer, ) +from pandas.core.indexing import mask_setitem_value import pandas.core.missing as missing if TYPE_CHECKING: @@ -936,6 +937,8 @@ def setitem(self, indexer, value): if transpose: values = values.T + indexer, value = mask_setitem_value(indexer, value, len(values)) + # length checking check_setitem_lengths(indexer, value, values) exact_match = is_exact_shape_match(values, arr_value) From 0ac5516f53f9e100f99eaac145b6322d6b6b50f0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Oct 2021 20:30:51 -0700 Subject: [PATCH 27/32] fix last tesst --- pandas/core/indexing.py | 27 ++++++++++++++++++++++----- pandas/core/internals/blocks.py | 5 ++++- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0fbc6845ea1e0..411f1e14048a9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,6 +11,7 @@ from pandas._libs.indexing import NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim +from pandas._typing import Shape from pandas.errors import ( AbstractMethodError, InvalidIndexError, @@ -56,7 +57,6 @@ Index, MultiIndex, ) -from pandas.core.internals import ArrayManager if TYPE_CHECKING: from pandas import ( @@ -1688,6 +1688,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): # FIXME: kludge return self._setitem_single_block(orig, value, name) + from pandas.core.internals import ArrayManager + if ( com.is_null_slice(info_idx) and is_scalar(value) @@ -1844,8 +1846,14 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str else: # TODO: not totally clear why we are requiring this + # Need so that we raise in test_multiindex_setitem self._align_frame(indexer[0], value) + if com.is_bool_indexer(indexer[0]) and indexer[0].sum() == len(value): + # TODO: better place for this? + pi = indexer[0].nonzero()[0] + sub_indexer[0] = pi + for loc in ilocs: item = self.obj.columns[loc] if item in value: @@ -1869,7 +1877,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): The indexer we use for setitem along axis=0. """ pi = plane_indexer - pi, value = mask_setitem_value(pi, value, len(self.obj)) + pi, value = mask_setitem_value(pi, value, (len(self.obj),)) ser = self.obj._ixs(loc, axis=1) @@ -1901,6 +1909,8 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): def _setitem_iat_loc(self, loc: int, pi, value): # TODO: likely a BM method? + from pandas.core.internals import ArrayManager + if isinstance(self.obj._mgr, ArrayManager): # TODO: implement this correctly for ArrayManager return self._setitem_single_column(loc, value, pi) @@ -2494,13 +2504,13 @@ def need_slice(obj: slice) -> bool: ) -def mask_setitem_value(indexer, value, length: int): +def mask_setitem_value(indexer, value, shape: Shape): """ Convert a boolean indexer to a positional indexer, masking `value` if necessary. """ if com.is_bool_indexer(indexer): indexer = np.asarray(indexer).nonzero()[0] - if is_list_like(value) and len(value) == length: + if is_list_like(value) and len(value) == shape[0]: if not is_array_like(value): value = [value[n] for n in indexer] else: @@ -2512,7 +2522,14 @@ def mask_setitem_value(indexer, value, length: int): if com.is_bool_indexer(key): new_key = np.asarray(key).nonzero()[0] indexer[i] = new_key - # TODO: sometimes need to do take on the value? + + if is_list_like(value) and len(value) == shape[i]: + # FIXME: assuming value.ndim == 1 here? + # FIXME: assuming non-i tuple member is scalar? + if not is_array_like(value): + value = [value[n] for n in new_key] + else: + value = value[new_key] indexer = tuple(indexer) return indexer, value diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c1f4d74b78905..f8fbe80795782 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -933,7 +933,7 @@ def setitem(self, indexer, value): if transpose: values = values.T - indexer, value = mask_setitem_value(indexer, value, len(values)) + indexer, value = mask_setitem_value(indexer, value, values.shape) # length checking check_setitem_lengths(indexer, value, values) @@ -975,6 +975,9 @@ def setitem_inplace(self, indexer, value) -> None: pi = indexer[0] values = self.values + + indexer, value = mask_setitem_value(indexer, value, values.shape) + if not isinstance(self, ExtensionBlock): # includes DatetimeArray, TimedeltaArray blkloc = indexer[1] From 7db1a5a20e6934b081999bd311209e1eadd746ae Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Nov 2021 09:11:56 -0700 Subject: [PATCH 28/32] ArrayManager cases --- pandas/tests/indexing/test_iloc.py | 6 +++++- pandas/tests/indexing/test_loc.py | 11 +++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 64ae2fed98e63..92b70cefb90c4 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1222,9 +1222,13 @@ def test_iloc_setitem_series_duplicate_columns(self): [("int64", "0", 0), ("float", "1.2", 1.2)], ) def test_iloc_setitem_dtypes_duplicate_columns( - self, dtypes, init_value, expected_value + self, dtypes, init_value, expected_value, using_array_manager, request ): # GH#22035 + if using_array_manager: + mark = pytest.mark.xfail(reason="incorrectly retains int64/float dtype") + request.node.add_marker(mark) + df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"]) df.iloc[:, 0] = df.iloc[:, 0].astype(dtypes) expected_df = DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a4185064e5cea..8dd6d5bbbaaf5 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -657,8 +657,12 @@ def test_loc_setitem_frame_with_reindex(self, using_array_manager): expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex_mixed(self): + def test_loc_setitem_frame_with_reindex_mixed(self, using_array_manager, request): # GH#40480 + if using_array_manager: + mark = pytest.mark.xfail(reason="df.A stays int64") + request.node.add_marker(mark) + df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") @@ -668,8 +672,11 @@ def test_loc_setitem_frame_with_reindex_mixed(self): expected["B"] = "string" tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_inverted_slice(self): + def test_loc_setitem_frame_with_inverted_slice(self, using_array_manager, request): # GH#40480 + if using_array_manager: + mark = pytest.mark.xfail(reason="df.A stays int64") + request.node.add_marker(mark) df = DataFrame(index=[1, 2, 3], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[slice(3, 0, -1), "A"] = np.array([1, 2, 3], dtype="int64") From 3b456f9d6440bd5b369f17e4e8ac08d63af3cb6d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Nov 2021 15:25:00 -0800 Subject: [PATCH 29/32] remove _isetitem --- pandas/core/frame.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a42a16441dfb3..36f860f1a3be3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5378,34 +5378,14 @@ def _replace_columnwise( target, value = mapping[ax[i]] newobj = ser.replace(target, value, regex=regex) - res._isetitem(i, newobj) + # If we had unique columns, we could just do + # res[res.columns[i]] = newobj + res._iset_item_mgr(i, newobj._values) if inplace: return return res.__finalize__(self) - def _isetitem(self, loc: int, value): - """ - Set a new array in our the given column position. - - Notes - ----- - Replaces the existing array, does not write into it. - """ - cols = self.columns - if cols.is_unique: - col = cols[loc] - self[col] = value - return - - # Otherwise we temporarily pin unique columns and call __setitem__ - newcols = Index(range(len(cols))) - try: - self.columns = newcols - self[loc] = value - finally: - self.columns = cols - @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift( self, From 4e69970568cbf142be6eaee737074c2abd4b04e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Nov 2021 15:28:59 -0800 Subject: [PATCH 30/32] fixup missing fixture --- pandas/tests/indexing/test_partial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 7aef553366c8d..826392c75965c 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -266,7 +266,7 @@ def test_partial_setting(self, using_array_manager): with pytest.raises(IndexError, match=msg): s.iat[3] = 5.0 - def test_partial_setting_frame(self): + def test_partial_setting_frame(self, using_array_manager): df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" ) From 0a0e71257223566107d486f315ae215c6aed76a1 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Nov 2021 18:12:05 -0800 Subject: [PATCH 31/32] un-xfail for ArrayManager --- pandas/tests/indexing/multiindex/test_setitem.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 2a12d690ff0bd..da2c1a9586de4 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -121,9 +121,6 @@ def test_setitem_multiindex3(self): expected=copy, ) - # TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in - # all NaNs -> doesn't work in the "split" path (also for BlockManager actually) - @td.skip_array_manager_not_yet_implemented def test_multiindex_setitem(self): # GH 3738 From 4ce2cd332adb5b811b8dfcc3ac3a10fdaa2dde20 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Nov 2021 20:09:25 -0800 Subject: [PATCH 32/32] fix incorrect test --- pandas/tests/io/sas/test_sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 5477559262cb8..fa9b0bb572000 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -33,7 +33,7 @@ def setup_method(self, datapath): for k in range(df.shape[1]): col = df.iloc[:, k] if col.dtype == np.int64: - df.iloc[:, k] = df.iloc[:, k].astype(np.float64) + df[df.columns[k]] = df.iloc[:, k].astype(np.float64) self.data.append(df) @pytest.mark.slow