From a5c1f5ed6f82a7b606c326fb9b1b5614d9c9d813 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 26 Jan 2021 20:13:17 -0800 Subject: [PATCH 01/13] REF: DataFrame._setitem_array dont use iloc.__setitem__ --- pandas/core/frame.py | 73 +++++++++++++++++---- pandas/tests/frame/indexing/test_setitem.py | 4 +- pandas/tests/indexing/test_indexing.py | 8 ++- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7585b21abe99..e15ff6efc9442 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3211,9 +3211,10 @@ def _setitem_slice(self, key: slice, value): self._check_setitem_copy() self.iloc[key] = value - def _setitem_array(self, key, value): + def _setitem_array(self, key: Sequence[Hashable], value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): + # bool indexer is indexing along rows if len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}!" @@ -3222,19 +3223,67 @@ def _setitem_array(self, key, value): indexer = key.nonzero()[0] self._check_setitem_copy() self.iloc[indexer] = value + + elif isinstance(key, np.ndarray) and key.ndim != 1: + # FIXME: kludge just to get the right exception message + # in test_setitem_ndarray_3d + raise ValueError( + f"Buffer has wrong number of dimensions (expected 1, got {key.ndim})" + ) + + elif not isinstance(value, (np.ndarray, DataFrame)) and np.ndim(value) > 1: + # list of lists + value = DataFrame(value) + return self._setitem_array(key, value) + + elif not self.columns.is_unique and not all( + is_hashable(x) and x in self.columns for x in key + ): + raise NotImplementedError + else: - if isinstance(value, DataFrame) and self.columns.is_unique: - if len(value.columns) != len(key): - raise ValueError("Columns must be same length as key") - for k1, k2 in zip(key, value.columns): - self[k1] = value[k2] + # Note: unlike self.iloc[:, indexer] = value, this will + # never try to overwrite values inplace + + def igetitem(val, i: int): + if isinstance(val, np.ndarray) and val.ndim == 2: + return val[:, i] + elif isinstance(val, DataFrame): + return val.iloc[:, i] + elif not is_list_like(val): + return val + else: + return val[i] + + if self.columns.is_unique: + key_len = len(key) + else: + indexer, missing = self.columns.get_indexer_non_unique(key) + if len(missing) == 0: + key_len = len(indexer) + else: + key_len = np.shape(value)[-1] + + if not is_list_like(value): + vlen = key_len + else: + vlen = np.shape(value)[-1] + + if vlen != key_len: + # TODO: is this only if all are contained? + raise ValueError("Columns must be same length as key") + + if self.columns.is_unique: + # easy case + for i, col in enumerate(key): + self[col] = igetitem(value, i) + else: - self.loc._ensure_listlike_indexer(key, axis=1, value=value) - indexer = self.loc._get_listlike_indexer( - key, axis=1, raise_missing=False - )[1] - self._check_setitem_copy() - self.iloc[:, indexer] = value + mask = self.columns.isin(key) + ilocs = mask.nonzero()[0] + for i, iloc in enumerate(ilocs): + # TODO: sure this is never-inplace? + self._iset_item(iloc, igetitem(value, i)) def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 4f8ac49cb17ec..b54421d76c446 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -361,9 +361,11 @@ def test_setitem_frame_duplicate_columns(self): [np.nan, 1, 2, np.nan, 4, 5], [np.nan, 1, 2, np.nan, 4, 5], ], - columns=cols, dtype="object", ) + expected[2] = expected[2].astype(np.int64) + expected[5] = expected[5].astype(np.int64) + expected.columns = cols tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f67341ab176d7..596b437860747 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -99,7 +99,13 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer): ) else: err = ValueError - msg = r"Buffer has wrong number of dimensions \(expected 1, got 3\)|" + msg = "|".join( + [ + r"Buffer has wrong number of dimensions \(expected 1, got 3\)", + "Cannot set values with ndim > 1", + "Index data must be 1-dimensional", + ] + ) with pytest.raises(err, match=msg): idxr[nd3] = 0 From 1789fcad0a1fdc44f3098eb2ece4c5083a4e9e1d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Feb 2021 17:08:22 -0800 Subject: [PATCH 02/13] mypy fixup --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70aa4fb13ac35..ae77384357df5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3218,7 +3218,7 @@ def _setitem_slice(self, key: slice, value): self._check_setitem_copy() self.iloc[key] = value - def _setitem_array(self, key: Sequence[Hashable], value): + def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): # bool indexer is indexing along rows From a1cdd1931ff24d4a54777e6312957c282b6cda24 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Feb 2021 19:43:07 -0800 Subject: [PATCH 03/13] mypy fixup --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 09c8c46168e24..f2176496fa637 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3285,7 +3285,7 @@ def igetitem(obj, i: int): # Using self.iloc[:, i] = ... may set values inplace, which # by convention we do not do in __setitem__ try: - self.columns = range(len(self.columns)) + self.columns = Index(range(len(self.columns))) for i, iloc in enumerate(ilocs): self[iloc] = igetitem(value, i) finally: From c32a427dfbfad2c3bbb99b4ec873606dc143c1e9 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 10:02:13 -0800 Subject: [PATCH 04/13] 32bit compat --- pandas/tests/frame/indexing/test_setitem.py | 2 ++ pandas/tests/reshape/test_pivot.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e48a40f7ac110..d9827cd937ce6 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -322,6 +322,8 @@ def test_setitem_complete_column_with_array(self): "d": [1, 1, 1], } ) + expected["c"] = expected["c"].astype(np.intp) + expected["d"] = expected["d"].astype(np.intp) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f9b2a02920841..bc5f00d8048ae 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -969,7 +969,7 @@ def test_margins_dtype(self): expected = DataFrame( {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) - expected["All"] = expected["dull"] + expected["shiny"] + expected["All"] = (expected["dull"] + expected["shiny"]).astype(np.intp) result = df.pivot_table( values="D", From 8a283d94a5e73233c993494d18c608490613c763 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 13:16:59 -0800 Subject: [PATCH 05/13] troubleshoot windows builds --- pandas/tests/frame/indexing/test_setitem.py | 6 ++++-- pandas/tests/reshape/test_pivot.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d9827cd937ce6..d30a848564882 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -322,8 +322,10 @@ def test_setitem_complete_column_with_array(self): "d": [1, 1, 1], } ) - expected["c"] = expected["c"].astype(np.intp) - expected["d"] = expected["d"].astype(np.intp) + expected["c"] = expected["c"].astype(arr.dtype) + expected["d"] = expected["d"].astype(arr.dtype) + assert expected["c"].dtype == arr.dtype + assert expected["d"].dtype == arr.dtype tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index bc5f00d8048ae..6ffe4a458016d 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -969,7 +969,9 @@ def test_margins_dtype(self): expected = DataFrame( {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) - expected["All"] = (expected["dull"] + expected["shiny"]).astype(np.intp) + expected["All"] = expected["dull"] + expected["shiny"] + expected["All"] = expected["All"].astype(np.intp) + assert expected.dtypes["All"] == np.intp result = df.pivot_table( values="D", From 9f6d8e7af3d0ee66256929994bb0e21d9188f7fd Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 18:19:40 -0800 Subject: [PATCH 06/13] troubleshoot 32bit --- pandas/tests/reshape/test_pivot.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6ffe4a458016d..301163517223b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -962,7 +962,7 @@ def test_margins_dtype(self): # GH 17013 df = self.data.copy() - df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) + df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8") mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) @@ -970,8 +970,6 @@ def test_margins_dtype(self): {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) expected["All"] = expected["dull"] + expected["shiny"] - expected["All"] = expected["All"].astype(np.intp) - assert expected.dtypes["All"] == np.intp result = df.pivot_table( values="D", From c107e772dc73978c10e926c73b1e99affc1b21af Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Feb 2021 12:51:30 -0800 Subject: [PATCH 07/13] TST: missed raising cases --- pandas/core/frame.py | 6 +++--- pandas/tests/frame/indexing/test_setitem.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7d8ebecb595b..06c402ee39e5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3245,9 +3245,9 @@ def _setitem_array(self, key, value): def _iset_not_inplace(self, key, value): def igetitem(obj, i: int): - if isinstance(obj, DataFrame): - return obj.iloc[:, i] - elif isinstance(obj, np.ndarray): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): return obj[..., i] else: return obj[i] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d30a848564882..e706133eb6b1e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -372,6 +372,17 @@ def test_setitem_frame_duplicate_columns(self): expected.columns = cols tm.assert_frame_equal(df, expected) + def test_setitem_frame_duplicate_columns_size_mismatch(self): + # GH#39510 + cols = ["A", "B", "C"] * 2 + df = DataFrame(index=range(3), columns=cols) + with pytest.raises(ValueError, match="Columns must be same length as key"): + df[["A"]] = (0, 3, 5) + + df2 = df.iloc[:, :3] # unique columns + with pytest.raises(ValueError, match="Columns must be same length as key"): + df2[["A"]] = (0, 3, 5) + @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) def test_setitem_df_wrong_column_number(self, cols): # GH#38604 From 4cfe863358368acfaab2ab4e419ccaef0ffcc4d5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 14:06:20 -0800 Subject: [PATCH 08/13] port phofl tests --- pandas/tests/frame/indexing/test_setitem.py | 39 +++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cfc49d982cc0f..3af345d2704f2 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -784,3 +784,42 @@ def test_setitem_clear_caches(self): assert df["z"] is not foo tm.assert_series_equal(df["z"], expected) + + def test_setitem_duplicate_columns_not_inplace(self): + # GH#39510 + cols = ["A", "B"] * 2 + df = DataFrame(0.0, index=[0], columns=cols) + df_copy = df.copy() + df_view = df[:] + df["B"] = (2, 5) + + expected = DataFrame([[0.0, 2, 0.0, 5]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) + + @pytest.mark.xfail(reason="Setitem with same dtype still changing inplace") + @pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]]) + def test_setitem_same_dtype_not_inplace(self, value): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1], [0, 1]], columns=cols) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df_view, df_copy) + + @pytest.mark.parametrize("value", [1.0, np.array([[1.0], [1.0]]), [[1.0], [1.0]]]) + def test_setitem_listlike_key_scalar_value_not_inplace(self, value): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) From 7ea792f267b5e610ee00ff6d3bdcf39795af689b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 07:15:23 -0800 Subject: [PATCH 09/13] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f7204ceb9d412..f0dfcb3788581 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -449,6 +449,7 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) +- Bug in :meth:`DataFrame.__setitem__` in some cases overwriting existing values instead of inserting a new array (:issue:`39510`) Missing ^^^^^^^ From 20f6a161899cbcf1f7d3be23b55eabb427a310cc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 15:19:09 -0800 Subject: [PATCH 10/13] clarify whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f0dfcb3788581..32860b9319459 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -449,7 +449,7 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) -- Bug in :meth:`DataFrame.__setitem__` in some cases overwriting existing values instead of inserting a new array (:issue:`39510`) +- Bug in :meth:`DataFrame.__setitem__` when setting multiple columns incorrectly overwriting existing data rather than inserting new arrays (:issue:`39510`) Missing ^^^^^^^ From 2d0cf9e8a6d4201855dc092618d5e35cc38c617c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 12:40:01 -0800 Subject: [PATCH 11/13] comment --- pandas/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5b44ede283a7..bdafcc53b4032 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3516,6 +3516,12 @@ def _setitem_array(self, key, value): self._iset_not_inplace(key, value) def _iset_not_inplace(self, key, value): + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. + def igetitem(obj, i: int): # Note: we catch DataFrame obj before getting here, but # hypothetically would return obj.iloc[:, i] From e1ed083207a59067fcdbd017a74d0b5a1dbda632 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 12:51:40 -0800 Subject: [PATCH 12/13] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9e05e1c80a1d1..d539512edeb64 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -223,6 +223,41 @@ In pandas 1.3.0, ``df`` continues to share data with ``values`` np.shares_memory(df["A"], values) +.. _whatsnew_130.notable_bug_fixes.setitem_never_inplace: + +Never Operate Inplace When Setting ``frame[keys] = values`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When setting multiple columns using ``frame[keys] = values`` new arrays will +replace pre-existing arrays for these keys, which will *not* be over-written +(:issue:`39510`). As a result, the columns will retain the dtype(s) of ``values``, +never casting to the dtypes of the existing arrays. + +.. ipython:: python + + df = pd.DataFrame(range(3), columns=["A"], dtype="float64") + df[["A"]] = 5 + +In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing +array backing ``df``: + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: df.dtypes + Out[1]: + A float64 + +In the new behavior, we get a new array, and retain an integer-dtyped ``5``: + +*pandas 1.3.0* + +.. ipython:: python + + df.dtypes + + .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: Consistent Casting With Setting Into Boolean Series @@ -495,7 +530,6 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) -- Bug in :meth:`DataFrame.__setitem__` when setting multiple columns incorrectly overwriting existing data rather than inserting new arrays (:issue:`39510`) Missing ^^^^^^^ From 1af868610dfb3c6d313c7fa567bab71aa20f5bca Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 18:57:30 -0800 Subject: [PATCH 13/13] xfail only for BM --- pandas/tests/frame/indexing/test_setitem.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index eb2d48dfc9d2a..e8cdcfcaafa86 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -926,10 +926,15 @@ def test_setitem_duplicate_columns_not_inplace(self): tm.assert_frame_equal(df_view, df_copy) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(reason="Setitem with same dtype still changing inplace") @pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]]) - def test_setitem_same_dtype_not_inplace(self, value): + def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request): # GH#39510 + if not using_array_manager: + mark = pytest.mark.xfail( + reason="Setitem with same dtype still changing inplace" + ) + request.node.add_marker(mark) + cols = ["A", "B"] df = DataFrame(0, index=[0, 1], columns=cols) df_copy = df.copy()