diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index fad734a0e39ad..272d9f8399d81 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -225,6 +225,41 @@ In pandas 1.3.0, ``df`` continues to share data with ``values`` np.shares_memory(df["A"], values) +.. _whatsnew_130.notable_bug_fixes.setitem_never_inplace: + +Never Operate Inplace When Setting ``frame[keys] = values`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When setting multiple columns using ``frame[keys] = values`` new arrays will +replace pre-existing arrays for these keys, which will *not* be over-written +(:issue:`39510`). As a result, the columns will retain the dtype(s) of ``values``, +never casting to the dtypes of the existing arrays. + +.. ipython:: python + + df = pd.DataFrame(range(3), columns=["A"], dtype="float64") + df[["A"]] = 5 + +In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing +array backing ``df``: + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: df.dtypes + Out[1]: + A float64 + +In the new behavior, we get a new array, and retain an integer-dtyped ``5``: + +*pandas 1.3.0* + +.. ipython:: python + + df.dtypes + + .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: Consistent Casting With Setting Into Boolean Series diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f36f300c529cf..9fdd979ce8eca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3548,6 +3548,7 @@ def _setitem_slice(self, key: slice, value): def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): + # bool indexer is indexing along rows if len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}!" @@ -3559,18 +3560,72 @@ def _setitem_array(self, key, value): # GH#39931 reindex since iloc does not align value = value.reindex(self.index.take(indexer)) self.iloc[indexer] = value + else: if isinstance(value, DataFrame): check_key_length(self.columns, key, value) for k1, k2 in zip(key, value.columns): self[k1] = value[k2] + + elif not is_list_like(value): + for col in key: + self[col] = value + + elif isinstance(value, np.ndarray) and value.ndim == 2: + self._iset_not_inplace(key, value) + + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + return self._setitem_array(key, value) + else: - self.loc._ensure_listlike_indexer(key, axis=1, value=value) - indexer = self.loc._get_listlike_indexer( - key, axis=1, raise_missing=False - )[1] - self._check_setitem_copy() - self.iloc[:, indexer] = value + self._iset_not_inplace(key, value) + + def _iset_not_inplace(self, key, value): + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. + + def igetitem(obj, i: int): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): + return obj[..., i] + else: + return obj[i] + + if self.columns.is_unique: + if np.shape(value)[-1] != len(key): + raise ValueError("Columns must be same length as key") + + for i, col in enumerate(key): + self[col] = igetitem(value, i) + + else: + + ilocs = self.columns.get_indexer_non_unique(key)[0] + if (ilocs < 0).any(): + # key entries not in self.columns + raise NotImplementedError + + if np.shape(value)[-1] != len(ilocs): + raise ValueError("Columns must be same length as key") + + assert np.ndim(value) <= 2 + + orig_columns = self.columns + + # Using self.iloc[:, i] = ... may set values inplace, which + # by convention we do not do in __setitem__ + try: + self.columns = Index(range(len(self.columns))) + for i, iloc in enumerate(ilocs): + self[iloc] = igetitem(value, i) + finally: + self.columns = orig_columns def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 661df8a792c65..e8cdcfcaafa86 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -342,6 +342,10 @@ def test_setitem_complete_column_with_array(self): "d": [1, 1, 1], } ) + expected["c"] = expected["c"].astype(arr.dtype) + expected["d"] = expected["d"].astype(arr.dtype) + assert expected["c"].dtype == arr.dtype + assert expected["d"].dtype == arr.dtype tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) @@ -381,16 +385,35 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager): [np.nan, 1, 2, np.nan, 4, 5], [np.nan, 1, 2, np.nan, 4, 5], ], - columns=cols, dtype="object", ) + if using_array_manager: # setitem replaces column so changes dtype + + expected.columns = cols expected["C"] = expected["C"].astype("int64") # TODO(ArrayManager) .loc still overwrites expected["B"] = expected["B"].astype("int64") + else: + # set these with unique columns to be extra-unambiguous + expected[2] = expected[2].astype(np.int64) + expected[5] = expected[5].astype(np.int64) + expected.columns = cols + tm.assert_frame_equal(df, expected) + def test_setitem_frame_duplicate_columns_size_mismatch(self): + # GH#39510 + cols = ["A", "B", "C"] * 2 + df = DataFrame(index=range(3), columns=cols) + with pytest.raises(ValueError, match="Columns must be same length as key"): + df[["A"]] = (0, 3, 5) + + df2 = df.iloc[:, :3] # unique columns + with pytest.raises(ValueError, match="Columns must be same length as key"): + df2[["A"]] = (0, 3, 5) + @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) def test_setitem_df_wrong_column_number(self, cols): # GH#38604 @@ -890,3 +913,47 @@ def test_setitem_clear_caches(self): assert df["z"] is not foo tm.assert_series_equal(df["z"], expected) + + def test_setitem_duplicate_columns_not_inplace(self): + # GH#39510 + cols = ["A", "B"] * 2 + df = DataFrame(0.0, index=[0], columns=cols) + df_copy = df.copy() + df_view = df[:] + df["B"] = (2, 5) + + expected = DataFrame([[0.0, 2, 0.0, 5]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]]) + def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request): + # GH#39510 + if not using_array_manager: + mark = pytest.mark.xfail( + reason="Setitem with same dtype still changing inplace" + ) + request.node.add_marker(mark) + + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1], [0, 1]], columns=cols) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df_view, df_copy) + + @pytest.mark.parametrize("value", [1.0, np.array([[1.0], [1.0]]), [[1.0], [1.0]]]) + def test_setitem_listlike_key_scalar_value_not_inplace(self, value): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 880872bfa713a..b72a7c1081d0e 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -115,7 +115,14 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli): ) else: err = ValueError - msg = r"Buffer has wrong number of dimensions \(expected 1, got 3\)|" + msg = "|".join( + [ + r"Buffer has wrong number of dimensions \(expected 1, got 3\)", + "Cannot set values with ndim > 1", + "Index data must be 1-dimensional", + "Array conditional must be same shape as self", + ] + ) with pytest.raises(err, match=msg): idxr[nd3] = 0 diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e345f4f4b5f7f..c50886ba43019 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -966,7 +966,7 @@ def test_margins_dtype(self): # GH 17013 df = self.data.copy() - df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) + df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8") mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))