Skip to content

Commit fefd999

Browse files
authored
REF/API: DataFrame.__setitem__ never operate in-place (#39510)
1 parent 203f901 commit fefd999

File tree

5 files changed

+173
-9
lines changed

5 files changed

+173
-9
lines changed

doc/source/whatsnew/v1.3.0.rst

+35
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,41 @@ In pandas 1.3.0, ``df`` continues to share data with ``values``
225225
np.shares_memory(df["A"], values)
226226
227227
228+
.. _whatsnew_130.notable_bug_fixes.setitem_never_inplace:
229+
230+
Never Operate Inplace When Setting ``frame[keys] = values``
231+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
232+
233+
When setting multiple columns using ``frame[keys] = values`` new arrays will
234+
replace pre-existing arrays for these keys, which will *not* be over-written
235+
(:issue:`39510`). As a result, the columns will retain the dtype(s) of ``values``,
236+
never casting to the dtypes of the existing arrays.
237+
238+
.. ipython:: python
239+
240+
df = pd.DataFrame(range(3), columns=["A"], dtype="float64")
241+
df[["A"]] = 5
242+
243+
In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing
244+
array backing ``df``:
245+
246+
*pandas 1.2.x*
247+
248+
.. code-block:: ipython
249+
250+
In [1]: df.dtypes
251+
Out[1]:
252+
A float64
253+
254+
In the new behavior, we get a new array, and retain an integer-dtyped ``5``:
255+
256+
*pandas 1.3.0*
257+
258+
.. ipython:: python
259+
260+
df.dtypes
261+
262+
228263
.. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting:
229264

230265
Consistent Casting With Setting Into Boolean Series

pandas/core/frame.py

+61-6
Original file line numberDiff line numberDiff line change
@@ -3548,6 +3548,7 @@ def _setitem_slice(self, key: slice, value):
35483548
def _setitem_array(self, key, value):
35493549
# also raises Exception if object array with NA values
35503550
if com.is_bool_indexer(key):
3551+
# bool indexer is indexing along rows
35513552
if len(key) != len(self.index):
35523553
raise ValueError(
35533554
f"Item wrong length {len(key)} instead of {len(self.index)}!"
@@ -3559,18 +3560,72 @@ def _setitem_array(self, key, value):
35593560
# GH#39931 reindex since iloc does not align
35603561
value = value.reindex(self.index.take(indexer))
35613562
self.iloc[indexer] = value
3563+
35623564
else:
35633565
if isinstance(value, DataFrame):
35643566
check_key_length(self.columns, key, value)
35653567
for k1, k2 in zip(key, value.columns):
35663568
self[k1] = value[k2]
3569+
3570+
elif not is_list_like(value):
3571+
for col in key:
3572+
self[col] = value
3573+
3574+
elif isinstance(value, np.ndarray) and value.ndim == 2:
3575+
self._iset_not_inplace(key, value)
3576+
3577+
elif np.ndim(value) > 1:
3578+
# list of lists
3579+
value = DataFrame(value).values
3580+
return self._setitem_array(key, value)
3581+
35673582
else:
3568-
self.loc._ensure_listlike_indexer(key, axis=1, value=value)
3569-
indexer = self.loc._get_listlike_indexer(
3570-
key, axis=1, raise_missing=False
3571-
)[1]
3572-
self._check_setitem_copy()
3573-
self.iloc[:, indexer] = value
3583+
self._iset_not_inplace(key, value)
3584+
3585+
def _iset_not_inplace(self, key, value):
3586+
# GH#39510 when setting with df[key] = obj with a list-like key and
3587+
# list-like value, we iterate over those listlikes and set columns
3588+
# one at a time. This is different from dispatching to
3589+
# `self.loc[:, key]= value` because loc.__setitem__ may overwrite
3590+
# data inplace, whereas this will insert new arrays.
3591+
3592+
def igetitem(obj, i: int):
3593+
# Note: we catch DataFrame obj before getting here, but
3594+
# hypothetically would return obj.iloc[:, i]
3595+
if isinstance(obj, np.ndarray):
3596+
return obj[..., i]
3597+
else:
3598+
return obj[i]
3599+
3600+
if self.columns.is_unique:
3601+
if np.shape(value)[-1] != len(key):
3602+
raise ValueError("Columns must be same length as key")
3603+
3604+
for i, col in enumerate(key):
3605+
self[col] = igetitem(value, i)
3606+
3607+
else:
3608+
3609+
ilocs = self.columns.get_indexer_non_unique(key)[0]
3610+
if (ilocs < 0).any():
3611+
# key entries not in self.columns
3612+
raise NotImplementedError
3613+
3614+
if np.shape(value)[-1] != len(ilocs):
3615+
raise ValueError("Columns must be same length as key")
3616+
3617+
assert np.ndim(value) <= 2
3618+
3619+
orig_columns = self.columns
3620+
3621+
# Using self.iloc[:, i] = ... may set values inplace, which
3622+
# by convention we do not do in __setitem__
3623+
try:
3624+
self.columns = Index(range(len(self.columns)))
3625+
for i, iloc in enumerate(ilocs):
3626+
self[iloc] = igetitem(value, i)
3627+
finally:
3628+
self.columns = orig_columns
35743629

35753630
def _setitem_frame(self, key, value):
35763631
# support boolean setting with DataFrame input, e.g.

pandas/tests/frame/indexing/test_setitem.py

+68-1
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,10 @@ def test_setitem_complete_column_with_array(self):
342342
"d": [1, 1, 1],
343343
}
344344
)
345+
expected["c"] = expected["c"].astype(arr.dtype)
346+
expected["d"] = expected["d"].astype(arr.dtype)
347+
assert expected["c"].dtype == arr.dtype
348+
assert expected["d"].dtype == arr.dtype
345349
tm.assert_frame_equal(df, expected)
346350

347351
@pytest.mark.parametrize("dtype", ["f8", "i8", "u8"])
@@ -381,16 +385,35 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager):
381385
[np.nan, 1, 2, np.nan, 4, 5],
382386
[np.nan, 1, 2, np.nan, 4, 5],
383387
],
384-
columns=cols,
385388
dtype="object",
386389
)
390+
387391
if using_array_manager:
388392
# setitem replaces column so changes dtype
393+
394+
expected.columns = cols
389395
expected["C"] = expected["C"].astype("int64")
390396
# TODO(ArrayManager) .loc still overwrites
391397
expected["B"] = expected["B"].astype("int64")
398+
else:
399+
# set these with unique columns to be extra-unambiguous
400+
expected[2] = expected[2].astype(np.int64)
401+
expected[5] = expected[5].astype(np.int64)
402+
expected.columns = cols
403+
392404
tm.assert_frame_equal(df, expected)
393405

406+
def test_setitem_frame_duplicate_columns_size_mismatch(self):
407+
# GH#39510
408+
cols = ["A", "B", "C"] * 2
409+
df = DataFrame(index=range(3), columns=cols)
410+
with pytest.raises(ValueError, match="Columns must be same length as key"):
411+
df[["A"]] = (0, 3, 5)
412+
413+
df2 = df.iloc[:, :3] # unique columns
414+
with pytest.raises(ValueError, match="Columns must be same length as key"):
415+
df2[["A"]] = (0, 3, 5)
416+
394417
@pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]])
395418
def test_setitem_df_wrong_column_number(self, cols):
396419
# GH#38604
@@ -890,3 +913,47 @@ def test_setitem_clear_caches(self):
890913

891914
assert df["z"] is not foo
892915
tm.assert_series_equal(df["z"], expected)
916+
917+
def test_setitem_duplicate_columns_not_inplace(self):
918+
# GH#39510
919+
cols = ["A", "B"] * 2
920+
df = DataFrame(0.0, index=[0], columns=cols)
921+
df_copy = df.copy()
922+
df_view = df[:]
923+
df["B"] = (2, 5)
924+
925+
expected = DataFrame([[0.0, 2, 0.0, 5]], columns=cols)
926+
tm.assert_frame_equal(df_view, df_copy)
927+
tm.assert_frame_equal(df, expected)
928+
929+
@pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]])
930+
def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request):
931+
# GH#39510
932+
if not using_array_manager:
933+
mark = pytest.mark.xfail(
934+
reason="Setitem with same dtype still changing inplace"
935+
)
936+
request.node.add_marker(mark)
937+
938+
cols = ["A", "B"]
939+
df = DataFrame(0, index=[0, 1], columns=cols)
940+
df_copy = df.copy()
941+
df_view = df[:]
942+
df[["B"]] = value
943+
944+
expected = DataFrame([[0, 1], [0, 1]], columns=cols)
945+
tm.assert_frame_equal(df, expected)
946+
tm.assert_frame_equal(df_view, df_copy)
947+
948+
@pytest.mark.parametrize("value", [1.0, np.array([[1.0], [1.0]]), [[1.0], [1.0]]])
949+
def test_setitem_listlike_key_scalar_value_not_inplace(self, value):
950+
# GH#39510
951+
cols = ["A", "B"]
952+
df = DataFrame(0, index=[0, 1], columns=cols)
953+
df_copy = df.copy()
954+
df_view = df[:]
955+
df[["B"]] = value
956+
957+
expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols)
958+
tm.assert_frame_equal(df_view, df_copy)
959+
tm.assert_frame_equal(df, expected)

pandas/tests/indexing/test_indexing.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,14 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli):
115115
)
116116
else:
117117
err = ValueError
118-
msg = r"Buffer has wrong number of dimensions \(expected 1, got 3\)|"
118+
msg = "|".join(
119+
[
120+
r"Buffer has wrong number of dimensions \(expected 1, got 3\)",
121+
"Cannot set values with ndim > 1",
122+
"Index data must be 1-dimensional",
123+
"Array conditional must be same shape as self",
124+
]
125+
)
119126

120127
with pytest.raises(err, match=msg):
121128
idxr[nd3] = 0

pandas/tests/reshape/test_pivot.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -966,7 +966,7 @@ def test_margins_dtype(self):
966966
# GH 17013
967967

968968
df = self.data.copy()
969-
df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3)
969+
df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8")
970970

971971
mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")]
972972
mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))

0 commit comments

Comments
 (0)