Skip to content

Commit 03dd698

Browse files
authored
BUG: DataFrame.__setitem__ sometimes operating inplace (#43406)
1 parent 33fffdb commit 03dd698

File tree

14 files changed

+115
-63
lines changed

14 files changed

+115
-63
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ Indexing
531531
- Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
532532
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
533533
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
534+
- Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`)
534535
- Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
535536
- Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`)
536537
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`)

pandas/core/frame.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -3840,9 +3840,11 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
38403840
arraylike = _reindex_for_setitem(value, self.index)
38413841
self._set_item_mgr(key, arraylike)
38423842

3843-
def _iset_item_mgr(self, loc: int | slice | np.ndarray, value) -> None:
3843+
def _iset_item_mgr(
3844+
self, loc: int | slice | np.ndarray, value, inplace: bool = False
3845+
) -> None:
38443846
# when called from _set_item_mgr loc can be anything returned from get_loc
3845-
self._mgr.iset(loc, value)
3847+
self._mgr.iset(loc, value, inplace=inplace)
38463848
self._clear_item_cache()
38473849

38483850
def _set_item_mgr(self, key, value: ArrayLike) -> None:
@@ -3860,9 +3862,9 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
38603862
if len(self):
38613863
self._check_setitem_copy()
38623864

3863-
def _iset_item(self, loc: int, value) -> None:
3865+
def _iset_item(self, loc: int, value, inplace: bool = False) -> None:
38643866
arraylike = self._sanitize_column(value)
3865-
self._iset_item_mgr(loc, arraylike)
3867+
self._iset_item_mgr(loc, arraylike, inplace=inplace)
38663868

38673869
# check if we are modifying a copy
38683870
# try to set first as we want an invalid
@@ -3994,13 +3996,13 @@ def _reset_cacher(self) -> None:
39943996
# no-op for DataFrame
39953997
pass
39963998

3997-
def _maybe_cache_changed(self, item, value: Series) -> None:
3999+
def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
39984000
"""
39994001
The object has called back to us saying maybe it has changed.
40004002
"""
40014003
loc = self._info_axis.get_loc(item)
40024004
arraylike = value._values
4003-
self._mgr.iset(loc, arraylike)
4005+
self._mgr.iset(loc, arraylike, inplace=inplace)
40044006

40054007
# ----------------------------------------------------------------------
40064008
# Unsorted

pandas/core/generic.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -3531,7 +3531,10 @@ def _reset_cacher(self) -> None:
35313531
raise AbstractMethodError(self)
35323532

35333533
def _maybe_update_cacher(
3534-
self, clear: bool_t = False, verify_is_copy: bool_t = True
3534+
self,
3535+
clear: bool_t = False,
3536+
verify_is_copy: bool_t = True,
3537+
inplace: bool_t = False,
35353538
) -> None:
35363539
"""
35373540
See if we need to update our parent cacher if clear, then clear our

pandas/core/indexing.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -1862,10 +1862,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
18621862
# set the item, possibly having a dtype change
18631863
ser = ser.copy()
18641864
ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value)
1865-
ser._maybe_update_cacher(clear=True)
1865+
ser._maybe_update_cacher(clear=True, inplace=True)
18661866

18671867
# reset the sliced object if unique
1868-
self.obj._iset_item(loc, ser)
1868+
self.obj._iset_item(loc, ser, inplace=True)
18691869

18701870
def _setitem_single_block(self, indexer, value, name: str):
18711871
"""
@@ -1890,9 +1890,10 @@ def _setitem_single_block(self, indexer, value, name: str):
18901890
if i != info_axis
18911891
)
18921892
):
1893-
selected_item_labels = item_labels[indexer[info_axis]]
1894-
if len(item_labels.get_indexer_for([selected_item_labels])) == 1:
1895-
self.obj[selected_item_labels] = value
1893+
col = item_labels[indexer[info_axis]]
1894+
if len(item_labels.get_indexer_for([col])) == 1:
1895+
loc = item_labels.get_loc(col)
1896+
self.obj._iset_item(loc, value, inplace=True)
18961897
return
18971898

18981899
indexer = maybe_convert_ix(*indexer)
@@ -1910,7 +1911,7 @@ def _setitem_single_block(self, indexer, value, name: str):
19101911

19111912
# actually do the set
19121913
self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
1913-
self.obj._maybe_update_cacher(clear=True)
1914+
self.obj._maybe_update_cacher(clear=True, inplace=True)
19141915

19151916
def _setitem_with_indexer_missing(self, indexer, value):
19161917
"""

pandas/core/internals/array_manager.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,9 @@ def column_arrays(self) -> list[ArrayLike]:
792792
"""
793793
return self.arrays
794794

795-
def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
795+
def iset(
796+
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
797+
):
796798
"""
797799
Set new column(s).
798800
@@ -804,6 +806,8 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
804806
loc : integer, slice or boolean mask
805807
Positional location (already bounds checked)
806808
value : np.ndarray or ExtensionArray
809+
inplace : bool, default False
810+
Whether overwrite existing array as opposed to replacing it.
807811
"""
808812
# single column -> single integer index
809813
if lib.is_integer(loc):

pandas/core/internals/managers.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1032,7 +1032,9 @@ def column_arrays(self) -> list[np.ndarray]:
10321032
# expected "List[ndarray[Any, Any]]")
10331033
return result # type: ignore[return-value]
10341034

1035-
def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
1035+
def iset(
1036+
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
1037+
):
10361038
"""
10371039
Set new item in-place. Does not consolidate. Adds new Block if not
10381040
contained in the current set of items
@@ -1087,7 +1089,7 @@ def value_getitem(placement):
10871089
for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
10881090
blk = self.blocks[blkno]
10891091
blk_locs = blklocs[val_locs.indexer]
1090-
if blk.should_store(value):
1092+
if inplace and blk.should_store(value):
10911093
blk.set_inplace(blk_locs, value_getitem(val_locs))
10921094
else:
10931095
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

pandas/core/series.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1226,7 +1226,7 @@ def _check_is_chained_assignment_possible(self) -> bool:
12261226
return super()._check_is_chained_assignment_possible()
12271227

12281228
def _maybe_update_cacher(
1229-
self, clear: bool = False, verify_is_copy: bool = True
1229+
self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False
12301230
) -> None:
12311231
"""
12321232
See NDFrame._maybe_update_cacher.__doc__
@@ -1244,13 +1244,15 @@ def _maybe_update_cacher(
12441244
# GH#42530 self.name must be in ref.columns
12451245
# to ensure column still in dataframe
12461246
# otherwise, either self or ref has swapped in new arrays
1247-
ref._maybe_cache_changed(cacher[0], self)
1247+
ref._maybe_cache_changed(cacher[0], self, inplace=inplace)
12481248
else:
12491249
# GH#33675 we have swapped in a new array, so parent
12501250
# reference to self is now invalid
12511251
ref._item_cache.pop(cacher[0], None)
12521252

1253-
super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy)
1253+
super()._maybe_update_cacher(
1254+
clear=clear, verify_is_copy=verify_is_copy, inplace=inplace
1255+
)
12541256

12551257
# ----------------------------------------------------------------------
12561258
# Unsorted

pandas/tests/frame/indexing/test_indexing.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -537,16 +537,19 @@ def test_getitem_setitem_integer_slice_keyerrors(self):
537537

538538
@td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view
539539
def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):
540+
540541
sliced = float_string_frame.iloc[:, -3:]
541542
assert sliced["D"].dtype == np.float64
542543

543544
# get view with single block
544545
# setting it triggers setting with copy
545546
sliced = float_frame.iloc[:, -3:]
546547

548+
assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values)
549+
547550
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
548551
with pytest.raises(com.SettingWithCopyError, match=msg):
549-
sliced["C"] = 4.0
552+
sliced.loc[:, "C"] = 4.0
550553

551554
assert (float_frame["C"] == 4).all()
552555

@@ -1002,9 +1005,11 @@ def test_iloc_row_slice_view(self, using_array_manager):
10021005
# setting it makes it raise/warn
10031006
subset = df.iloc[slice(4, 8)]
10041007

1008+
assert np.shares_memory(df[2], subset[2])
1009+
10051010
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
10061011
with pytest.raises(com.SettingWithCopyError, match=msg):
1007-
subset[2] = 0.0
1012+
subset.loc[:, 2] = 0.0
10081013

10091014
exp_col = original[2].copy()
10101015
# TODO(ArrayManager) verify it is expected that the original didn't change
@@ -1041,10 +1046,13 @@ def test_iloc_col_slice_view(self, using_array_manager):
10411046

10421047
if not using_array_manager:
10431048
# verify slice is view
1049+
1050+
assert np.shares_memory(df[8]._values, subset[8]._values)
1051+
10441052
# and that we are setting a copy
10451053
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
10461054
with pytest.raises(com.SettingWithCopyError, match=msg):
1047-
subset[8] = 0.0
1055+
subset.loc[:, 8] = 0.0
10481056

10491057
assert (df[8] == 0).all()
10501058
else:

pandas/tests/frame/indexing/test_setitem.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -1028,12 +1028,6 @@ def test_setitem_duplicate_columns_not_inplace(self):
10281028
)
10291029
def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request):
10301030
# GH#39510
1031-
if not using_array_manager:
1032-
mark = pytest.mark.xfail(
1033-
reason="Setitem with same dtype still changing inplace"
1034-
)
1035-
request.node.add_marker(mark)
1036-
10371031
cols = ["A", "B"]
10381032
df = DataFrame(0, index=[0, 1], columns=cols)
10391033
df_copy = df.copy()
@@ -1056,3 +1050,34 @@ def test_setitem_listlike_key_scalar_value_not_inplace(self, value):
10561050
expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols)
10571051
tm.assert_frame_equal(df_view, df_copy)
10581052
tm.assert_frame_equal(df, expected)
1053+
1054+
@pytest.mark.parametrize(
1055+
"indexer",
1056+
[
1057+
"a",
1058+
["a"],
1059+
pytest.param(
1060+
[True, False],
1061+
marks=pytest.mark.xfail(
1062+
reason="Boolean indexer incorrectly setting inplace",
1063+
strict=False, # passing on some builds, no obvious pattern
1064+
),
1065+
),
1066+
],
1067+
)
1068+
@pytest.mark.parametrize(
1069+
"value, set_value",
1070+
[
1071+
(1, 5),
1072+
(1.0, 5.0),
1073+
(Timestamp("2020-12-31"), Timestamp("2021-12-31")),
1074+
("a", "b"),
1075+
],
1076+
)
1077+
def test_setitem_not_operating_inplace(self, value, set_value, indexer):
1078+
# GH#43406
1079+
df = DataFrame({"a": value}, index=[0, 1])
1080+
expected = df.copy()
1081+
view = df[:]
1082+
df[indexer] = set_value
1083+
tm.assert_frame_equal(view, expected)

pandas/tests/frame/methods/test_rename.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,10 @@ def test_rename_multiindex(self):
173173
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view
174174
def test_rename_nocopy(self, float_frame):
175175
renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
176-
renamed["foo"] = 1.0
176+
177+
assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values)
178+
179+
renamed.loc[:, "foo"] = 1.0
177180
assert (float_frame["C"] == 1.0).all()
178181

179182
def test_rename_inplace(self, float_frame):

pandas/tests/indexing/test_iloc.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -824,20 +824,24 @@ def test_iloc_empty_list_indexer_is_ok(self):
824824
df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
825825
)
826826

827-
def test_identity_slice_returns_new_object(self, using_array_manager):
827+
def test_identity_slice_returns_new_object(self, using_array_manager, request):
828828
# GH13873
829+
if using_array_manager:
830+
mark = pytest.mark.xfail(
831+
reason="setting with .loc[:, 'a'] does not alter inplace"
832+
)
833+
request.node.add_marker(mark)
834+
829835
original_df = DataFrame({"a": [1, 2, 3]})
830836
sliced_df = original_df.iloc[:]
831837
assert sliced_df is not original_df
832838

833839
# should be a shallow copy
834-
original_df["a"] = [4, 4, 4]
835-
if using_array_manager:
836-
# TODO(ArrayManager) verify it is expected that the original didn't change
837-
# setitem is replacing full column, so doesn't update "viewing" dataframe
838-
assert not (sliced_df["a"] == 4).all()
839-
else:
840-
assert (sliced_df["a"] == 4).all()
840+
assert np.shares_memory(original_df["a"], sliced_df["a"])
841+
842+
# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
843+
original_df.loc[:, "a"] = [4, 4, 4]
844+
assert (sliced_df["a"] == 4).all()
841845

842846
original_series = Series([1, 2, 3, 4, 5, 6])
843847
sliced_series = original_series.iloc[:]

pandas/tests/indexing/test_loc.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -1002,21 +1002,25 @@ def test_loc_empty_list_indexer_is_ok(self):
10021002
df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
10031003
)
10041004

1005-
def test_identity_slice_returns_new_object(self, using_array_manager):
1005+
def test_identity_slice_returns_new_object(self, using_array_manager, request):
10061006
# GH13873
1007+
if using_array_manager:
1008+
mark = pytest.mark.xfail(
1009+
reason="setting with .loc[:, 'a'] does not alter inplace"
1010+
)
1011+
request.node.add_marker(mark)
1012+
10071013
original_df = DataFrame({"a": [1, 2, 3]})
10081014
sliced_df = original_df.loc[:]
10091015
assert sliced_df is not original_df
10101016
assert original_df[:] is not original_df
10111017

10121018
# should be a shallow copy
1013-
original_df["a"] = [4, 4, 4]
1014-
if using_array_manager:
1015-
# TODO(ArrayManager) verify it is expected that the original didn't change
1016-
# setitem is replacing full column, so doesn't update "viewing" dataframe
1017-
assert not (sliced_df["a"] == 4).all()
1018-
else:
1019-
assert (sliced_df["a"] == 4).all()
1019+
assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values)
1020+
1021+
# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
1022+
original_df.loc[:, "a"] = [4, 4, 4]
1023+
assert (sliced_df["a"] == 4).all()
10201024

10211025
# These should not return copies
10221026
assert original_df is original_df.loc[:, :]

pandas/tests/internals/test_internals.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,11 @@ def test_get_numeric_data(self):
750750
)
751751

752752
# Check sharing
753-
numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
753+
numeric.iset(
754+
numeric.items.get_loc("float"),
755+
np.array([100.0, 200.0, 300.0]),
756+
inplace=True,
757+
)
754758
tm.assert_almost_equal(
755759
mgr.iget(mgr.items.get_loc("float")).internal_values(),
756760
np.array([100.0, 200.0, 300.0]),
@@ -759,7 +763,9 @@ def test_get_numeric_data(self):
759763
numeric2 = mgr.get_numeric_data(copy=True)
760764
tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
761765
numeric2.iset(
762-
numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
766+
numeric2.items.get_loc("float"),
767+
np.array([1000.0, 2000.0, 3000.0]),
768+
inplace=True,
763769
)
764770
tm.assert_almost_equal(
765771
mgr.iget(mgr.items.get_loc("float")).internal_values(),
@@ -781,7 +787,7 @@ def test_get_bool_data(self):
781787
bools.iget(bools.items.get_loc("bool")).internal_values(),
782788
)
783789

784-
bools.iset(0, np.array([True, False, True]))
790+
bools.iset(0, np.array([True, False, True]), inplace=True)
785791
tm.assert_numpy_array_equal(
786792
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
787793
np.array([True, False, True]),

pandas/tests/reshape/merge/test_merge.py

+2-15
Original file line numberDiff line numberDiff line change
@@ -308,21 +308,8 @@ def test_merge_nocopy(self, using_array_manager):
308308

309309
merged = merge(left, right, left_index=True, right_index=True, copy=False)
310310

311-
if using_array_manager:
312-
# With ArrayManager, setting a column doesn't change the values inplace
313-
# and thus does not propagate the changes to the original left/right
314-
# dataframes -> need to check that no copy was made in a different way
315-
# TODO(ArrayManager) we should be able to simplify this with a .loc
316-
# setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10
317-
# but this currently replaces the array (_setitem_with_indexer_split_path)
318-
assert merged._mgr.arrays[0] is left._mgr.arrays[0]
319-
assert merged._mgr.arrays[2] is right._mgr.arrays[0]
320-
else:
321-
merged["a"] = 6
322-
assert (left["a"] == 6).all()
323-
324-
merged["d"] = "peekaboo"
325-
assert (right["d"] == "peekaboo").all()
311+
assert np.shares_memory(merged["a"]._values, left["a"]._values)
312+
assert np.shares_memory(merged["d"]._values, right["d"]._values)
326313

327314
def test_intelligently_handle_join_key(self):
328315
# #733, be a bit more 1337 about not returning unconsolidated DataFrame

0 commit comments

Comments
 (0)