Skip to content

Commit de9815c

Browse files
CoW warning mode: setting values into single column of DataFrame (#56020)
1 parent 0255ab3 commit de9815c

27 files changed

+171
-98
lines changed

pandas/core/computation/eval.py

+3
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,9 @@ def eval(
389389
# to use a non-numeric indexer
390390
try:
391391
with warnings.catch_warnings(record=True):
392+
warnings.filterwarnings(
393+
"always", "Setting a value on a view", FutureWarning
394+
)
392395
# TODO: Filter the warnings we actually care about here.
393396
if inplace and isinstance(target, NDFrame):
394397
target.loc[:, assigner] = ret

pandas/core/frame.py

+1
Original file line numberDiff line numberDiff line change
@@ -4854,6 +4854,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
48544854

48554855
inplace = validate_bool_kwarg(inplace, "inplace")
48564856
kwargs["level"] = kwargs.pop("level", 0) + 1
4857+
# TODO(CoW) those index/column resolvers create unnecessary refs to `self`
48574858
index_resolvers = self._get_index_resolvers()
48584859
column_resolvers = self._get_cleaned_column_resolvers()
48594860
resolvers = column_resolvers, index_resolvers

pandas/core/internals/blocks.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1792,13 +1792,14 @@ def round(self, decimals: int, using_cow: bool = False) -> Self:
17921792
# has no attribute "round"
17931793
values = self.values.round(decimals) # type: ignore[union-attr]
17941794
if values is self.values:
1795-
refs = self.refs
17961795
if not using_cow:
17971796
# Normally would need to do this before, but
17981797
# numpy only returns same array when round operation
17991798
# is no-op
18001799
# https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636
18011800
values = values.copy()
1801+
else:
1802+
refs = self.refs
18021803
return self.make_block_same_class(values, refs=refs)
18031804

18041805
# ---------------------------------------------------------------------

pandas/core/internals/managers.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1342,7 +1342,13 @@ def column_setitem(
13421342
This is a method on the BlockManager level, to avoid creating an
13431343
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
13441344
"""
1345-
if using_copy_on_write() and not self._has_no_reference(loc):
1345+
if warn_copy_on_write() and not self._has_no_reference(loc):
1346+
warnings.warn(
1347+
COW_WARNING_GENERAL_MSG,
1348+
FutureWarning,
1349+
stacklevel=find_stack_level(),
1350+
)
1351+
elif using_copy_on_write() and not self._has_no_reference(loc):
13461352
blkno = self.blknos[loc]
13471353
# Split blocks to only copy the column we want to modify
13481354
blk_loc = self.blklocs[loc]

pandas/tests/computation/test_eval.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1164,7 +1164,9 @@ def test_assignment_single_assign_new(self):
11641164
df.eval("c = a + b", inplace=True)
11651165
tm.assert_frame_equal(df, expected)
11661166

1167-
def test_assignment_single_assign_local_overlap(self):
1167+
# TODO(CoW-warn) this should not warn (DataFrame.eval creates refs to self)
1168+
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
1169+
def test_assignment_single_assign_local_overlap(self, warn_copy_on_write):
11681170
df = DataFrame(
11691171
np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
11701172
)
@@ -1218,6 +1220,8 @@ def test_column_in(self):
12181220
tm.assert_series_equal(result, expected, check_names=False)
12191221

12201222
@pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.")
1223+
# TODO(CoW-warn) this should not warn (DataFrame.eval creates refs to self)
1224+
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
12211225
def test_assignment_not_inplace(self):
12221226
# see gh-9297
12231227
df = DataFrame(
@@ -1897,13 +1901,14 @@ def test_eval_no_support_column_name(request, column):
18971901
tm.assert_frame_equal(result, expected)
18981902

18991903

1900-
def test_set_inplace(using_copy_on_write):
1904+
def test_set_inplace(using_copy_on_write, warn_copy_on_write):
19011905
# https://github.com/pandas-dev/pandas/issues/47449
19021906
# Ensure we don't only update the DataFrame inplace, but also the actual
19031907
# column values, such that references to this column also get updated
19041908
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
19051909
result_view = df[:]
19061910
ser = df["A"]
1911+
# with tm.assert_cow_warning(warn_copy_on_write):
19071912
df.eval("A = B + C", inplace=True)
19081913
expected = DataFrame({"A": [11, 13, 15], "B": [4, 5, 6], "C": [7, 8, 9]})
19091914
tm.assert_frame_equal(df, expected)

pandas/tests/copy_view/test_constructors.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,8 @@ def test_dataframe_from_dict_of_series(
242242
assert np.shares_memory(get_array(result, "a"), get_array(s1))
243243

244244
# mutating the new dataframe doesn't mutate original
245-
# TODO(CoW-warn) this should also warn
246-
result.iloc[0, 0] = 10
245+
with tm.assert_cow_warning(warn_copy_on_write):
246+
result.iloc[0, 0] = 10
247247
if using_copy_on_write:
248248
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
249249
tm.assert_series_equal(s1, s1_orig)

pandas/tests/copy_view/test_core_functionalities.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -28,27 +28,32 @@ def test_setitem_dont_track_unnecessary_references(using_copy_on_write):
2828
assert np.shares_memory(arr, get_array(df, "a"))
2929

3030

31-
def test_setitem_with_view_copies(using_copy_on_write):
31+
def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write):
3232
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
3333
view = df[:]
3434
expected = df.copy()
3535

3636
df["b"] = 100
3737
arr = get_array(df, "a")
38-
df.iloc[0, 0] = 100 # Check that we correctly track reference
38+
with tm.assert_cow_warning(warn_copy_on_write):
39+
df.iloc[0, 0] = 100 # Check that we correctly track reference
3940
if using_copy_on_write:
4041
assert not np.shares_memory(arr, get_array(df, "a"))
4142
tm.assert_frame_equal(view, expected)
4243

4344

44-
def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, request):
45+
def test_setitem_with_view_invalidated_does_not_copy(
46+
using_copy_on_write, warn_copy_on_write, request
47+
):
4548
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
4649
view = df[:]
4750

4851
df["b"] = 100
4952
arr = get_array(df, "a")
5053
view = None # noqa: F841
51-
df.iloc[0, 0] = 100
54+
# TODO(CoW-warn) false positive?
55+
with tm.assert_cow_warning(warn_copy_on_write):
56+
df.iloc[0, 0] = 100
5257
if using_copy_on_write:
5358
# Setitem split the block. Since the old block shared data with view
5459
# all the new blocks are referencing view and each other. When view

pandas/tests/copy_view/test_indexing.py

+33-22
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def test_subset_column_selection_modify_parent(backend, using_copy_on_write):
101101
tm.assert_frame_equal(subset, expected)
102102

103103

104-
def test_subset_row_slice(backend, using_copy_on_write):
104+
def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write):
105105
# Case: taking a subset of the rows of a DataFrame using a slice
106106
# + afterwards modifying the subset
107107
_, DataFrame, _ = backend
@@ -121,7 +121,8 @@ def test_subset_row_slice(backend, using_copy_on_write):
121121
# INFO this no longer raise warning since pandas 1.4
122122
# with pd.option_context("chained_assignment", "warn"):
123123
# with tm.assert_produces_warning(SettingWithCopyWarning):
124-
subset.iloc[0, 0] = 0
124+
with tm.assert_cow_warning(warn_copy_on_write):
125+
subset.iloc[0, 0] = 0
125126

126127
subset._mgr._verify_integrity()
127128

@@ -334,8 +335,11 @@ def test_subset_set_with_row_indexer(
334335
pytest.skip("setitem with labels selects on columns")
335336

336337
# TODO(CoW-warn) should warn
337-
if using_copy_on_write or warn_copy_on_write:
338+
if using_copy_on_write:
338339
indexer_si(subset)[indexer] = 0
340+
elif warn_copy_on_write:
341+
with tm.assert_cow_warning():
342+
indexer_si(subset)[indexer] = 0
339343
else:
340344
# INFO iloc no longer raises warning since pandas 1.4
341345
warn = SettingWithCopyWarning if indexer_si is tm.setitem else None
@@ -419,7 +423,7 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write):
419423
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
420424
)
421425
def test_subset_set_column_with_loc(
422-
backend, using_copy_on_write, using_array_manager, dtype
426+
backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype
423427
):
424428
# Case: setting a single column with loc on a viewing subset
425429
# -> subset.loc[:, col] = value
@@ -432,6 +436,9 @@ def test_subset_set_column_with_loc(
432436

433437
if using_copy_on_write:
434438
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
439+
elif warn_copy_on_write:
440+
with tm.assert_cow_warning():
441+
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
435442
else:
436443
with pd.option_context("chained_assignment", "warn"):
437444
with tm.assert_produces_warning(
@@ -455,7 +462,9 @@ def test_subset_set_column_with_loc(
455462
tm.assert_frame_equal(df, df_orig)
456463

457464

458-
def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_manager):
465+
def test_subset_set_column_with_loc2(
466+
backend, using_copy_on_write, warn_copy_on_write, using_array_manager
467+
):
459468
# Case: setting a single column with loc on a viewing subset
460469
# -> subset.loc[:, col] = value
461470
# separate test for case of DataFrame of a single column -> takes a separate
@@ -467,6 +476,9 @@ def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_m
467476

468477
if using_copy_on_write:
469478
subset.loc[:, "a"] = 0
479+
elif warn_copy_on_write:
480+
with tm.assert_cow_warning():
481+
subset.loc[:, "a"] = 0
470482
else:
471483
with pd.option_context("chained_assignment", "warn"):
472484
with tm.assert_produces_warning(
@@ -528,7 +540,9 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt
528540
[slice("a", "b"), np.array([True, True, False]), ["a", "b"]],
529541
ids=["slice", "mask", "array"],
530542
)
531-
def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write):
543+
def test_subset_set_with_column_indexer(
544+
backend, indexer, using_copy_on_write, warn_copy_on_write
545+
):
532546
# Case: setting multiple columns with a column indexer on a viewing subset
533547
# -> subset.loc[:, [col1, col2]] = value
534548
_, DataFrame, _ = backend
@@ -538,6 +552,9 @@ def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write):
538552

539553
if using_copy_on_write:
540554
subset.loc[:, indexer] = 0
555+
elif warn_copy_on_write:
556+
with tm.assert_cow_warning():
557+
subset.loc[:, indexer] = 0
541558
else:
542559
with pd.option_context("chained_assignment", "warn"):
543560
# As of 2.0, this setitem attempts (successfully) to set values
@@ -659,10 +676,7 @@ def test_subset_chained_getitem_column(
659676
# modify parent -> don't modify subset
660677
subset = df[:]["a"][0:2]
661678
df._clear_item_cache()
662-
# TODO(CoW-warn) should also warn for mixed block and nullable dtypes
663-
with tm.assert_cow_warning(
664-
warn_copy_on_write and dtype == "int64" and dtype_backend == "numpy"
665-
):
679+
with tm.assert_cow_warning(warn_copy_on_write):
666680
df.iloc[0, 0] = 0
667681
expected = Series([1, 2], name="a")
668682
if using_copy_on_write:
@@ -765,8 +779,7 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write):
765779
assert df2 is not df
766780

767781
# and those trigger CoW when mutated
768-
# TODO(CoW-warn) should also warn for nullable dtypes
769-
with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"):
782+
with tm.assert_cow_warning(warn_copy_on_write):
770783
df2.iloc[0, 0] = 0
771784
if using_copy_on_write:
772785
tm.assert_frame_equal(df, df_orig)
@@ -884,10 +897,10 @@ def test_series_subset_set_with_indexer(
884897
# del operator
885898

886899

887-
def test_del_frame(backend, using_copy_on_write):
900+
def test_del_frame(backend, using_copy_on_write, warn_copy_on_write):
888901
# Case: deleting a column with `del` on a viewing child dataframe should
889902
# not modify parent + update the references
890-
_, DataFrame, _ = backend
903+
dtype_backend, DataFrame, _ = backend
891904
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
892905
df_orig = df.copy()
893906
df2 = df[:]
@@ -901,11 +914,14 @@ def test_del_frame(backend, using_copy_on_write):
901914
tm.assert_frame_equal(df2, df_orig[["a", "c"]])
902915
df2._mgr._verify_integrity()
903916

904-
df.loc[0, "b"] = 200
917+
# TODO(CoW-warn) false positive, this should not warn?
918+
with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"):
919+
df.loc[0, "b"] = 200
905920
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
906921
df_orig = df.copy()
907922

908-
df2.loc[0, "a"] = 100
923+
with tm.assert_cow_warning(warn_copy_on_write):
924+
df2.loc[0, "a"] = 100
909925
if using_copy_on_write:
910926
# modifying child after deleting a column still doesn't update parent
911927
tm.assert_frame_equal(df, df_orig)
@@ -1109,7 +1125,6 @@ def test_set_value_copy_only_necessary_column(
11091125
):
11101126
# When setting inplace, only copy column that is modified instead of the whole
11111127
# block (by splitting the block)
1112-
single_block = isinstance(col[0], int)
11131128
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col})
11141129
df_orig = df.copy()
11151130
view = df[:]
@@ -1120,11 +1135,7 @@ def test_set_value_copy_only_necessary_column(
11201135
):
11211136
indexer_func(df)[indexer] = val
11221137
else:
1123-
# TODO(CoW-warn) should also warn in the other cases
1124-
with tm.assert_cow_warning(
1125-
warn_copy_on_write
1126-
and not (indexer[0] == slice(None) or (not single_block and val == 100))
1127-
):
1138+
with tm.assert_cow_warning(warn_copy_on_write):
11281139
indexer_func(df)[indexer] = val
11291140

11301141
if using_copy_on_write:

pandas/tests/copy_view/test_interp_fillna.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -324,11 +324,12 @@ def test_fillna_ea_noop_shares_memory(
324324

325325

326326
def test_fillna_inplace_ea_noop_shares_memory(
327-
using_copy_on_write, any_numeric_ea_and_arrow_dtype
327+
using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype
328328
):
329329
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
330330
df_orig = df.copy()
331331
view = df[:]
332+
# TODO(CoW-warn)
332333
df.fillna(100, inplace=True)
333334

334335
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
@@ -342,7 +343,9 @@ def test_fillna_inplace_ea_noop_shares_memory(
342343
assert not df._mgr._has_no_reference(1)
343344
assert not view._mgr._has_no_reference(1)
344345

345-
df.iloc[0, 1] = 100
346+
# TODO(CoW-warn) should this warn for ArrowDtype?
347+
with tm.assert_cow_warning(warn_copy_on_write):
348+
df.iloc[0, 1] = 100
346349
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
347350
tm.assert_frame_equal(df_orig, view)
348351
else:

0 commit comments

Comments
 (0)