Skip to content

CoW warning mode: setting values into single column of DataFrame #56020

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pandas/core/computation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,9 @@ def eval(
# to use a non-numeric indexer
try:
with warnings.catch_warnings(record=True):
warnings.filterwarnings(
"always", "Setting a value on a view", FutureWarning
)
# TODO: Filter the warnings we actually care about here.
if inplace and isinstance(target, NDFrame):
target.loc[:, assigner] = ret
Expand Down
1 change: 1 addition & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4857,6 +4857,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:

inplace = validate_bool_kwarg(inplace, "inplace")
kwargs["level"] = kwargs.pop("level", 0) + 1
# TODO(CoW) those index/column resolvers create unnecessary refs to `self`
index_resolvers = self._get_index_resolvers()
column_resolvers = self._get_cleaned_column_resolvers()
resolvers = column_resolvers, index_resolvers
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1737,13 +1737,14 @@ def round(self, decimals: int, using_cow: bool = False) -> Self:
# has no attribute "round"
values = self.values.round(decimals) # type: ignore[union-attr]
if values is self.values:
refs = self.refs
if not using_cow:
# Normally would need to do this before, but
# numpy only returns same array when round operation
# is no-op
# https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636
values = values.copy()
else:
refs = self.refs
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were incorrectly adding forwarding refs always, even when we did make a copy. For CoW this doesn't cause anything wrong, but for the warning mode this gave false positive warnings.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah sorry for that, I didn't care much about non CoW when adding all of those

return self.make_block_same_class(values, refs=refs)

# ---------------------------------------------------------------------
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,13 @@ def column_setitem(
This is a method on the BlockManager level, to avoid creating an
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
"""
if using_copy_on_write() and not self._has_no_reference(loc):
if warn_copy_on_write() and not self._has_no_reference(loc):
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
elif using_copy_on_write() and not self._has_no_reference(loc):
blkno = self.blknos[loc]
# Split blocks to only copy the column we want to modify
blk_loc = self.blklocs[loc]
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/computation/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,9 @@ def test_assignment_single_assign_new(self):
df.eval("c = a + b", inplace=True)
tm.assert_frame_equal(df, expected)

def test_assignment_single_assign_local_overlap(self):
# TODO(CoW-warn) this should not warn (DataFrame.eval creates refs to self)
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
def test_assignment_single_assign_local_overlap(self, warn_copy_on_write):
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
)
Expand Down Expand Up @@ -1218,6 +1220,8 @@ def test_column_in(self):
tm.assert_series_equal(result, expected, check_names=False)

@pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.")
# TODO(CoW-warn) this should not warn (DataFrame.eval creates refs to self)
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
def test_assignment_not_inplace(self):
# see gh-9297
df = DataFrame(
Expand Down Expand Up @@ -1897,13 +1901,14 @@ def test_eval_no_support_column_name(request, column):
tm.assert_frame_equal(result, expected)


def test_set_inplace(using_copy_on_write):
def test_set_inplace(using_copy_on_write, warn_copy_on_write):
# https://github.com/pandas-dev/pandas/issues/47449
# Ensure we don't only update the DataFrame inplace, but also the actual
# column values, such that references to this column also get updated
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
result_view = df[:]
ser = df["A"]
# with tm.assert_cow_warning(warn_copy_on_write):
df.eval("A = B + C", inplace=True)
expected = DataFrame({"A": [11, 13, 15], "B": [4, 5, 6], "C": [7, 8, 9]})
tm.assert_frame_equal(df, expected)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,8 @@ def test_dataframe_from_dict_of_series(
assert np.shares_memory(get_array(result, "a"), get_array(s1))

# mutating the new dataframe doesn't mutate original
# TODO(CoW-warn) this should also warn
result.iloc[0, 0] = 10
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0, 0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_series_equal(s1, s1_orig)
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/copy_view/test_core_functionalities.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,32 @@ def test_setitem_dont_track_unnecessary_references(using_copy_on_write):
assert np.shares_memory(arr, get_array(df, "a"))


def test_setitem_with_view_copies(using_copy_on_write):
def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
view = df[:]
expected = df.copy()

df["b"] = 100
arr = get_array(df, "a")
df.iloc[0, 0] = 100 # Check that we correctly track reference
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 100 # Check that we correctly track reference
if using_copy_on_write:
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(view, expected)


def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, request):
def test_setitem_with_view_invalidated_does_not_copy(
using_copy_on_write, warn_copy_on_write, request
):
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
view = df[:]

df["b"] = 100
arr = get_array(df, "a")
view = None # noqa: F841
df.iloc[0, 0] = 100
# TODO(CoW-warn) false positive?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, comment explaining this is 3 lines below

with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 100
if using_copy_on_write:
# Setitem split the block. Since the old block shared data with view
# all the new blocks are referencing view and each other. When view
Expand Down
55 changes: 33 additions & 22 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def test_subset_column_selection_modify_parent(backend, using_copy_on_write):
tm.assert_frame_equal(subset, expected)


def test_subset_row_slice(backend, using_copy_on_write):
def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write):
# Case: taking a subset of the rows of a DataFrame using a slice
# + afterwards modifying the subset
_, DataFrame, _ = backend
Expand All @@ -121,7 +121,8 @@ def test_subset_row_slice(backend, using_copy_on_write):
# INFO this no longer raise warning since pandas 1.4
# with pd.option_context("chained_assignment", "warn"):
# with tm.assert_produces_warning(SettingWithCopyWarning):
subset.iloc[0, 0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
subset.iloc[0, 0] = 0

subset._mgr._verify_integrity()

Expand Down Expand Up @@ -334,8 +335,11 @@ def test_subset_set_with_row_indexer(
pytest.skip("setitem with labels selects on columns")

# TODO(CoW-warn) should warn
if using_copy_on_write or warn_copy_on_write:
if using_copy_on_write:
indexer_si(subset)[indexer] = 0
elif warn_copy_on_write:
with tm.assert_cow_warning():
indexer_si(subset)[indexer] = 0
else:
# INFO iloc no longer raises warning since pandas 1.4
warn = SettingWithCopyWarning if indexer_si is tm.setitem else None
Expand Down Expand Up @@ -419,7 +423,7 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write):
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_column_with_loc(
backend, using_copy_on_write, using_array_manager, dtype
backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype
):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
Expand All @@ -432,6 +436,9 @@ def test_subset_set_column_with_loc(

if using_copy_on_write:
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
elif warn_copy_on_write:
with tm.assert_cow_warning():
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(
Expand All @@ -455,7 +462,9 @@ def test_subset_set_column_with_loc(
tm.assert_frame_equal(df, df_orig)


def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_manager):
def test_subset_set_column_with_loc2(
backend, using_copy_on_write, warn_copy_on_write, using_array_manager
):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
# separate test for case of DataFrame of a single column -> takes a separate
Expand All @@ -467,6 +476,9 @@ def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_m

if using_copy_on_write:
subset.loc[:, "a"] = 0
elif warn_copy_on_write:
with tm.assert_cow_warning():
subset.loc[:, "a"] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(
Expand Down Expand Up @@ -528,7 +540,9 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt
[slice("a", "b"), np.array([True, True, False]), ["a", "b"]],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write):
def test_subset_set_with_column_indexer(
backend, indexer, using_copy_on_write, warn_copy_on_write
):
# Case: setting multiple columns with a column indexer on a viewing subset
# -> subset.loc[:, [col1, col2]] = value
_, DataFrame, _ = backend
Expand All @@ -538,6 +552,9 @@ def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write):

if using_copy_on_write:
subset.loc[:, indexer] = 0
elif warn_copy_on_write:
with tm.assert_cow_warning():
subset.loc[:, indexer] = 0
else:
with pd.option_context("chained_assignment", "warn"):
# As of 2.0, this setitem attempts (successfully) to set values
Expand Down Expand Up @@ -659,10 +676,7 @@ def test_subset_chained_getitem_column(
# modify parent -> don't modify subset
subset = df[:]["a"][0:2]
df._clear_item_cache()
# TODO(CoW-warn) should also warn for mixed block and nullable dtypes
with tm.assert_cow_warning(
warn_copy_on_write and dtype == "int64" and dtype_backend == "numpy"
):
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 0
expected = Series([1, 2], name="a")
if using_copy_on_write:
Expand Down Expand Up @@ -765,8 +779,7 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write):
assert df2 is not df

# and those trigger CoW when mutated
# TODO(CoW-warn) should also warn for nullable dtypes
with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"):
with tm.assert_cow_warning(warn_copy_on_write):
df2.iloc[0, 0] = 0
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
Expand Down Expand Up @@ -884,10 +897,10 @@ def test_series_subset_set_with_indexer(
# del operator


def test_del_frame(backend, using_copy_on_write):
def test_del_frame(backend, using_copy_on_write, warn_copy_on_write):
# Case: deleting a column with `del` on a viewing child dataframe should
# not modify parent + update the references
_, DataFrame, _ = backend
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
df2 = df[:]
Expand All @@ -901,11 +914,14 @@ def test_del_frame(backend, using_copy_on_write):
tm.assert_frame_equal(df2, df_orig[["a", "c"]])
df2._mgr._verify_integrity()

df.loc[0, "b"] = 200
# TODO(CoW-warn) false positive, this should not warn?
with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"):
df.loc[0, "b"] = 200
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df_orig = df.copy()

df2.loc[0, "a"] = 100
with tm.assert_cow_warning(warn_copy_on_write):
df2.loc[0, "a"] = 100
if using_copy_on_write:
# modifying child after deleting a column still doesn't update parent
tm.assert_frame_equal(df, df_orig)
Expand Down Expand Up @@ -1109,7 +1125,6 @@ def test_set_value_copy_only_necessary_column(
):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
single_block = isinstance(col[0], int)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col})
df_orig = df.copy()
view = df[:]
Expand All @@ -1120,11 +1135,7 @@ def test_set_value_copy_only_necessary_column(
):
indexer_func(df)[indexer] = val
else:
# TODO(CoW-warn) should also warn in the other cases
with tm.assert_cow_warning(
warn_copy_on_write
and not (indexer[0] == slice(None) or (not single_block and val == 100))
):
with tm.assert_cow_warning(warn_copy_on_write):
indexer_func(df)[indexer] = val

if using_copy_on_write:
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,11 +324,12 @@ def test_fillna_ea_noop_shares_memory(


def test_fillna_inplace_ea_noop_shares_memory(
using_copy_on_write, any_numeric_ea_and_arrow_dtype
using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype
):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
view = df[:]
# TODO(CoW-warn)
df.fillna(100, inplace=True)

if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
Expand All @@ -342,7 +343,9 @@ def test_fillna_inplace_ea_noop_shares_memory(
assert not df._mgr._has_no_reference(1)
assert not view._mgr._has_no_reference(1)

df.iloc[0, 1] = 100
# TODO(CoW-warn) should this warn for ArrowDtype?
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 1] = 100
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
tm.assert_frame_equal(df_orig, view)
else:
Expand Down
Loading