Skip to content

CoW warning mode: add warning for single block setitem #55838

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from pandas._config import (
get_option,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._config.config import _get_option

Expand Down Expand Up @@ -4538,7 +4539,7 @@ def _clear_item_cache(self) -> None:

def _get_item_cache(self, item: Hashable) -> Series:
"""Return the cached item, item represents a label indexer."""
if using_copy_on_write():
if using_copy_on_write() or warn_copy_on_write():
loc = self.columns.get_loc(item)
return self._ixs(loc, axis=1)
Comment on lines 4540 to 4544
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I needed to make a decision here about what to do with the item cache in the warning mode.

In the end, the current version of the PR updates the warning mode to also not populate and use the item cache, like for Cow mode.

This will cause some behaviour changes when enabling the warning mode. But it also avoids a bunch of otherwise false-positive warnings. So it's a trade-off.

Examples that would otherwise give false positives:

df =  ..
df["col"].mean()
# now mutating `df` would trigger CoW / raise warning because the df["col"] series is cached

or whenever we access columns under the hood in a method, like df2 = df.astype({..}), even when df2 fully consists of copied data because there was an actual cast, df will have referenced because of the item cache.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine by me


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12392,7 +12392,7 @@ def _inplace_method(self, other, op) -> Self:
"""
warn = True
if not PYPY and warn_copy_on_write():
if sys.getrefcount(self) <= 5:
if sys.getrefcount(self) <= 4:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you arrive at the conclusion to change this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From testing in practice, but my assumption is that it is the consequence of removing the item cache, which means one reference less (so this change sounds logical in hindsight)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yeah that makes a lot of sense

Thx

# we are probably in an inplace setitem context (e.g. df['a'] += 1)
warn = False

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import lib
from pandas._libs.tslibs import OutOfBoundsDatetime
Expand Down Expand Up @@ -966,7 +969,7 @@ def is_in_axis(key) -> bool:
def is_in_obj(gpr) -> bool:
if not hasattr(gpr, "name"):
return False
if using_copy_on_write():
if using_copy_on_write() or warn_copy_on_write():
# For the CoW case, we check the references to determine if the
# series is part of the object
try:
Expand Down
28 changes: 25 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@
from pandas.api.extensions import ExtensionArray


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
Expand Down Expand Up @@ -387,7 +396,14 @@ def setitem(self, indexer, value) -> Self:
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")

if using_copy_on_write() and not self._has_no_reference(0):
if warn_copy_on_write() and not self._has_no_reference(0):
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)

elif using_copy_on_write() and not self._has_no_reference(0):
# this method is only called if there is a single block -> hardcoded 0
# Split blocks to only copy the columns we want to modify
if self.ndim == 2 and isinstance(indexer, tuple):
Expand Down Expand Up @@ -1951,9 +1967,15 @@ def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self:
return type(self)(blk.copy(deep=False), self.index)
array = blk.values[indexer]

if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b":
# boolean indexing always gives a copy with numpy
refs = None
else:
# TODO(CoW) in theory only need to track reference if new_array is a view
refs = blk.refs

bp = BlockPlacement(slice(0, len(array)))
# TODO(CoW) in theory only need to track reference if new_array is a view
block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
block = type(blk)(array, placement=bp, ndim=1, refs=refs)

new_idx = self.index[indexer]
return type(self)(block, new_idx)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,7 @@ def test_apply_dtype(col):
tm.assert_series_equal(result, expected)


def test_apply_mutating(using_array_manager, using_copy_on_write):
def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write):
# GH#35462 case where applied func pins a new BlockManager to a row
df = DataFrame({"a": range(100), "b": range(100, 200)})
df_orig = df.copy()
Expand All @@ -1467,7 +1467,8 @@ def func(row):
expected = df.copy()
expected["a"] += 1

result = df.apply(func, axis=1)
with tm.assert_cow_warning(warn_copy_on_write):
result = df.apply(func, axis=1)

tm.assert_frame_equal(result, expected)
if using_copy_on_write or using_array_manager:
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/copy_view/index/test_datetimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
)
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)


@pytest.mark.parametrize(
"cons",
Expand Down
30 changes: 18 additions & 12 deletions pandas/tests/copy_view/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ def index_view(index_data=[1, 2]):
return idx, view


def test_set_index_update_column(using_copy_on_write):
def test_set_index_update_column(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1})
df = df.set_index("a", drop=False)
expected = df.index.copy(deep=True)
df.iloc[0, 0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a false positive to me, shouldn't set_index copy the data in non-Cow?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't set_index copy the data in non-Cow?

You would assume so, but no .. (that's kind of a horrible bug I would say):

In [15]: df = DataFrame({"a": [1, 2], "b": 1})

In [16]: df = df.set_index("a", drop=False)

In [17]: df
Out[17]: 
   a  b
a      
1  1  1
2  2  1

In [18]: df.loc[1, 'a'] = 100

In [19]: df
Out[19]: 
       a  b
a          
100  100  1
2      2  1

See #42934 (it seems I actually have an old draft PR for this)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yikes this is terrible and unexpected.

df.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
Expand All @@ -39,49 +40,53 @@ def test_set_index_drop_update_column(using_copy_on_write):
tm.assert_index_equal(df.index, expected)


def test_set_index_series(using_copy_on_write):
def test_set_index_series(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df = df.set_index(ser)
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))


def test_assign_index_as_series(using_copy_on_write):
def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df.index = ser
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))


def test_assign_index_as_index(using_copy_on_write):
def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
rhs_index = Index(ser)
df.index = rhs_index
rhs_index = None # overwrite to clear reference
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))


def test_index_from_series(using_copy_on_write):
def test_index_from_series(using_copy_on_write, warn_copy_on_write):
ser = Series([1, 2])
idx = Index(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
else:
Expand All @@ -96,12 +101,13 @@ def test_index_from_series_copy(using_copy_on_write):
assert np.shares_memory(get_array(ser), arr)


def test_index_from_index(using_copy_on_write):
def test_index_from_index(using_copy_on_write, warn_copy_on_write):
ser = Series([1, 2])
idx = Index(ser)
idx = Index(idx)
expected = idx.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
else:
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/copy_view/index/test_periodindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
)
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)


@pytest.mark.parametrize(
"cons",
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/copy_view/index/test_timedeltaindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
)
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)


@pytest.mark.parametrize(
"cons",
Expand Down
40 changes: 27 additions & 13 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


@pytest.mark.parametrize("dtype", [None, "int64"])
def test_series_from_series(dtype, using_copy_on_write):
def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write):
# Case: constructing a Series from another Series object follows CoW rules:
# a new object is returned and thus mutations are not propagated
ser = Series([1, 2, 3], name="name")
Expand All @@ -43,7 +43,8 @@ def test_series_from_series(dtype, using_copy_on_write):
assert not np.shares_memory(get_array(ser), get_array(result))
else:
# mutating shallow copy does mutate original
result.iloc[0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
assert ser.iloc[0] == 0
# and still shares memory
assert np.shares_memory(get_array(ser), get_array(result))
Expand All @@ -57,11 +58,12 @@ def test_series_from_series(dtype, using_copy_on_write):
assert result.iloc[0] == 1
else:
# mutating original does mutate shallow copy
ser.iloc[0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 0
assert result.iloc[0] == 0


def test_series_from_series_with_reindex(using_copy_on_write):
def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write):
# Case: constructing a Series from another Series with specifying an index
# that potentially requires a reindex of the values
ser = Series([1, 2, 3], name="name")
Expand All @@ -76,7 +78,8 @@ def test_series_from_series_with_reindex(using_copy_on_write):
]:
result = Series(ser, index=index)
assert np.shares_memory(ser.values, result.values)
result.iloc[0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
if using_copy_on_write:
assert ser.iloc[0] == 1
else:
Expand Down Expand Up @@ -153,6 +156,7 @@ def test_series_from_index_different_dtypes(using_copy_on_write):
assert ser._mgr._has_no_reference(0)


@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
Expand Down Expand Up @@ -186,7 +190,9 @@ def test_series_from_block_manager_different_dtype(using_copy_on_write):

@pytest.mark.parametrize("use_mgr", [True, False])
@pytest.mark.parametrize("columns", [None, ["a"]])
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr):
def test_dataframe_constructor_mgr_or_df(
using_copy_on_write, warn_copy_on_write, columns, use_mgr
):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()

Expand All @@ -201,7 +207,8 @@ def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr):
new_df = DataFrame(data)

assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
new_df.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write and not use_mgr):
new_df.iloc[0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
Expand All @@ -215,7 +222,7 @@ def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr):
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
def test_dataframe_from_dict_of_series(
request, using_copy_on_write, columns, index, dtype
request, using_copy_on_write, warn_copy_on_write, columns, index, dtype
):
# Case: constructing a DataFrame from Series objects with copy=False
# has to do a lazy following CoW rules
Expand All @@ -235,6 +242,7 @@ def test_dataframe_from_dict_of_series(
assert np.shares_memory(get_array(result, "a"), get_array(s1))

# mutating the new dataframe doesn't mutate original
# TODO(CoW-warn) this should also warn
result.iloc[0, 0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
Expand All @@ -248,7 +256,8 @@ def test_dataframe_from_dict_of_series(
result = DataFrame(
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
)
s1.iloc[0] = 10
with tm.assert_cow_warning(warn_copy_on_write):
s1.iloc[0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -278,15 +287,19 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype):
@pytest.mark.parametrize(
"data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
)
def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons):
def test_dataframe_from_series_or_index(
using_copy_on_write, warn_copy_on_write, data, dtype, cons
):
obj = cons(data, dtype=dtype)
obj_orig = obj.copy()
df = DataFrame(obj, dtype=dtype)
assert np.shares_memory(get_array(obj), get_array(df, 0))
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)

df.iloc[0, 0] = data[-1]
# TODO(CoW-warn) should not warn for an index?
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = data[-1]
if using_copy_on_write:
tm.assert_equal(obj, obj_orig)

Expand Down Expand Up @@ -341,15 +354,16 @@ def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
assert np.shares_memory(get_array(df, 0), arr)


def test_dataframe_from_records_with_dataframe(using_copy_on_write):
def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
with tm.assert_produces_warning(FutureWarning):
df2 = DataFrame.from_records(df)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.iloc[0, 0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
df2.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
Expand Down
Loading