Skip to content

CoW: Warn for cases that go through putmask #56168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
algos as libalgos,
Expand Down Expand Up @@ -49,6 +52,11 @@
)


class _AlreadyWarned:
def __init__(self):
self.warned_already = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a small comment here about why such a class is needed (in contrast to a simple boolean argument)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added



class DataManager(PandasObject):
# TODO share more methods/attributes

Expand Down Expand Up @@ -203,12 +211,17 @@ def putmask(self, mask, new, align: bool = True) -> Self:
align_keys = ["mask"]
new = extract_array(new, extract_numpy=True)

already_warned = None
if warn_copy_on_write():
already_warned = _AlreadyWarned()

return self.apply_with_block(
"putmask",
align_keys=align_keys,
mask=mask,
new=new,
using_cow=using_copy_on_write(),
already_warned=already_warned,
)

@final
Expand Down
58 changes: 56 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas._config import (
get_option,
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
Expand Down Expand Up @@ -136,6 +137,29 @@
_dtype_obj = np.dtype("object")


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:

ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`

In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""


def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
Expand Down Expand Up @@ -1355,7 +1379,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block:
values[indexer] = casted
return self

def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
def putmask(
self, mask, new, using_cow: bool = False, already_warned=None
) -> list[Block]:
"""
putmask the data to the block; it is possible that we may create a
new dtype of block
Expand Down Expand Up @@ -1388,6 +1414,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
return [self.copy(deep=False)]
return [self]

if (
warn_copy_on_write()
and already_warned is not None
and not already_warned.warned_already
):
if self.refs.has_reference():
warnings.warn(
COW_WARNING_SETITEM_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
already_warned.warned_already = True

try:
casted = np_can_hold_element(values.dtype, new)

Expand Down Expand Up @@ -2020,7 +2059,9 @@ def where(
return [nb]

@final
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
def putmask(
self, mask, new, using_cow: bool = False, already_warned=None
) -> list[Block]:
"""
See Block.putmask.__doc__
"""
Expand All @@ -2038,6 +2079,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
return [self.copy(deep=False)]
return [self]

if (
warn_copy_on_write()
and already_warned is not None
and not already_warned.warned_already
):
if self.refs.has_reference():
warnings.warn(
COW_WARNING_SETITEM_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
already_warned.warned_already = True

self = self._maybe_copy(using_cow, inplace=True)
values = self.values
if values.ndim == 2:
Expand Down
25 changes: 2 additions & 23 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
interleaved_dtype,
)
from pandas.core.internals.blocks import (
COW_WARNING_GENERAL_MSG,
COW_WARNING_SETITEM_MSG,
Block,
NumpyBlock,
ensure_block_shape,
Expand Down Expand Up @@ -100,29 +102,6 @@
from pandas.api.extensions import ExtensionArray


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:

ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`

In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""


class BaseBlockManager(DataManager):
"""
Core internal data structure to implement DataFrame, Series, etc.
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/copy_view/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@
from pandas.tests.copy_view.util import get_array


def test_clip_inplace_reference(using_copy_on_write):
def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.clip(lower=2, inplace=True)
if warn_copy_on_write:
with tm.assert_cow_warning():
df.clip(lower=2, inplace=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this ideally give a more specific warning about an inplace method?
Because right now this gives the general setting on a view warning

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure we can change the message

else:
df.clip(lower=2, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
Expand Down
19 changes: 5 additions & 14 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,11 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write):

mask = subset > 3

# TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through
# DataFrame._where(..., inplace=True)
if using_copy_on_write or warn_copy_on_write:
if using_copy_on_write:
subset[mask] = 0
elif warn_copy_on_write:
with tm.assert_cow_warning():
subset[mask] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
Expand Down Expand Up @@ -867,18 +868,8 @@ def test_series_subset_set_with_indexer(
and indexer.dtype.kind == "i"
):
warn = FutureWarning
is_mask = (
indexer_si is tm.setitem
and isinstance(indexer, np.ndarray)
and indexer.dtype.kind == "b"
)
if warn_copy_on_write:
# TODO(CoW-warn) should also warn for setting with mask
# -> Series.__setitem__ with boolean mask ends up using Series._set_values
# or Series._where depending on value being set
with tm.assert_cow_warning(
not is_mask, raise_on_extra_warnings=warn is not None
):
with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None):
indexer_si(subset)[indexer] = 0
else:
with tm.assert_produces_warning(warn, match=msg):
Expand Down
25 changes: 18 additions & 7 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,11 +1407,12 @@ def test_items(using_copy_on_write, warn_copy_on_write):


@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_putmask(using_copy_on_write, dtype):
def test_putmask(using_copy_on_write, dtype, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
view = df[:]
df_orig = df.copy()
df[df == df] = 5
with tm.assert_cow_warning(warn_copy_on_write):
df[df == df] = 5

if using_copy_on_write:
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -1445,15 +1446,21 @@ def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype):
@pytest.mark.parametrize(
"val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)]
)
def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp, warn):
def test_putmask_dont_copy_some_blocks(
using_copy_on_write, val, exp, warn, warn_copy_on_write
):
df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5})
view = df[:]
df_orig = df.copy()
indexer = DataFrame(
[[True, False, False], [True, False, False]], columns=list("abc")
)
with tm.assert_produces_warning(warn, match="incompatible dtype"):
df[indexer] = val
if warn_copy_on_write:
with tm.assert_cow_warning():
df[indexer] = val
else:
with tm.assert_produces_warning(warn, match="incompatible dtype"):
df[indexer] = val

if using_copy_on_write:
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -1785,13 +1792,17 @@ def test_update_frame(using_copy_on_write, warn_copy_on_write):
tm.assert_frame_equal(view, expected)


def test_update_series(using_copy_on_write):
def test_update_series(using_copy_on_write, warn_copy_on_write):
ser1 = Series([1.0, 2.0, 3.0])
ser2 = Series([100.0], index=[1])
ser1_orig = ser1.copy()
view = ser1[:]

ser1.update(ser2)
if warn_copy_on_write:
with tm.assert_cow_warning():
ser1.update(ser2)
else:
ser1.update(ser2)

expected = Series([1.0, 100.0, 3.0])
tm.assert_series_equal(ser1, expected)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,18 @@ def test_replace_categorical(using_copy_on_write, val):


@pytest.mark.parametrize("method", ["where", "mask"])
def test_masking_inplace(using_copy_on_write, method):
def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]

method = getattr(df, method)
method(df["a"] > 1.6, -1, inplace=True)
if warn_copy_on_write:
with tm.assert_cow_warning():
method(df["a"] > 1.6, -1, inplace=True)
else:
method(df["a"] > 1.6, -1, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,7 @@ def test_replace_value_is_none(self, datetime_frame):
datetime_frame.iloc[0, 0] = orig_value
datetime_frame.iloc[1, 0] = orig2

def test_replace_for_new_dtypes(self, datetime_frame):
def test_replace_for_new_dtypes(self, datetime_frame, warn_copy_on_write):
# dtypes
tsframe = datetime_frame.copy().astype(np.float32)
tsframe.loc[tsframe.index[:5], "A"] = np.nan
Expand All @@ -732,7 +732,11 @@ def test_replace_for_new_dtypes(self, datetime_frame):
tsframe.loc[tsframe.index[:5], "B"] = -1e8

b = tsframe["B"]
b[b == -1e8] = np.nan
if warn_copy_on_write:
with tm.assert_cow_warning():
b[b == -1e8] = np.nan
else:
b[b == -1e8] = np.nan
tsframe["B"] = b
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
Expand Down
16 changes: 6 additions & 10 deletions pandas/tests/indexing/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write, warn_copy_on_write)
df.response[mask] = "none"
tm.assert_frame_equal(df, DataFrame({"response": data}))
else:
# TODO(CoW-warn) should warn
# with tm.assert_cow_warning(warn_copy_on_write):
df.response[mask] = "none"
with tm.assert_cow_warning(warn_copy_on_write):
df.response[mask] = "none"
tm.assert_frame_equal(df, DataFrame({"response": mdata}))

recarray = np.rec.fromarrays([data], names=["response"])
Expand All @@ -177,9 +176,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write, warn_copy_on_write)
df.response[mask] = "none"
tm.assert_frame_equal(df, DataFrame({"response": data}))
else:
# TODO(CoW-warn) should warn
# with tm.assert_cow_warning(warn_copy_on_write):
df.response[mask] = "none"
with tm.assert_cow_warning(warn_copy_on_write):
df.response[mask] = "none"
tm.assert_frame_equal(df, DataFrame({"response": mdata}))

df = DataFrame({"response": data, "response1": data})
Expand All @@ -190,9 +188,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write, warn_copy_on_write)
df.response[mask] = "none"
tm.assert_frame_equal(df, df_original)
else:
# TODO(CoW-warn) should warn
# with tm.assert_cow_warning(warn_copy_on_write):
df.response[mask] = "none"
with tm.assert_cow_warning(warn_copy_on_write):
df.response[mask] = "none"
tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data}))

# GH 6056
Expand All @@ -203,7 +200,6 @@ def test_setitem_chained_setfault(self, using_copy_on_write, warn_copy_on_write)
df["A"].iloc[0] = np.nan
expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]})
else:
# TODO(CoW-warn) custom warning message
with tm.assert_cow_warning(warn_copy_on_write):
df["A"].iloc[0] = np.nan
expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]})
Expand Down