Skip to content

CoW: Warn for cases that go through putmask #56168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10496,6 +10496,7 @@ def _where(
inplace: bool_t = False,
axis: Axis | None = None,
level=None,
warn: bool_t = True,
):
"""
Equivalent to public method `where`, except that `other` is not
Expand Down Expand Up @@ -10626,7 +10627,7 @@ def _where(
# we may have different type blocks come out of putmask, so
# reconstruct the block manager

new_data = self._mgr.putmask(mask=cond, new=other, align=align)
new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn)
result = self._constructor_from_mgr(new_data, axes=new_data.axes)
return self._update_inplace(result)

Expand Down
24 changes: 22 additions & 2 deletions pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
algos as libalgos,
Expand Down Expand Up @@ -49,6 +52,16 @@
)


class _AlreadyWarned:
def __init__(self):
# This class is used on the manager level to the block level to
# ensure that we warn only once. The block method can update the
# warned_already option without returning a value to keep the
# interface consistent. This is only a temporary solution for
# CoW warnings.
self.warned_already = False


class DataManager(PandasObject):
# TODO share more methods/attributes

Expand Down Expand Up @@ -196,19 +209,26 @@ def where(self, other, cond, align: bool) -> Self:
)

@final
def putmask(self, mask, new, align: bool = True) -> Self:
def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self:
if align:
align_keys = ["new", "mask"]
else:
align_keys = ["mask"]
new = extract_array(new, extract_numpy=True)

already_warned = None
if warn_copy_on_write():
already_warned = _AlreadyWarned()
if not warn:
already_warned.warned_already = True

return self.apply_with_block(
"putmask",
align_keys=align_keys,
mask=mask,
new=new,
using_cow=using_copy_on_write(),
already_warned=already_warned,
)

@final
Expand Down
58 changes: 56 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas._config import (
get_option,
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
Expand Down Expand Up @@ -136,6 +137,29 @@
_dtype_obj = np.dtype("object")


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:

ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`

In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""


def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
Expand Down Expand Up @@ -1355,7 +1379,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block:
values[indexer] = casted
return self

def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
def putmask(
self, mask, new, using_cow: bool = False, already_warned=None
) -> list[Block]:
"""
putmask the data to the block; it is possible that we may create a
new dtype of block
Expand Down Expand Up @@ -1388,6 +1414,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
return [self.copy(deep=False)]
return [self]

if (
warn_copy_on_write()
and already_warned is not None
and not already_warned.warned_already
):
if self.refs.has_reference():
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
already_warned.warned_already = True

try:
casted = np_can_hold_element(values.dtype, new)

Expand Down Expand Up @@ -2020,7 +2059,9 @@ def where(
return [nb]

@final
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
def putmask(
self, mask, new, using_cow: bool = False, already_warned=None
) -> list[Block]:
"""
See Block.putmask.__doc__
"""
Expand All @@ -2038,6 +2079,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
return [self.copy(deep=False)]
return [self]

if (
warn_copy_on_write()
and already_warned is not None
and not already_warned.warned_already
):
if self.refs.has_reference():
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
already_warned.warned_already = True

self = self._maybe_copy(using_cow, inplace=True)
values = self.values
if values.ndim == 2:
Expand Down
25 changes: 2 additions & 23 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
interleaved_dtype,
)
from pandas.core.internals.blocks import (
COW_WARNING_GENERAL_MSG,
COW_WARNING_SETITEM_MSG,
Block,
NumpyBlock,
ensure_block_shape,
Expand Down Expand Up @@ -100,29 +102,6 @@
from pandas.api.extensions import ExtensionArray


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:

ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`

In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""


class BaseBlockManager(DataManager):
"""
Core internal data structure to implement DataFrame, Series, etc.
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,7 +1320,7 @@ def __setitem__(self, key, value) -> None:
# otherwise with listlike other we interpret series[mask] = other
# as series[mask] = other[mask]
try:
self._where(~key, value, inplace=True)
self._where(~key, value, inplace=True, warn=warn)
except InvalidIndexError:
# test_where_dups
self.iloc[key] = value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write):
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_setitem(indexer, using_copy_on_write):
def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write):
# ensure we only get a single warning for those typical cases of chained
# assignment
df = DataFrame({"a": [1, 2, 3], "b": 1})
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/copy_view/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
from pandas.tests.copy_view.util import get_array


def test_clip_inplace_reference(using_copy_on_write):
def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.clip(lower=2, inplace=True)
if warn_copy_on_write:
with tm.assert_cow_warning():
df.clip(lower=2, inplace=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this ideally give a more specific warning about an inplace method?
Because right now this gives the general setting on a view warning

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure we can change the message

else:
df.clip(lower=2, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
Expand Down
19 changes: 5 additions & 14 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,11 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write):

mask = subset > 3

# TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through
# DataFrame._where(..., inplace=True)
if using_copy_on_write or warn_copy_on_write:
if using_copy_on_write:
subset[mask] = 0
elif warn_copy_on_write:
with tm.assert_cow_warning():
subset[mask] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
Expand Down Expand Up @@ -867,18 +868,8 @@ def test_series_subset_set_with_indexer(
and indexer.dtype.kind == "i"
):
warn = FutureWarning
is_mask = (
indexer_si is tm.setitem
and isinstance(indexer, np.ndarray)
and indexer.dtype.kind == "b"
)
if warn_copy_on_write:
# TODO(CoW-warn) should also warn for setting with mask
# -> Series.__setitem__ with boolean mask ends up using Series._set_values
# or Series._where depending on value being set
with tm.assert_cow_warning(
not is_mask, raise_on_extra_warnings=warn is not None
):
with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None):
indexer_si(subset)[indexer] = 0
else:
with tm.assert_produces_warning(warn, match=msg):
Expand Down
25 changes: 18 additions & 7 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,11 +1407,12 @@ def test_items(using_copy_on_write, warn_copy_on_write):


@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_putmask(using_copy_on_write, dtype):
def test_putmask(using_copy_on_write, dtype, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
view = df[:]
df_orig = df.copy()
df[df == df] = 5
with tm.assert_cow_warning(warn_copy_on_write):
df[df == df] = 5

if using_copy_on_write:
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -1445,15 +1446,21 @@ def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype):
@pytest.mark.parametrize(
"val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)]
)
def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp, warn):
def test_putmask_dont_copy_some_blocks(
using_copy_on_write, val, exp, warn, warn_copy_on_write
):
df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5})
view = df[:]
df_orig = df.copy()
indexer = DataFrame(
[[True, False, False], [True, False, False]], columns=list("abc")
)
with tm.assert_produces_warning(warn, match="incompatible dtype"):
df[indexer] = val
if warn_copy_on_write:
with tm.assert_cow_warning():
df[indexer] = val
else:
with tm.assert_produces_warning(warn, match="incompatible dtype"):
df[indexer] = val

if using_copy_on_write:
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -1796,13 +1803,17 @@ def test_update_frame(using_copy_on_write, warn_copy_on_write):
tm.assert_frame_equal(view, expected)


def test_update_series(using_copy_on_write):
def test_update_series(using_copy_on_write, warn_copy_on_write):
ser1 = Series([1.0, 2.0, 3.0])
ser2 = Series([100.0], index=[1])
ser1_orig = ser1.copy()
view = ser1[:]

ser1.update(ser2)
if warn_copy_on_write:
with tm.assert_cow_warning():
ser1.update(ser2)
else:
ser1.update(ser2)

expected = Series([1.0, 100.0, 3.0])
tm.assert_series_equal(ser1, expected)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,18 @@ def test_replace_categorical(using_copy_on_write, val):


@pytest.mark.parametrize("method", ["where", "mask"])
def test_masking_inplace(using_copy_on_write, method):
def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]

method = getattr(df, method)
method(df["a"] > 1.6, -1, inplace=True)
if warn_copy_on_write:
with tm.assert_cow_warning():
method(df["a"] > 1.6, -1, inplace=True)
else:
method(df["a"] > 1.6, -1, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,11 +729,7 @@ def test_replace_for_new_dtypes(self, datetime_frame):

tsframe.loc[tsframe.index[:5], "A"] = np.nan
tsframe.loc[tsframe.index[-5:], "A"] = np.nan
tsframe.loc[tsframe.index[:5], "B"] = -1e8

b = tsframe["B"]
b[b == -1e8] = np.nan
tsframe["B"] = b
tsframe.loc[tsframe.index[:5], "B"] = np.nan
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# TODO: what is this even testing?
Expand Down