Skip to content

Commit ae76ad4

Browse files
CoW: Warn for cases that go through putmask (#56168)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 5165102 commit ae76ad4

File tree

11 files changed

+120
-60
lines changed

11 files changed

+120
-60
lines changed

pandas/core/generic.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10578,6 +10578,7 @@ def _where(
1057810578
inplace: bool_t = False,
1057910579
axis: Axis | None = None,
1058010580
level=None,
10581+
warn: bool_t = True,
1058110582
):
1058210583
"""
1058310584
Equivalent to public method `where`, except that `other` is not
@@ -10708,7 +10709,7 @@ def _where(
1070810709
# we may have different type blocks come out of putmask, so
1070910710
# reconstruct the block manager
1071010711

10711-
new_data = self._mgr.putmask(mask=cond, new=other, align=align)
10712+
new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn)
1071210713
result = self._constructor_from_mgr(new_data, axes=new_data.axes)
1071310714
return self._update_inplace(result)
1071410715

pandas/core/internals/base.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414

1515
import numpy as np
1616

17-
from pandas._config import using_copy_on_write
17+
from pandas._config import (
18+
using_copy_on_write,
19+
warn_copy_on_write,
20+
)
1821

1922
from pandas._libs import (
2023
algos as libalgos,
@@ -49,6 +52,16 @@
4952
)
5053

5154

55+
class _AlreadyWarned:
56+
def __init__(self):
57+
# This class is used on the manager level to the block level to
58+
# ensure that we warn only once. The block method can update the
59+
# warned_already option without returning a value to keep the
60+
# interface consistent. This is only a temporary solution for
61+
# CoW warnings.
62+
self.warned_already = False
63+
64+
5265
class DataManager(PandasObject):
5366
# TODO share more methods/attributes
5467

@@ -196,19 +209,26 @@ def where(self, other, cond, align: bool) -> Self:
196209
)
197210

198211
@final
199-
def putmask(self, mask, new, align: bool = True) -> Self:
212+
def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self:
200213
if align:
201214
align_keys = ["new", "mask"]
202215
else:
203216
align_keys = ["mask"]
204217
new = extract_array(new, extract_numpy=True)
205218

219+
already_warned = None
220+
if warn_copy_on_write():
221+
already_warned = _AlreadyWarned()
222+
if not warn:
223+
already_warned.warned_already = True
224+
206225
return self.apply_with_block(
207226
"putmask",
208227
align_keys=align_keys,
209228
mask=mask,
210229
new=new,
211230
using_cow=using_copy_on_write(),
231+
already_warned=already_warned,
212232
)
213233

214234
@final

pandas/core/internals/blocks.py

+56-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pandas._config import (
1919
get_option,
2020
using_copy_on_write,
21+
warn_copy_on_write,
2122
)
2223

2324
from pandas._libs import (
@@ -136,6 +137,29 @@
136137
_dtype_obj = np.dtype("object")
137138

138139

140+
COW_WARNING_GENERAL_MSG = """\
141+
Setting a value on a view: behaviour will change in pandas 3.0.
142+
You are mutating a Series or DataFrame object, and currently this mutation will
143+
also have effect on other Series or DataFrame objects that share data with this
144+
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
145+
will never modify another.
146+
"""
147+
148+
149+
COW_WARNING_SETITEM_MSG = """\
150+
Setting a value on a view: behaviour will change in pandas 3.0.
151+
Currently, the mutation will also have effect on the object that shares data
152+
with this object. For example, when setting a value in a Series that was
153+
extracted from a column of a DataFrame, that DataFrame will also be updated:
154+
155+
ser = df["col"]
156+
ser[0] = 0 <--- in pandas 2, this also updates `df`
157+
158+
In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
159+
modify another, and thus in the example above, `df` will not be changed.
160+
"""
161+
162+
139163
def maybe_split(meth: F) -> F:
140164
"""
141165
If we have a multi-column block, split and operate block-wise. Otherwise
@@ -1355,7 +1379,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block:
13551379
values[indexer] = casted
13561380
return self
13571381

1358-
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
1382+
def putmask(
1383+
self, mask, new, using_cow: bool = False, already_warned=None
1384+
) -> list[Block]:
13591385
"""
13601386
putmask the data to the block; it is possible that we may create a
13611387
new dtype of block
@@ -1388,6 +1414,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
13881414
return [self.copy(deep=False)]
13891415
return [self]
13901416

1417+
if (
1418+
warn_copy_on_write()
1419+
and already_warned is not None
1420+
and not already_warned.warned_already
1421+
):
1422+
if self.refs.has_reference():
1423+
warnings.warn(
1424+
COW_WARNING_GENERAL_MSG,
1425+
FutureWarning,
1426+
stacklevel=find_stack_level(),
1427+
)
1428+
already_warned.warned_already = True
1429+
13911430
try:
13921431
casted = np_can_hold_element(values.dtype, new)
13931432

@@ -2020,7 +2059,9 @@ def where(
20202059
return [nb]
20212060

20222061
@final
2023-
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
2062+
def putmask(
2063+
self, mask, new, using_cow: bool = False, already_warned=None
2064+
) -> list[Block]:
20242065
"""
20252066
See Block.putmask.__doc__
20262067
"""
@@ -2038,6 +2079,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
20382079
return [self.copy(deep=False)]
20392080
return [self]
20402081

2082+
if (
2083+
warn_copy_on_write()
2084+
and already_warned is not None
2085+
and not already_warned.warned_already
2086+
):
2087+
if self.refs.has_reference():
2088+
warnings.warn(
2089+
COW_WARNING_GENERAL_MSG,
2090+
FutureWarning,
2091+
stacklevel=find_stack_level(),
2092+
)
2093+
already_warned.warned_already = True
2094+
20412095
self = self._maybe_copy(using_cow, inplace=True)
20422096
values = self.values
20432097
if values.ndim == 2:

pandas/core/internals/managers.py

+2-23
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@
7272
interleaved_dtype,
7373
)
7474
from pandas.core.internals.blocks import (
75+
COW_WARNING_GENERAL_MSG,
76+
COW_WARNING_SETITEM_MSG,
7577
Block,
7678
NumpyBlock,
7779
ensure_block_shape,
@@ -100,29 +102,6 @@
100102
from pandas.api.extensions import ExtensionArray
101103

102104

103-
COW_WARNING_GENERAL_MSG = """\
104-
Setting a value on a view: behaviour will change in pandas 3.0.
105-
You are mutating a Series or DataFrame object, and currently this mutation will
106-
also have effect on other Series or DataFrame objects that share data with this
107-
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
108-
will never modify another.
109-
"""
110-
111-
112-
COW_WARNING_SETITEM_MSG = """\
113-
Setting a value on a view: behaviour will change in pandas 3.0.
114-
Currently, the mutation will also have effect on the object that shares data
115-
with this object. For example, when setting a value in a Series that was
116-
extracted from a column of a DataFrame, that DataFrame will also be updated:
117-
118-
ser = df["col"]
119-
ser[0] = 0 <--- in pandas 2, this also updates `df`
120-
121-
In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
122-
modify another, and thus in the example above, `df` will not be changed.
123-
"""
124-
125-
126105
class BaseBlockManager(DataManager):
127106
"""
128107
Core internal data structure to implement DataFrame, Series, etc.

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1320,7 +1320,7 @@ def __setitem__(self, key, value) -> None:
13201320
# otherwise with listlike other we interpret series[mask] = other
13211321
# as series[mask] = other[mask]
13221322
try:
1323-
self._where(~key, value, inplace=True)
1323+
self._where(~key, value, inplace=True, warn=warn)
13241324
except InvalidIndexError:
13251325
# test_where_dups
13261326
self.iloc[key] = value

pandas/tests/copy_view/test_chained_assignment_deprecation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write):
7676
@pytest.mark.parametrize(
7777
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
7878
)
79-
def test_series_setitem(indexer, using_copy_on_write):
79+
def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write):
8080
# ensure we only get a single warning for those typical cases of chained
8181
# assignment
8282
df = DataFrame({"a": [1, 2, 3], "b": 1})

pandas/tests/copy_view/test_clip.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@
88
from pandas.tests.copy_view.util import get_array
99

1010

11-
def test_clip_inplace_reference(using_copy_on_write):
11+
def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write):
1212
df = DataFrame({"a": [1.5, 2, 3]})
1313
df_copy = df.copy()
1414
arr_a = get_array(df, "a")
1515
view = df[:]
16-
df.clip(lower=2, inplace=True)
16+
if warn_copy_on_write:
17+
with tm.assert_cow_warning():
18+
df.clip(lower=2, inplace=True)
19+
else:
20+
df.clip(lower=2, inplace=True)
1721

1822
if using_copy_on_write:
1923
assert not np.shares_memory(get_array(df, "a"), arr_a)

pandas/tests/copy_view/test_indexing.py

+5-14
Original file line numberDiff line numberDiff line change
@@ -367,10 +367,11 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write):
367367

368368
mask = subset > 3
369369

370-
# TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through
371-
# DataFrame._where(..., inplace=True)
372-
if using_copy_on_write or warn_copy_on_write:
370+
if using_copy_on_write:
373371
subset[mask] = 0
372+
elif warn_copy_on_write:
373+
with tm.assert_cow_warning():
374+
subset[mask] = 0
374375
else:
375376
with pd.option_context("chained_assignment", "warn"):
376377
with tm.assert_produces_warning(SettingWithCopyWarning):
@@ -867,18 +868,8 @@ def test_series_subset_set_with_indexer(
867868
and indexer.dtype.kind == "i"
868869
):
869870
warn = FutureWarning
870-
is_mask = (
871-
indexer_si is tm.setitem
872-
and isinstance(indexer, np.ndarray)
873-
and indexer.dtype.kind == "b"
874-
)
875871
if warn_copy_on_write:
876-
# TODO(CoW-warn) should also warn for setting with mask
877-
# -> Series.__setitem__ with boolean mask ends up using Series._set_values
878-
# or Series._where depending on value being set
879-
with tm.assert_cow_warning(
880-
not is_mask, raise_on_extra_warnings=warn is not None
881-
):
872+
with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None):
882873
indexer_si(subset)[indexer] = 0
883874
else:
884875
with tm.assert_produces_warning(warn, match=msg):

pandas/tests/copy_view/test_methods.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -1407,11 +1407,12 @@ def test_items(using_copy_on_write, warn_copy_on_write):
14071407

14081408

14091409
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
1410-
def test_putmask(using_copy_on_write, dtype):
1410+
def test_putmask(using_copy_on_write, dtype, warn_copy_on_write):
14111411
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
14121412
view = df[:]
14131413
df_orig = df.copy()
1414-
df[df == df] = 5
1414+
with tm.assert_cow_warning(warn_copy_on_write):
1415+
df[df == df] = 5
14151416

14161417
if using_copy_on_write:
14171418
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
@@ -1445,15 +1446,21 @@ def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype):
14451446
@pytest.mark.parametrize(
14461447
"val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)]
14471448
)
1448-
def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp, warn):
1449+
def test_putmask_dont_copy_some_blocks(
1450+
using_copy_on_write, val, exp, warn, warn_copy_on_write
1451+
):
14491452
df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5})
14501453
view = df[:]
14511454
df_orig = df.copy()
14521455
indexer = DataFrame(
14531456
[[True, False, False], [True, False, False]], columns=list("abc")
14541457
)
1455-
with tm.assert_produces_warning(warn, match="incompatible dtype"):
1456-
df[indexer] = val
1458+
if warn_copy_on_write:
1459+
with tm.assert_cow_warning():
1460+
df[indexer] = val
1461+
else:
1462+
with tm.assert_produces_warning(warn, match="incompatible dtype"):
1463+
df[indexer] = val
14571464

14581465
if using_copy_on_write:
14591466
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
@@ -1796,13 +1803,17 @@ def test_update_frame(using_copy_on_write, warn_copy_on_write):
17961803
tm.assert_frame_equal(view, expected)
17971804

17981805

1799-
def test_update_series(using_copy_on_write):
1806+
def test_update_series(using_copy_on_write, warn_copy_on_write):
18001807
ser1 = Series([1.0, 2.0, 3.0])
18011808
ser2 = Series([100.0], index=[1])
18021809
ser1_orig = ser1.copy()
18031810
view = ser1[:]
18041811

1805-
ser1.update(ser2)
1812+
if warn_copy_on_write:
1813+
with tm.assert_cow_warning():
1814+
ser1.update(ser2)
1815+
else:
1816+
ser1.update(ser2)
18061817

18071818
expected = Series([1.0, 100.0, 3.0])
18081819
tm.assert_series_equal(ser1, expected)

pandas/tests/copy_view/test_replace.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -279,14 +279,18 @@ def test_replace_categorical(using_copy_on_write, val):
279279

280280

281281
@pytest.mark.parametrize("method", ["where", "mask"])
282-
def test_masking_inplace(using_copy_on_write, method):
282+
def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write):
283283
df = DataFrame({"a": [1.5, 2, 3]})
284284
df_orig = df.copy()
285285
arr_a = get_array(df, "a")
286286
view = df[:]
287287

288288
method = getattr(df, method)
289-
method(df["a"] > 1.6, -1, inplace=True)
289+
if warn_copy_on_write:
290+
with tm.assert_cow_warning():
291+
method(df["a"] > 1.6, -1, inplace=True)
292+
else:
293+
method(df["a"] > 1.6, -1, inplace=True)
290294

291295
if using_copy_on_write:
292296
assert not np.shares_memory(get_array(df, "a"), arr_a)

pandas/tests/frame/methods/test_replace.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -729,11 +729,7 @@ def test_replace_for_new_dtypes(self, datetime_frame):
729729

730730
tsframe.loc[tsframe.index[:5], "A"] = np.nan
731731
tsframe.loc[tsframe.index[-5:], "A"] = np.nan
732-
tsframe.loc[tsframe.index[:5], "B"] = -1e8
733-
734-
b = tsframe["B"]
735-
b[b == -1e8] = np.nan
736-
tsframe["B"] = b
732+
tsframe.loc[tsframe.index[:5], "B"] = np.nan
737733
msg = "DataFrame.fillna with 'method' is deprecated"
738734
with tm.assert_produces_warning(FutureWarning, match=msg):
739735
# TODO: what is this even testing?

0 commit comments

Comments
 (0)