Skip to content

Commit 3a6fd1e

Browse files
Backport PR pandas-dev#50918 on branch 2.0.x (ENH: Optimize replace to avoid copying when not necessary) (pandas-dev#51652)
Backport PR pandas-dev#50918: ENH: Optimize replace to avoid copying when not necessary Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 8906d4a commit 3a6fd1e

File tree

4 files changed

+205
-55
lines changed

4 files changed

+205
-55
lines changed

pandas/core/internals/blocks.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ def replace(
552552
inplace: bool = False,
553553
# mask may be pre-computed if we're called from replace_list
554554
mask: npt.NDArray[np.bool_] | None = None,
555+
using_cow: bool = False,
555556
) -> list[Block]:
556557
"""
557558
replace the to_replace value with value, possible to create new
@@ -566,7 +567,12 @@ def replace(
566567
if isinstance(values, Categorical):
567568
# TODO: avoid special-casing
568569
# GH49404
569-
blk = self if inplace else self.copy()
570+
if using_cow and (self.refs.has_reference() or not inplace):
571+
blk = self.copy()
572+
elif using_cow:
573+
blk = self.copy(deep=False)
574+
else:
575+
blk = self if inplace else self.copy()
570576
values = cast(Categorical, blk.values)
571577
values._replace(to_replace=to_replace, value=value, inplace=True)
572578
return [blk]
@@ -576,22 +582,36 @@ def replace(
576582
# replacing it is a no-op.
577583
# Note: If to_replace were a list, NDFrame.replace would call
578584
# replace_list instead of replace.
579-
return [self] if inplace else [self.copy()]
585+
if using_cow:
586+
return [self.copy(deep=False)]
587+
else:
588+
return [self] if inplace else [self.copy()]
580589

581590
if mask is None:
582591
mask = missing.mask_missing(values, to_replace)
583592
if not mask.any():
584593
# Note: we get here with test_replace_extension_other incorrectly
585594
# bc _can_hold_element is incorrect.
586-
return [self] if inplace else [self.copy()]
595+
if using_cow:
596+
return [self.copy(deep=False)]
597+
else:
598+
return [self] if inplace else [self.copy()]
587599

588600
elif self._can_hold_element(value):
589-
blk = self if inplace else self.copy()
601+
# TODO(CoW): Maybe split here as well into columns where mask has True
602+
# and rest?
603+
if using_cow:
604+
if inplace:
605+
blk = self.copy(deep=self.refs.has_reference())
606+
else:
607+
blk = self.copy()
608+
else:
609+
blk = self if inplace else self.copy()
590610
putmask_inplace(blk.values, mask, value)
591611
if not (self.is_object and value is None):
592612
# if the user *explicitly* gave None, we keep None, otherwise
593613
# may downcast to NaN
594-
blocks = blk.convert(copy=False)
614+
blocks = blk.convert(copy=False, using_cow=using_cow)
595615
else:
596616
blocks = [blk]
597617
return blocks
@@ -619,6 +639,7 @@ def replace(
619639
value=value,
620640
inplace=True,
621641
mask=mask[i : i + 1],
642+
using_cow=using_cow,
622643
)
623644
)
624645
return blocks
@@ -797,7 +818,10 @@ def _replace_coerce(
797818
return [nb]
798819
return [self] if inplace else [self.copy()]
799820
return self.replace(
800-
to_replace=to_replace, value=value, inplace=inplace, mask=mask
821+
to_replace=to_replace,
822+
value=value,
823+
inplace=inplace,
824+
mask=mask,
801825
)
802826

803827
# ---------------------------------------------------------------------

pandas/core/internals/managers.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,11 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
459459
assert not is_list_like(to_replace)
460460
assert not is_list_like(value)
461461
return self.apply(
462-
"replace", to_replace=to_replace, value=value, inplace=inplace
462+
"replace",
463+
to_replace=to_replace,
464+
value=value,
465+
inplace=inplace,
466+
using_cow=using_copy_on_write(),
463467
)
464468

465469
def replace_regex(self, **kwargs):

pandas/tests/copy_view/test_methods.py

-38
Original file line numberDiff line numberDiff line change
@@ -1210,44 +1210,6 @@ def test_items(using_copy_on_write):
12101210
assert df.loc[0, name] == 0
12111211

12121212

1213-
@pytest.mark.parametrize(
1214-
"replace_kwargs",
1215-
[
1216-
{"to_replace": {"a": 1, "b": 4}, "value": -1},
1217-
# Test CoW splits blocks to avoid copying unchanged columns
1218-
{"to_replace": {"a": 1}, "value": -1},
1219-
{"to_replace": {"b": 4}, "value": -1},
1220-
{"to_replace": {"b": {4: 1}}},
1221-
# TODO: Add these in a further optimization
1222-
# We would need to see which columns got replaced in the mask
1223-
# which could be expensive
1224-
# {"to_replace": {"b": 1}},
1225-
# 1
1226-
],
1227-
)
1228-
def test_replace(using_copy_on_write, replace_kwargs):
1229-
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
1230-
df_orig = df.copy()
1231-
1232-
df_replaced = df.replace(**replace_kwargs)
1233-
1234-
if using_copy_on_write:
1235-
if (df_replaced["b"] == df["b"]).all():
1236-
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
1237-
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
1238-
1239-
# mutating squeezed df triggers a copy-on-write for that column/block
1240-
df_replaced.loc[0, "c"] = -1
1241-
if using_copy_on_write:
1242-
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
1243-
1244-
if "a" in replace_kwargs["to_replace"]:
1245-
arr = get_array(df_replaced, "a")
1246-
df_replaced.loc[0, "a"] = 100
1247-
assert np.shares_memory(get_array(df_replaced, "a"), arr)
1248-
tm.assert_frame_equal(df, df_orig)
1249-
1250-
12511213
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
12521214
def test_putmask(using_copy_on_write, dtype):
12531215
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)

pandas/tests/copy_view/test_replace.py

+170-10
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,194 @@
99
from pandas.tests.copy_view.util import get_array
1010

1111

12-
def test_replace_categorical_inplace_reference(using_copy_on_write):
13-
df = DataFrame({"a": Categorical([1, 2, 3])})
12+
@pytest.mark.parametrize(
13+
"replace_kwargs",
14+
[
15+
{"to_replace": {"a": 1, "b": 4}, "value": -1},
16+
# Test CoW splits blocks to avoid copying unchanged columns
17+
{"to_replace": {"a": 1}, "value": -1},
18+
{"to_replace": {"b": 4}, "value": -1},
19+
{"to_replace": {"b": {4: 1}}},
20+
# TODO: Add these in a further optimization
21+
# We would need to see which columns got replaced in the mask
22+
# which could be expensive
23+
# {"to_replace": {"b": 1}},
24+
# 1
25+
],
26+
)
27+
def test_replace(using_copy_on_write, replace_kwargs):
28+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
29+
df_orig = df.copy()
30+
31+
df_replaced = df.replace(**replace_kwargs)
32+
33+
if using_copy_on_write:
34+
if (df_replaced["b"] == df["b"]).all():
35+
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
36+
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
37+
38+
# mutating squeezed df triggers a copy-on-write for that column/block
39+
df_replaced.loc[0, "c"] = -1
40+
if using_copy_on_write:
41+
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
42+
43+
if "a" in replace_kwargs["to_replace"]:
44+
arr = get_array(df_replaced, "a")
45+
df_replaced.loc[0, "a"] = 100
46+
assert np.shares_memory(get_array(df_replaced, "a"), arr)
47+
tm.assert_frame_equal(df, df_orig)
48+
49+
50+
def test_replace_mask_all_false_second_block(using_copy_on_write):
51+
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
52+
df_orig = df.copy()
53+
54+
df2 = df.replace(to_replace=1.5, value=55.5)
55+
56+
if using_copy_on_write:
57+
# TODO: Block splitting would allow us to avoid copying b
58+
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
59+
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
60+
61+
else:
62+
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
63+
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
64+
65+
df2.loc[0, "c"] = 1
66+
tm.assert_frame_equal(df, df_orig) # Original is unchanged
67+
68+
if using_copy_on_write:
69+
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
70+
# TODO: This should split and not copy the whole block
71+
# assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
72+
73+
74+
def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
75+
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
1476
df_orig = df.copy()
77+
78+
df2 = df.replace(to_replace=1.5, value="a")
79+
80+
if using_copy_on_write:
81+
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
82+
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
83+
84+
elif not using_array_manager:
85+
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
86+
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
87+
88+
if using_copy_on_write:
89+
df2.loc[0, "b"] = 0.5
90+
tm.assert_frame_equal(df, df_orig) # Original is unchanged
91+
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
92+
93+
94+
def test_replace_to_replace_wrong_dtype(using_copy_on_write):
95+
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
96+
df_orig = df.copy()
97+
98+
df2 = df.replace(to_replace="xxx", value=1.5)
99+
100+
if using_copy_on_write:
101+
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
102+
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
103+
104+
else:
105+
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
106+
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
107+
108+
df2.loc[0, "b"] = 0.5
109+
tm.assert_frame_equal(df, df_orig) # Original is unchanged
110+
111+
if using_copy_on_write:
112+
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
113+
114+
115+
def test_replace_inplace(using_copy_on_write):
116+
df = DataFrame({"a": [1.5, 2, 3]})
117+
arr_a = get_array(df, "a")
118+
df.replace(to_replace=1.5, value=15.5, inplace=True)
119+
120+
assert np.shares_memory(get_array(df, "a"), arr_a)
121+
if using_copy_on_write:
122+
assert df._mgr._has_no_reference(0)
123+
124+
125+
@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
126+
def test_replace_inplace_reference(using_copy_on_write, to_replace):
127+
df = DataFrame({"a": [1.5, 2, 3]})
15128
arr_a = get_array(df, "a")
16129
view = df[:]
17-
df.replace(to_replace=[1], value=2, inplace=True)
130+
df.replace(to_replace=to_replace, value=15.5, inplace=True)
18131

19132
if using_copy_on_write:
20-
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
133+
assert not np.shares_memory(get_array(df, "a"), arr_a)
21134
assert df._mgr._has_no_reference(0)
22135
assert view._mgr._has_no_reference(0)
23-
tm.assert_frame_equal(view, df_orig)
24136
else:
25-
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
137+
assert np.shares_memory(get_array(df, "a"), arr_a)
26138

27139

28-
def test_replace_inplace_reference(using_copy_on_write):
140+
@pytest.mark.parametrize("to_replace", ["a", 100.5])
141+
def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
29142
df = DataFrame({"a": [1.5, 2, 3]})
30143
arr_a = get_array(df, "a")
31144
view = df[:]
32-
df.replace(to_replace=[1.5], value=15.5, inplace=True)
145+
df.replace(to_replace=to_replace, value=15.5, inplace=True)
33146

147+
assert np.shares_memory(get_array(df, "a"), arr_a)
34148
if using_copy_on_write:
35-
assert not np.shares_memory(get_array(df, "a"), arr_a)
149+
assert not df._mgr._has_no_reference(0)
150+
assert not view._mgr._has_no_reference(0)
151+
152+
153+
@pytest.mark.parametrize("to_replace", [1, [1]])
154+
@pytest.mark.parametrize("val", [1, 1.5])
155+
def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
156+
df = DataFrame({"a": Categorical([1, 2, 3])})
157+
df_orig = df.copy()
158+
arr_a = get_array(df, "a")
159+
view = df[:]
160+
df.replace(to_replace=to_replace, value=val, inplace=True)
161+
162+
if using_copy_on_write:
163+
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
36164
assert df._mgr._has_no_reference(0)
37165
assert view._mgr._has_no_reference(0)
166+
tm.assert_frame_equal(view, df_orig)
38167
else:
39-
assert np.shares_memory(get_array(df, "a"), arr_a)
168+
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
169+
170+
171+
@pytest.mark.parametrize("val", [1, 1.5])
172+
def test_replace_categorical_inplace(using_copy_on_write, val):
173+
df = DataFrame({"a": Categorical([1, 2, 3])})
174+
arr_a = get_array(df, "a")
175+
df.replace(to_replace=1, value=val, inplace=True)
176+
177+
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
178+
if using_copy_on_write:
179+
assert df._mgr._has_no_reference(0)
180+
181+
expected = DataFrame({"a": Categorical([val, 2, 3])})
182+
tm.assert_frame_equal(df, expected)
183+
184+
185+
@pytest.mark.parametrize("val", [1, 1.5])
186+
def test_replace_categorical(using_copy_on_write, val):
187+
df = DataFrame({"a": Categorical([1, 2, 3])})
188+
df_orig = df.copy()
189+
df2 = df.replace(to_replace=1, value=val)
190+
191+
if using_copy_on_write:
192+
assert df._mgr._has_no_reference(0)
193+
assert df2._mgr._has_no_reference(0)
194+
assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
195+
tm.assert_frame_equal(df, df_orig)
196+
197+
arr_a = get_array(df2, "a").codes
198+
df2.iloc[0, 0] = 2.0
199+
assert np.shares_memory(get_array(df2, "a").codes, arr_a)
40200

41201

42202
@pytest.mark.parametrize("method", ["where", "mask"])

0 commit comments

Comments
 (0)