Skip to content

Commit b0c0d8a

Browse files
phofllithomas1
andauthored
ENH: Add lazy copy to replace (#50746)
* WIP: ENH: CoW support for replace * Fix bugs, should work for 1-D * Refactor to support more than one loc * Don't access refs if None * Flesh out tests, fix issues * fix tests * fixes * Simplify block splitting logic * Clear ref * Adjust test * Fix test * Small fix * Fix 32 bit test * Adjust test * Update pandas/tests/copy_view/test_methods.py Co-authored-by: Thomas Li <[email protected]> Co-authored-by: Thomas Li <[email protected]>
1 parent 579e070 commit b0c0d8a

File tree

5 files changed

+108
-15
lines changed

5 files changed

+108
-15
lines changed

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5555,7 +5555,7 @@ def _replace_columnwise(
55555555
DataFrame or None
55565556
"""
55575557
# Operate column-wise
5558-
res = self if inplace else self.copy()
5558+
res = self if inplace else self.copy(deep=None)
55595559
ax = self.columns
55605560

55615561
for i, ax_value in enumerate(ax):

pandas/core/generic.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7046,6 +7046,7 @@ def replace(
70467046
to_replace = [to_replace]
70477047

70487048
if isinstance(to_replace, (tuple, list)):
7049+
# TODO: Consider copy-on-write for non-replaced columns's here
70497050
if isinstance(self, ABCDataFrame):
70507051
from pandas import Series
70517052

@@ -7105,7 +7106,7 @@ def replace(
71057106
if not self.size:
71067107
if inplace:
71077108
return None
7108-
return self.copy()
7109+
return self.copy(deep=None)
71097110

71107111
if is_dict_like(to_replace):
71117112
if is_dict_like(value): # {'A' : NA} -> {'A' : 0}

pandas/core/internals/managers.py

+20-13
Original file line numberDiff line numberDiff line change
@@ -1220,36 +1220,43 @@ def value_getitem(placement):
12201220
if inplace and blk.should_store(value):
12211221
# Updating inplace -> check if we need to do Copy-on-Write
12221222
if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
1223-
blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
1223+
nbs_tup = tuple(blk.delete(blk_locs))
1224+
first_nb = new_block_2d(
1225+
value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs])
1226+
)
1227+
if self.refs is not None:
1228+
self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))
12241229
self._clear_reference_block(blkno_l)
12251230
else:
12261231
blk.set_inplace(blk_locs, value_getitem(val_locs))
1232+
continue
12271233
else:
12281234
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
12291235
unfit_val_locs.append(val_locs)
12301236

12311237
# If all block items are unfit, schedule the block for removal.
12321238
if len(val_locs) == len(blk.mgr_locs):
12331239
removed_blknos.append(blkno_l)
1240+
continue
12341241
else:
12351242
nbs = blk.delete(blk_locs)
12361243
# Add first block where old block was and remaining blocks at
12371244
# the end to avoid updating all block numbers
12381245
first_nb = nbs[0]
12391246
nbs_tup = tuple(nbs[1:])
1240-
nr_blocks = len(self.blocks)
1241-
blocks_tup = (
1242-
self.blocks[:blkno_l]
1243-
+ (first_nb,)
1244-
+ self.blocks[blkno_l + 1 :]
1245-
+ nbs_tup
1246-
)
1247-
self.blocks = blocks_tup
1248-
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
1247+
nr_blocks = len(self.blocks)
1248+
blocks_tup = (
1249+
self.blocks[:blkno_l]
1250+
+ (first_nb,)
1251+
+ self.blocks[blkno_l + 1 :]
1252+
+ nbs_tup
1253+
)
1254+
self.blocks = blocks_tup
1255+
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
12491256

1250-
for i, nb in enumerate(nbs_tup):
1251-
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1252-
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
1257+
for i, nb in enumerate(nbs_tup):
1258+
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1259+
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
12531260

12541261
if len(removed_blknos):
12551262
# Remove blocks & update blknos and refs accordingly

pandas/tests/copy_view/test_internals.py

+47
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas as pd
77
from pandas import DataFrame
8+
import pandas._testing as tm
89
from pandas.tests.copy_view.util import get_array
910

1011

@@ -93,3 +94,49 @@ def test_switch_options():
9394
subset.iloc[0, 0] = 0
9495
# df updated with CoW disabled
9596
assert df.iloc[0, 0] == 0
97+
98+
99+
@td.skip_array_manager_invalid_test
100+
@pytest.mark.parametrize("dtype", [np.intp, np.int8])
101+
@pytest.mark.parametrize(
102+
"locs, arr",
103+
[
104+
([0], np.array([-1, -2, -3])),
105+
([1], np.array([-1, -2, -3])),
106+
([5], np.array([-1, -2, -3])),
107+
([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
108+
([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
109+
([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T),
110+
([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
111+
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
112+
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
113+
],
114+
)
115+
def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
116+
# Nothing currently calls iset with
117+
# more than 1 loc with inplace=True (only happens with inplace=False)
118+
# but ensure that it works
119+
df = DataFrame(
120+
{
121+
"a": [1, 2, 3],
122+
"b": [4, 5, 6],
123+
"c": [7, 8, 9],
124+
"d": [10, 11, 12],
125+
"e": [13, 14, 15],
126+
"f": ["a", "b", "c"],
127+
},
128+
)
129+
arr = arr.astype(dtype)
130+
df_orig = df.copy()
131+
df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy)
132+
df2._mgr.iset(locs, arr, inplace=True)
133+
134+
tm.assert_frame_equal(df, df_orig)
135+
136+
if using_copy_on_write:
137+
for i, col in enumerate(df.columns):
138+
if i not in locs:
139+
assert np.shares_memory(get_array(df, col), get_array(df2, col))
140+
else:
141+
for col in df.columns:
142+
assert not np.shares_memory(get_array(df, col), get_array(df2, col))

pandas/tests/copy_view/test_methods.py

+38
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,44 @@ def test_squeeze(using_copy_on_write):
974974
assert df.loc[0, "a"] == 0
975975

976976

977+
@pytest.mark.parametrize(
978+
"replace_kwargs",
979+
[
980+
{"to_replace": {"a": 1, "b": 4}, "value": -1},
981+
# Test CoW splits blocks to avoid copying unchanged columns
982+
{"to_replace": {"a": 1}, "value": -1},
983+
{"to_replace": {"b": 4}, "value": -1},
984+
{"to_replace": {"b": {4: 1}}},
985+
# TODO: Add these in a further optimization
986+
# We would need to see which columns got replaced in the mask
987+
# which could be expensive
988+
# {"to_replace": {"b": 1}},
989+
# 1
990+
],
991+
)
992+
def test_replace(using_copy_on_write, replace_kwargs):
993+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
994+
df_orig = df.copy()
995+
996+
df_replaced = df.replace(**replace_kwargs)
997+
998+
if using_copy_on_write:
999+
if (df_replaced["b"] == df["b"]).all():
1000+
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
1001+
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
1002+
1003+
# mutating squeezed df triggers a copy-on-write for that column/block
1004+
df_replaced.loc[0, "c"] = -1
1005+
if using_copy_on_write:
1006+
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
1007+
1008+
if "a" in replace_kwargs["to_replace"]:
1009+
arr = get_array(df_replaced, "a")
1010+
df_replaced.loc[0, "a"] = 100
1011+
assert np.shares_memory(get_array(df_replaced, "a"), arr)
1012+
tm.assert_frame_equal(df, df_orig)
1013+
1014+
9771015
def test_putmask(using_copy_on_write):
9781016
df = DataFrame({"a": [1, 2], "b": 1, "c": 2})
9791017
view = df[:]

0 commit comments

Comments
 (0)