Skip to content

ENH: Add lazy copy to replace #50746

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 17, 2023
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5530,7 +5530,7 @@ def _replace_columnwise(
DataFrame or None
"""
# Operate column-wise
res = self if inplace else self.copy()
res = self if inplace else self.copy(deep=None)
ax = self.columns

for i, ax_value in enumerate(ax):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7046,6 +7046,7 @@ def replace(
to_replace = [to_replace]

if isinstance(to_replace, (tuple, list)):
# TODO: Consider copy-on-write for non-replaced columns's here
if isinstance(self, ABCDataFrame):
from pandas import Series

Expand Down Expand Up @@ -7105,7 +7106,7 @@ def replace(
if not self.size:
if inplace:
return None
return self.copy()
return self.copy(deep=None)

if is_dict_like(to_replace):
if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
Expand Down
33 changes: 20 additions & 13 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,36 +1220,43 @@ def value_getitem(placement):
if inplace and blk.should_store(value):
# Updating inplace -> check if we need to do Copy-on-Write
if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
nbs_tup = tuple(blk.delete(blk_locs))
first_nb = new_block_2d(
value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs])
)
if self.refs is not None:
self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to clear the refs at self.refs[blkno_l] now, since first_nb should not have a reference block?

Copy link
Member Author

@phofl phofl Jan 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thx for catching, must have removed the line accidentially

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adjusted one of the tests to cover this

self._clear_reference_block(blkno_l)
else:
blk.set_inplace(blk_locs, value_getitem(val_locs))
continue
else:
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
unfit_val_locs.append(val_locs)

# If all block items are unfit, schedule the block for removal.
if len(val_locs) == len(blk.mgr_locs):
removed_blknos.append(blkno_l)
continue
else:
nbs = blk.delete(blk_locs)
# Add first block where old block was and remaining blocks at
# the end to avoid updating all block numbers
first_nb = nbs[0]
nbs_tup = tuple(nbs[1:])
nr_blocks = len(self.blocks)
blocks_tup = (
self.blocks[:blkno_l]
+ (first_nb,)
+ self.blocks[blkno_l + 1 :]
+ nbs_tup
)
self.blocks = blocks_tup
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
nr_blocks = len(self.blocks)
blocks_tup = (
self.blocks[:blkno_l]
+ (first_nb,)
+ self.blocks[blkno_l + 1 :]
+ nbs_tup
)
self.blocks = blocks_tup
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
for i, nb in enumerate(nbs_tup):
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks

if len(removed_blknos):
# Remove blocks & update blknos and refs accordingly
Expand Down
46 changes: 46 additions & 0 deletions pandas/tests/copy_view/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array


Expand Down Expand Up @@ -93,3 +94,48 @@ def test_switch_options():
subset.iloc[0, 0] = 0
# df updated with CoW disabled
assert df.iloc[0, 0] == 0


@td.skip_array_manager_invalid_test
@pytest.mark.parametrize(
"locs, arr",
[
([0], np.array([-1, -2, -3], dtype=np.intp)),
([1], np.array([-1, -2, -3], dtype=np.intp)),
([5], np.array([-1, -2, -3], dtype=np.intp)),
([0, 1], np.array([-1, -2, -3], dtype=np.intp)),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @lithomas1 I don't think that this is valid or can be reached in this way. I think the dimension of your values has to be the same as your indexer. This raises if blk.should_store_value is False, because it does not get broadcast.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. Even for the non CoW, the iset operation doesn't fail(strangely), but printing the DataFrame afterwards does.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep you get a block that has dimension 2 according to block placement but the underlying array has only one column, doesn’t really error when creating but as soon as you access it youll get into trouble :)

([0, 2], np.array([-1, -2, -3], dtype=np.intp)),
([0, 1, 2], np.array([-1, -2, -3], dtype=np.intp)),
([1, 2], np.array([-1, -2, -3], dtype=np.intp)),
([1, 3], np.array([-1, -2, -3], dtype=np.intp)),
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]], dtype=np.intp).T),
],
)
def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr):
# Nothing currently calls iset with
# more than 1 loc with inplace=True (only happens with inplace=False)
# but ensure that it works
df = DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [7, 8, 9],
"d": [10, 11, 12],
"e": [13, 14, 15],
},
dtype=np.intp,
)
df["f"] = ["a", "b", "c"]
df_orig = df.copy()
df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy)
df2._mgr.iset(locs, arr, inplace=True)

tm.assert_frame_equal(df, df_orig)

if using_copy_on_write:
for i, col in enumerate(df.columns):
if i not in locs:
assert np.shares_memory(get_array(df, col), get_array(df2, col))
else:
for col in df.columns:
assert not np.shares_memory(get_array(df, col), get_array(df2, col))
38 changes: 38 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,44 @@ def test_squeeze(using_copy_on_write):
assert df.loc[0, "a"] == 0


@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a"))

if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)


def test_putmask(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2})
view = df[:]
Expand Down