Skip to content

Backport PR #51658 on branch 2.0.x (ENH: Improve replace lazy copy handling) #51679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,8 @@ def replace_list(
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
]
if not len(pairs):
if using_cow:
return [self.copy(deep=False)]
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]

Expand All @@ -734,8 +736,9 @@ def replace_list(
masks = [extract_bool_array(x) for x in masks]

if using_cow and inplace:
# TODO(CoW): Optimize
rb = [self.copy()]
# Don't set up refs here, otherwise we will think that we have
# references when we check again later
rb = [self]
else:
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):
Expand All @@ -762,10 +765,16 @@ def replace_list(
mask=m, # type: ignore[arg-type]
inplace=inplace,
regex=regex,
using_cow=using_cow,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
result = extend_blocks([b.convert(copy=True) for b in result])
result = extend_blocks(
[
b.convert(copy=True and not using_cow, using_cow=using_cow)
for b in result
]
)
new_rb.extend(result)
rb = new_rb
return rb
Expand All @@ -778,6 +787,7 @@ def _replace_coerce(
mask: npt.NDArray[np.bool_],
inplace: bool = True,
regex: bool = False,
using_cow: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
Expand Down Expand Up @@ -811,17 +821,24 @@ def _replace_coerce(
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
nb = self.astype(np.dtype(object), copy=False)
if nb is self and not inplace:
has_ref = self.refs.has_reference()
nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow)
if (nb is self or using_cow) and not inplace:
nb = nb.copy()
elif inplace and has_ref and nb.refs.has_reference():
# no copy in astype and we had refs before
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
return [nb]
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
using_cow=using_cow,
)

# ---------------------------------------------------------------------
Expand Down
71 changes: 70 additions & 1 deletion pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write):
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_inplace(using_copy_on_write):
@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
def test_replace_inplace(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)
Expand Down Expand Up @@ -216,3 +217,71 @@ def test_masking_inplace(using_copy_on_write, method):
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)


def test_replace_empty_list(using_copy_on_write):
df = DataFrame({"a": [1, 2]})

df2 = df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

arr_a = get_array(df, "a")
df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not df2._mgr._has_no_reference(0)


@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(using_copy_on_write, value):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
if using_copy_on_write or value is None:
assert np.shares_memory(arr, get_array(df, "a"))
else:
# This could be inplace
assert not np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)


def test_replace_list_multiple_elements_inplace(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([1, 2], 4, inplace=True)
if using_copy_on_write:
# TODO(CoW): This should share memory
assert not np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_replace_list_none(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})

df_orig = df.copy()
df2 = df.replace(["b"], value=None)
tm.assert_frame_equal(df, df_orig)

assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))


def test_replace_list_none_inplace_refs(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df_orig = df.copy()
view = df[:]
df.replace(["a"], value=None, inplace=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
assert np.shares_memory(arr, get_array(df, "a"))