Skip to content

ENH: Improve replace lazy copy handling #51658

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,8 @@ def replace_list(
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
]
if not len(pairs):
if using_cow:
return [self.copy(deep=False)]
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]

Expand All @@ -734,8 +736,9 @@ def replace_list(
masks = [extract_bool_array(x) for x in masks]

if using_cow and inplace:
# TODO(CoW): Optimize
rb = [self.copy()]
# Don't set up refs here, otherwise we will think that we have
# references when we check again later
rb = [self]
else:
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):
Expand All @@ -762,10 +765,16 @@ def replace_list(
mask=m, # type: ignore[arg-type]
inplace=inplace,
regex=regex,
using_cow=using_cow,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
result = extend_blocks([b.convert(copy=True) for b in result])
result = extend_blocks(
[
b.convert(copy=True and not using_cow, using_cow=using_cow)
for b in result
]
)
new_rb.extend(result)
rb = new_rb
return rb
Expand All @@ -778,6 +787,7 @@ def _replace_coerce(
mask: npt.NDArray[np.bool_],
inplace: bool = True,
regex: bool = False,
using_cow: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
Expand Down Expand Up @@ -811,17 +821,24 @@ def _replace_coerce(
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
nb = self.astype(np.dtype(object), copy=False)
if nb is self and not inplace:
has_ref = self.refs.has_reference()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC checking self.refs.has_reference here vs a few lines down can have different results in the case where self.dtype == object? does that make a difference?

if it doesnt make a difference, i think we can refactor this to share some code with some other CoW-handling

nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow)
Copy link
Member

@lithomas1 lithomas1 Feb 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy=False doesn't do anything here right since astyping (to object) will always make a copy?

if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
# Explicit copy, or required since NumPy can't view from / to object.
return arr.astype(dtype, copy=True)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No this can create a view, would have to step through the code paths but discovered a bug when this is a view

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't looked into this too closely, but I guess this would be a view when going from object -> object.

Happy to let optimizing this be a follow up.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this is the case where I was able to produce the bug. Run time wise this is more or less optimal right now, the actual checks are performed within astype anyway

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was referring to the nb.copy() below. I'm a little worried that that might double copy if astype already copied.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this can still happen, but Astype should take care of returning a block without references if a copy already happened (check is not perfect yet though)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for clarifying.

if (nb is self or using_cow) and not inplace:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the nb is self here ever possible?

nb = nb.copy()
elif inplace and has_ref and nb.refs.has_reference():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC nb.refs.has_reference should only be True if we are already object dtype?

# no copy in astype and we had refs before
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
return [nb]
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
using_cow=using_cow,
)

# ---------------------------------------------------------------------
Expand Down
76 changes: 71 additions & 5 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write):
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_inplace(using_copy_on_write):
@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
def test_replace_inplace(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)
Expand All @@ -122,17 +123,22 @@ def test_replace_inplace(using_copy_on_write):
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
def test_replace_inplace_reference(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=15.5, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
if isinstance(to_replace, list) and len(to_replace) == 0:
Copy link
Member

@lithomas1 lithomas1 Feb 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be better to parametrize test_replace_emptylist for inplace=True/False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added it to another test where it added less complexity

assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)

Expand Down Expand Up @@ -216,3 +222,63 @@ def test_masking_inplace(using_copy_on_write, method):
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)


def test_replace_empty_list(using_copy_on_write):
df = DataFrame({"a": [1, 2]})

df2 = df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))


@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(using_copy_on_write, value):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
if using_copy_on_write or value is None:
assert np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
else:
# This could be inplace
assert not np.shares_memory(arr, get_array(df, "a"))


def test_replace_list_multiple_elements_inplace(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([1, 2], 4, inplace=True)
if using_copy_on_write:
# TODO(CoW): This should share memory
assert not np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_replace_list_none(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})

df_orig = df.copy()
df2 = df.replace(["b"], value=None)
tm.assert_frame_equal(df, df_orig)

assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))


def test_replace_list_none_inplace_refs(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df_orig = df.copy()
view = df[:]
df.replace(["a"], value=None, inplace=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
assert np.shares_memory(arr, get_array(df, "a"))