From e1820849fda7afa2e951b31ff91bea68abaa2799 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 00:50:12 +0100 Subject: [PATCH 1/5] ENH: Improve replace lazy copy handling --- pandas/core/internals/blocks.py | 27 ++++++-- pandas/tests/copy_view/test_replace.py | 89 ++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 35a7855b8240f..7a24649fc60af 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -714,6 +714,8 @@ def replace_list( (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) ] if not len(pairs): + if using_cow: + return [self.copy(deep=False)] # shortcut, nothing to replace return [self] if inplace else [self.copy()] @@ -734,8 +736,9 @@ def replace_list( masks = [extract_bool_array(x) for x in masks] if using_cow and inplace: - # TODO(CoW): Optimize - rb = [self.copy()] + # Don't set up refs here, otherwise we will think that we have + # references when we check again later + rb = [self] else: rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(pairs): @@ -762,10 +765,16 @@ def replace_list( mask=m, # type: ignore[arg-type] inplace=inplace, regex=regex, + using_cow=using_cow, ) if convert and blk.is_object and not all(x is None for x in dest_list): # GH#44498 avoid unwanted cast-back - result = extend_blocks([b.convert(copy=True) for b in result]) + result = extend_blocks( + [ + b.convert(copy=True and not using_cow, using_cow=using_cow) + for b in result + ] + ) new_rb.extend(result) rb = new_rb return rb @@ -778,6 +787,7 @@ def _replace_coerce( mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, + using_cow: bool = False, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -811,17 +821,24 @@ def _replace_coerce( if value is None: # gh-45601, gh-45836, gh-46634 if mask.any(): - nb = self.astype(np.dtype(object), copy=False) - if nb is self and not inplace: + has_ref = self.refs.has_reference() + nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) + if (nb is self or using_cow) and not inplace: + nb = nb.copy() + elif inplace and has_ref and nb.refs.has_reference(): + # no copy in astype and we had refs before nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] + if using_cow: + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] return self.replace( to_replace=to_replace, value=value, inplace=inplace, mask=mask, + using_cow=using_cow, ) # --------------------------------------------------------------------- diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 7cd197541ac33..01d3336b0ce01 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -216,3 +216,92 @@ def test_masking_inplace(using_copy_on_write, method): tm.assert_frame_equal(view, df_orig) else: assert np.shares_memory(get_array(df, "a"), arr_a) + + +def test_replace_empty_list(using_copy_on_write): + df = DataFrame({"a": [1, 2]}) + arr = get_array(df, "a") + df.replace([], [], inplace=True) + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + df2 = df.replace([], []) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not df._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_replace_list_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + arr = get_array(df, "a") + df.replace([1], 4, inplace=True) + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_replace_object_list_inplace(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}) + arr = get_array(df, "a") + df.replace(["c"], "d", inplace=True) + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_replace_list_multiple_elements_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + arr = get_array(df, "a") + df.replace([1, 2], 4, inplace=True) + if using_copy_on_write: + # TODO(CoW): This should share memory + assert not np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_replace_list_inplace_refs(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + view = df[:] + arr = get_array(df, "a") + df.replace([1, 2], 4, inplace=True) + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_replace_list_none(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}) + arr = get_array(df, "a") + df.replace(["a"], value=None, inplace=True) + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + df_orig = df.copy() + df2 = df.replace(["b"], value=None) + tm.assert_frame_equal(df, df_orig) + + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + +def test_replace_list_none_inplace_refs(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}) + arr = get_array(df, "a") + df_orig = df.copy() + view = df[:] + df.replace(["a"], value=None, inplace=True) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + else: + assert np.shares_memory(arr, get_array(df, "a")) From 1fcea8a76e18c31a9e2d0623799b0d8393b053d7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 01:26:28 +0100 Subject: [PATCH 2/5] Refactor tests --- pandas/tests/copy_view/test_replace.py | 61 ++++++++------------------ 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 01d3336b0ce01..251d4a8ac5b18 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -112,7 +112,8 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write): assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) -def test_replace_inplace(using_copy_on_write): +@pytest.mark.parametrize("to_replace", [1.5, [1.5], []]) +def test_replace_inplace(using_copy_on_write, to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") df.replace(to_replace=1.5, value=15.5, inplace=True) @@ -122,7 +123,7 @@ def test_replace_inplace(using_copy_on_write): assert df._mgr._has_no_reference(0) -@pytest.mark.parametrize("to_replace", [1.5, [1.5]]) +@pytest.mark.parametrize("to_replace", [1.5, [1.5], []]) def test_replace_inplace_reference(using_copy_on_write, to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") @@ -130,9 +131,14 @@ def test_replace_inplace_reference(using_copy_on_write, to_replace): df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) + if isinstance(to_replace, list) and len(to_replace) == 0: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) else: assert np.shares_memory(get_array(df, "a"), arr_a) @@ -220,11 +226,6 @@ def test_masking_inplace(using_copy_on_write, method): def test_replace_empty_list(using_copy_on_write): df = DataFrame({"a": [1, 2]}) - arr = get_array(df, "a") - df.replace([], [], inplace=True) - assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) df2 = df.replace([], []) if using_copy_on_write: @@ -234,22 +235,17 @@ def test_replace_empty_list(using_copy_on_write): assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) -def test_replace_list_inplace(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - arr = get_array(df, "a") - df.replace([1], 4, inplace=True) - assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - - -def test_replace_object_list_inplace(using_copy_on_write): +@pytest.mark.parametrize("value", ["d", None]) +def test_replace_object_list_inplace(using_copy_on_write, value): df = DataFrame({"a": ["a", "b", "c"]}) arr = get_array(df, "a") - df.replace(["c"], "d", inplace=True) - assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: + df.replace(["c"], value, inplace=True) + if using_copy_on_write or value is None: + assert np.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) + else: + # This could be inplace + assert not np.shares_memory(arr, get_array(df, "a")) def test_replace_list_multiple_elements_inplace(using_copy_on_write): @@ -264,27 +260,8 @@ def test_replace_list_multiple_elements_inplace(using_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) -def test_replace_list_inplace_refs(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - df_orig = df.copy() - view = df[:] - arr = get_array(df, "a") - df.replace([1, 2], 4, inplace=True) - if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) - assert df._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(arr, get_array(df, "a")) - - def test_replace_list_none(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}) - arr = get_array(df, "a") - df.replace(["a"], value=None, inplace=True) - assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) df_orig = df.copy() df2 = df.replace(["b"], value=None) From d37c5e27a95e962e37b1f4d5bb2896d4a0ab832b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 20:23:23 +0100 Subject: [PATCH 3/5] Modify test --- pandas/tests/copy_view/test_replace.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 251d4a8ac5b18..4cdbe6c4b99e3 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -123,7 +123,7 @@ def test_replace_inplace(using_copy_on_write, to_replace): assert df._mgr._has_no_reference(0) -@pytest.mark.parametrize("to_replace", [1.5, [1.5], []]) +@pytest.mark.parametrize("to_replace", [1.5, [1.5]]) def test_replace_inplace_reference(using_copy_on_write, to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") @@ -131,14 +131,9 @@ def test_replace_inplace_reference(using_copy_on_write, to_replace): df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: - if isinstance(to_replace, list) and len(to_replace) == 0: - assert np.shares_memory(get_array(df, "a"), arr_a) - assert not df._mgr._has_no_reference(0) - assert not view._mgr._has_no_reference(0) - else: - assert not np.shares_memory(get_array(df, "a"), arr_a) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) else: assert np.shares_memory(get_array(df, "a"), arr_a) @@ -234,6 +229,13 @@ def test_replace_empty_list(using_copy_on_write): else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + arr_a = get_array(df, "a") + df.replace([], []) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert not df._mgr._has_no_reference(0) + assert not df2._mgr._has_no_reference(0) + @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): From cafa3949969904d7ad81e2a7245c8e6801b69012 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 20:24:32 +0100 Subject: [PATCH 4/5] Fix array manager --- pandas/tests/copy_view/test_replace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 4cdbe6c4b99e3..ec26c57e3434a 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -244,6 +244,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value): df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: assert df._mgr._has_no_reference(0) else: # This could be inplace From 7ba2db79ad1f3648fad22c306e54f0b443ad6fb3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 21:10:48 +0100 Subject: [PATCH 5/5] Fix --- pandas/tests/copy_view/test_replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index ec26c57e3434a..13991ac8b81cb 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -244,11 +244,11 @@ def test_replace_object_list_inplace(using_copy_on_write, value): df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) else: # This could be inplace assert not np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) def test_replace_list_multiple_elements_inplace(using_copy_on_write):