From 4d782417e7e078dd73878ce88acab4959a160e51 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 01:15:25 +0100 Subject: [PATCH 1/3] ENH: Add CoW mechanism to replace_regex --- pandas/core/internals/blocks.py | 21 ++++++++++-- pandas/core/internals/managers.py | 2 +- pandas/tests/copy_view/test_replace.py | 45 ++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 35a7855b8240f..6c685a09a539e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -651,6 +651,7 @@ def _replace_regex( value, inplace: bool = False, mask=None, + using_cow: bool = False, ) -> list[Block]: """ Replace elements by the given value. @@ -665,6 +666,8 @@ def _replace_regex( Perform inplace modification. mask : array-like of bool, optional True indicate corresponding element is ignored. + using_cow: bool, default False + Specifying if copy on write is enabled. Returns ------- @@ -673,15 +676,27 @@ def _replace_regex( if not self._can_hold_element(to_replace): # i.e. only ObjectBlock, but could in principle include a # String ExtensionBlock + if using_cow: + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] rx = re.compile(to_replace) - new_values = self.values if inplace else self.values.copy() + if using_cow: + if inplace and not self.refs.has_reference(): + refs = self.refs + new_values = self.values + else: + refs = None + new_values = self.values.copy() + else: + refs = None + new_values = self.values if inplace else self.values.copy() + replace_regex(new_values, rx, value, mask) - block = self.make_block(new_values) - return block.convert(copy=False) + block = self.make_block(new_values, refs=refs) + return block.convert(copy=False, using_cow=using_cow) @final def replace_list( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9ecb77ad782b4..39772795dde27 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -467,7 +467,7 @@ def replace(self: T, to_replace, value, inplace: bool) -> T: ) def replace_regex(self, **kwargs): - return self.apply("_replace_regex", **kwargs) + return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write()) def replace_list( self: T, diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 7cd197541ac33..55de07b7ead40 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -47,6 +47,51 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) +def test_replace_regex_inplace_refs(using_copy_on_write): + df = DataFrame({"a": ["aaa", "bbb"]}) + df_orig = df.copy() + view = df[:] + arr = get_array(df, "a") + df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_replace_regex_inplace(using_copy_on_write): + df = DataFrame({"a": ["aaa", "bbb"]}) + arr = get_array(df, "a") + df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr, get_array(df, "a")) + + df_orig = df.copy() + df2 = df.replace(to_replace=r"^a.$", value="new", regex=True) + tm.assert_frame_equal(df_orig, df) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_replace_regex_inplace_no_op(using_copy_on_write): + df = DataFrame({"a": [1, 2]}) + arr = get_array(df, "a") + df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr, get_array(df, "a")) + + df_orig = df.copy() + df2 = df.replace(to_replace=r"^x.$", value="new", regex=True) + tm.assert_frame_equal(df_orig, df) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + def test_replace_mask_all_false_second_block(using_copy_on_write): df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) df_orig = df.copy() From b7ceb67ca7942da72d9b751c5d491218aeda083b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 1 Mar 2023 01:43:08 +0100 Subject: [PATCH 2/3] Fix test --- pandas/tests/copy_view/test_replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index f3690a34141de..867f1b59fcad6 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -64,13 +64,13 @@ def test_replace_regex_inplace_refs(using_copy_on_write): def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") - df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) assert np.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() - df2 = df.replace(to_replace=r"^a.$", value="new", regex=True) + df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) From 3e996f789ad4f6920697aa96033c8a275abf923e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 1 Mar 2023 01:43:37 +0100 Subject: [PATCH 3/3] Fix test --- pandas/tests/copy_view/test_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 867f1b59fcad6..c0b5498b839f5 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -52,7 +52,7 @@ def test_replace_regex_inplace_refs(using_copy_on_write): df_orig = df.copy() view = df[:] arr = get_array(df, "a") - df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert not np.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0)