From 1993b3f542faa6a7e9d42f3d59784e89eafcebfc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 9 Jul 2023 15:30:41 +0200 Subject: [PATCH 1/3] CoW: Avoid unnecessary copies for columnwise replace --- pandas/core/frame.py | 10 ++++++---- pandas/core/internals/managers.py | 2 +- pandas/tests/copy_view/test_replace.py | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6fdd6cb2a639e..583417466c270 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4158,11 +4158,13 @@ def _set_item_mgr( if len(self): self._check_setitem_copy() - def _iset_item(self, loc: int, value: Series) -> None: + def _iset_item(self, loc: int, value: Series, inplace=True) -> None: # We are only called from _replace_columnwise which guarantees that # no reindex is necessary - # TODO(CoW): Optimize to avoid copy here, but have ton track refs - self._iset_item_mgr(loc, value._values.copy(), inplace=True) + if using_copy_on_write(): + self._iset_item_mgr(loc, value._values, inplace=inplace, refs=value._references) + else: + self._iset_item_mgr(loc, value._values.copy(), inplace=True) # check if we are modifying a copy # try to set first as we want an invalid @@ -5480,7 +5482,7 @@ def _replace_columnwise( target, value = mapping[ax_value] newobj = ser.replace(target, value, regex=regex) - res._iset_item(i, newobj) + res._iset_item(i, newobj, inplace=inplace) if inplace: return diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0d24f742b97c7..ee914081d7944 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1099,7 +1099,7 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs)) + self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs), refs=refs) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index dfb1caa4b2ffd..83fe5c12b1c4f 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -364,12 +364,22 @@ def test_replace_list_none_inplace_refs(using_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) +def test_replace_columnwise_no_op_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + view = df[:] + df_orig = df.copy() + df.replace({"a": 10}, 100, inplace=True) + if using_copy_on_write: + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(view, df_orig) + + def test_replace_columnwise_no_op(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) df_orig = df.copy() df2 = df.replace({"a": 10}, 100) if using_copy_on_write: - # TODO(CoW): This should share memory - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 100 tm.assert_frame_equal(df, df_orig) From 344a816f251be942bd82832e8dd6ea8591bf31b6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 13 Jul 2023 15:17:00 -0500 Subject: [PATCH 2/3] Run pre-commit --- pandas/core/frame.py | 4 +++- pandas/core/internals/managers.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 583417466c270..4a47389eb2ca7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4162,7 +4162,9 @@ def _iset_item(self, loc: int, value: Series, inplace=True) -> None: # We are only called from _replace_columnwise which guarantees that # no reindex is necessary if using_copy_on_write(): - self._iset_item_mgr(loc, value._values, inplace=inplace, refs=value._references) + self._iset_item_mgr( + loc, value._values, inplace=inplace, refs=value._references + ) else: self._iset_item_mgr(loc, value._values.copy(), inplace=True) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ee914081d7944..93badf2178c3d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1099,7 +1099,9 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs), refs=refs) + self._iset_split_block( + blkno_l, blk_locs, value_getitem(val_locs), refs=refs + ) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue From ff9fed26b33328d990db92a510be1863ece82ca9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 13 Jul 2023 18:13:10 -0500 Subject: [PATCH 3/3] Fix --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a47389eb2ca7..5cf0e9d9f2796 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4158,7 +4158,7 @@ def _set_item_mgr( if len(self): self._check_setitem_copy() - def _iset_item(self, loc: int, value: Series, inplace=True) -> None: + def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None: # We are only called from _replace_columnwise which guarantees that # no reindex is necessary if using_copy_on_write():