ENH: Add lazy copy to replace (#50746)

phofl · lithomas1 · web-flow · commit b0c0d8a8237f · 2023-01-16T20:13:43.000-08:00
* WIP: ENH: CoW support for replace

* Fix bugs, should work for 1-D

* Refactor to support more than one loc

* Don't access refs if None

* Flesh out tests, fix issues

* fix tests

* fixes

* Simplify block splitting logic

* Clear ref

* Adjust test

* Fix test

* Small fix

* Fix 32 bit test

* Adjust test

* Update pandas/tests/copy_view/test_methods.py

Co-authored-by: Thomas Li &lt;47963215+lithomas1@users.noreply.github.com&gt;

Co-authored-by: Thomas Li &lt;47963215+lithomas1@users.noreply.github.com&gt;
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5555,7 +5555,7 @@ def _replace_columnwise(
         DataFrame or None
         """
         # Operate column-wise
-        res = self if inplace else self.copy()
+        res = self if inplace else self.copy(deep=None)
         ax = self.columns
 
         for i, ax_value in enumerate(ax):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7046,6 +7046,7 @@ def replace(
                 to_replace = [to_replace]
 
             if isinstance(to_replace, (tuple, list)):
+                # TODO: Consider copy-on-write for non-replaced columns's here
                 if isinstance(self, ABCDataFrame):
                     from pandas import Series
 
@@ -7105,7 +7106,7 @@ def replace(
             if not self.size:
                 if inplace:
                     return None
-                return self.copy()
+                return self.copy(deep=None)
 
             if is_dict_like(to_replace):
                 if is_dict_like(value):  # {'A' : NA} -> {'A' : 0}
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1220,36 +1220,43 @@ def value_getitem(placement):
             if inplace and blk.should_store(value):
                 # Updating inplace -> check if we need to do Copy-on-Write
                 if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
-                    blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
+                    nbs_tup = tuple(blk.delete(blk_locs))
+                    first_nb = new_block_2d(
+                        value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs])
+                    )
+                    if self.refs is not None:
+                        self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))
                     self._clear_reference_block(blkno_l)
                 else:
                     blk.set_inplace(blk_locs, value_getitem(val_locs))
+                    continue
             else:
                 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
                 unfit_val_locs.append(val_locs)
 
                 # If all block items are unfit, schedule the block for removal.
                 if len(val_locs) == len(blk.mgr_locs):
                     removed_blknos.append(blkno_l)
+                    continue
                 else:
                     nbs = blk.delete(blk_locs)
                     # Add first block where old block was and remaining blocks at
                     # the end to avoid updating all block numbers
                     first_nb = nbs[0]
                     nbs_tup = tuple(nbs[1:])
-                    nr_blocks = len(self.blocks)
-                    blocks_tup = (
-                        self.blocks[:blkno_l]
-                        + (first_nb,)
-                        + self.blocks[blkno_l + 1 :]
-                        + nbs_tup
-                    )
-                    self.blocks = blocks_tup
-                    self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
+            nr_blocks = len(self.blocks)
+            blocks_tup = (
+                self.blocks[:blkno_l]
+                + (first_nb,)
+                + self.blocks[blkno_l + 1 :]
+                + nbs_tup
+            )
+            self.blocks = blocks_tup
+            self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
 
-                    for i, nb in enumerate(nbs_tup):
-                        self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
-                        self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
+            for i, nb in enumerate(nbs_tup):
+                self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
+                self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
 
         if len(removed_blknos):
             # Remove blocks & update blknos and refs accordingly
diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 from pandas import DataFrame
+import pandas._testing as tm
 from pandas.tests.copy_view.util import get_array
 
 
@@ -93,3 +94,49 @@ def test_switch_options():
         subset.iloc[0, 0] = 0
         # df updated with CoW disabled
         assert df.iloc[0, 0] == 0
+
+
+@td.skip_array_manager_invalid_test
+@pytest.mark.parametrize("dtype", [np.intp, np.int8])
+@pytest.mark.parametrize(
+    "locs, arr",
+    [
+        ([0], np.array([-1, -2, -3])),
+        ([1], np.array([-1, -2, -3])),
+        ([5], np.array([-1, -2, -3])),
+        ([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T),
+        ([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+    ],
+)
+def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
+    # Nothing currently calls iset with
+    # more than 1 loc with inplace=True (only happens with inplace=False)
+    # but ensure that it works
+    df = DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": [4, 5, 6],
+            "c": [7, 8, 9],
+            "d": [10, 11, 12],
+            "e": [13, 14, 15],
+            "f": ["a", "b", "c"],
+        },
+    )
+    arr = arr.astype(dtype)
+    df_orig = df.copy()
+    df2 = df.copy(deep=None)  # Trigger a CoW (if enabled, otherwise makes copy)
+    df2._mgr.iset(locs, arr, inplace=True)
+
+    tm.assert_frame_equal(df, df_orig)
+
+    if using_copy_on_write:
+        for i, col in enumerate(df.columns):
+            if i not in locs:
+                assert np.shares_memory(get_array(df, col), get_array(df2, col))
+    else:
+        for col in df.columns:
+            assert not np.shares_memory(get_array(df, col), get_array(df2, col))
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -974,6 +974,44 @@ def test_squeeze(using_copy_on_write):
         assert df.loc[0, "a"] == 0
 
 
+@pytest.mark.parametrize(
+    "replace_kwargs",
+    [
+        {"to_replace": {"a": 1, "b": 4}, "value": -1},
+        # Test CoW splits blocks to avoid copying unchanged columns
+        {"to_replace": {"a": 1}, "value": -1},
+        {"to_replace": {"b": 4}, "value": -1},
+        {"to_replace": {"b": {4: 1}}},
+        # TODO: Add these in a further optimization
+        # We would need to see which columns got replaced in the mask
+        # which could be expensive
+        # {"to_replace": {"b": 1}},
+        # 1
+    ],
+)
+def test_replace(using_copy_on_write, replace_kwargs):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
+    df_orig = df.copy()
+
+    df_replaced = df.replace(**replace_kwargs)
+
+    if using_copy_on_write:
+        if (df_replaced["b"] == df["b"]).all():
+            assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
+        assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+
+    # mutating squeezed df triggers a copy-on-write for that column/block
+    df_replaced.loc[0, "c"] = -1
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+
+    if "a" in replace_kwargs["to_replace"]:
+        arr = get_array(df_replaced, "a")
+        df_replaced.loc[0, "a"] = 100
+        assert np.shares_memory(get_array(df_replaced, "a"), arr)
+    tm.assert_frame_equal(df, df_orig)
+
+
 def test_putmask(using_copy_on_write):
     df = DataFrame({"a": [1, 2], "b": 1, "c": 2})
     view = df[:]