ENH: Avoid copying unnecessary columns in setitem by splitting blocks for CoW (#51031)

phofl · web-flow · commit c74d05711db5 · 2023-02-10T18:55:09.000+01:00
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1635,6 +1635,9 @@ def delete(self, loc) -> list[Block]:
             values = self.values.delete(loc)
             mgr_locs = self._mgr_locs.delete(loc)
             return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
+        elif self.values.ndim == 1:
+            # We get here through to_stata
+            return []
         return super().delete(loc)
 
     @cache_readonly
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1246,7 +1246,10 @@ def value_getitem(placement):
             self._known_consolidated = False
 
     def _iset_split_block(
-        self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None
+        self,
+        blkno_l: int,
+        blk_locs: np.ndarray | list[int],
+        value: ArrayLike | None = None,
     ) -> None:
         """Removes columns from a block by splitting the block.
 
@@ -1267,12 +1270,8 @@ def _iset_split_block(
 
         nbs_tup = tuple(blk.delete(blk_locs))
         if value is not None:
-            # Argument 1 to "BlockPlacement" has incompatible type "BlockPlacement";
-            # expected "Union[int, slice, ndarray[Any, Any]]"
-            first_nb = new_block_2d(
-                value,
-                BlockPlacement(blk.mgr_locs[blk_locs]),  # type: ignore[arg-type]
-            )
+            locs = blk.mgr_locs.as_array[blk_locs]
+            first_nb = new_block_2d(value, BlockPlacement(locs))
         else:
             first_nb = nbs_tup[0]
             nbs_tup = tuple(nbs_tup[1:])
@@ -1283,6 +1282,10 @@ def _iset_split_block(
         )
         self.blocks = blocks_tup
 
+        if not nbs_tup and value is not None:
+            # No need to update anything if split did not happen
+            return
+
         self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
 
         for i, nb in enumerate(nbs_tup):
@@ -1326,11 +1329,18 @@ def column_setitem(
         intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
         """
         if using_copy_on_write() and not self._has_no_reference(loc):
-            # otherwise perform Copy-on-Write and clear the reference
             blkno = self.blknos[loc]
-            blocks = list(self.blocks)
-            blocks[blkno] = blocks[blkno].copy()
-            self.blocks = tuple(blocks)
+            # Split blocks to only copy the column we want to modify
+            blk_loc = self.blklocs[loc]
+            # Copy our values
+            values = self.blocks[blkno].values
+            if values.ndim == 1:
+                values = values.copy()
+            else:
+                # Use [blk_loc] as indexer to keep ndim=2, this already results in a
+                # copy
+                values = values[[blk_loc]]
+            self._iset_split_block(blkno, [blk_loc], values)
 
         # this manager is only created temporarily to mutate the values in place
         # so don't track references, otherwise the `setitem` would perform CoW again
diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py
@@ -883,3 +883,39 @@ def test_dataframe_add_column_from_series():
     df.loc[2, "new"] = 100
     expected_s = Series([0, 11, 12])
     tm.assert_series_equal(s, expected_s)
+
+
+@pytest.mark.parametrize("val", [100, "a"])
+@pytest.mark.parametrize(
+    "indexer_func, indexer",
+    [
+        (tm.loc, (0, "a")),
+        (tm.iloc, (0, 0)),
+        (tm.loc, ([0], "a")),
+        (tm.iloc, ([0], 0)),
+        (tm.loc, (slice(None), "a")),
+        (tm.iloc, (slice(None), 0)),
+    ],
+)
+def test_set_value_copy_only_necessary_column(
+    using_copy_on_write, indexer_func, indexer, val
+):
+    # When setting inplace, only copy column that is modified instead of the whole
+    # block (by splitting the block)
+    # TODO multi-block only for now
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    view = df[:]
+
+    indexer_func(df)[indexer] = val
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
+        tm.assert_frame_equal(view, df_orig)
+    else:
+        assert np.shares_memory(get_array(df, "c"), get_array(view, "c"))
+        if val == "a":
+            assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
+        else:
+            assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))