pandas-dev · lithomas1 · Jan 20, 2023 · jbrockmendel · Jan 20, 2023
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -31,6 +31,7 @@
     overload,
 )
 import warnings
+import weakref
 
 import numpy as np
 from numpy import ma
@@ -4043,12 +4044,12 @@ def _iset_item_mgr(
         self._mgr.iset(loc, value, inplace=inplace)
         self._clear_item_cache()
 
-    def _set_item_mgr(self, key, value: ArrayLike) -> None:
+    def _set_item_mgr(self, key, value: ArrayLike, refs=[]) -> None:
         try:
             loc = self._info_axis.get_loc(key)
         except KeyError:
             # This item wasn't present, just insert at end
-            self._mgr.insert(len(self._info_axis), key, value)
+            self._mgr.insert(len(self._info_axis), key, value, ref=refs)
         else:
             self._iset_item_mgr(loc, value)
 
@@ -4078,7 +4079,17 @@ def _set_item(self, key, value) -> None:
         Series/TimeSeries will be conformed to the DataFrames index to
         ensure homogeneity.
         """
-        value = self._sanitize_column(value)
+        orig_value = None
+        refs = None
+        copy = True
+        if using_copy_on_write() and isinstance(value, Series):
+            refs = []
+            block = value._mgr.blocks[0]
+            refs.append(weakref.ref(block))
+            orig_value = value
+            copy = False
+
+        value = self._sanitize_column(value, copy=copy)
 
         if (
             key in self.columns
@@ -4091,7 +4102,19 @@ def _set_item(self, key, value) -> None:
                 if isinstance(existing_piece, DataFrame):
                     value = np.tile(value, (len(existing_piece.columns), 1)).T
 
-        self._set_item_mgr(key, value)
+        self._set_item_mgr(key, value, refs=refs)
+
+        # Also make a ref back to the DF in the Series so modifying the Series
+        # doesn't change DF (triggers a CoW)
+        # TODO: Make sure the weakref is not dead?
+        if orig_value is not None and not orig_value._mgr.refs:
+            # If the series already has refs (e.g. another DF contains it as a column),
+            # then modifying it will already trigger
+            # a CoW, so we are good
+            loc = self._info_axis.get_loc(key)
+            blkno = self._mgr.blknos[loc]
+            blk = self._mgr.blocks[blkno]
+            orig_value._mgr.refs = [weakref.ref(blk)]
 
     def _set_value(
         self, index: IndexLabel, col, value: Scalar, takeable: bool = False
@@ -4795,14 +4818,17 @@ def assign(self, **kwargs) -> DataFrame:
             data[k] = com.apply_if_callable(v, data)
         return data
 
-    def _sanitize_column(self, value) -> ArrayLike:
+    def _sanitize_column(self, value, copy=False) -> ArrayLike:
         """
         Ensures new columns (which go into the BlockManager as new blocks) are
-        always copied and converted into an array.
+        converted into an array.
 
         Parameters
         ----------
         value : scalar, Series, or array-like
+        copy : bool, default False
+            Whether to copy new columns. You would want to turn this off if CoW
+            is on, and instead set refs appropriately.
 
         Returns
         -------
@@ -4815,11 +4841,12 @@ def _sanitize_column(self, value) -> ArrayLike:
         if isinstance(value, DataFrame):
             return _reindex_for_setitem(value, self.index)
         elif is_dict_like(value):
-            return _reindex_for_setitem(Series(value), self.index)
+            return _reindex_for_setitem(Series(value), self.index, copy=copy)
 
         if is_list_like(value):
             com.require_length_match(value, self.index)
-        return sanitize_array(value, self.index, copy=True, allow_2d=True)
+
+        return sanitize_array(value, self.index, copy=copy, allow_2d=True)
 
     @property
     def _series(self):
@@ -11517,11 +11544,16 @@ def _from_nested_dict(data) -> collections.defaultdict:
     return new_data
 
 
-def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
+def _reindex_for_setitem(
+    value: DataFrame | Series, index: Index, copy=True
+) -> ArrayLike:
     # reindex if necessary
 
     if value.index.equals(index) or not len(index):
-        return value._values.copy()
+        ret_values = value._values
+        if copy:
+            ret_values = ret_values.copy()
+        return ret_values
 
     # GH#4107
     try:

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1375,7 +1375,7 @@ def column_setitem(
             new_mgr = col_mgr.setitem((idx,), value)
             self.iset(loc, new_mgr._block.values, inplace=True)
 
-    def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
+    def insert(self, loc: int, item: Hashable, value: ArrayLike, ref=None) -> None:
         """
         Insert item at selected position.
 
@@ -1384,6 +1384,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
         loc : int
         item : hashable
         value : np.ndarray or ExtensionArray
+        ref: weakref.ref or None
+            A weakref pointing to the Block that owns the value or None
         """
         # insert to the axis; this could possibly raise a TypeError
         new_axis = self.items.insert(loc, item)
@@ -1410,9 +1412,13 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
 
         self.axes[0] = new_axis
         self.blocks += (block,)
-        # TODO(CoW) do we always "own" the passed `value`?
-        if self.refs is not None:
-            self.refs += [None]
+
+        if ref:
+            if self.refs is not None:
+                self.refs.append(ref)
+            else:
+                self.refs = [None] * (len(self.blocks) - 1)
+                self.refs += ref
 
         self._known_consolidated = False
 

diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py
@@ -29,22 +29,29 @@ def test_set_column_with_array():
 def test_set_column_with_series(using_copy_on_write):
     # Case: setting a series as a new column (df[col] = s) copies that data
     df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
-    ser = Series([1, 2, 3])
+    ser = Series([1, 2, 3], name="c")
+    ser_orig = ser.copy()
 
     df["c"] = ser
 
     if using_copy_on_write:
-        # TODO(CoW) with CoW we can delay the copy
-        # assert np.shares_memory(df["c"].values, ser.values)
-        assert not np.shares_memory(df["c"].values, ser.values)
+        assert np.shares_memory(df["c"].values, ser.values)
     else:
         # the series data is copied
         assert not np.shares_memory(df["c"].values, ser.values)
 
     # and modifying the series does not modify the DataFrame
     ser.iloc[0] = 0
     assert ser.iloc[0] == 0
-    tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
+    tm.assert_series_equal(df["c"], ser_orig)
+
+    # Update ser_orig now, so we can check if ser changed
+    ser_orig.iloc[0] = 0
+
+    # and modifying the DataFrame doesn't modify the Series
+    df.loc[0, "c"] = 10
+    assert df.loc[0, "c"] == 10
+    tm.assert_series_equal(ser, ser_orig)
 
 
 def test_set_column_with_index(using_copy_on_write):
@@ -79,12 +86,12 @@ def test_set_columns_with_dataframe(using_copy_on_write):
     df[["c", "d"]] = df2
 
     if using_copy_on_write:
-        # TODO(CoW) with CoW we can delay the copy
-        # assert np.shares_memory(df["c"].values, df2["c"].values)
-        assert not np.shares_memory(df["c"].values, df2["c"].values)
+        assert np.shares_memory(df["c"].values, df2["c"].values)
+        assert np.shares_memory(df["d"].values, df2["d"].values)
     else:
         # the data is copied
         assert not np.shares_memory(df["c"].values, df2["c"].values)
+        assert not np.shares_memory(df["d"].values, df2["d"].values)
 
     # and modifying the set DataFrame does not modify the original DataFrame
     df2.iloc[0, 0] = 0