diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eb0eb34dbefc4..012b6577690ba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -31,6 +31,7 @@ overload, ) import warnings +import weakref import numpy as np from numpy import ma @@ -4043,12 +4044,12 @@ def _iset_item_mgr( self._mgr.iset(loc, value, inplace=inplace) self._clear_item_cache() - def _set_item_mgr(self, key, value: ArrayLike) -> None: + def _set_item_mgr(self, key, value: ArrayLike, refs=[]) -> None: try: loc = self._info_axis.get_loc(key) except KeyError: # This item wasn't present, just insert at end - self._mgr.insert(len(self._info_axis), key, value) + self._mgr.insert(len(self._info_axis), key, value, ref=refs) else: self._iset_item_mgr(loc, value) @@ -4078,7 +4079,17 @@ def _set_item(self, key, value) -> None: Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - value = self._sanitize_column(value) + orig_value = None + refs = None + copy = True + if using_copy_on_write() and isinstance(value, Series): + refs = [] + block = value._mgr.blocks[0] + refs.append(weakref.ref(block)) + orig_value = value + copy = False + + value = self._sanitize_column(value, copy=copy) if ( key in self.columns @@ -4091,7 +4102,19 @@ def _set_item(self, key, value) -> None: if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)).T - self._set_item_mgr(key, value) + self._set_item_mgr(key, value, refs=refs) + + # Also make a ref back to the DF in the Series so modifying the Series + # doesn't change DF (triggers a CoW) + # TODO: Make sure the weakref is not dead? + if orig_value is not None and not orig_value._mgr.refs: + # If the series already has refs (e.g. another DF contains it as a column), + # then modifying it will already trigger + # a CoW, so we are good + loc = self._info_axis.get_loc(key) + blkno = self._mgr.blknos[loc] + blk = self._mgr.blocks[blkno] + orig_value._mgr.refs = [weakref.ref(blk)] def _set_value( self, index: IndexLabel, col, value: Scalar, takeable: bool = False @@ -4795,14 +4818,17 @@ def assign(self, **kwargs) -> DataFrame: data[k] = com.apply_if_callable(v, data) return data - def _sanitize_column(self, value) -> ArrayLike: + def _sanitize_column(self, value, copy=False) -> ArrayLike: """ Ensures new columns (which go into the BlockManager as new blocks) are - always copied and converted into an array. + converted into an array. Parameters ---------- value : scalar, Series, or array-like + copy : bool, default False + Whether to copy new columns. You would want to turn this off if CoW + is on, and instead set refs appropriately. Returns ------- @@ -4815,11 +4841,12 @@ def _sanitize_column(self, value) -> ArrayLike: if isinstance(value, DataFrame): return _reindex_for_setitem(value, self.index) elif is_dict_like(value): - return _reindex_for_setitem(Series(value), self.index) + return _reindex_for_setitem(Series(value), self.index, copy=copy) if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True) + + return sanitize_array(value, self.index, copy=copy, allow_2d=True) @property def _series(self): @@ -11517,11 +11544,16 @@ def _from_nested_dict(data) -> collections.defaultdict: return new_data -def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: +def _reindex_for_setitem( + value: DataFrame | Series, index: Index, copy=True +) -> ArrayLike: # reindex if necessary if value.index.equals(index) or not len(index): - return value._values.copy() + ret_values = value._values + if copy: + ret_values = ret_values.copy() + return ret_values # GH#4107 try: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a0aba0778da5..b5807ffb5c8cd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1375,7 +1375,7 @@ def column_setitem( new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) - def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: + def insert(self, loc: int, item: Hashable, value: ArrayLike, ref=None) -> None: """ Insert item at selected position. @@ -1384,6 +1384,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: loc : int item : hashable value : np.ndarray or ExtensionArray + ref: weakref.ref or None + A weakref pointing to the Block that owns the value or None """ # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1410,9 +1412,13 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: self.axes[0] = new_axis self.blocks += (block,) - # TODO(CoW) do we always "own" the passed `value`? - if self.refs is not None: - self.refs += [None] + + if ref: + if self.refs is not None: + self.refs.append(ref) + else: + self.refs = [None] * (len(self.blocks) - 1) + self.refs += ref self._known_consolidated = False diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 9e0d350dde0de..a85089c637fe5 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -29,14 +29,13 @@ def test_set_column_with_array(): def test_set_column_with_series(using_copy_on_write): # Case: setting a series as a new column (df[col] = s) copies that data df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - ser = Series([1, 2, 3]) + ser = Series([1, 2, 3], name="c") + ser_orig = ser.copy() df["c"] = ser if using_copy_on_write: - # TODO(CoW) with CoW we can delay the copy - # assert np.shares_memory(df["c"].values, ser.values) - assert not np.shares_memory(df["c"].values, ser.values) + assert np.shares_memory(df["c"].values, ser.values) else: # the series data is copied assert not np.shares_memory(df["c"].values, ser.values) @@ -44,7 +43,15 @@ def test_set_column_with_series(using_copy_on_write): # and modifying the series does not modify the DataFrame ser.iloc[0] = 0 assert ser.iloc[0] == 0 - tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + tm.assert_series_equal(df["c"], ser_orig) + + # Update ser_orig now, so we can check if ser changed + ser_orig.iloc[0] = 0 + + # and modifying the DataFrame doesn't modify the Series + df.loc[0, "c"] = 10 + assert df.loc[0, "c"] == 10 + tm.assert_series_equal(ser, ser_orig) def test_set_column_with_index(using_copy_on_write): @@ -79,12 +86,12 @@ def test_set_columns_with_dataframe(using_copy_on_write): df[["c", "d"]] = df2 if using_copy_on_write: - # TODO(CoW) with CoW we can delay the copy - # assert np.shares_memory(df["c"].values, df2["c"].values) - assert not np.shares_memory(df["c"].values, df2["c"].values) + assert np.shares_memory(df["c"].values, df2["c"].values) + assert np.shares_memory(df["d"].values, df2["d"].values) else: # the data is copied assert not np.shares_memory(df["c"].values, df2["c"].values) + assert not np.shares_memory(df["d"].values, df2["d"].values) # and modifying the set DataFrame does not modify the original DataFrame df2.iloc[0, 0] = 0