Skip to content

ENH: Use circular weakref to delay copy in setitem #50902

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
overload,
)
import warnings
import weakref

import numpy as np
from numpy import ma
Expand Down Expand Up @@ -4043,12 +4044,12 @@ def _iset_item_mgr(
self._mgr.iset(loc, value, inplace=inplace)
self._clear_item_cache()

def _set_item_mgr(self, key, value: ArrayLike) -> None:
def _set_item_mgr(self, key, value: ArrayLike, refs=[]) -> None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid a mutable default

try:
loc = self._info_axis.get_loc(key)
except KeyError:
# This item wasn't present, just insert at end
self._mgr.insert(len(self._info_axis), key, value)
self._mgr.insert(len(self._info_axis), key, value, ref=refs)
else:
self._iset_item_mgr(loc, value)

Expand Down Expand Up @@ -4078,7 +4079,17 @@ def _set_item(self, key, value) -> None:
Series/TimeSeries will be conformed to the DataFrames index to
ensure homogeneity.
"""
value = self._sanitize_column(value)
orig_value = None
refs = None
copy = True
if using_copy_on_write() and isinstance(value, Series):
refs = []
block = value._mgr.blocks[0]
refs.append(weakref.ref(block))
orig_value = value
copy = False

value = self._sanitize_column(value, copy=copy)

if (
key in self.columns
Expand All @@ -4091,7 +4102,19 @@ def _set_item(self, key, value) -> None:
if isinstance(existing_piece, DataFrame):
value = np.tile(value, (len(existing_piece.columns), 1)).T

self._set_item_mgr(key, value)
self._set_item_mgr(key, value, refs=refs)

# Also make a ref back to the DF in the Series so modifying the Series
# doesn't change DF (triggers a CoW)
# TODO: Make sure the weakref is not dead?
if orig_value is not None and not orig_value._mgr.refs:
# If the series already has refs (e.g. another DF contains it as a column),
# then modifying it will already trigger
# a CoW, so we are good
loc = self._info_axis.get_loc(key)
blkno = self._mgr.blknos[loc]
blk = self._mgr.blocks[blkno]
orig_value._mgr.refs = [weakref.ref(blk)]

def _set_value(
self, index: IndexLabel, col, value: Scalar, takeable: bool = False
Expand Down Expand Up @@ -4795,14 +4818,17 @@ def assign(self, **kwargs) -> DataFrame:
data[k] = com.apply_if_callable(v, data)
return data

def _sanitize_column(self, value) -> ArrayLike:
def _sanitize_column(self, value, copy=False) -> ArrayLike:
"""
Ensures new columns (which go into the BlockManager as new blocks) are
always copied and converted into an array.
converted into an array.

Parameters
----------
value : scalar, Series, or array-like
copy : bool, default False
Whether to copy new columns. You would want to turn this off if CoW
is on, and instead set refs appropriately.

Returns
-------
Expand All @@ -4815,11 +4841,12 @@ def _sanitize_column(self, value) -> ArrayLike:
if isinstance(value, DataFrame):
return _reindex_for_setitem(value, self.index)
elif is_dict_like(value):
return _reindex_for_setitem(Series(value), self.index)
return _reindex_for_setitem(Series(value), self.index, copy=copy)

if is_list_like(value):
com.require_length_match(value, self.index)
return sanitize_array(value, self.index, copy=True, allow_2d=True)

return sanitize_array(value, self.index, copy=copy, allow_2d=True)

@property
def _series(self):
Expand Down Expand Up @@ -11517,11 +11544,16 @@ def _from_nested_dict(data) -> collections.defaultdict:
return new_data


def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
def _reindex_for_setitem(
value: DataFrame | Series, index: Index, copy=True
) -> ArrayLike:
# reindex if necessary

if value.index.equals(index) or not len(index):
return value._values.copy()
ret_values = value._values
if copy:
ret_values = ret_values.copy()
return ret_values

# GH#4107
try:
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1375,7 +1375,7 @@ def column_setitem(
new_mgr = col_mgr.setitem((idx,), value)
self.iset(loc, new_mgr._block.values, inplace=True)

def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
def insert(self, loc: int, item: Hashable, value: ArrayLike, ref=None) -> None:
"""
Insert item at selected position.

Expand All @@ -1384,6 +1384,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
loc : int
item : hashable
value : np.ndarray or ExtensionArray
ref: weakref.ref or None
A weakref pointing to the Block that owns the value or None
"""
# insert to the axis; this could possibly raise a TypeError
new_axis = self.items.insert(loc, item)
Expand All @@ -1410,9 +1412,13 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:

self.axes[0] = new_axis
self.blocks += (block,)
# TODO(CoW) do we always "own" the passed `value`?
if self.refs is not None:
self.refs += [None]

if ref:
if self.refs is not None:
self.refs.append(ref)
else:
self.refs = [None] * (len(self.blocks) - 1)
self.refs += ref

self._known_consolidated = False

Expand Down
23 changes: 15 additions & 8 deletions pandas/tests/copy_view/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,29 @@ def test_set_column_with_array():
def test_set_column_with_series(using_copy_on_write):
# Case: setting a series as a new column (df[col] = s) copies that data
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ser = Series([1, 2, 3])
ser = Series([1, 2, 3], name="c")
ser_orig = ser.copy()

df["c"] = ser

if using_copy_on_write:
# TODO(CoW) with CoW we can delay the copy
# assert np.shares_memory(df["c"].values, ser.values)
assert not np.shares_memory(df["c"].values, ser.values)
assert np.shares_memory(df["c"].values, ser.values)
else:
# the series data is copied
assert not np.shares_memory(df["c"].values, ser.values)

# and modifying the series does not modify the DataFrame
ser.iloc[0] = 0
assert ser.iloc[0] == 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
tm.assert_series_equal(df["c"], ser_orig)

# Update ser_orig now, so we can check if ser changed
ser_orig.iloc[0] = 0

# and modifying the DataFrame doesn't modify the Series
df.loc[0, "c"] = 10
assert df.loc[0, "c"] == 10
tm.assert_series_equal(ser, ser_orig)


def test_set_column_with_index(using_copy_on_write):
Expand Down Expand Up @@ -79,12 +86,12 @@ def test_set_columns_with_dataframe(using_copy_on_write):
df[["c", "d"]] = df2

if using_copy_on_write:
# TODO(CoW) with CoW we can delay the copy
# assert np.shares_memory(df["c"].values, df2["c"].values)
assert not np.shares_memory(df["c"].values, df2["c"].values)
assert np.shares_memory(df["c"].values, df2["c"].values)
assert np.shares_memory(df["d"].values, df2["d"].values)
else:
# the data is copied
assert not np.shares_memory(df["c"].values, df2["c"].values)
assert not np.shares_memory(df["d"].values, df2["d"].values)

# and modifying the set DataFrame does not modify the original DataFrame
df2.iloc[0, 0] = 0
Expand Down