Skip to content

Commit aa5a01f

Browse files
phoflYi Wei
authored and
Yi Wei
committed
CoW: Delay copy when setting Series into DataFrame (pandas-dev#51698)
1 parent a6aeca9 commit aa5a01f

File tree

5 files changed

+139
-49
lines changed

5 files changed

+139
-49
lines changed

pandas/core/frame.py

+36-19
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@
201201
if TYPE_CHECKING:
202202
import datetime
203203

204+
from pandas._libs.internals import BlockValuesRefs
204205
from pandas._typing import (
205206
AggFuncType,
206207
AnyAll,
@@ -3923,11 +3924,11 @@ def isetitem(self, loc, value) -> None:
39233924
)
39243925

39253926
for i, idx in enumerate(loc):
3926-
arraylike = self._sanitize_column(value.iloc[:, i])
3927+
arraylike, _ = self._sanitize_column(value.iloc[:, i])
39273928
self._iset_item_mgr(idx, arraylike, inplace=False)
39283929
return
39293930

3930-
arraylike = self._sanitize_column(value)
3931+
arraylike, _ = self._sanitize_column(value)
39313932
self._iset_item_mgr(loc, arraylike, inplace=False)
39323933

39333934
def __setitem__(self, key, value):
@@ -4098,7 +4099,7 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
40984099
return
40994100

41004101
# now align rows
4101-
arraylike = _reindex_for_setitem(value, self.index)
4102+
arraylike, _ = _reindex_for_setitem(value, self.index)
41024103
self._set_item_mgr(key, arraylike)
41034104
return
41044105

@@ -4111,20 +4112,26 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
41114112
self[key] = value[value.columns[0]]
41124113

41134114
def _iset_item_mgr(
4114-
self, loc: int | slice | np.ndarray, value, inplace: bool = False
4115+
self,
4116+
loc: int | slice | np.ndarray,
4117+
value,
4118+
inplace: bool = False,
4119+
refs: BlockValuesRefs | None = None,
41154120
) -> None:
41164121
# when called from _set_item_mgr loc can be anything returned from get_loc
4117-
self._mgr.iset(loc, value, inplace=inplace)
4122+
self._mgr.iset(loc, value, inplace=inplace, refs=refs)
41184123
self._clear_item_cache()
41194124

4120-
def _set_item_mgr(self, key, value: ArrayLike) -> None:
4125+
def _set_item_mgr(
4126+
self, key, value: ArrayLike, refs: BlockValuesRefs | None = None
4127+
) -> None:
41214128
try:
41224129
loc = self._info_axis.get_loc(key)
41234130
except KeyError:
41244131
# This item wasn't present, just insert at end
4125-
self._mgr.insert(len(self._info_axis), key, value)
4132+
self._mgr.insert(len(self._info_axis), key, value, refs)
41264133
else:
4127-
self._iset_item_mgr(loc, value)
4134+
self._iset_item_mgr(loc, value, refs=refs)
41284135

41294136
# check if we are modifying a copy
41304137
# try to set first as we want an invalid
@@ -4154,7 +4161,7 @@ def _set_item(self, key, value) -> None:
41544161
Series/TimeSeries will be conformed to the DataFrames index to
41554162
ensure homogeneity.
41564163
"""
4157-
value = self._sanitize_column(value)
4164+
value, refs = self._sanitize_column(value, using_cow=using_copy_on_write())
41584165

41594166
if (
41604167
key in self.columns
@@ -4166,8 +4173,9 @@ def _set_item(self, key, value) -> None:
41664173
existing_piece = self[key]
41674174
if isinstance(existing_piece, DataFrame):
41684175
value = np.tile(value, (len(existing_piece.columns), 1)).T
4176+
refs = None
41694177

4170-
self._set_item_mgr(key, value)
4178+
self._set_item_mgr(key, value, refs)
41714179

41724180
def _set_value(
41734181
self, index: IndexLabel, col, value: Scalar, takeable: bool = False
@@ -4795,7 +4803,7 @@ def insert(
47954803
elif isinstance(value, DataFrame):
47964804
value = value.iloc[:, 0]
47974805

4798-
value = self._sanitize_column(value)
4806+
value, _ = self._sanitize_column(value)
47994807
self._mgr.insert(loc, column, value)
48004808

48014809
def assign(self, **kwargs) -> DataFrame:
@@ -4866,29 +4874,34 @@ def assign(self, **kwargs) -> DataFrame:
48664874
data[k] = com.apply_if_callable(v, data)
48674875
return data
48684876

4869-
def _sanitize_column(self, value) -> ArrayLike:
4877+
def _sanitize_column(
4878+
self, value, using_cow: bool = False
4879+
) -> tuple[ArrayLike, BlockValuesRefs | None]:
48704880
"""
48714881
Ensures new columns (which go into the BlockManager as new blocks) are
4872-
always copied and converted into an array.
4882+
always copied (or a reference is being tracked to them under CoW)
4883+
and converted into an array.
48734884
48744885
Parameters
48754886
----------
48764887
value : scalar, Series, or array-like
48774888
48784889
Returns
48794890
-------
4880-
numpy.ndarray or ExtensionArray
4891+
tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs
48814892
"""
48824893
self._ensure_valid_index(value)
48834894

48844895
# Using a DataFrame would mean coercing values to one dtype
48854896
assert not isinstance(value, DataFrame)
48864897
if is_dict_like(value):
4887-
return _reindex_for_setitem(Series(value), self.index)
4898+
if not isinstance(value, Series):
4899+
value = Series(value)
4900+
return _reindex_for_setitem(value, self.index, using_cow=using_cow)
48884901

48894902
if is_list_like(value):
48904903
com.require_length_match(value, self.index)
4891-
return sanitize_array(value, self.index, copy=True, allow_2d=True)
4904+
return sanitize_array(value, self.index, copy=True, allow_2d=True), None
48924905

48934906
@property
48944907
def _series(self):
@@ -11899,11 +11912,15 @@ def _from_nested_dict(data) -> collections.defaultdict:
1189911912
return new_data
1190011913

1190111914

11902-
def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
11915+
def _reindex_for_setitem(
11916+
value: DataFrame | Series, index: Index, using_cow: bool = False
11917+
) -> tuple[ArrayLike, BlockValuesRefs | None]:
1190311918
# reindex if necessary
1190411919

1190511920
if value.index.equals(index) or not len(index):
11906-
return value._values.copy()
11921+
if using_cow and isinstance(value, Series):
11922+
return value._values, value._references
11923+
return value._values.copy(), None
1190711924

1190811925
# GH#4107
1190911926
try:
@@ -11917,4 +11934,4 @@ def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
1191711934
raise TypeError(
1191811935
"incompatible index of inserted column with frame index"
1191911936
) from err
11920-
return reindexed_value
11937+
return reindexed_value, None

pandas/core/internals/array_manager.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,11 @@ def column_arrays(self) -> list[ArrayLike]:
798798
return [np.asarray(arr) for arr in self.arrays]
799799

800800
def iset(
801-
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
801+
self,
802+
loc: int | slice | np.ndarray,
803+
value: ArrayLike,
804+
inplace: bool = False,
805+
refs=None,
802806
) -> None:
803807
"""
804808
Set new column(s).
@@ -874,7 +878,7 @@ def column_setitem(
874878
# update existing ArrayManager in-place
875879
self.arrays[loc] = new_mgr.arrays[0]
876880

877-
def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
881+
def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
878882
"""
879883
Insert item at selected position.
880884

pandas/core/internals/managers.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -1129,7 +1129,11 @@ def column_arrays(self) -> list[np.ndarray]:
11291129
return result # type: ignore[return-value]
11301130

11311131
def iset(
1132-
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
1132+
self,
1133+
loc: int | slice | np.ndarray,
1134+
value: ArrayLike,
1135+
inplace: bool = False,
1136+
refs: BlockValuesRefs | None = None,
11331137
):
11341138
"""
11351139
Set new item in-place. Does not consolidate. Adds new Block if not
@@ -1170,6 +1174,7 @@ def iset(
11701174
inplace=inplace,
11711175
blkno=blkno,
11721176
blk=blk,
1177+
refs=refs,
11731178
)
11741179

11751180
# error: Incompatible types in assignment (expression has type
@@ -1215,7 +1220,7 @@ def value_getitem(placement):
12151220
continue
12161221
else:
12171222
# Defer setting the new values to enable consolidation
1218-
self._iset_split_block(blkno_l, blk_locs)
1223+
self._iset_split_block(blkno_l, blk_locs, refs=refs)
12191224

12201225
if len(removed_blknos):
12211226
# Remove blocks & update blknos accordingly
@@ -1245,6 +1250,7 @@ def value_getitem(placement):
12451250
new_block_2d(
12461251
values=value,
12471252
placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
1253+
refs=refs,
12481254
)
12491255
for mgr_loc in unfit_idxr
12501256
)
@@ -1260,6 +1266,7 @@ def value_getitem(placement):
12601266
new_block_2d(
12611267
values=value_getitem(unfit_val_items),
12621268
placement=BlockPlacement(unfit_idxr),
1269+
refs=refs,
12631270
)
12641271
)
12651272

@@ -1276,6 +1283,7 @@ def _iset_split_block(
12761283
blkno_l: int,
12771284
blk_locs: np.ndarray | list[int],
12781285
value: ArrayLike | None = None,
1286+
refs: BlockValuesRefs | None = None,
12791287
) -> None:
12801288
"""Removes columns from a block by splitting the block.
12811289
@@ -1288,6 +1296,7 @@ def _iset_split_block(
12881296
blkno_l: The block number to operate on, relevant for updating the manager
12891297
blk_locs: The locations of our block that should be deleted.
12901298
value: The value to set as a replacement.
1299+
refs: The reference tracking object of the value to set.
12911300
"""
12921301
blk = self.blocks[blkno_l]
12931302

@@ -1297,7 +1306,7 @@ def _iset_split_block(
12971306
nbs_tup = tuple(blk.delete(blk_locs))
12981307
if value is not None:
12991308
locs = blk.mgr_locs.as_array[blk_locs]
1300-
first_nb = new_block_2d(value, BlockPlacement(locs))
1309+
first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs)
13011310
else:
13021311
first_nb = nbs_tup[0]
13031312
nbs_tup = tuple(nbs_tup[1:])
@@ -1319,7 +1328,13 @@ def _iset_split_block(
13191328
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
13201329

13211330
def _iset_single(
1322-
self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
1331+
self,
1332+
loc: int,
1333+
value: ArrayLike,
1334+
inplace: bool,
1335+
blkno: int,
1336+
blk: Block,
1337+
refs: BlockValuesRefs | None = None,
13231338
) -> None:
13241339
"""
13251340
Fastpath for iset when we are only setting a single position and
@@ -1339,7 +1354,7 @@ def _iset_single(
13391354
blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
13401355
return
13411356

1342-
nb = new_block_2d(value, placement=blk._mgr_locs)
1357+
nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs)
13431358
old_blocks = self.blocks
13441359
new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
13451360
self.blocks = new_blocks
@@ -1377,7 +1392,7 @@ def column_setitem(
13771392
new_mgr = col_mgr.setitem((idx,), value)
13781393
self.iset(loc, new_mgr._block.values, inplace=True)
13791394

1380-
def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
1395+
def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
13811396
"""
13821397
Insert item at selected position.
13831398
@@ -1386,6 +1401,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
13861401
loc : int
13871402
item : hashable
13881403
value : np.ndarray or ExtensionArray
1404+
refs : The reference tracking object of the value to set.
13891405
"""
13901406
# insert to the axis; this could possibly raise a TypeError
13911407
new_axis = self.items.insert(loc, item)
@@ -1401,7 +1417,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
14011417

14021418
bp = BlockPlacement(slice(loc, loc + 1))
14031419
# TODO(CoW) do we always "own" the passed `value`?
1404-
block = new_block_2d(values=value, placement=bp)
1420+
block = new_block_2d(values=value, placement=bp, refs=refs)
14051421

14061422
if not len(self.blocks):
14071423
# Fastpath

pandas/tests/copy_view/test_indexing.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -977,28 +977,24 @@ def test_column_as_series_no_item_cache(
977977
# TODO add tests for other indexing methods on the Series
978978

979979

980-
def test_dataframe_add_column_from_series(backend):
980+
def test_dataframe_add_column_from_series(backend, using_copy_on_write):
981981
# Case: adding a new column to a DataFrame from an existing column/series
982-
# -> always already takes a copy on assignment
983-
# (no change in behaviour here)
984-
# TODO can we achieve the same behaviour with Copy-on-Write?
982+
# -> delays copy under CoW
985983
_, DataFrame, Series = backend
986984
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
987985

988986
s = Series([10, 11, 12])
989987
df["new"] = s
990-
assert not np.shares_memory(get_array(df, "new"), s.values)
988+
if using_copy_on_write:
989+
assert np.shares_memory(get_array(df, "new"), get_array(s))
990+
else:
991+
assert not np.shares_memory(get_array(df, "new"), get_array(s))
991992

992993
# editing series -> doesn't modify column in frame
993994
s[0] = 0
994995
expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]})
995996
tm.assert_frame_equal(df, expected)
996997

997-
# editing column in frame -> doesn't modify series
998-
df.loc[2, "new"] = 100
999-
expected_s = Series([0, 11, 12])
1000-
tm.assert_series_equal(s, expected_s)
1001-
1002998

1003999
@pytest.mark.parametrize("val", [100, "a"])
10041000
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)