Skip to content

Commit 5bdd17c

Browse files
phofltopper-123
authored andcommitted
CoW: Delay copy when setting Series into DataFrame (pandas-dev#51698)
1 parent b8eadbd commit 5bdd17c

File tree

5 files changed

+139
-49
lines changed

5 files changed

+139
-49
lines changed

pandas/core/frame.py

+36-19
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@
204204
if TYPE_CHECKING:
205205
import datetime
206206

207+
from pandas._libs.internals import BlockValuesRefs
207208
from pandas._typing import (
208209
AggFuncType,
209210
AnyAll,
@@ -3939,11 +3940,11 @@ def isetitem(self, loc, value) -> None:
39393940
)
39403941

39413942
for i, idx in enumerate(loc):
3942-
arraylike = self._sanitize_column(value.iloc[:, i])
3943+
arraylike, _ = self._sanitize_column(value.iloc[:, i])
39433944
self._iset_item_mgr(idx, arraylike, inplace=False)
39443945
return
39453946

3946-
arraylike = self._sanitize_column(value)
3947+
arraylike, _ = self._sanitize_column(value)
39473948
self._iset_item_mgr(loc, arraylike, inplace=False)
39483949

39493950
def __setitem__(self, key, value):
@@ -4114,7 +4115,7 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
41144115
return
41154116

41164117
# now align rows
4117-
arraylike = _reindex_for_setitem(value, self.index)
4118+
arraylike, _ = _reindex_for_setitem(value, self.index)
41184119
self._set_item_mgr(key, arraylike)
41194120
return
41204121

@@ -4127,20 +4128,26 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
41274128
self[key] = value[value.columns[0]]
41284129

41294130
def _iset_item_mgr(
4130-
self, loc: int | slice | np.ndarray, value, inplace: bool = False
4131+
self,
4132+
loc: int | slice | np.ndarray,
4133+
value,
4134+
inplace: bool = False,
4135+
refs: BlockValuesRefs | None = None,
41314136
) -> None:
41324137
# when called from _set_item_mgr loc can be anything returned from get_loc
4133-
self._mgr.iset(loc, value, inplace=inplace)
4138+
self._mgr.iset(loc, value, inplace=inplace, refs=refs)
41344139
self._clear_item_cache()
41354140

4136-
def _set_item_mgr(self, key, value: ArrayLike) -> None:
4141+
def _set_item_mgr(
4142+
self, key, value: ArrayLike, refs: BlockValuesRefs | None = None
4143+
) -> None:
41374144
try:
41384145
loc = self._info_axis.get_loc(key)
41394146
except KeyError:
41404147
# This item wasn't present, just insert at end
4141-
self._mgr.insert(len(self._info_axis), key, value)
4148+
self._mgr.insert(len(self._info_axis), key, value, refs)
41424149
else:
4143-
self._iset_item_mgr(loc, value)
4150+
self._iset_item_mgr(loc, value, refs=refs)
41444151

41454152
# check if we are modifying a copy
41464153
# try to set first as we want an invalid
@@ -4170,7 +4177,7 @@ def _set_item(self, key, value) -> None:
41704177
Series/TimeSeries will be conformed to the DataFrames index to
41714178
ensure homogeneity.
41724179
"""
4173-
value = self._sanitize_column(value)
4180+
value, refs = self._sanitize_column(value, using_cow=using_copy_on_write())
41744181

41754182
if (
41764183
key in self.columns
@@ -4182,8 +4189,9 @@ def _set_item(self, key, value) -> None:
41824189
existing_piece = self[key]
41834190
if isinstance(existing_piece, DataFrame):
41844191
value = np.tile(value, (len(existing_piece.columns), 1)).T
4192+
refs = None
41854193

4186-
self._set_item_mgr(key, value)
4194+
self._set_item_mgr(key, value, refs)
41874195

41884196
def _set_value(
41894197
self, index: IndexLabel, col, value: Scalar, takeable: bool = False
@@ -4811,7 +4819,7 @@ def insert(
48114819
elif isinstance(value, DataFrame):
48124820
value = value.iloc[:, 0]
48134821

4814-
value = self._sanitize_column(value)
4822+
value, _ = self._sanitize_column(value)
48154823
self._mgr.insert(loc, column, value)
48164824

48174825
def assign(self, **kwargs) -> DataFrame:
@@ -4882,29 +4890,34 @@ def assign(self, **kwargs) -> DataFrame:
48824890
data[k] = com.apply_if_callable(v, data)
48834891
return data
48844892

4885-
def _sanitize_column(self, value) -> ArrayLike:
4893+
def _sanitize_column(
4894+
self, value, using_cow: bool = False
4895+
) -> tuple[ArrayLike, BlockValuesRefs | None]:
48864896
"""
48874897
Ensures new columns (which go into the BlockManager as new blocks) are
4888-
always copied and converted into an array.
4898+
always copied (or a reference is being tracked to them under CoW)
4899+
and converted into an array.
48894900
48904901
Parameters
48914902
----------
48924903
value : scalar, Series, or array-like
48934904
48944905
Returns
48954906
-------
4896-
numpy.ndarray or ExtensionArray
4907+
tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs
48974908
"""
48984909
self._ensure_valid_index(value)
48994910

49004911
# Using a DataFrame would mean coercing values to one dtype
49014912
assert not isinstance(value, DataFrame)
49024913
if is_dict_like(value):
4903-
return _reindex_for_setitem(Series(value), self.index)
4914+
if not isinstance(value, Series):
4915+
value = Series(value)
4916+
return _reindex_for_setitem(value, self.index, using_cow=using_cow)
49044917

49054918
if is_list_like(value):
49064919
com.require_length_match(value, self.index)
4907-
return sanitize_array(value, self.index, copy=True, allow_2d=True)
4920+
return sanitize_array(value, self.index, copy=True, allow_2d=True), None
49084921

49094922
@property
49104923
def _series(self):
@@ -11915,11 +11928,15 @@ def _from_nested_dict(data) -> collections.defaultdict:
1191511928
return new_data
1191611929

1191711930

11918-
def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
11931+
def _reindex_for_setitem(
11932+
value: DataFrame | Series, index: Index, using_cow: bool = False
11933+
) -> tuple[ArrayLike, BlockValuesRefs | None]:
1191911934
# reindex if necessary
1192011935

1192111936
if value.index.equals(index) or not len(index):
11922-
return value._values.copy()
11937+
if using_cow and isinstance(value, Series):
11938+
return value._values, value._references
11939+
return value._values.copy(), None
1192311940

1192411941
# GH#4107
1192511942
try:
@@ -11933,4 +11950,4 @@ def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
1193311950
raise TypeError(
1193411951
"incompatible index of inserted column with frame index"
1193511952
) from err
11936-
return reindexed_value
11953+
return reindexed_value, None

pandas/core/internals/array_manager.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,11 @@ def column_arrays(self) -> list[ArrayLike]:
798798
return [np.asarray(arr) for arr in self.arrays]
799799

800800
def iset(
801-
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
801+
self,
802+
loc: int | slice | np.ndarray,
803+
value: ArrayLike,
804+
inplace: bool = False,
805+
refs=None,
802806
) -> None:
803807
"""
804808
Set new column(s).
@@ -874,7 +878,7 @@ def column_setitem(
874878
# update existing ArrayManager in-place
875879
self.arrays[loc] = new_mgr.arrays[0]
876880

877-
def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
881+
def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
878882
"""
879883
Insert item at selected position.
880884

pandas/core/internals/managers.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -1129,7 +1129,11 @@ def column_arrays(self) -> list[np.ndarray]:
11291129
return result # type: ignore[return-value]
11301130

11311131
def iset(
1132-
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
1132+
self,
1133+
loc: int | slice | np.ndarray,
1134+
value: ArrayLike,
1135+
inplace: bool = False,
1136+
refs: BlockValuesRefs | None = None,
11331137
):
11341138
"""
11351139
Set new item in-place. Does not consolidate. Adds new Block if not
@@ -1170,6 +1174,7 @@ def iset(
11701174
inplace=inplace,
11711175
blkno=blkno,
11721176
blk=blk,
1177+
refs=refs,
11731178
)
11741179

11751180
# error: Incompatible types in assignment (expression has type
@@ -1215,7 +1220,7 @@ def value_getitem(placement):
12151220
continue
12161221
else:
12171222
# Defer setting the new values to enable consolidation
1218-
self._iset_split_block(blkno_l, blk_locs)
1223+
self._iset_split_block(blkno_l, blk_locs, refs=refs)
12191224

12201225
if len(removed_blknos):
12211226
# Remove blocks & update blknos accordingly
@@ -1245,6 +1250,7 @@ def value_getitem(placement):
12451250
new_block_2d(
12461251
values=value,
12471252
placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
1253+
refs=refs,
12481254
)
12491255
for mgr_loc in unfit_idxr
12501256
)
@@ -1260,6 +1266,7 @@ def value_getitem(placement):
12601266
new_block_2d(
12611267
values=value_getitem(unfit_val_items),
12621268
placement=BlockPlacement(unfit_idxr),
1269+
refs=refs,
12631270
)
12641271
)
12651272

@@ -1276,6 +1283,7 @@ def _iset_split_block(
12761283
blkno_l: int,
12771284
blk_locs: np.ndarray | list[int],
12781285
value: ArrayLike | None = None,
1286+
refs: BlockValuesRefs | None = None,
12791287
) -> None:
12801288
"""Removes columns from a block by splitting the block.
12811289
@@ -1288,6 +1296,7 @@ def _iset_split_block(
12881296
blkno_l: The block number to operate on, relevant for updating the manager
12891297
blk_locs: The locations of our block that should be deleted.
12901298
value: The value to set as a replacement.
1299+
refs: The reference tracking object of the value to set.
12911300
"""
12921301
blk = self.blocks[blkno_l]
12931302

@@ -1297,7 +1306,7 @@ def _iset_split_block(
12971306
nbs_tup = tuple(blk.delete(blk_locs))
12981307
if value is not None:
12991308
locs = blk.mgr_locs.as_array[blk_locs]
1300-
first_nb = new_block_2d(value, BlockPlacement(locs))
1309+
first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs)
13011310
else:
13021311
first_nb = nbs_tup[0]
13031312
nbs_tup = tuple(nbs_tup[1:])
@@ -1319,7 +1328,13 @@ def _iset_split_block(
13191328
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
13201329

13211330
def _iset_single(
1322-
self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
1331+
self,
1332+
loc: int,
1333+
value: ArrayLike,
1334+
inplace: bool,
1335+
blkno: int,
1336+
blk: Block,
1337+
refs: BlockValuesRefs | None = None,
13231338
) -> None:
13241339
"""
13251340
Fastpath for iset when we are only setting a single position and
@@ -1339,7 +1354,7 @@ def _iset_single(
13391354
blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
13401355
return
13411356

1342-
nb = new_block_2d(value, placement=blk._mgr_locs)
1357+
nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs)
13431358
old_blocks = self.blocks
13441359
new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
13451360
self.blocks = new_blocks
@@ -1377,7 +1392,7 @@ def column_setitem(
13771392
new_mgr = col_mgr.setitem((idx,), value)
13781393
self.iset(loc, new_mgr._block.values, inplace=True)
13791394

1380-
def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
1395+
def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
13811396
"""
13821397
Insert item at selected position.
13831398
@@ -1386,6 +1401,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
13861401
loc : int
13871402
item : hashable
13881403
value : np.ndarray or ExtensionArray
1404+
refs : The reference tracking object of the value to set.
13891405
"""
13901406
# insert to the axis; this could possibly raise a TypeError
13911407
new_axis = self.items.insert(loc, item)
@@ -1401,7 +1417,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
14011417

14021418
bp = BlockPlacement(slice(loc, loc + 1))
14031419
# TODO(CoW) do we always "own" the passed `value`?
1404-
block = new_block_2d(values=value, placement=bp)
1420+
block = new_block_2d(values=value, placement=bp, refs=refs)
14051421

14061422
if not len(self.blocks):
14071423
# Fastpath

pandas/tests/copy_view/test_indexing.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -977,28 +977,24 @@ def test_column_as_series_no_item_cache(
977977
# TODO add tests for other indexing methods on the Series
978978

979979

980-
def test_dataframe_add_column_from_series(backend):
980+
def test_dataframe_add_column_from_series(backend, using_copy_on_write):
981981
# Case: adding a new column to a DataFrame from an existing column/series
982-
# -> always already takes a copy on assignment
983-
# (no change in behaviour here)
984-
# TODO can we achieve the same behaviour with Copy-on-Write?
982+
# -> delays copy under CoW
985983
_, DataFrame, Series = backend
986984
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
987985

988986
s = Series([10, 11, 12])
989987
df["new"] = s
990-
assert not np.shares_memory(get_array(df, "new"), s.values)
988+
if using_copy_on_write:
989+
assert np.shares_memory(get_array(df, "new"), get_array(s))
990+
else:
991+
assert not np.shares_memory(get_array(df, "new"), get_array(s))
991992

992993
# editing series -> doesn't modify column in frame
993994
s[0] = 0
994995
expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]})
995996
tm.assert_frame_equal(df, expected)
996997

997-
# editing column in frame -> doesn't modify series
998-
df.loc[2, "new"] = 100
999-
expected_s = Series([0, 11, 12])
1000-
tm.assert_series_equal(s, expected_s)
1001-
1002998

1003999
@pytest.mark.parametrize("val", [100, "a"])
10041000
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)