From b5d4f8452db50fc9a38056900554e73bfd0ba8f6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 26 Feb 2023 01:35:03 +0100 Subject: [PATCH 01/14] CoW: PoC for avoiding copy when setting values for rhs --- pandas/core/frame.py | 41 +++++++++++++++++++------------ pandas/core/internals/managers.py | 4 +-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49416cc2d53c0..96ee183e98282 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -47,6 +47,7 @@ properties, ) from pandas._libs.hashtable import duplicated +from pandas._libs.internals import BlockValuesRefs from pandas._libs.lib import ( NoDefault, is_range_indexer, @@ -3896,11 +3897,11 @@ def isetitem(self, loc, value) -> None: loc = [loc] for i, idx in enumerate(loc): - arraylike = self._sanitize_column(value.iloc[:, i]) + arraylike, _ = self._sanitize_column(value.iloc[:, i]) self._iset_item_mgr(idx, arraylike, inplace=False) return - arraylike = self._sanitize_column(value) + arraylike, _ = self._sanitize_column(value) self._iset_item_mgr(loc, arraylike, inplace=False) def __setitem__(self, key, value): @@ -4069,7 +4070,7 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: return # now align rows - arraylike = _reindex_for_setitem(value, self.index) + arraylike, _ = _reindex_for_setitem(value, self.index) self._set_item_mgr(key, arraylike) return @@ -4088,14 +4089,14 @@ def _iset_item_mgr( self._mgr.iset(loc, value, inplace=inplace) self._clear_item_cache() - def _set_item_mgr(self, key, value: ArrayLike) -> None: + def _set_item_mgr(self, key, value: ArrayLike, refs=None) -> None: try: loc = self._info_axis.get_loc(key) except KeyError: # This item wasn't present, just insert at end - self._mgr.insert(len(self._info_axis), key, value) + self._mgr.insert(len(self._info_axis), key, value, refs) else: - self._iset_item_mgr(loc, value) + self._iset_item_mgr(loc, value, refs) # check if we are modifying a copy # try to set first as we want an invalid @@ -4104,7 +4105,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None: self._check_setitem_copy() def _iset_item(self, loc: int, value) -> None: - arraylike = self._sanitize_column(value) + arraylike, _ = self._sanitize_column(value) self._iset_item_mgr(loc, arraylike, inplace=True) # check if we are modifying a copy @@ -4123,7 +4124,7 @@ def _set_item(self, key, value) -> None: Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - value = self._sanitize_column(value) + value, refs = self._sanitize_column(value) if ( key in self.columns @@ -4136,7 +4137,7 @@ def _set_item(self, key, value) -> None: if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)).T - self._set_item_mgr(key, value) + self._set_item_mgr(key, value, refs) def _set_value( self, index: IndexLabel, col, value: Scalar, takeable: bool = False @@ -4755,7 +4756,7 @@ def insert( if not isinstance(loc, int): raise TypeError("loc must be int") - value = self._sanitize_column(value) + value, _ = self._sanitize_column(value) self._mgr.insert(loc, column, value) def assign(self, **kwargs) -> DataFrame: @@ -4826,7 +4827,7 @@ def assign(self, **kwargs) -> DataFrame: data[k] = com.apply_if_callable(v, data) return data - def _sanitize_column(self, value) -> ArrayLike: + def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: """ Ensures new columns (which go into the BlockManager as new blocks) are always copied and converted into an array. @@ -4846,11 +4847,15 @@ def _sanitize_column(self, value) -> ArrayLike: if isinstance(value, DataFrame): return _reindex_for_setitem(value, self.index) elif is_dict_like(value): - return _reindex_for_setitem(Series(value), self.index) + if not isinstance(value, Series): + value = Series(value) + return _reindex_for_setitem( + value, self.index, using_cow=using_copy_on_write() + ) if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True) + return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property def _series(self): @@ -11566,11 +11571,15 @@ def _from_nested_dict(data) -> collections.defaultdict: return new_data -def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: +def _reindex_for_setitem( + value: DataFrame | Series, index: Index, using_cow: bool = False +) -> tuple[ArrayLike, BlockValuesRefs | None]: # reindex if necessary if value.index.equals(index) or not len(index): - return value._values.copy() + if using_cow and isinstance(value, Series): + return value._values, value._mgr.blocks[0].refs + return value._values.copy(), None # GH#4107 try: @@ -11584,4 +11593,4 @@ def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: raise TypeError( "incompatible index of inserted column with frame index" ) from err - return reindexed_value + return reindexed_value, None diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cb32b3bbc6cc7..3959639204d97 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1358,7 +1358,7 @@ def column_setitem( new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) - def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. @@ -1382,7 +1382,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: bp = BlockPlacement(slice(loc, loc + 1)) # TODO(CoW) do we always "own" the passed `value`? - block = new_block_2d(values=value, placement=bp) + block = new_block_2d(values=value, placement=bp, refs=refs) if not len(self.blocks): # Fastpath From e3a50ddebdcf4366f99d0cd08fe85aa2eb1cf2bd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 28 Feb 2023 23:40:55 +0100 Subject: [PATCH 02/14] Update tests --- pandas/core/frame.py | 16 ++++++------ pandas/core/internals/blocks.py | 13 ++++++++-- pandas/core/internals/managers.py | 28 ++++++++++++++++----- pandas/tests/copy_view/test_indexing.py | 7 ++++-- pandas/tests/copy_view/test_setitem.py | 8 ++---- pandas/tests/frame/indexing/test_setitem.py | 2 ++ 6 files changed, 50 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eb18bccfbd99c..34b03d29a9518 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4085,10 +4085,10 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: self[key] = value[value.columns[0]] def _iset_item_mgr( - self, loc: int | slice | np.ndarray, value, inplace: bool = False + self, loc: int | slice | np.ndarray, value, inplace: bool = False, refs=None ) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc - self._mgr.iset(loc, value, inplace=inplace) + self._mgr.iset(loc, value, inplace=inplace, refs=refs) self._clear_item_cache() def _set_item_mgr(self, key, value: ArrayLike, refs=None) -> None: @@ -4098,7 +4098,7 @@ def _set_item_mgr(self, key, value: ArrayLike, refs=None) -> None: # This item wasn't present, just insert at end self._mgr.insert(len(self._info_axis), key, value, refs) else: - self._iset_item_mgr(loc, value, refs) + self._iset_item_mgr(loc, value, refs=refs) # check if we are modifying a copy # try to set first as we want an invalid @@ -4126,7 +4126,7 @@ def _set_item(self, key, value) -> None: Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - value, refs = self._sanitize_column(value) + value, refs = self._sanitize_column(value, using_cow=using_copy_on_write()) if ( key in self.columns @@ -4837,7 +4837,9 @@ def assign(self, **kwargs) -> DataFrame: data[k] = com.apply_if_callable(v, data) return data - def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: + def _sanitize_column( + self, value, using_cow: bool = False + ) -> tuple[ArrayLike, BlockValuesRefs | None]: """ Ensures new columns (which go into the BlockManager as new blocks) are always copied and converted into an array. @@ -4857,9 +4859,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_dict_like(value): if not isinstance(value, Series): value = Series(value) - return _reindex_for_setitem( - value, self.index, using_cow=using_copy_on_write() - ) + return _reindex_for_setitem(value, self.index, using_cow=using_cow) if is_list_like(value): com.require_length_match(value, self.index) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 32b159983308a..9015f14d15d25 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -889,7 +889,9 @@ def _slice( return self.values[slicer] - def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: + def set_inplace( + self, locs, values: ArrayLike, copy: bool = False, refs=None + ) -> None: """ Modify block values in-place with new item value. @@ -903,6 +905,8 @@ def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: Caller is responsible for checking values.dtype == self.dtype. """ + if refs is not None: + values = values.copy() if copy: self.values = self.values.copy() self.values[locs] = values @@ -1853,9 +1857,14 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: + def set_inplace( + self, locs, values: ArrayLike, copy: bool = False, refs=None + ) -> None: # When an ndarray, we should have locs.tolist() == [0] # When a BlockPlacement we should have list(locs) == [0] + if refs is not None: + self = self.make_block(values, refs=refs) + return if copy: self.values = self.values.copy() self.values[:] = values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 31908c8f5e61a..93b00ccf426d8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1114,7 +1114,11 @@ def column_arrays(self) -> list[np.ndarray]: return result # type: ignore[return-value] def iset( - self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs=None, ): """ Set new item in-place. Does not consolidate. Adds new Block if not @@ -1155,6 +1159,7 @@ def iset( inplace=inplace, blkno=blkno, blk=blk, + refs=refs, ) # error: Incompatible types in assignment (expression has type @@ -1186,7 +1191,9 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs)) + self._iset_split_block( + blkno_l, blk_locs, value_getitem(val_locs), refs=refs + ) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue @@ -1230,6 +1237,7 @@ def value_getitem(placement): new_block_2d( values=value, placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)), + refs=refs, ) for mgr_loc in unfit_idxr ) @@ -1245,6 +1253,7 @@ def value_getitem(placement): new_block_2d( values=value_getitem(unfit_val_items), placement=BlockPlacement(unfit_idxr), + refs=refs, ) ) @@ -1261,6 +1270,7 @@ def _iset_split_block( blkno_l: int, blk_locs: np.ndarray | list[int], value: ArrayLike | None = None, + refs=None, ) -> None: """Removes columns from a block by splitting the block. @@ -1282,7 +1292,7 @@ def _iset_split_block( nbs_tup = tuple(blk.delete(blk_locs)) if value is not None: locs = blk.mgr_locs.as_array[blk_locs] - first_nb = new_block_2d(value, BlockPlacement(locs)) + first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs) else: first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) @@ -1304,7 +1314,13 @@ def _iset_split_block( self._blknos[nb.mgr_locs.indexer] = i + nr_blocks def _iset_single( - self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block + self, + loc: int, + value: ArrayLike, + inplace: bool, + blkno: int, + blk: Block, + refs=None, ) -> None: """ Fastpath for iset when we are only setting a single position and @@ -1321,10 +1337,10 @@ def _iset_single( # perform Copy-on-Write and clear the reference copy = True iloc = self.blklocs[loc] - blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy, refs=refs) return - nb = new_block_2d(value, placement=blk._mgr_locs) + nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) old_blocks = self.blocks new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] self.blocks = new_blocks diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index ca2954f0d390e..2579f5c2f6627 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -977,7 +977,7 @@ def test_column_as_series_no_item_cache( # TODO add tests for other indexing methods on the Series -def test_dataframe_add_column_from_series(backend): +def test_dataframe_add_column_from_series(backend, using_copy_on_write): # Case: adding a new column to a DataFrame from an existing column/series # -> always already takes a copy on assignment # (no change in behaviour here) @@ -987,7 +987,10 @@ def test_dataframe_add_column_from_series(backend): s = Series([10, 11, 12]) df["new"] = s - assert not np.shares_memory(get_array(df, "new"), s.values) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "new"), get_array(s)) + else: + assert not np.shares_memory(get_array(df, "new"), get_array(s)) # editing series -> doesn't modify column in frame s[0] = 0 diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 9e0d350dde0de..8b894f371c78d 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -34,9 +34,7 @@ def test_set_column_with_series(using_copy_on_write): df["c"] = ser if using_copy_on_write: - # TODO(CoW) with CoW we can delay the copy - # assert np.shares_memory(df["c"].values, ser.values) - assert not np.shares_memory(df["c"].values, ser.values) + assert np.shares_memory(df["c"].values, ser.values) else: # the series data is copied assert not np.shares_memory(df["c"].values, ser.values) @@ -79,9 +77,7 @@ def test_set_columns_with_dataframe(using_copy_on_write): df[["c", "d"]] = df2 if using_copy_on_write: - # TODO(CoW) with CoW we can delay the copy - # assert np.shares_memory(df["c"].values, df2["c"].values) - assert not np.shares_memory(df["c"].values, df2["c"].values) + assert np.shares_memory(df["c"].values, df2["c"].values) else: # the data is copied assert not np.shares_memory(df["c"].values, df2["c"].values) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c20db86904d06..8c520310663e4 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -514,8 +514,10 @@ def test_setitem_intervals(self): tm.assert_series_equal(df["C"], df["E"], check_names=False) def test_setitem_categorical(self): + pd.options.mode.copy_on_write = True # GH#35369 df = DataFrame({"h": Series(list("mn")).astype("category")}) + # view = df[:] df.h = df.h.cat.reorder_categories(["n", "m"]) expected = DataFrame( {"h": Categorical(["m", "n"]).reorder_categories(["n", "m"])} From 290facdda4e46a79251f195732c7515c4a457237 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 1 Mar 2023 00:50:01 +0100 Subject: [PATCH 03/14] Fix array manager --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 87e549ae023ef..b86aa48fc15da 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -872,7 +872,7 @@ def column_setitem( # update existing ArrayManager in-place self.arrays[loc] = new_mgr.arrays[0] - def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. From d75f46adc6d941670016d6716ff81d7b522cda9c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 15:32:45 +0100 Subject: [PATCH 04/14] Add type hint --- pandas/core/internals/managers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 63e0803da5f44..9d00ef10bb2b0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -22,7 +22,10 @@ internals as libinternals, lib, ) -from pandas._libs.internals import BlockPlacement +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -1129,7 +1132,7 @@ def iset( loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False, - refs=None, + refs: BlockValuesRefs | None = None, ): """ Set new item in-place. Does not consolidate. Adds new Block if not From c61dffbe55104177108909bf3e03feba0b38e9fa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 15:38:23 +0100 Subject: [PATCH 05/14] Remove unnecessary --- pandas/core/frame.py | 2 +- pandas/core/internals/blocks.py | 6 +----- pandas/core/internals/managers.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 976a311be28f0..aa97521bde780 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12043,7 +12043,7 @@ def _reindex_for_setitem( if value.index.equals(index) or not len(index): if using_cow and isinstance(value, Series): - return value._values, value._mgr.blocks[0].refs + return value._values, value._references return value._values.copy(), None # GH#4107 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3276e68657dd6..f139fb78d483a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -905,9 +905,7 @@ def _slice( return self.values[slicer] - def set_inplace( - self, locs, values: ArrayLike, copy: bool = False, refs=None - ) -> None: + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: """ Modify block values in-place with new item value. @@ -921,8 +919,6 @@ def set_inplace( Caller is responsible for checking values.dtype == self.dtype. """ - if refs is not None: - values = values.copy() if copy: self.values = self.values.copy() self.values[locs] = values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9d00ef10bb2b0..8d3bf9c0e1c82 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1351,7 +1351,7 @@ def _iset_single( # perform Copy-on-Write and clear the reference copy = True iloc = self.blklocs[loc] - blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy, refs=refs) + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) From 7bdff6bb2c785f224a99b81d1e4f83da2492ecab Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 15:42:33 +0100 Subject: [PATCH 06/14] Fix typing --- pandas/core/frame.py | 10 ++++++++-- pandas/core/internals/blocks.py | 7 +------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aa97521bde780..5a32bf02933bc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4131,13 +4131,19 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: self[key] = value[value.columns[0]] def _iset_item_mgr( - self, loc: int | slice | np.ndarray, value, inplace: bool = False, refs=None + self, + loc: int | slice | np.ndarray, + value, + inplace: bool = False, + refs: BlockValuesRefs | None = None, ) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc self._mgr.iset(loc, value, inplace=inplace, refs=refs) self._clear_item_cache() - def _set_item_mgr(self, key, value: ArrayLike, refs=None) -> None: + def _set_item_mgr( + self, key, value: ArrayLike, refs: BlockValuesRefs | None = None + ) -> None: try: loc = self._info_axis.get_loc(key) except KeyError: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f139fb78d483a..cb336d2f718a6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1902,14 +1902,9 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace( - self, locs, values: ArrayLike, copy: bool = False, refs=None - ) -> None: + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: # When an ndarray, we should have locs.tolist() == [0] # When a BlockPlacement we should have list(locs) == [0] - if refs is not None: - self = self.make_block(values, refs=refs) - return if copy: self.values = self.values.copy() self.values[:] = values From 90777f52313b4ede83ea7e8558b695b50161d872 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 15:55:43 +0100 Subject: [PATCH 07/14] Add tests --- pandas/conftest.py | 1 + pandas/core/frame.py | 1 + pandas/core/internals/managers.py | 9 +++--- pandas/tests/copy_view/test_indexing.py | 31 +++++++++++++++++++++ pandas/tests/frame/indexing/test_setitem.py | 2 -- 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 95bb2078d151c..fcf74bbf796dd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1921,6 +1921,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5a32bf02933bc..668ff2bb97fc6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4192,6 +4192,7 @@ def _set_item(self, key, value) -> None: existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)).T + refs = None self._set_item_mgr(key, value, refs) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8d3bf9c0e1c82..ba875c8f27f0e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1205,9 +1205,7 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - self._iset_split_block( - blkno_l, blk_locs, value_getitem(val_locs), refs=refs - ) + self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs)) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue @@ -1221,7 +1219,7 @@ def value_getitem(placement): continue else: # Defer setting the new values to enable consolidation - self._iset_split_block(blkno_l, blk_locs) + self._iset_split_block(blkno_l, blk_locs, refs=refs) if len(removed_blknos): # Remove blocks & update blknos accordingly @@ -1284,7 +1282,7 @@ def _iset_split_block( blkno_l: int, blk_locs: np.ndarray | list[int], value: ArrayLike | None = None, - refs=None, + refs: BlockValuesRefs | None = None, ) -> None: """Removes columns from a block by splitting the block. @@ -1297,6 +1295,7 @@ def _iset_split_block( blkno_l: The block number to operate on, relevant for updating the manager blk_locs: The locations of our block that should be deleted. value: The value to set as a replacement. + refs: The reference tracking object of the value to set. """ blk = self.blocks[blkno_l] diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 365490c957e49..2beb51cbe1359 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1052,3 +1052,34 @@ def test_getitem_midx_slice(using_copy_on_write, using_array_manager): if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df) + + +def test_setitem_series_no_copy(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + df["a"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "a")) + + df.iloc[0, 0] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_no_copy_split_block(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 612a4a0a0c1b2..998ac9c8395ce 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -514,10 +514,8 @@ def test_setitem_intervals(self): tm.assert_series_equal(df["C"], df["E"], check_names=False) def test_setitem_categorical(self): - pd.options.mode.copy_on_write = True # GH#35369 df = DataFrame({"h": Series(list("mn")).astype("category")}) - # view = df[:] df.h = df.h.cat.reorder_categories(["n", "m"]) expected = DataFrame( {"h": Categorical(["m", "n"]).reorder_categories(["n", "m"])} From 46256fe331f6fa86cabf2874cc215ebf6a4dc625 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 15:56:29 +0100 Subject: [PATCH 08/14] Add tests --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index fcf74bbf796dd..95bb2078d151c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1921,7 +1921,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From 2bc836b0589cfc91fc3246572f5e6ab17c076fc0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 15:58:41 +0100 Subject: [PATCH 09/14] Move tests --- pandas/core/internals/managers.py | 2 +- pandas/tests/copy_view/test_indexing.py | 31 ------------------------ pandas/tests/copy_view/test_setitem.py | 32 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ba875c8f27f0e..a77720cfdcb40 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1333,7 +1333,7 @@ def _iset_single( inplace: bool, blkno: int, blk: Block, - refs=None, + refs: BlockValuesRefs | None = None, ) -> None: """ Fastpath for iset when we are only setting a single position and diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 2beb51cbe1359..365490c957e49 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1052,34 +1052,3 @@ def test_getitem_midx_slice(using_copy_on_write, using_array_manager): if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df) - - -def test_setitem_series_no_copy(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - rhs = Series([4, 5, 6]) - rhs_orig = rhs.copy() - df["b"] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(rhs), get_array(df, "b")) - - df.iloc[0, 1] = 100 - tm.assert_series_equal(rhs, rhs_orig) - - df["a"] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(rhs), get_array(df, "a")) - - df.iloc[0, 0] = 100 - tm.assert_series_equal(rhs, rhs_orig) - - -def test_setitem_series_no_copy_split_block(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": 1}) - rhs = Series([4, 5, 6]) - rhs_orig = rhs.copy() - df["b"] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(rhs), get_array(df, "b")) - - df.iloc[0, 1] = 100 - tm.assert_series_equal(rhs, rhs_orig) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 8b894f371c78d..5f0d23d92a02b 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -7,6 +7,7 @@ Series, ) import pandas._testing as tm +from pandas.tests.copy_view.util import get_array # ----------------------------------------------------------------------------- # Copy/view behaviour for the values that are set in a DataFrame @@ -85,3 +86,34 @@ def test_set_columns_with_dataframe(using_copy_on_write): # and modifying the set DataFrame does not modify the original DataFrame df2.iloc[0, 0] = 0 tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c")) + + +def test_setitem_series_no_copy(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + df["a"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "a")) + + df.iloc[0, 0] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_no_copy_split_block(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) From 14d77da22380b525f5137f0bab8a3259984f4759 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 16:17:20 +0100 Subject: [PATCH 10/14] Add midx test --- pandas/tests/copy_view/test_setitem.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 5f0d23d92a02b..0991850580b55 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, Index, + MultiIndex, RangeIndex, Series, ) @@ -117,3 +118,15 @@ def test_setitem_series_no_copy_split_block(using_copy_on_write): df.iloc[0, 1] = 100 tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_column_midx_broadcasting(using_copy_on_write): + df = DataFrame( + [[1, 2, 3], [3, 4, 5]], + columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]), + ) + rhs = Series([10, 11]) + df["a"] = rhs + assert not np.shares_memory(get_array(rhs), df._get_column_array(0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) From fcb0bb44029969f912b9221d376df22add854c94 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 16:55:01 +0100 Subject: [PATCH 11/14] Fix missing arg --- pandas/core/internals/array_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 5242fa6913592..b3a6b9bb23a78 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -798,7 +798,11 @@ def column_arrays(self) -> list[ArrayLike]: return [np.asarray(arr) for arr in self.arrays] def iset( - self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs=None, ) -> None: """ Set new column(s). From d6ed1cc62e5c874a7ede2d33b998cc2ab7e051d5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 22:44:21 +0100 Subject: [PATCH 12/14] Remove --- pandas/tests/copy_view/test_indexing.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 365490c957e49..63f0ffe8542e2 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -979,9 +979,7 @@ def test_column_as_series_no_item_cache( def test_dataframe_add_column_from_series(backend, using_copy_on_write): # Case: adding a new column to a DataFrame from an existing column/series - # -> always already takes a copy on assignment - # (no change in behaviour here) - # TODO can we achieve the same behaviour with Copy-on-Write? + # -> delays copy under CoW _, DataFrame, Series = backend df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) @@ -997,11 +995,6 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write): expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) tm.assert_frame_equal(df, expected) - # editing column in frame -> doesn't modify series - df.loc[2, "new"] = 100 - expected_s = Series([0, 11, 12]) - tm.assert_series_equal(s, expected_s) - @pytest.mark.parametrize("val", [100, "a"]) @pytest.mark.parametrize( From 1997e3fb67f3eef9f10cf1b8279f2ea24f3f2fec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 May 2023 12:52:39 +0200 Subject: [PATCH 13/14] add some more comments to the tests --- pandas/tests/copy_view/test_setitem.py | 30 ++++++++++++++++++++------ 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 0991850580b55..2a99a00e249fa 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -22,7 +22,7 @@ def test_set_column_with_array(): df["c"] = arr # the array data is copied - assert not np.shares_memory(df["c"].values, arr) + assert not np.shares_memory(get_array(df, "c"), arr) # and thus modifying the array does not modify the DataFrame arr[0] = 0 tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) @@ -30,16 +30,17 @@ def test_set_column_with_array(): def test_set_column_with_series(using_copy_on_write): # Case: setting a series as a new column (df[col] = s) copies that data + # (with delayed copy with CoW) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) ser = Series([1, 2, 3]) df["c"] = ser if using_copy_on_write: - assert np.shares_memory(df["c"].values, ser.values) + assert np.shares_memory(get_array(df, "c"), get_array(ser)) else: # the series data is copied - assert not np.shares_memory(df["c"].values, ser.values) + assert not np.shares_memory(get_array(df, "c"), get_array(ser)) # and modifying the series does not modify the DataFrame ser.iloc[0] = 0 @@ -55,7 +56,7 @@ def test_set_column_with_index(using_copy_on_write): df["c"] = idx # the index data is copied - assert not np.shares_memory(df["c"].values, idx.values) + assert not np.shares_memory(get_array(df, "c"), idx.values) # and thus modifying the index does not modify the DataFrame idx.values[0] = 0 @@ -66,23 +67,24 @@ def test_set_column_with_index(using_copy_on_write): df["d"] = idx - assert not np.shares_memory(df["d"].values, arr) + assert not np.shares_memory(get_array(df, "d"), arr) arr[0] = 0 tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) def test_set_columns_with_dataframe(using_copy_on_write): # Case: setting a DataFrame as new columns copies that data + # (with delayed copy with CoW) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) df[["c", "d"]] = df2 if using_copy_on_write: - assert np.shares_memory(df["c"].values, df2["c"].values) + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) else: # the data is copied - assert not np.shares_memory(df["c"].values, df2["c"].values) + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) # and modifying the set DataFrame does not modify the original DataFrame df2.iloc[0, 0] = 0 @@ -90,9 +92,12 @@ def test_set_columns_with_dataframe(using_copy_on_write): def test_setitem_series_no_copy(using_copy_on_write): + # Case: setting a Series as column into a DataFrame can delay copying that data df = DataFrame({"a": [1, 2, 3]}) rhs = Series([4, 5, 6]) rhs_orig = rhs.copy() + + # adding a new column df["b"] = rhs if using_copy_on_write: assert np.shares_memory(get_array(rhs), get_array(df, "b")) @@ -100,6 +105,13 @@ def test_setitem_series_no_copy(using_copy_on_write): df.iloc[0, 1] = 100 tm.assert_series_equal(rhs, rhs_orig) + +def test_setitem_series_no_copy_single_block(using_copy_on_write): + # Overwriting an existing column that is a single block + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + df["a"] = rhs if using_copy_on_write: assert np.shares_memory(get_array(rhs), get_array(df, "a")) @@ -109,9 +121,11 @@ def test_setitem_series_no_copy(using_copy_on_write): def test_setitem_series_no_copy_split_block(using_copy_on_write): + # Overwriting an existing column that is part of a larger block df = DataFrame({"a": [1, 2, 3], "b": 1}) rhs = Series([4, 5, 6]) rhs_orig = rhs.copy() + df["b"] = rhs if using_copy_on_write: assert np.shares_memory(get_array(rhs), get_array(df, "b")) @@ -121,6 +135,8 @@ def test_setitem_series_no_copy_split_block(using_copy_on_write): def test_setitem_series_column_midx_broadcasting(using_copy_on_write): + # Setting a Series to multiple columns will repeat the data + # (currently copying the data eagerly) df = DataFrame( [[1, 2, 3], [3, 4, 5]], columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]), From 0e3d3f7d1537dbae9e608fdb3771cd6a4ea875d1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 May 2023 13:14:39 +0200 Subject: [PATCH 14/14] update some docstrings --- pandas/core/frame.py | 5 +++-- pandas/core/internals/managers.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e934bdbfeb4df..b6c80a51cec16 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4879,7 +4879,8 @@ def _sanitize_column( ) -> tuple[ArrayLike, BlockValuesRefs | None]: """ Ensures new columns (which go into the BlockManager as new blocks) are - always copied and converted into an array. + always copied (or a reference is being tracked to them under CoW) + and converted into an array. Parameters ---------- @@ -4887,7 +4888,7 @@ def _sanitize_column( Returns ------- - numpy.ndarray or ExtensionArray + tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs """ self._ensure_valid_index(value) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 57378fda92ff2..397f9d5b1bbe6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1401,6 +1401,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: loc : int item : hashable value : np.ndarray or ExtensionArray + refs : The reference tracking object of the value to set. """ # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item)