From b0812c22e56a0b83a1b2ff1650be239c318bf6cd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 20 Jan 2023 15:39:56 +0100 Subject: [PATCH 01/10] CLN: Refactor block splitting into its own method --- pandas/core/internals/managers.py | 59 ++++++++++++++---------- pandas/tests/internals/test_internals.py | 18 ++++++++ 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a0aba0778da5..23d68e6986c94 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1220,13 +1220,9 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - nbs_tup = tuple(blk.delete(blk_locs)) - first_nb = new_block_2d( - value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs]) + self._iset_split_block( + blkno_l, blk, blk_locs, value_getitem(val_locs) ) - if self.refs is not None: - self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) - self._clear_reference_block(blkno_l) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue @@ -1239,24 +1235,7 @@ def value_getitem(placement): removed_blknos.append(blkno_l) continue else: - nbs = blk.delete(blk_locs) - # Add first block where old block was and remaining blocks at - # the end to avoid updating all block numbers - first_nb = nbs[0] - nbs_tup = tuple(nbs[1:]) - nr_blocks = len(self.blocks) - blocks_tup = ( - self.blocks[:blkno_l] - + (first_nb,) - + self.blocks[blkno_l + 1 :] - + nbs_tup - ) - self.blocks = blocks_tup - self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) - - for i, nb in enumerate(nbs_tup): - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + self._iset_split_block(blkno_l, blk, blk_locs) if len(removed_blknos): # Remove blocks & update blknos and refs accordingly @@ -1320,6 +1299,38 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False + def _iset_split_block( + self, blkno_l: int, blk: Block, blk_locs, value: ArrayLike | None = None + ): + if self._blklocs is None: + self._rebuild_blknos_and_blklocs() + + nbs_tup = tuple(blk.delete(blk_locs)) + if value is not None: + first_nb = new_block_2d(value, BlockPlacement(blk.mgr_locs[blk_locs])) + else: + first_nb = nbs_tup[0] + nbs_tup = tuple(nbs_tup[1:]) + + if self.refs is not None: + self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) + + if value is not None: + # Only clear if we set new values + self._clear_reference_block(blkno_l) + + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup + ) + self.blocks = blocks_tup + + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + def _iset_single( self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block ) -> None: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c253c4bd1ca53..8efe40abe7df6 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -883,6 +883,24 @@ def test_validate_bool_args(self, value): with pytest.raises(ValueError, match=msg): bm1.replace_list([1], [2], inplace=value) + def test_iset_split_block(self): + bm = create_mgr("a,b,c: i8; d: f8") + bm._iset_split_block(0, bm.blocks[0], np.array([0])) + tm.assert_numpy_array_equal(bm.blklocs, np.array([0, 0, 1, 0])) + # First indexer currently does not have a block associated with it in case + tm.assert_numpy_array_equal(bm.blknos, np.array([0, 0, 0, 1])) + assert len(bm.blocks) == 2 + + def test_iset_split_block_values(self): + bm = create_mgr("a,b,c: i8; d: f8") + bm._iset_split_block( + 0, bm.blocks[0], np.array([0]), np.array([list(range(10))]) + ) + tm.assert_numpy_array_equal(bm.blklocs, np.array([0, 0, 1, 0])) + # First indexer currently does not have a block associated with it in case + tm.assert_numpy_array_equal(bm.blknos, np.array([0, 2, 2, 1])) + assert len(bm.blocks) == 3 + def _as_array(mgr): if mgr.ndim == 1: From 0ee8783b61b7dccadc828ff979a97ef5ee865eb3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 20 Jan 2023 15:47:37 +0100 Subject: [PATCH 02/10] Add docs --- pandas/core/internals/managers.py | 23 ++++++++++++++++++----- pandas/tests/internals/test_internals.py | 6 ++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 23d68e6986c94..1a66eb812d0a7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1220,9 +1220,7 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - self._iset_split_block( - blkno_l, blk, blk_locs, value_getitem(val_locs) - ) + self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs)) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue @@ -1235,7 +1233,8 @@ def value_getitem(placement): removed_blknos.append(blkno_l) continue else: - self._iset_split_block(blkno_l, blk, blk_locs) + # Defer setting the new values to enable consolidation + self._iset_split_block(blkno_l, blk_locs) if len(removed_blknos): # Remove blocks & update blknos and refs accordingly @@ -1300,8 +1299,22 @@ def value_getitem(placement): self._known_consolidated = False def _iset_split_block( - self, blkno_l: int, blk: Block, blk_locs, value: ArrayLike | None = None + self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None ): + """Removes columns from a block by splitting the block. + + Avoids copying the whole block through slicing and updates the manager + after determinint the new block structure. Optionally adds a new block, + otherwise has to be done by the caller. + + Parameters + ---------- + blkno_l: The block number to operate on, relevant for updating the manager + blk_locs: The locations of our block that should be deleted. + value: The value to set as a replacement. + """ + blk = self.blocks[blkno_l] + if self._blklocs is None: self._rebuild_blknos_and_blklocs() diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 8efe40abe7df6..9e784869ae0b4 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -885,7 +885,7 @@ def test_validate_bool_args(self, value): def test_iset_split_block(self): bm = create_mgr("a,b,c: i8; d: f8") - bm._iset_split_block(0, bm.blocks[0], np.array([0])) + bm._iset_split_block(0, np.array([0])) tm.assert_numpy_array_equal(bm.blklocs, np.array([0, 0, 1, 0])) # First indexer currently does not have a block associated with it in case tm.assert_numpy_array_equal(bm.blknos, np.array([0, 0, 0, 1])) @@ -893,9 +893,7 @@ def test_iset_split_block(self): def test_iset_split_block_values(self): bm = create_mgr("a,b,c: i8; d: f8") - bm._iset_split_block( - 0, bm.blocks[0], np.array([0]), np.array([list(range(10))]) - ) + bm._iset_split_block(0, np.array([0]), np.array([list(range(10))])) tm.assert_numpy_array_equal(bm.blklocs, np.array([0, 0, 1, 0])) # First indexer currently does not have a block associated with it in case tm.assert_numpy_array_equal(bm.blknos, np.array([0, 2, 2, 1])) From 39446555a95abd7e056aefdb71911dc83d60d821 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 20 Jan 2023 20:31:51 +0000 Subject: [PATCH 03/10] ENH: Avoid copying unnecessary columns in indexing for CoW --- pandas/core/internals/blocks.py | 2 ++ pandas/core/internals/managers.py | 29 ++++++++++++++++------ pandas/tests/copy_view/test_indexing.py | 32 +++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4bb4882574228..08573267e7a4f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1328,6 +1328,8 @@ def delete(self, loc) -> list[Block]: values = np.delete(values, loc) mgr_locs = self._mgr_locs.delete(loc) return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + elif self.values.ndim == 1: + return [] if np.max(loc) >= self.values.shape[0]: raise IndexError diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a66eb812d0a7..ea737d7c0bbdd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1325,8 +1325,15 @@ def _iset_split_block( first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) - if self.refs is not None: - self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) + if using_copy_on_write(): + if self.refs is not None: + self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) + else: + # This can happen if we have inplace=True + new_ref = weakref.ref(self.blocks[blkno_l]) + self.refs = [None] * len(self.blocks) + [new_ref] * len(nbs_tup) + if value is None: + self.refs[blkno_l] = new_ref if value is not None: # Only clear if we set new values @@ -1338,6 +1345,10 @@ def _iset_split_block( ) self.blocks = blocks_tup + if not nbs_tup and value is not None: + # No need to update anything if split did not happen + return + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) for i, nb in enumerate(nbs_tup): @@ -1383,12 +1394,16 @@ def column_setitem( intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ if using_copy_on_write() and not self._has_no_reference(loc): - # otherwise perform Copy-on-Write and clear the reference blkno = self.blknos[loc] - blocks = list(self.blocks) - blocks[blkno] = blocks[blkno].copy() - self.blocks = tuple(blocks) - self._clear_reference_block(blkno) + # Split blocks to only copy the column we want to modify + blk_loc = self.blklocs[loc] + # Copy our values + values = self.blocks[blkno].values + if values.ndim == 1: + values = values.copy() + else: + values = values[[blk_loc]] + self._iset_split_block(blkno, [blk_loc], values) # this manager is only created temporarily to mutate the values in place # so don't track references, otherwise the `setitem` would perform CoW again diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index df6b83518eaff..bfa8ac62eff63 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -883,3 +883,35 @@ def test_dataframe_add_column_from_series(): df.loc[2, "new"] = 100 expected_s = Series([0, 11, 12]) tm.assert_series_equal(s, expected_s) + + +@pytest.mark.parametrize("val", [100, "a"]) +@pytest.mark.parametrize( + "method", + [ + lambda df: df.loc[0, "a"], + lambda df: df.iloc[0, 0], + lambda df: df.loc[[0], "a"], + lambda df: df.iloc[[0], 0], + lambda df: df.loc[:, "a"], + lambda df: df.iloc[:, 0], + ], +) +def test_set_value_loc_copy_only_necessary_column(using_copy_on_write, method, val): + # Only copy column that is modified, multi-block only for now + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1.5}) + df_orig = df.copy() + view = df[:] + + df.loc[0, "a"] = val + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "c"), get_array(view, "c")) + if val == "a": + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + else: + assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) From 33eb278f4c3be0b54987e05694b960565edad0b5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 27 Jan 2023 20:22:30 -0500 Subject: [PATCH 04/10] Fix mypy --- pandas/core/internals/managers.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9f84bcb06c82c..0fdd534f991f9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1299,7 +1299,10 @@ def value_getitem(placement): self._known_consolidated = False def _iset_split_block( - self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None + self, + blkno_l: int, + blk_locs: np.ndarray | list[int], + value: ArrayLike | None = None, ) -> None: """Removes columns from a block by splitting the block. @@ -1320,12 +1323,8 @@ def _iset_split_block( nbs_tup = tuple(blk.delete(blk_locs)) if value is not None: - # error: No overload variant of "__getitem__" of "BlockPlacement" matches - # argument type "ndarray[Any, Any]" [call-overload] - first_nb = new_block_2d( - value, - BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[call-overload] - ) + locs = blk.mgr_locs.as_array[blk_locs] + first_nb = new_block_2d(value, BlockPlacement(locs)) else: first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) From 331232e609a63f730cccfe68e0fcf09b0814fc83 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 10:45:30 +0100 Subject: [PATCH 05/10] Merge new ref tracking --- pandas/core/internals/managers.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 14578634445c7..e831b3d85ba3c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1265,21 +1265,6 @@ def _iset_split_block( first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) - if using_copy_on_write(): - if self.refs is not None: - self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) - else: - pass - # This can happen if we have inplace=True - # new_ref = weakref.ref(self.blocks[blkno_l]) - # self.refs = [None] * len(self.blocks) + [new_ref] * len(nbs_tup) - # if value is None: - # self.refs[blkno_l] = new_ref - - if value is not None: - # Only clear if we set new values - self._clear_reference_block(blkno_l) - nr_blocks = len(self.blocks) blocks_tup = ( self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup From cae0170d7f147c47a6199437350101699011e606 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 16:38:47 +0100 Subject: [PATCH 06/10] Remove elif --- pandas/core/internals/blocks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5ad0fc226807c..15abc143cd081 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1335,8 +1335,6 @@ def delete(self, loc) -> list[Block]: values = np.delete(values, loc) mgr_locs = self._mgr_locs.delete(loc) return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] - elif self.values.ndim == 1: - return [] if np.max(loc) >= self.values.shape[0]: raise IndexError From 26f132240434eb7659e2297b12ce88429a245235 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 12:15:53 +0100 Subject: [PATCH 07/10] Fix --- pandas/core/internals/blocks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e66011acb978b..1f55a132d665b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1605,6 +1605,9 @@ def delete(self, loc) -> list[Block]: values = self.values.delete(loc) mgr_locs = self._mgr_locs.delete(loc) return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + elif self.values.ndim == 1: + # We get here through to_stata + return [] return super().delete(loc) @cache_readonly From 697ddb80b73d9efa7a4edc6434cb8c41c2344bbb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 12:21:17 +0100 Subject: [PATCH 08/10] Clean up --- pandas/core/internals/managers.py | 1 + pandas/tests/copy_view/test_indexing.py | 24 ++++++++++++++---------- pandas/tests/indexing/test_iloc.py | 2 ++ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6426dc3d35f0d..36a3cb5a3f3e6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1341,6 +1341,7 @@ def column_setitem( if values.ndim == 1: values = values.copy() else: + # Use [blk_loc] as indexer to keep ndim=2 values = values[[blk_loc]] self._iset_split_block(blkno, [blk_loc], values) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index bfa8ac62eff63..24ecbb21b54ff 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -887,23 +887,27 @@ def test_dataframe_add_column_from_series(): @pytest.mark.parametrize("val", [100, "a"]) @pytest.mark.parametrize( - "method", + "indexer_func, indexer", [ - lambda df: df.loc[0, "a"], - lambda df: df.iloc[0, 0], - lambda df: df.loc[[0], "a"], - lambda df: df.iloc[[0], 0], - lambda df: df.loc[:, "a"], - lambda df: df.iloc[:, 0], + (tm.loc, (0, "a")), + (tm.iloc, (0, 0)), + (tm.loc, ([0], "a")), + (tm.iloc, ([0], 0)), + (tm.loc, (slice(None), "a")), + (tm.iloc, (slice(None), 0)), ], ) -def test_set_value_loc_copy_only_necessary_column(using_copy_on_write, method, val): - # Only copy column that is modified, multi-block only for now +def test_set_value_copy_only_necessary_column( + using_copy_on_write, indexer_func, indexer, val +): + # When setting inplace, only copy column that is modified instead of the whole + # block (by splitting the block) + # TODO multi-block only for now df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1.5}) df_orig = df.copy() view = df[:] - df.loc[0, "a"] = val + indexer_func(df)[indexer] = val if using_copy_on_write: assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 27dd16172b992..55b96bac6609b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -13,6 +13,7 @@ from pandas.errors import IndexingError import pandas.util._test_decorators as td +import pandas as pd from pandas import ( NA, Categorical, @@ -486,6 +487,7 @@ def test_iloc_setitem_pandas_object(self): tm.assert_series_equal(s, expected) def test_iloc_setitem_dups(self): + pd.options.mode.copy_on_write = True # GH 6766 # iloc with a mask aligning from another iloc From 6b6eb63c07f2529aced1a2041f2af05adc54ac5e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 12:22:03 +0100 Subject: [PATCH 09/10] Clean --- pandas/tests/copy_view/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 24ecbb21b54ff..a673d8b37a008 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -903,7 +903,7 @@ def test_set_value_copy_only_necessary_column( # When setting inplace, only copy column that is modified instead of the whole # block (by splitting the block) # TODO multi-block only for now - df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1.5}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() view = df[:] From fb623cce8d0a937ab7cca7b45640074eb6671ded Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 16:26:46 +0100 Subject: [PATCH 10/10] Fix --- pandas/core/internals/managers.py | 3 ++- pandas/tests/indexing/test_iloc.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 36a3cb5a3f3e6..f003e5eb6a052 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1341,7 +1341,8 @@ def column_setitem( if values.ndim == 1: values = values.copy() else: - # Use [blk_loc] as indexer to keep ndim=2 + # Use [blk_loc] as indexer to keep ndim=2, this already results in a + # copy values = values[[blk_loc]] self._iset_split_block(blkno, [blk_loc], values) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 55b96bac6609b..27dd16172b992 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -13,7 +13,6 @@ from pandas.errors import IndexingError import pandas.util._test_decorators as td -import pandas as pd from pandas import ( NA, Categorical, @@ -487,7 +486,6 @@ def test_iloc_setitem_pandas_object(self): tm.assert_series_equal(s, expected) def test_iloc_setitem_dups(self): - pd.options.mode.copy_on_write = True # GH 6766 # iloc with a mask aligning from another iloc