diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a0aba0778da5..71d8b20f18457 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1220,13 +1220,7 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - nbs_tup = tuple(blk.delete(blk_locs)) - first_nb = new_block_2d( - value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs]) - ) - if self.refs is not None: - self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) - self._clear_reference_block(blkno_l) + self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs)) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) continue @@ -1239,24 +1233,8 @@ def value_getitem(placement): removed_blknos.append(blkno_l) continue else: - nbs = blk.delete(blk_locs) - # Add first block where old block was and remaining blocks at - # the end to avoid updating all block numbers - first_nb = nbs[0] - nbs_tup = tuple(nbs[1:]) - nr_blocks = len(self.blocks) - blocks_tup = ( - self.blocks[:blkno_l] - + (first_nb,) - + self.blocks[blkno_l + 1 :] - + nbs_tup - ) - self.blocks = blocks_tup - self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) - - for i, nb in enumerate(nbs_tup): - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + # Defer setting the new values to enable consolidation + self._iset_split_block(blkno_l, blk_locs) if len(removed_blknos): # Remove blocks & update blknos and refs accordingly @@ -1320,6 +1298,57 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False + def _iset_split_block( + self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None + ) -> None: + """Removes columns from a block by splitting the block. + + Avoids copying the whole block through slicing and updates the manager + after determinint the new block structure. Optionally adds a new block, + otherwise has to be done by the caller. + + Parameters + ---------- + blkno_l: The block number to operate on, relevant for updating the manager + blk_locs: The locations of our block that should be deleted. + value: The value to set as a replacement. + """ + blk = self.blocks[blkno_l] + + if self._blklocs is None: + self._rebuild_blknos_and_blklocs() + + nbs_tup = tuple(blk.delete(blk_locs)) + if value is not None: + # error: No overload variant of "__getitem__" of "BlockPlacement" matches + # argument type "ndarray[Any, Any]" [call-overload] + first_nb = new_block_2d( + value, + BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[call-overload] + ) + else: + first_nb = nbs_tup[0] + nbs_tup = tuple(nbs_tup[1:]) + + if self.refs is not None: + self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) + + if value is not None: + # Only clear if we set new values + self._clear_reference_block(blkno_l) + + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup + ) + self.blocks = blocks_tup + + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + def _iset_single( self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block ) -> None: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c253c4bd1ca53..d5702a545c0d8 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.internals import BlockPlacement +from pandas.compat import IS64 import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_scalar @@ -883,6 +884,30 @@ def test_validate_bool_args(self, value): with pytest.raises(ValueError, match=msg): bm1.replace_list([1], [2], inplace=value) + def test_iset_split_block(self): + bm = create_mgr("a,b,c: i8; d: f8") + bm._iset_split_block(0, np.array([0])) + tm.assert_numpy_array_equal( + bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32") + ) + # First indexer currently does not have a block associated with it in case + tm.assert_numpy_array_equal( + bm.blknos, np.array([0, 0, 0, 1], dtype="int64" if IS64 else "int32") + ) + assert len(bm.blocks) == 2 + + def test_iset_split_block_values(self): + bm = create_mgr("a,b,c: i8; d: f8") + bm._iset_split_block(0, np.array([0]), np.array([list(range(10))])) + tm.assert_numpy_array_equal( + bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32") + ) + # First indexer currently does not have a block associated with it in case + tm.assert_numpy_array_equal( + bm.blknos, np.array([0, 2, 2, 1], dtype="int64" if IS64 else "int32") + ) + assert len(bm.blocks) == 3 + def _as_array(mgr): if mgr.ndim == 1: