Skip to content

CLN: Refactor block splitting into its own method #50900

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 54 additions & 25 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,13 +1220,7 @@ def value_getitem(placement):
if inplace and blk.should_store(value):
# Updating inplace -> check if we need to do Copy-on-Write
if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
nbs_tup = tuple(blk.delete(blk_locs))
first_nb = new_block_2d(
value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs])
)
if self.refs is not None:
self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))
self._clear_reference_block(blkno_l)
self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs))
else:
blk.set_inplace(blk_locs, value_getitem(val_locs))
continue
Expand All @@ -1239,24 +1233,8 @@ def value_getitem(placement):
removed_blknos.append(blkno_l)
continue
else:
nbs = blk.delete(blk_locs)
# Add first block where old block was and remaining blocks at
# the end to avoid updating all block numbers
first_nb = nbs[0]
nbs_tup = tuple(nbs[1:])
nr_blocks = len(self.blocks)
blocks_tup = (
self.blocks[:blkno_l]
+ (first_nb,)
+ self.blocks[blkno_l + 1 :]
+ nbs_tup
)
self.blocks = blocks_tup
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
# Defer setting the new values to enable consolidation
self._iset_split_block(blkno_l, blk_locs)

if len(removed_blknos):
# Remove blocks & update blknos and refs accordingly
Expand Down Expand Up @@ -1320,6 +1298,57 @@ def value_getitem(placement):
# Newly created block's dtype may already be present.
self._known_consolidated = False

def _iset_split_block(
self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None
) -> None:
"""Removes columns from a block by splitting the block.

Avoids copying the whole block through slicing and updates the manager
after determinint the new block structure. Optionally adds a new block,
otherwise has to be done by the caller.

Parameters
----------
blkno_l: The block number to operate on, relevant for updating the manager
blk_locs: The locations of our block that should be deleted.
value: The value to set as a replacement.
"""
blk = self.blocks[blkno_l]

if self._blklocs is None:
self._rebuild_blknos_and_blklocs()

nbs_tup = tuple(blk.delete(blk_locs))
if value is not None:
# error: No overload variant of "__getitem__" of "BlockPlacement" matches
# argument type "ndarray[Any, Any]" [call-overload]
first_nb = new_block_2d(
value,
BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[call-overload]
)
else:
first_nb = nbs_tup[0]
nbs_tup = tuple(nbs_tup[1:])

if self.refs is not None:
self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))

if value is not None:
# Only clear if we set new values
self._clear_reference_block(blkno_l)

nr_blocks = len(self.blocks)
blocks_tup = (
self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup
)
self.blocks = blocks_tup

self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks

def _iset_single(
self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
) -> None:
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest

from pandas._libs.internals import BlockPlacement
from pandas.compat import IS64
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_scalar
Expand Down Expand Up @@ -883,6 +884,30 @@ def test_validate_bool_args(self, value):
with pytest.raises(ValueError, match=msg):
bm1.replace_list([1], [2], inplace=value)

def test_iset_split_block(self):
bm = create_mgr("a,b,c: i8; d: f8")
bm._iset_split_block(0, np.array([0]))
tm.assert_numpy_array_equal(
bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32")
)
# First indexer currently does not have a block associated with it in case
tm.assert_numpy_array_equal(
bm.blknos, np.array([0, 0, 0, 1], dtype="int64" if IS64 else "int32")
)
assert len(bm.blocks) == 2

def test_iset_split_block_values(self):
bm = create_mgr("a,b,c: i8; d: f8")
bm._iset_split_block(0, np.array([0]), np.array([list(range(10))]))
tm.assert_numpy_array_equal(
bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32")
)
# First indexer currently does not have a block associated with it in case
tm.assert_numpy_array_equal(
bm.blknos, np.array([0, 2, 2, 1], dtype="int64" if IS64 else "int32")
)
assert len(bm.blocks) == 3


def _as_array(mgr):
if mgr.ndim == 1:
Expand Down