Skip to content

Commit 2096725

Browse files
authored
CLN: Refactor block splitting into its own method (pandas-dev#50900)
1 parent 22de62c commit 2096725

File tree

2 files changed

+79
-25
lines changed

2 files changed

+79
-25
lines changed

pandas/core/internals/managers.py

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,13 +1220,7 @@ def value_getitem(placement):
12201220
if inplace and blk.should_store(value):
12211221
# Updating inplace -> check if we need to do Copy-on-Write
12221222
if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
1223-
nbs_tup = tuple(blk.delete(blk_locs))
1224-
first_nb = new_block_2d(
1225-
value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs])
1226-
)
1227-
if self.refs is not None:
1228-
self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))
1229-
self._clear_reference_block(blkno_l)
1223+
self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs))
12301224
else:
12311225
blk.set_inplace(blk_locs, value_getitem(val_locs))
12321226
continue
@@ -1239,24 +1233,8 @@ def value_getitem(placement):
12391233
removed_blknos.append(blkno_l)
12401234
continue
12411235
else:
1242-
nbs = blk.delete(blk_locs)
1243-
# Add first block where old block was and remaining blocks at
1244-
# the end to avoid updating all block numbers
1245-
first_nb = nbs[0]
1246-
nbs_tup = tuple(nbs[1:])
1247-
nr_blocks = len(self.blocks)
1248-
blocks_tup = (
1249-
self.blocks[:blkno_l]
1250-
+ (first_nb,)
1251-
+ self.blocks[blkno_l + 1 :]
1252-
+ nbs_tup
1253-
)
1254-
self.blocks = blocks_tup
1255-
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
1256-
1257-
for i, nb in enumerate(nbs_tup):
1258-
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1259-
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
1236+
# Defer setting the new values to enable consolidation
1237+
self._iset_split_block(blkno_l, blk_locs)
12601238

12611239
if len(removed_blknos):
12621240
# Remove blocks & update blknos and refs accordingly
@@ -1320,6 +1298,57 @@ def value_getitem(placement):
13201298
# Newly created block's dtype may already be present.
13211299
self._known_consolidated = False
13221300

1301+
def _iset_split_block(
1302+
self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None
1303+
) -> None:
1304+
"""Removes columns from a block by splitting the block.
1305+
1306+
Avoids copying the whole block through slicing and updates the manager
1307+
after determinint the new block structure. Optionally adds a new block,
1308+
otherwise has to be done by the caller.
1309+
1310+
Parameters
1311+
----------
1312+
blkno_l: The block number to operate on, relevant for updating the manager
1313+
blk_locs: The locations of our block that should be deleted.
1314+
value: The value to set as a replacement.
1315+
"""
1316+
blk = self.blocks[blkno_l]
1317+
1318+
if self._blklocs is None:
1319+
self._rebuild_blknos_and_blklocs()
1320+
1321+
nbs_tup = tuple(blk.delete(blk_locs))
1322+
if value is not None:
1323+
# error: No overload variant of "__getitem__" of "BlockPlacement" matches
1324+
# argument type "ndarray[Any, Any]" [call-overload]
1325+
first_nb = new_block_2d(
1326+
value,
1327+
BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[call-overload]
1328+
)
1329+
else:
1330+
first_nb = nbs_tup[0]
1331+
nbs_tup = tuple(nbs_tup[1:])
1332+
1333+
if self.refs is not None:
1334+
self.refs.extend([self.refs[blkno_l]] * len(nbs_tup))
1335+
1336+
if value is not None:
1337+
# Only clear if we set new values
1338+
self._clear_reference_block(blkno_l)
1339+
1340+
nr_blocks = len(self.blocks)
1341+
blocks_tup = (
1342+
self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup
1343+
)
1344+
self.blocks = blocks_tup
1345+
1346+
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
1347+
1348+
for i, nb in enumerate(nbs_tup):
1349+
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1350+
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
1351+
13231352
def _iset_single(
13241353
self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
13251354
) -> None:

pandas/tests/internals/test_internals.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pytest
1010

1111
from pandas._libs.internals import BlockPlacement
12+
from pandas.compat import IS64
1213
import pandas.util._test_decorators as td
1314

1415
from pandas.core.dtypes.common import is_scalar
@@ -883,6 +884,30 @@ def test_validate_bool_args(self, value):
883884
with pytest.raises(ValueError, match=msg):
884885
bm1.replace_list([1], [2], inplace=value)
885886

887+
def test_iset_split_block(self):
888+
bm = create_mgr("a,b,c: i8; d: f8")
889+
bm._iset_split_block(0, np.array([0]))
890+
tm.assert_numpy_array_equal(
891+
bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32")
892+
)
893+
# First indexer currently does not have a block associated with it in case
894+
tm.assert_numpy_array_equal(
895+
bm.blknos, np.array([0, 0, 0, 1], dtype="int64" if IS64 else "int32")
896+
)
897+
assert len(bm.blocks) == 2
898+
899+
def test_iset_split_block_values(self):
900+
bm = create_mgr("a,b,c: i8; d: f8")
901+
bm._iset_split_block(0, np.array([0]), np.array([list(range(10))]))
902+
tm.assert_numpy_array_equal(
903+
bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32")
904+
)
905+
# First indexer currently does not have a block associated with it in case
906+
tm.assert_numpy_array_equal(
907+
bm.blknos, np.array([0, 2, 2, 1], dtype="int64" if IS64 else "int32")
908+
)
909+
assert len(bm.blocks) == 3
910+
886911

887912
def _as_array(mgr):
888913
if mgr.ndim == 1:

0 commit comments

Comments
 (0)