Skip to content

Commit e7e3676

Browse files
authored
PERF: Split blocks in blk.delete (#50148)
* PERF: Split blocks in blk.delete * Add asv * Add whatsnew * Add comment * Fix and adjust tests * Fix mypy * Remove comment * Fix formatting * Refactor * Fix test * Fix mypy * Fixup
1 parent b513776 commit e7e3676

File tree

6 files changed

+102
-29
lines changed

6 files changed

+102
-29
lines changed

asv_bench/benchmarks/indexing.py

+13
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,19 @@ def time_assign_list_of_columns_concat(self):
476476
concat([self.df, df], axis=1)
477477

478478

479+
class Setitem:
480+
def setup(self):
481+
N = 500_000
482+
cols = 500
483+
self.df = DataFrame(np.random.rand(N, cols))
484+
485+
def time_setitem(self):
486+
self.df[100] = 100
487+
488+
def time_setitem_list(self):
489+
self.df[[100, 200, 300]] = 100
490+
491+
479492
class ChainIndexing:
480493

481494
params = [None, "warn"]

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,7 @@ Performance improvements
785785
- Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`)
786786
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
787787
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
788+
- Performance improvement in :meth:`DataFrame.__setitem__` (:issue:`46267`)
788789
- Performance improvement in ``var`` and ``std`` for nullable dtypes (:issue:`48379`).
789790
- Performance improvement when iterating over pyarrow and nullable dtypes (:issue:`49825`, :issue:`49851`)
790791
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)

pandas/core/internals/blocks.py

+47-13
Original file line numberDiff line numberDiff line change
@@ -1315,11 +1315,48 @@ def quantile(
13151315
# ---------------------------------------------------------------------
13161316
# Abstract Methods Overridden By EABackedBlock and NumpyBlock
13171317

1318-
def delete(self, loc) -> Block:
1319-
"""
1320-
Return a new Block with the given loc(s) deleted.
1318+
def delete(self, loc) -> list[Block]:
1319+
"""Deletes the locs from the block.
1320+
1321+
We split the block to avoid copying the underlying data. We create new
1322+
blocks for every connected segment of the initial block that is not deleted.
1323+
The new blocks point to the initial array.
13211324
"""
1322-
raise AbstractMethodError(self)
1325+
if not is_list_like(loc):
1326+
loc = [loc]
1327+
1328+
if self.ndim == 1:
1329+
values = cast(np.ndarray, self.values)
1330+
values = np.delete(values, loc)
1331+
mgr_locs = self._mgr_locs.delete(loc)
1332+
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
1333+
1334+
if np.max(loc) >= self.values.shape[0]:
1335+
raise IndexError
1336+
1337+
# Add one out-of-bounds indexer as maximum to collect
1338+
# all columns after our last indexer if any
1339+
loc = np.concatenate([loc, [self.values.shape[0]]])
1340+
mgr_locs_arr = self._mgr_locs.as_array
1341+
new_blocks: list[Block] = []
1342+
1343+
previous_loc = -1
1344+
for idx in loc:
1345+
1346+
if idx == previous_loc + 1:
1347+
# There is no column between current and last idx
1348+
pass
1349+
else:
1350+
# No overload variant of "__getitem__" of "ExtensionArray" matches
1351+
# argument type "Tuple[slice, slice]"
1352+
values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa
1353+
locs = mgr_locs_arr[previous_loc + 1 : idx]
1354+
nb = type(self)(values, placement=BlockPlacement(locs), ndim=self.ndim)
1355+
new_blocks.append(nb)
1356+
1357+
previous_loc = idx
1358+
1359+
return new_blocks
13231360

13241361
@property
13251362
def is_view(self) -> bool:
@@ -1532,11 +1569,13 @@ def putmask(self, mask, new) -> list[Block]:
15321569

15331570
return [self]
15341571

1535-
def delete(self, loc) -> Block:
1572+
def delete(self, loc) -> list[Block]:
15361573
# This will be unnecessary if/when __array_function__ is implemented
1537-
values = self.values.delete(loc)
1538-
mgr_locs = self._mgr_locs.delete(loc)
1539-
return type(self)(values, placement=mgr_locs, ndim=self.ndim)
1574+
if self.ndim == 1:
1575+
values = self.values.delete(loc)
1576+
mgr_locs = self._mgr_locs.delete(loc)
1577+
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
1578+
return super().delete(loc)
15401579

15411580
@cache_readonly
15421581
def array_values(self) -> ExtensionArray:
@@ -1851,11 +1890,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
18511890
def values_for_json(self) -> np.ndarray:
18521891
return self.values
18531892

1854-
def delete(self, loc) -> Block:
1855-
values = np.delete(self.values, loc, 0)
1856-
mgr_locs = self._mgr_locs.delete(loc)
1857-
return type(self)(values, placement=mgr_locs, ndim=self.ndim)
1858-
18591893

18601894
class NumericBlock(NumpyBlock):
18611895
__slots__ = ()

pandas/core/internals/managers.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -1235,14 +1235,24 @@ def value_getitem(placement):
12351235
if len(val_locs) == len(blk.mgr_locs):
12361236
removed_blknos.append(blkno_l)
12371237
else:
1238-
nb = blk.delete(blk_locs)
1238+
nbs = blk.delete(blk_locs)
1239+
# Add first block where old block was and remaining blocks at
1240+
# the end to avoid updating all block numbers
1241+
first_nb = nbs[0]
1242+
nbs_tup = tuple(nbs[1:])
1243+
nr_blocks = len(self.blocks)
12391244
blocks_tup = (
1240-
self.blocks[:blkno_l] + (nb,) + self.blocks[blkno_l + 1 :]
1245+
self.blocks[:blkno_l]
1246+
+ (first_nb,)
1247+
+ self.blocks[blkno_l + 1 :]
1248+
+ nbs_tup
12411249
)
12421250
self.blocks = blocks_tup
1243-
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1244-
# blk.delete gives a copy, so we can remove a possible reference
1245-
self._clear_reference_block(blkno_l)
1251+
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
1252+
1253+
for i, nb in enumerate(nbs_tup):
1254+
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1255+
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
12461256

12471257
if len(removed_blknos):
12481258
# Remove blocks & update blknos and refs accordingly
@@ -2034,7 +2044,7 @@ def idelete(self, indexer) -> SingleBlockManager:
20342044
20352045
Ensures that self.blocks doesn't become empty.
20362046
"""
2037-
nb = self._block.delete(indexer)
2047+
nb = self._block.delete(indexer)[0]
20382048
self.blocks = (nb,)
20392049
self.axes[0] = self.axes[0].delete(indexer)
20402050
self._cache.clear()

pandas/tests/frame/test_stack_unstack.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1325,8 +1325,15 @@ def test_unstack_non_slice_like_blocks(using_array_manager):
13251325
# Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like
13261326

13271327
mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
1328-
df = DataFrame(np.random.randn(15, 4), index=mi)
1329-
df[1] = df[1].astype(np.int64)
1328+
df = DataFrame(
1329+
{
1330+
0: np.random.randn(15),
1331+
1: np.random.randn(15).astype(np.int64),
1332+
2: np.random.randn(15),
1333+
3: np.random.randn(15),
1334+
},
1335+
index=mi,
1336+
)
13301337
if not using_array_manager:
13311338
assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)
13321339

pandas/tests/internals/test_internals.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def test_copy(self, fblock):
285285
def test_delete(self, fblock):
286286
newb = fblock.copy()
287287
locs = newb.mgr_locs
288-
nb = newb.delete(0)
288+
nb = newb.delete(0)[0]
289289
assert newb.mgr_locs is locs
290290

291291
assert nb is not newb
@@ -299,21 +299,25 @@ def test_delete(self, fblock):
299299
newb = fblock.copy()
300300
locs = newb.mgr_locs
301301
nb = newb.delete(1)
302+
assert len(nb) == 2
302303
assert newb.mgr_locs is locs
303304

304305
tm.assert_numpy_array_equal(
305-
nb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp)
306+
nb[0].mgr_locs.as_array, np.array([0], dtype=np.intp)
307+
)
308+
tm.assert_numpy_array_equal(
309+
nb[1].mgr_locs.as_array, np.array([4], dtype=np.intp)
306310
)
307311
assert not (newb.values[1] == 2).all()
308-
assert (nb.values[1] == 2).all()
312+
assert (nb[1].values[0] == 2).all()
309313

310314
newb = fblock.copy()
311-
locs = newb.mgr_locs
312315
nb = newb.delete(2)
316+
assert len(nb) == 1
313317
tm.assert_numpy_array_equal(
314-
nb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
318+
nb[0].mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
315319
)
316-
assert (nb.values[1] == 1).all()
320+
assert (nb[0].values[1] == 1).all()
317321

318322
newb = fblock.copy()
319323

@@ -328,14 +332,18 @@ def test_delete_datetimelike(self):
328332
assert isinstance(blk.values, TimedeltaArray)
329333

330334
nb = blk.delete(1)
331-
assert isinstance(nb.values, TimedeltaArray)
335+
assert len(nb) == 2
336+
assert isinstance(nb[0].values, TimedeltaArray)
337+
assert isinstance(nb[1].values, TimedeltaArray)
332338

333339
df = DataFrame(arr.view("M8[ns]"))
334340
blk = df._mgr.blocks[0]
335341
assert isinstance(blk.values, DatetimeArray)
336342

337343
nb = blk.delete([1, 3])
338-
assert isinstance(nb.values, DatetimeArray)
344+
assert len(nb) == 2
345+
assert isinstance(nb[0].values, DatetimeArray)
346+
assert isinstance(nb[1].values, DatetimeArray)
339347

340348
def test_split(self):
341349
# GH#37799

0 commit comments

Comments
 (0)