Skip to content

PERF: Split blocks in blk.delete #50148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 9, 2023
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,19 @@ def time_assign_list_of_columns_concat(self):
concat([self.df, df], axis=1)


class Setitem:
def setup(self):
N = 500_000
cols = 500
self.df = DataFrame(np.random.rand(N, cols))

def time_setitem(self):
self.df[100] = 100

def time_setitem_list(self):
self.df[[100, 200, 300]] = 100


class ChainIndexing:

params = [None, "warn"]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,7 @@ Performance improvements
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
- Performance improvement in :meth:`DataFrame.__setitem__` (:issue:`46267`)
- Performance improvement in ``var`` and ``std`` for nullable dtypes (:issue:`48379`).
- Performance improvement when iterating over pyarrow and nullable dtypes (:issue:`49825`, :issue:`49851`)
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
Expand Down
60 changes: 47 additions & 13 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1315,11 +1315,48 @@ def quantile(
# ---------------------------------------------------------------------
# Abstract Methods Overridden By EABackedBlock and NumpyBlock

def delete(self, loc) -> Block:
"""
Return a new Block with the given loc(s) deleted.
def delete(self, loc) -> list[Block]:
"""Deletes the locs from the block.

We split the block to avoid copying the underlying data. We create new
blocks for every connected segment of the initial block that is not deleted.
The new blocks point to the initial array.
"""
raise AbstractMethodError(self)
if not is_list_like(loc):
loc = [loc]

if self.ndim == 1:
values = cast(np.ndarray, self.values)
values = np.delete(values, loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]

if np.max(loc) >= self.values.shape[0]:
raise IndexError

# Add one out-of-bounds indexer as maximum to collect
# all columns after our last indexer if any
loc = np.concatenate([loc, [self.values.shape[0]]])
mgr_locs_arr = self._mgr_locs.as_array
new_blocks: list[Block] = []

previous_loc = -1
for idx in loc:

if idx == previous_loc + 1:
# There is no column between current and last idx
pass
else:
# No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[slice, slice]"
values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa
locs = mgr_locs_arr[previous_loc + 1 : idx]
nb = type(self)(values, placement=BlockPlacement(locs), ndim=self.ndim)
new_blocks.append(nb)

previous_loc = idx

return new_blocks

@property
def is_view(self) -> bool:
Expand Down Expand Up @@ -1532,11 +1569,13 @@ def putmask(self, mask, new) -> list[Block]:

return [self]

def delete(self, loc) -> Block:
def delete(self, loc) -> list[Block]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for 2D EAs we probably want to do the same thing as we do for NumpyBlock?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any 2D EAs? I agree in general, but thought all of them were 1D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see NDArrayBackedExtensionBlock

# This will be unnecessary if/when __array_function__ is implemented
values = self.values.delete(loc)
mgr_locs = self._mgr_locs.delete(loc)
return type(self)(values, placement=mgr_locs, ndim=self.ndim)
if self.ndim == 1:
values = self.values.delete(loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
return super().delete(loc)

@cache_readonly
def array_values(self) -> ExtensionArray:
Expand Down Expand Up @@ -1851,11 +1890,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
def values_for_json(self) -> np.ndarray:
return self.values

def delete(self, loc) -> Block:
values = np.delete(self.values, loc, 0)
mgr_locs = self._mgr_locs.delete(loc)
return type(self)(values, placement=mgr_locs, ndim=self.ndim)


class NumericBlock(NumpyBlock):
__slots__ = ()
Expand Down
22 changes: 16 additions & 6 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1235,14 +1235,24 @@ def value_getitem(placement):
if len(val_locs) == len(blk.mgr_locs):
removed_blknos.append(blkno_l)
else:
nb = blk.delete(blk_locs)
nbs = blk.delete(blk_locs)
# Add first block where old block was and remaining blocks at
# the end to avoid updating all block numbers
first_nb = nbs[0]
nbs_tup = tuple(nbs[1:])
nr_blocks = len(self.blocks)
blocks_tup = (
self.blocks[:blkno_l] + (nb,) + self.blocks[blkno_l + 1 :]
self.blocks[:blkno_l]
+ (first_nb,)
+ self.blocks[blkno_l + 1 :]
+ nbs_tup
)
self.blocks = blocks_tup
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
# blk.delete gives a copy, so we can remove a possible reference
self._clear_reference_block(blkno_l)
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
self._blknos[nb.mgr_locs.indexer] = i + nr_blocks

if len(removed_blknos):
# Remove blocks & update blknos and refs accordingly
Expand Down Expand Up @@ -2034,7 +2044,7 @@ def idelete(self, indexer) -> SingleBlockManager:

Ensures that self.blocks doesn't become empty.
"""
nb = self._block.delete(indexer)
nb = self._block.delete(indexer)[0]
self.blocks = (nb,)
self.axes[0] = self.axes[0].delete(indexer)
self._cache.clear()
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1325,8 +1325,15 @@ def test_unstack_non_slice_like_blocks(using_array_manager):
# Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like

mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
df = DataFrame(np.random.randn(15, 4), index=mi)
df[1] = df[1].astype(np.int64)
df = DataFrame(
{
0: np.random.randn(15),
1: np.random.randn(15).astype(np.int64),
2: np.random.randn(15),
3: np.random.randn(15),
},
index=mi,
)
if not using_array_manager:
assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)

Expand Down
24 changes: 16 additions & 8 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def test_copy(self, fblock):
def test_delete(self, fblock):
newb = fblock.copy()
locs = newb.mgr_locs
nb = newb.delete(0)
nb = newb.delete(0)[0]
assert newb.mgr_locs is locs

assert nb is not newb
Expand All @@ -299,21 +299,25 @@ def test_delete(self, fblock):
newb = fblock.copy()
locs = newb.mgr_locs
nb = newb.delete(1)
assert len(nb) == 2
assert newb.mgr_locs is locs

tm.assert_numpy_array_equal(
nb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp)
nb[0].mgr_locs.as_array, np.array([0], dtype=np.intp)
)
tm.assert_numpy_array_equal(
nb[1].mgr_locs.as_array, np.array([4], dtype=np.intp)
)
assert not (newb.values[1] == 2).all()
assert (nb.values[1] == 2).all()
assert (nb[1].values[0] == 2).all()

newb = fblock.copy()
locs = newb.mgr_locs
nb = newb.delete(2)
assert len(nb) == 1
tm.assert_numpy_array_equal(
nb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
nb[0].mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
)
assert (nb.values[1] == 1).all()
assert (nb[0].values[1] == 1).all()

newb = fblock.copy()

Expand All @@ -328,14 +332,18 @@ def test_delete_datetimelike(self):
assert isinstance(blk.values, TimedeltaArray)

nb = blk.delete(1)
assert isinstance(nb.values, TimedeltaArray)
assert len(nb) == 2
assert isinstance(nb[0].values, TimedeltaArray)
assert isinstance(nb[1].values, TimedeltaArray)

df = DataFrame(arr.view("M8[ns]"))
blk = df._mgr.blocks[0]
assert isinstance(blk.values, DatetimeArray)

nb = blk.delete([1, 3])
assert isinstance(nb.values, DatetimeArray)
assert len(nb) == 2
assert isinstance(nb[0].values, DatetimeArray)
assert isinstance(nb[1].values, DatetimeArray)

def test_split(self):
# GH#37799
Expand Down