Skip to content

Commit 203f901

Browse files
authored
PERF: repeated slicing along index in groupby (#40353)
1 parent b8b3fbf commit 203f901

File tree

7 files changed

+103
-26
lines changed

7 files changed

+103
-26
lines changed

pandas/core/arrays/sparse/array.py

+5
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,11 @@ def value_counts(self, dropna: bool = True):
817817
def __getitem__(self, key):
818818

819819
if isinstance(key, tuple):
820+
if len(key) > 1:
821+
if key[0] is Ellipsis:
822+
key = key[1:]
823+
elif key[-1] is Ellipsis:
824+
key = key[:-1]
820825
if len(key) > 1:
821826
raise IndexError("too many indices for array.")
822827
key = key[0]

pandas/core/arrays/string_arrow.py

+9
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,15 @@ def __getitem__(self, item: Any) -> Any:
354354
"Only integers, slices and integer or "
355355
"boolean arrays are valid indices."
356356
)
357+
elif isinstance(item, tuple):
358+
# possibly unpack arr[..., n] to arr[n]
359+
if len(item) == 1:
360+
item = item[0]
361+
elif len(item) == 2:
362+
if item[0] is Ellipsis:
363+
item = item[1]
364+
elif item[1] is Ellipsis:
365+
item = item[0]
357366

358367
# We are not an array indexer, so maybe e.g. a slice or integer
359368
# indexer. We dispatch to pyarrow.

pandas/core/internals/blocks.py

+29-6
Original file line numberDiff line numberDiff line change
@@ -309,18 +309,41 @@ def _slice(self, slicer):
309309
return self.values[slicer]
310310

311311
@final
312-
def getitem_block(self, slicer, new_mgr_locs=None) -> Block:
312+
def getitem_block(self, slicer) -> Block:
313313
"""
314314
Perform __getitem__-like, return result as block.
315315
316316
Only supports slices that preserve dimensionality.
317317
"""
318-
if new_mgr_locs is None:
319-
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
320-
new_mgr_locs = self._mgr_locs[axis0_slicer]
321-
elif not isinstance(new_mgr_locs, BlockPlacement):
322-
new_mgr_locs = BlockPlacement(new_mgr_locs)
318+
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
319+
new_mgr_locs = self._mgr_locs[axis0_slicer]
320+
321+
new_values = self._slice(slicer)
322+
323+
if new_values.ndim != self.values.ndim:
324+
raise ValueError("Only same dim slicing is allowed")
325+
326+
return type(self)._simple_new(new_values, new_mgr_locs, self.ndim)
323327

328+
@final
329+
def getitem_block_index(self, slicer: slice) -> Block:
330+
"""
331+
Perform __getitem__-like specialized to slicing along index.
332+
333+
Assumes self.ndim == 2
334+
"""
335+
# error: Invalid index type "Tuple[ellipsis, slice]" for
336+
# "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]"
337+
new_values = self.values[..., slicer] # type: ignore[index]
338+
return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim)
339+
340+
@final
341+
def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
342+
"""
343+
Perform __getitem__-like, return result as block.
344+
345+
Only supports slices that preserve dimensionality.
346+
"""
324347
new_values = self._slice(slicer)
325348

326349
if new_values.ndim != self.values.ndim:

pandas/core/internals/managers.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
internals as libinternals,
2424
lib,
2525
)
26+
from pandas._libs.internals import BlockPlacement
2627
from pandas._typing import (
2728
ArrayLike,
2829
Dtype,
@@ -801,8 +802,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
801802
if axis == 0:
802803
new_blocks = self._slice_take_blocks_ax0(slobj)
803804
elif axis == 1:
804-
slicer = (slice(None), slobj)
805-
new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
805+
new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks]
806806
else:
807807
raise IndexError("Requested axis not found in manager")
808808

@@ -1396,7 +1396,8 @@ def _slice_take_blocks_ax0(
13961396
# TODO(EA2D): special casing unnecessary with 2D EAs
13971397
if sllen == 0:
13981398
return []
1399-
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
1399+
bp = BlockPlacement(slice(0, sllen))
1400+
return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
14001401
elif not allow_fill or self.ndim == 1:
14011402
if allow_fill and fill_value is None:
14021403
fill_value = blk.fill_value
@@ -1405,7 +1406,9 @@ def _slice_take_blocks_ax0(
14051406
# GH#33597 slice instead of take, so we get
14061407
# views instead of copies
14071408
blocks = [
1408-
blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i)
1409+
blk.getitem_block_columns(
1410+
slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
1411+
)
14091412
for i, ml in enumerate(slobj)
14101413
]
14111414
# We have
@@ -1465,13 +1468,15 @@ def _slice_take_blocks_ax0(
14651468
taker = lib.maybe_indices_to_slice(taker, max_len)
14661469

14671470
if isinstance(taker, slice):
1468-
nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs)
1471+
nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
14691472
blocks.append(nb)
14701473
elif only_slice:
14711474
# GH#33597 slice instead of take, so we get
14721475
# views instead of copies
14731476
for i, ml in zip(taker, mgr_locs):
1474-
nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml)
1477+
slc = slice(i, i + 1)
1478+
bp = BlockPlacement(ml)
1479+
nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
14751480
# We have np.shares_memory(nb.values, blk.values)
14761481
blocks.append(nb)
14771482
else:

pandas/tests/extension/base/getitem.py

+20
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,26 @@ def test_getitem_slice(self, data):
245245
result = data[slice(1)] # scalar
246246
assert isinstance(result, type(data))
247247

248+
def test_getitem_ellipsis_and_slice(self, data):
249+
# GH#40353 this is called from getitem_block_index
250+
result = data[..., :]
251+
self.assert_extension_array_equal(result, data)
252+
253+
result = data[:, ...]
254+
self.assert_extension_array_equal(result, data)
255+
256+
result = data[..., :3]
257+
self.assert_extension_array_equal(result, data[:3])
258+
259+
result = data[:3, ...]
260+
self.assert_extension_array_equal(result, data[:3])
261+
262+
result = data[..., ::2]
263+
self.assert_extension_array_equal(result, data[::2])
264+
265+
result = data[::2, ...]
266+
self.assert_extension_array_equal(result, data[::2])
267+
248268
def test_get(self, data):
249269
# GH 20882
250270
s = pd.Series(data, index=[2 * i for i in range(len(data))])

pandas/tests/extension/json/array.py

+10
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,16 @@ def _from_factorized(cls, values, original):
8383
return cls([UserDict(x) for x in values if x != ()])
8484

8585
def __getitem__(self, item):
86+
if isinstance(item, tuple):
87+
if len(item) > 1:
88+
if item[0] is Ellipsis:
89+
item = item[1:]
90+
elif item[-1] is Ellipsis:
91+
item = item[:-1]
92+
if len(item) > 1:
93+
raise IndexError("too many indices for array.")
94+
item = item[0]
95+
8696
if isinstance(item, numbers.Integral):
8797
return self.data[item]
8898
elif isinstance(item, slice) and item == slice(None):

pandas/tests/internals/test_internals.py

+19-14
Original file line numberDiff line numberDiff line change
@@ -848,22 +848,27 @@ def assert_slice_ok(mgr, axis, slobj):
848848
assert_slice_ok(mgr, ax, slice(1, 4))
849849
assert_slice_ok(mgr, ax, slice(3, 0, -2))
850850

851-
# boolean mask
852-
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
853-
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
854-
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))
855-
856-
if mgr.shape[ax] >= 3:
857-
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
858-
assert_slice_ok(mgr, ax, np.array([True, True, False], dtype=np.bool_))
851+
if mgr.ndim < 2:
852+
# 2D only support slice objects
853+
854+
# boolean mask
855+
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
856+
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
857+
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))
858+
859+
if mgr.shape[ax] >= 3:
860+
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
861+
assert_slice_ok(
862+
mgr, ax, np.array([True, True, False], dtype=np.bool_)
863+
)
859864

860-
# fancy indexer
861-
assert_slice_ok(mgr, ax, [])
862-
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))
865+
# fancy indexer
866+
assert_slice_ok(mgr, ax, [])
867+
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))
863868

864-
if mgr.shape[ax] >= 3:
865-
assert_slice_ok(mgr, ax, [0, 1, 2])
866-
assert_slice_ok(mgr, ax, [-1, -2, -3])
869+
if mgr.shape[ax] >= 3:
870+
assert_slice_ok(mgr, ax, [0, 1, 2])
871+
assert_slice_ok(mgr, ax, [-1, -2, -3])
867872

868873
@pytest.mark.parametrize("mgr", MANAGERS)
869874
def test_take(self, mgr):

0 commit comments

Comments
 (0)