Skip to content

PERF: repeated slicing along index in groupby #40353

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
60a8649
CLN: BlockManager.get_slice require only slice arg
jbrockmendel Mar 6, 2021
91436a5
Merge branch 'master' into ref-slice
jbrockmendel Mar 6, 2021
b17ad65
mypy fixup
jbrockmendel Mar 6, 2021
9fa658f
Merge branch 'master' into ref-slice
jbrockmendel Mar 6, 2021
5afce04
PERF: implement Index._getitem_slice
jbrockmendel Mar 6, 2021
36a8530
PERF: implement getitem_block_columns
jbrockmendel Mar 8, 2021
c528119
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 9, 2021
dc5f975
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 10, 2021
14ee965
PERF: repeated slicing along index in groupby
jbrockmendel Mar 10, 2021
f6655ad
mypy fixup
jbrockmendel Mar 10, 2021
21fe008
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 11, 2021
e4eae87
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 11, 2021
ce2dec6
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 11, 2021
8ff5167
mypy fixup
jbrockmendel Mar 11, 2021
2ade2bb
revert
jbrockmendel Mar 12, 2021
c594fd0
comment typo fixup
jbrockmendel Mar 12, 2021
0cefeb2
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 12, 2021
e9d0a92
type:ignore
jbrockmendel Mar 12, 2021
9a4ccc1
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 12, 2021
14497df
TST: EA[..., slc]
jbrockmendel Mar 12, 2021
9a3969f
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 14, 2021
23cf40d
recert get_slice_index
jbrockmendel Mar 14, 2021
12266aa
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 14, 2021
d13ad92
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 15, 2021
e36b7b0
TST: arr[foo, ...]
jbrockmendel Mar 15, 2021
bc0a110
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,11 @@ def value_counts(self, dropna: bool = True):
def __getitem__(self, key):

if isinstance(key, tuple):
if len(key) > 1:
if key[0] is Ellipsis:
key = key[1:]
elif key[-1] is Ellipsis:
key = key[:-1]
if len(key) > 1:
raise IndexError("too many indices for array.")
key = key[0]
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,15 @@ def __getitem__(self, item: Any) -> Any:
"Only integers, slices and integer or "
"boolean arrays are valid indices."
)
elif isinstance(item, tuple):
# possibly unpack arr[..., n] to arr[n]
if len(item) == 1:
item = item[0]
elif len(item) == 2:
if item[0] is Ellipsis:
item = item[1]
elif item[1] is Ellipsis:
item = item[0]

# We are not an array indexer, so maybe e.g. a slice or integer
# indexer. We dispatch to pyarrow.
Expand Down
35 changes: 29 additions & 6 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,18 +309,41 @@ def _slice(self, slicer):
return self.values[slicer]

@final
def getitem_block(self, slicer, new_mgr_locs=None) -> Block:
def getitem_block(self, slicer) -> Block:
"""
Perform __getitem__-like, return result as block.

Only supports slices that preserve dimensionality.
"""
if new_mgr_locs is None:
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
new_mgr_locs = self._mgr_locs[axis0_slicer]
elif not isinstance(new_mgr_locs, BlockPlacement):
new_mgr_locs = BlockPlacement(new_mgr_locs)
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
new_mgr_locs = self._mgr_locs[axis0_slicer]

new_values = self._slice(slicer)

if new_values.ndim != self.values.ndim:
raise ValueError("Only same dim slicing is allowed")

return type(self)._simple_new(new_values, new_mgr_locs, self.ndim)

@final
def getitem_block_index(self, slicer: slice) -> Block:
"""
Perform __getitem__-like specialized to slicing along index.

Assumes self.ndim == 2
"""
# error: Invalid index type "Tuple[ellipsis, slice]" for
# "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]"
new_values = self.values[..., slicer] # type: ignore[index]
return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim)

@final
def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
"""
Perform __getitem__-like, return result as block.

Only supports slices that preserve dimensionality.
"""
new_values = self._slice(slicer)

if new_values.ndim != self.values.ndim:
Expand Down
17 changes: 11 additions & 6 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
internals as libinternals,
lib,
)
from pandas._libs.internals import BlockPlacement
from pandas._typing import (
ArrayLike,
Dtype,
Expand Down Expand Up @@ -801,8 +802,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
if axis == 0:
new_blocks = self._slice_take_blocks_ax0(slobj)
elif axis == 1:
slicer = (slice(None), slobj)
new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks]
else:
raise IndexError("Requested axis not found in manager")

Expand Down Expand Up @@ -1396,7 +1396,8 @@ def _slice_take_blocks_ax0(
# TODO(EA2D): special casing unnecessary with 2D EAs
if sllen == 0:
return []
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
bp = BlockPlacement(slice(0, sllen))
return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
elif not allow_fill or self.ndim == 1:
if allow_fill and fill_value is None:
fill_value = blk.fill_value
Expand All @@ -1405,7 +1406,9 @@ def _slice_take_blocks_ax0(
# GH#33597 slice instead of take, so we get
# views instead of copies
blocks = [
blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i)
blk.getitem_block_columns(
slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
)
for i, ml in enumerate(slobj)
]
# We have
Expand Down Expand Up @@ -1465,13 +1468,15 @@ def _slice_take_blocks_ax0(
taker = lib.maybe_indices_to_slice(taker, max_len)

if isinstance(taker, slice):
nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs)
nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
blocks.append(nb)
elif only_slice:
# GH#33597 slice instead of take, so we get
# views instead of copies
for i, ml in zip(taker, mgr_locs):
nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml)
slc = slice(i, i + 1)
bp = BlockPlacement(ml)
nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
# We have np.shares_memory(nb.values, blk.values)
blocks.append(nb)
else:
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,26 @@ def test_getitem_slice(self, data):
result = data[slice(1)] # scalar
assert isinstance(result, type(data))

def test_getitem_ellipsis_and_slice(self, data):
# GH#40353 this is called from getitem_block_index
result = data[..., :]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test data[:, ...] ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea, will update

self.assert_extension_array_equal(result, data)

result = data[:, ...]
self.assert_extension_array_equal(result, data)

result = data[..., :3]
self.assert_extension_array_equal(result, data[:3])

result = data[:3, ...]
self.assert_extension_array_equal(result, data[:3])

result = data[..., ::2]
self.assert_extension_array_equal(result, data[::2])

result = data[::2, ...]
self.assert_extension_array_equal(result, data[::2])

def test_get(self, data):
# GH 20882
s = pd.Series(data, index=[2 * i for i in range(len(data))])
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,16 @@ def _from_factorized(cls, values, original):
return cls([UserDict(x) for x in values if x != ()])

def __getitem__(self, item):
if isinstance(item, tuple):
if len(item) > 1:
if item[0] is Ellipsis:
item = item[1:]
elif item[-1] is Ellipsis:
item = item[:-1]
if len(item) > 1:
raise IndexError("too many indices for array.")
item = item[0]

if isinstance(item, numbers.Integral):
return self.data[item]
elif isinstance(item, slice) and item == slice(None):
Expand Down
33 changes: 19 additions & 14 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,22 +848,27 @@ def assert_slice_ok(mgr, axis, slobj):
assert_slice_ok(mgr, ax, slice(1, 4))
assert_slice_ok(mgr, ax, slice(3, 0, -2))

# boolean mask
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))

if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(mgr, ax, np.array([True, True, False], dtype=np.bool_))
if mgr.ndim < 2:
# 2D only support slice objects

# boolean mask
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))

if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(
mgr, ax, np.array([True, True, False], dtype=np.bool_)
)

# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))
# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))

if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])

@pytest.mark.parametrize("mgr", MANAGERS)
def test_take(self, mgr):
Expand Down