Skip to content

PERF: repeated slicing along index in groupby #40353

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
60a8649
CLN: BlockManager.get_slice require only slice arg
jbrockmendel Mar 6, 2021
91436a5
Merge branch 'master' into ref-slice
jbrockmendel Mar 6, 2021
b17ad65
mypy fixup
jbrockmendel Mar 6, 2021
9fa658f
Merge branch 'master' into ref-slice
jbrockmendel Mar 6, 2021
5afce04
PERF: implement Index._getitem_slice
jbrockmendel Mar 6, 2021
36a8530
PERF: implement getitem_block_columns
jbrockmendel Mar 8, 2021
c528119
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 9, 2021
dc5f975
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 10, 2021
14ee965
PERF: repeated slicing along index in groupby
jbrockmendel Mar 10, 2021
f6655ad
mypy fixup
jbrockmendel Mar 10, 2021
21fe008
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 11, 2021
e4eae87
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 11, 2021
ce2dec6
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 11, 2021
8ff5167
mypy fixup
jbrockmendel Mar 11, 2021
2ade2bb
revert
jbrockmendel Mar 12, 2021
c594fd0
comment typo fixup
jbrockmendel Mar 12, 2021
0cefeb2
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 12, 2021
e9d0a92
type:ignore
jbrockmendel Mar 12, 2021
9a4ccc1
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 12, 2021
14497df
TST: EA[..., slc]
jbrockmendel Mar 12, 2021
9a3969f
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 14, 2021
23cf40d
recert get_slice_index
jbrockmendel Mar 14, 2021
12266aa
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 14, 2021
d13ad92
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 15, 2021
e36b7b0
TST: arr[foo, ...]
jbrockmendel Mar 15, 2021
bc0a110
Merge branch 'master' into cln-getitem_block
jbrockmendel Mar 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,8 @@ def value_counts(self, dropna: bool = True):
def __getitem__(self, key):

if isinstance(key, tuple):
if len(key) and key[0] is Ellipsis:
key = key[1:]
if len(key) > 1:
raise IndexError("too many indices for array.")
key = key[0]
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,15 @@ def __getitem__(self, item: Any) -> Any:
"Only integers, slices and integer or "
"boolean arrays are valid indices."
)
elif isinstance(item, tuple):
# possibly unpack arr[..., n] to arr[n]
if len(item) == 1:
item = item[0]
elif len(item) == 2:
if item[0] is Ellipsis:
item = item[1]
elif item[1] is Ellipsis:
item = item[0]

# We are not an array indexer, so maybe e.g. a slice or integer
# indexer. We dispatch to pyarrow.
Expand Down
40 changes: 29 additions & 11 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,19 +1012,27 @@ def __iter__(self):

starts, ends = lib.generate_slices(self.slabels, self.ngroups)

for i, (start, end) in enumerate(zip(starts, ends)):
yield i, self._chop(sdata, slice(start, end))
if self.axis == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can make this more readable by

method =  self._chop_index if self.axis == 0 else self._chop_column

for i ......

     yield i, method(....)

for i, (start, end) in enumerate(zip(starts, ends)):
yield i, self._chop_index(sdata, slice(start, end))

else:
for i, (start, end) in enumerate(zip(starts, ends)):
yield i, self._chop_columns(sdata, slice(start, end))

@cache_readonly
def sorted_data(self) -> FrameOrSeries:
return self.data.take(self.sort_idx, axis=self.axis)

def _chop(self, sdata, slice_obj: slice) -> NDFrame:
def _chop_columns(self, sdata, slice_obj: slice) -> NDFrame:
raise AbstractMethodError(self)

def _chop_index(self, sdata, slice_obj: slice) -> NDFrame:
raise AbstractMethodError(self)


class SeriesSplitter(DataSplitter):
def _chop(self, sdata: Series, slice_obj: slice) -> Series:
def _chop_index(self, sdata: Series, slice_obj: slice) -> Series:
# fastpath equivalent to `sdata.iloc[slice_obj]`
mgr = sdata._mgr.get_slice(slice_obj)
# __finalize__ not called here, must be applied by caller if applicable
Expand All @@ -1043,13 +1051,23 @@ def fast_apply(self, f: F, sdata: FrameOrSeries, names):
starts, ends = lib.generate_slices(self.slabels, self.ngroups)
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)

def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
# Fastpath equivalent to:
# if self.axis == 0:
# return sdata.iloc[slice_obj]
# else:
# return sdata.iloc[:, slice_obj]
mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis)
def _chop_index(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
"""
Fastpath equivalent to `sdata.iloc[slice_obj]`
"""
mgr = sdata._mgr.get_slice_index(slice_obj)
# __finalize__ not called here, must be applied by caller if applicable

# fastpath equivalent to `return sdata._constructor(mgr)`
obj = type(sdata)._from_mgr(mgr)
object.__setattr__(obj, "_flags", sdata._flags)
return obj

def _chop_columns(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
"""
Fastpath equivalent to `sdata.iloc[:, slice_obj]`
"""
mgr = sdata._mgr.get_slice(slice_obj, axis=0)
# __finalize__ not called here, must be applied by caller if applicable

# fastpath equivalent to `return sdata._constructor(mgr)`
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def apply(
def isna(self: T, func) -> T:
return self.apply("apply", func=func)

def get_slice(self: T, slobj: slice, axis: int = 0) -> T:
raise AbstractMethodError(self)

def get_slice_index(self: T, slobj: slice) -> T:
"""
get_slice specialized to axis=self.ndim-1
"""
return self.get_slice(slobj, axis=self.ndim - 1)


class SingleDataManager(DataManager):
ndim = 1
Expand Down
35 changes: 29 additions & 6 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,18 +322,41 @@ def _slice(self, slicer):
return self.values[slicer]

@final
def getitem_block(self, slicer, new_mgr_locs=None) -> Block:
def getitem_block(self, slicer) -> Block:
"""
Perform __getitem__-like, return result as block.

Only supports slices that preserve dimensionality.
"""
if new_mgr_locs is None:
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
new_mgr_locs = self.mgr_locs[axis0_slicer]
elif not isinstance(new_mgr_locs, BlockPlacement):
new_mgr_locs = BlockPlacement(new_mgr_locs)
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
new_mgr_locs = self.mgr_locs[axis0_slicer]

new_values = self._slice(slicer)

if new_values.ndim != self.values.ndim:
raise ValueError("Only same dim slicing is allowed")

return type(self)._simple_new(new_values, new_mgr_locs, self.ndim)

@final
def getitem_block_index(self, slicer: slice) -> Block:
"""
Perform __getitem__-like specialized to slicing along index.

Assumes self.ndim == 2
"""
# error: Invalid index type "Tuple[ellipsis, slice]" for
# "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]"
new_values = self.values[..., slicer] # type: ignore[index]
return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim)

@final
def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
"""
Perform __getitem__-like, return result as block.

Only supports slices that preserve dimensionality.
"""
new_values = self._slice(slicer)

if new_values.ndim != self.values.ndim:
Expand Down
23 changes: 19 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
internals as libinternals,
lib,
)
from pandas._libs.internals import BlockPlacement
from pandas._typing import (
ArrayLike,
Dtype,
Expand Down Expand Up @@ -809,6 +810,15 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:

return type(self)._simple_new(tuple(new_blocks), new_axes)

def get_slice_index(self, slobj: slice) -> BlockManager:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a get_slice_columns and change get_slice to just dispatch? (to share code)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the only code-sharing i see available would be to change L819-L819 to return self.get_slice_index(slobj)which only saves 1 line

Flip side: one of the things im looking at is making another attempt at getting axes out of BlockManager (and ArrayManager), which would marginally widen the scope for sharing

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this specialized version avoid? Isn't it only the if axis == 0:/ elif axis == 1 checks?
(which I assume are quite fast checks, although if done repeatedly might indeed still worth to avoid)

The other difference is that get_slice does

            slicer = (slice(None), slobj)
            new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]

while this new function does:

new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks]

but get_slice could do that as well?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you answer this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it only the if axis == 0:/ elif axis == 1 checks? (which I assume are quite fast checks, although if done repeatedly might indeed still worth to avoid)

Yes. The code I'm profiling to optimize is (#40171 (comment))

import numpy as np
import pandas as pd

N = 10 ** 4
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
df = pd.DataFrame(
    {
        "key": labels,
        "key2": labels2,
        "value1": np.random.randn(N),
        "value2": ["foo", "bar", "baz", "qux"] * (N // 4),
    }
)

%prun -s cumtime for i in range(100): df.groupby("key").apply(lambda x: 1)  # on master as of this writing

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    4.170    4.170 {built-in method builtins.exec}
        1    0.001    0.001    4.170    4.170 <string>:1(<module>)
      100    0.001    0.000    4.156    0.042 groupby.py:911(apply)
      100    0.004    0.000    4.150    0.042 groupby.py:960(_python_apply_general)
      100    0.497    0.005    4.084    0.041 ops.py:263(apply)
   199000    0.176    0.000    3.223    0.000 ops.py:1005(__iter__)
   198900    0.248    0.000    2.986    0.000 ops.py:1046(_chop)
   198900    0.339    0.000    2.522    0.000 managers.py:796(get_slice)
   198900    0.188    0.000    1.631    0.000 managers.py:803(<listcomp>)
   596700    0.823    0.000    1.443    0.000 blocks.py:324(getitem_block)
   198900    0.135    0.000    0.409    0.000 base.py:4638(_getitem_slice)

we're still spending 8% of the runtime in get_slice itself, which is why i think avoiding re-doing these checks is worthwhile.

but get_slice could do that as well?

sure, could update get_slice to also use getitem_block_index (i think the first draft of this PR started before get_slice reliably actually got a slice object)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we're still spending 8% of the runtime in get_slice itself, which is why i think avoiding re-doing these checks is worthwhile.

And did that number reduce by using get_slice_index instead?

sure, could update get_slice to also use getitem_block_index

Yes, let's at least make the code consistent there

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using this branch, timing mgr.get_slice_index(slobj) vs mgr.get_slice(slobj, axis=1) (but making them both use getitem_block_index, and removing the assert) , I only get a tiny difference (less than 5% difference, and that's then on something that only takes 8% of the total runtime)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im seeing 17-22% gains with just getitem_block_index and not the extra BM/AM methods, so will revert those for this PR

# get_slice specialized to axis = 1 and ndim == 2
new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks]

axes = self.axes
new_axes = [axes[0], axes[1]._getitem_slice(slobj)]

return type(self)._simple_new(tuple(new_blocks), new_axes)

@property
def nblocks(self) -> int:
return len(self.blocks)
Expand Down Expand Up @@ -1391,7 +1401,8 @@ def _slice_take_blocks_ax0(
if sllen == 0:
return []
# TODO: tests all have isinstance(slobj, slice), other possibilities?
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
bp = BlockPlacement(slice(0, sllen))
return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
elif not allow_fill or self.ndim == 1:
if allow_fill and fill_value is None:
fill_value = blk.fill_value
Expand All @@ -1400,7 +1411,9 @@ def _slice_take_blocks_ax0(
# GH#33597 slice instead of take, so we get
# views instead of copies
blocks = [
blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i)
blk.getitem_block_columns(
slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
)
for i, ml in enumerate(slobj)
]
# We have
Expand Down Expand Up @@ -1460,13 +1473,15 @@ def _slice_take_blocks_ax0(
taker = lib.maybe_indices_to_slice(taker, max_len)

if isinstance(taker, slice):
nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs)
nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
blocks.append(nb)
elif only_slice:
# GH#33597 slice instead of take, so we get
# views instead of copies
for i, ml in zip(taker, mgr_locs):
nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml)
slc = slice(i, i + 1)
bp = BlockPlacement(ml)
nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
# We have np.shares_memory(nb.values, blk.values)
blocks.append(nb)
else:
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,17 @@ def test_getitem_slice(self, data):
result = data[slice(1)] # scalar
assert isinstance(result, type(data))

def test_getitem_ellipsis_and_slice(self, data):
# GH#40353 this is called from getitem_block_index
result = data[..., :]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test data[:, ...] ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea, will update

self.assert_extension_array_equal(result, data)

result = data[..., :3]
self.assert_extension_array_equal(result, data[:3])

result = data[..., ::2]
self.assert_extension_array_equal(result, data[::2])

def test_get(self, data):
# GH 20882
s = pd.Series(data, index=[2 * i for i in range(len(data))])
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def _from_factorized(cls, values, original):
return cls([UserDict(x) for x in values if x != ()])

def __getitem__(self, item):
if isinstance(item, tuple):
if len(item) and item[0] is Ellipsis:
item = item[1:]
if len(item) > 1:
raise IndexError("too many indices for array.")
item = item[0]

if isinstance(item, numbers.Integral):
return self.data[item]
elif isinstance(item, slice) and item == slice(None):
Expand Down
33 changes: 19 additions & 14 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,22 +848,27 @@ def assert_slice_ok(mgr, axis, slobj):
assert_slice_ok(mgr, ax, slice(1, 4))
assert_slice_ok(mgr, ax, slice(3, 0, -2))

# boolean mask
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))

if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(mgr, ax, np.array([True, True, False], dtype=np.bool_))
if mgr.ndim < 2:
# 2D only support slice objects

# boolean mask
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))

if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(
mgr, ax, np.array([True, True, False], dtype=np.bool_)
)

# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))
# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))

if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])

@pytest.mark.parametrize("mgr", MANAGERS)
def test_take(self, mgr):
Expand Down