-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: repeated slicing along index in groupby #40353
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
60a8649
91436a5
b17ad65
9fa658f
5afce04
36a8530
c528119
dc5f975
14ee965
f6655ad
21fe008
e4eae87
ce2dec6
8ff5167
2ade2bb
c594fd0
0cefeb2
e9d0a92
9a4ccc1
14497df
9a3969f
23cf40d
12266aa
d13ad92
e36b7b0
bc0a110
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
internals as libinternals, | ||
lib, | ||
) | ||
from pandas._libs.internals import BlockPlacement | ||
from pandas._typing import ( | ||
ArrayLike, | ||
Dtype, | ||
|
@@ -809,6 +810,15 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: | |
|
||
return type(self)._simple_new(tuple(new_blocks), new_axes) | ||
|
||
def get_slice_index(self, slobj: slice) -> BlockManager: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a get_slice_columns and change get_slice to just dispatch? (to share code) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the only code-sharing i see available would be to change L819-L819 to Flip side: one of the things im looking at is making another attempt at getting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this specialized version avoid? Isn't it only the The other difference is that
while this new function does:
but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you answer this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. The code I'm profiling to optimize is (#40171 (comment))
we're still spending 8% of the runtime in get_slice itself, which is why i think avoiding re-doing these checks is worthwhile.
sure, could update get_slice to also use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
And did that number reduce by using get_slice_index instead?
Yes, let's at least make the code consistent there There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using this branch, timing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. im seeing 17-22% gains with just getitem_block_index and not the extra BM/AM methods, so will revert those for this PR |
||
# get_slice specialized to axis = 1 and ndim == 2 | ||
new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] | ||
|
||
axes = self.axes | ||
new_axes = [axes[0], axes[1]._getitem_slice(slobj)] | ||
|
||
return type(self)._simple_new(tuple(new_blocks), new_axes) | ||
|
||
@property | ||
def nblocks(self) -> int: | ||
return len(self.blocks) | ||
|
@@ -1391,7 +1401,8 @@ def _slice_take_blocks_ax0( | |
if sllen == 0: | ||
return [] | ||
# TODO: tests all have isinstance(slobj, slice), other possibilities? | ||
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] | ||
bp = BlockPlacement(slice(0, sllen)) | ||
return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] | ||
elif not allow_fill or self.ndim == 1: | ||
if allow_fill and fill_value is None: | ||
fill_value = blk.fill_value | ||
|
@@ -1400,7 +1411,9 @@ def _slice_take_blocks_ax0( | |
# GH#33597 slice instead of take, so we get | ||
# views instead of copies | ||
blocks = [ | ||
blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i) | ||
blk.getitem_block_columns( | ||
slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i) | ||
) | ||
for i, ml in enumerate(slobj) | ||
] | ||
# We have | ||
|
@@ -1460,13 +1473,15 @@ def _slice_take_blocks_ax0( | |
taker = lib.maybe_indices_to_slice(taker, max_len) | ||
|
||
if isinstance(taker, slice): | ||
nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) | ||
nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) | ||
blocks.append(nb) | ||
elif only_slice: | ||
# GH#33597 slice instead of take, so we get | ||
# views instead of copies | ||
for i, ml in zip(taker, mgr_locs): | ||
nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml) | ||
slc = slice(i, i + 1) | ||
bp = BlockPlacement(ml) | ||
nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) | ||
# We have np.shares_memory(nb.values, blk.values) | ||
blocks.append(nb) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -245,6 +245,17 @@ def test_getitem_slice(self, data): | |
result = data[slice(1)] # scalar | ||
assert isinstance(result, type(data)) | ||
|
||
def test_getitem_ellipsis_and_slice(self, data): | ||
# GH#40353 this is called from getitem_block_index | ||
result = data[..., :] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea, will update |
||
self.assert_extension_array_equal(result, data) | ||
|
||
result = data[..., :3] | ||
self.assert_extension_array_equal(result, data[:3]) | ||
|
||
result = data[..., ::2] | ||
self.assert_extension_array_equal(result, data[::2]) | ||
|
||
def test_get(self, data): | ||
# GH 20882 | ||
s = pd.Series(data, index=[2 * i for i in range(len(data))]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can make this more readable by