-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: _cython_agg_blocks #31752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF: _cython_agg_blocks #31752
Changes from all commits
f728465
3fe1f3c
f7345ea
d8eeda3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1022,9 +1022,32 @@ def _cython_agg_blocks( | |
agg_blocks: List[Block] = [] | ||
new_items: List[np.ndarray] = [] | ||
deleted_items: List[np.ndarray] = [] | ||
# Some object-dtype blocks might be split into List[Block[T], Block[U]] | ||
split_items: List[np.ndarray] = [] | ||
split_frames: List[DataFrame] = [] | ||
|
||
def _recast_result(result, values): | ||
# see if we can cast the block back to the original dtype | ||
assert not isinstance(result, DataFrame) | ||
assert result is not no_result | ||
|
||
result = maybe_downcast_numeric(result, values.dtype) | ||
|
||
if not isinstance(values, np.ndarray) and isinstance(result, np.ndarray): | ||
# e.g. block.values was an IntegerArray | ||
# (1, N) case can occur if block.values was Categorical | ||
# and result is ndarray[object] | ||
assert result.ndim == 1 or result.shape[0] == 1 | ||
try: | ||
# Cast back if feasible | ||
result = type(values)._from_sequence( | ||
result.ravel(), dtype=values.dtype | ||
) | ||
except ValueError: | ||
# reshape to be valid for non-Extension Block | ||
result = result.reshape(1, -1) | ||
|
||
elif isinstance(result, np.ndarray) and result.ndim == 1: | ||
result = result.reshape(1, -1) | ||
|
||
return result | ||
|
||
no_result = object() | ||
for block in data.blocks: | ||
|
@@ -1048,6 +1071,7 @@ def _cython_agg_blocks( | |
continue | ||
|
||
# call our grouper again with only this block | ||
# TODO: will this mess up if we have duplicate columns? | ||
obj = self.obj[data.items[locs]] | ||
if obj.shape[1] == 1: | ||
# Avoid call to self.values that can occur in DataFrame | ||
|
@@ -1063,58 +1087,32 @@ def _cython_agg_blocks( | |
deleted_items.append(locs) | ||
continue | ||
else: | ||
if isinstance(result, Series): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm why is this required here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have test cases that get here with a Series and without this the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm that's unfortunate. Don't want to go too far down the typing road on this one but maybe should be using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it looks like since |
||
result = result.to_frame() | ||
|
||
result = cast(DataFrame, result) | ||
# unwrap DataFrame to get array | ||
if len(result._data.blocks) != 1: | ||
# We've split an object block! Everything we've assumed | ||
# about a single block input returning a single block output | ||
# is a lie. To keep the code-path for the typical non-split case | ||
# clean, we choose to clean up this mess later on. | ||
split_items.append(locs) | ||
split_frames.append(result) | ||
continue | ||
|
||
assert len(result._data.blocks) == 1 | ||
result = result._data.blocks[0].values | ||
if isinstance(result, np.ndarray) and result.ndim == 1: | ||
result = result.reshape(1, -1) | ||
|
||
assert not isinstance(result, DataFrame) | ||
|
||
if result is not no_result: | ||
# see if we can cast the block back to the original dtype | ||
result = maybe_downcast_numeric(result, block.dtype) | ||
|
||
if block.is_extension and isinstance(result, np.ndarray): | ||
# e.g. block.values was an IntegerArray | ||
# (1, N) case can occur if block.values was Categorical | ||
# and result is ndarray[object] | ||
assert result.ndim == 1 or result.shape[0] == 1 | ||
try: | ||
# Cast back if feasible | ||
result = type(block.values)._from_sequence( | ||
result.ravel(), dtype=block.values.dtype | ||
) | ||
except ValueError: | ||
# reshape to be valid for non-Extension Block | ||
result = result.reshape(1, -1) | ||
for i, col in enumerate(result.columns): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps misinterpreting but does this not have a huge performance penalty? IIUC tried column iteration back in #28782 but performance took a pretty big hit in doing so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the actual aggregation isn't being done columnwise, just re-assembling new blocks |
||
nb = result.iloc[:, [i]]._data.blocks[0] | ||
loc = data.items.get_loc(col) | ||
# FIXME: requires unique? GH#31735 | ||
res = _recast_result(nb.values, data.iget(loc).blocks[0].values) | ||
nb2 = make_block(res, placement=[loc], ndim=2) | ||
agg_blocks.append(nb2) | ||
|
||
else: | ||
assert not isinstance(result, DataFrame) | ||
assert result is not no_result | ||
result = _recast_result(result, block.values) | ||
agg_block: Block = block.make_block(result) | ||
agg_blocks.append(agg_block) | ||
|
||
new_items.append(locs) | ||
agg_blocks.append(agg_block) | ||
|
||
if not (agg_blocks or split_frames): | ||
if not agg_blocks: | ||
raise DataError("No numeric types to aggregate") | ||
|
||
if split_items: | ||
# Clean up the mess left over from split blocks. | ||
for locs, result in zip(split_items, split_frames): | ||
assert len(locs) == result.shape[1] | ||
for i, loc in enumerate(locs): | ||
new_items.append(np.array([loc], dtype=locs.dtype)) | ||
agg_blocks.append(result.iloc[:, [i]]._data.blocks[0]) | ||
|
||
# reset the locs in the blocks to correspond to our | ||
# current ordering | ||
indexer = np.concatenate(new_items) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you move this somewhere else, pandas.core.dtypes.cast maybe