-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH/PERF: enable column-wise reductions for EA-backed columns #32867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
a9ca0fa
21aee0d
9f83f6e
a9706e0
07372e3
2d08450
c7f1dae
9e2a780
7f847e8
594d2b0
3d62e68
5b0370e
6e3c287
7088cfc
925d660
852331e
1c9a685
34731f2
a8e61d0
f233109
ec65c57
15ec9b6
2653d02
64e0069
bb0a47b
9323f0e
eb33f86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7852,6 +7852,23 @@ def _count_level(self, level, axis=0, numeric_only=False): | |
def _reduce( | ||
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds | ||
): | ||
""" | ||
Reduce DataFrame over axis with given operation. | ||
|
||
Parameters | ||
---------- | ||
op : func | ||
The reducing function to be called on the values. | ||
name : str | ||
The name of the reduction. | ||
axis : int | ||
numeric_only : bool, optional | ||
filter_type : None or "bool" | ||
Set to "bool" for ops that give boolean results. | ||
skipna, **kwds : keywords to pass to the `op` function | ||
|
||
""" | ||
column_wise = kwds.pop("column_wise", False) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
assert filter_type is None or filter_type == "bool", filter_type | ||
|
||
|
@@ -7898,6 +7915,19 @@ def _get_data(axis_matters): | |
raise NotImplementedError(msg) | ||
return data | ||
|
||
def blk_func(values): | ||
if isinstance(values, ExtensionArray): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with this inside blk_func, shouldn't the block-wise operation have the same performance bump as the column-wise? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It didn't actually change anything performance wise (it's the same function being called as before). (it's possible that the block-wise way could be optimized to get rid of this difference though. The main thing is that the block results are not in order) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This would be really nice. |
||
return values._reduce(name, skipna=skipna, **kwds) | ||
else: | ||
return op(values, axis=1, skipna=skipna, **kwds) | ||
|
||
if axis == 0 and column_wise: | ||
# column-wise reduction | ||
df = self | ||
if numeric_only is True: | ||
df = _get_data(axis_matters=True) | ||
return df._reduce_columns(blk_func) | ||
|
||
if numeric_only is not None and axis in [0, 1]: | ||
df = self | ||
if numeric_only is True: | ||
|
@@ -7908,12 +7938,6 @@ def _get_data(axis_matters): | |
|
||
out_dtype = "bool" if filter_type == "bool" else None | ||
|
||
def blk_func(values): | ||
if values.ndim == 1 and not isinstance(values, np.ndarray): | ||
# we can't pass axis=1 | ||
return op(values, axis=0, skipna=skipna, **kwds) | ||
return op(values, axis=1, skipna=skipna, **kwds) | ||
|
||
# After possibly _get_data and transposing, we are now in the | ||
# simple case where we can use BlockManager._reduce | ||
res = df._data.reduce(blk_func) | ||
|
@@ -7994,6 +8018,27 @@ def blk_func(values): | |
result = self._constructor_sliced(result, index=labels) | ||
return result | ||
|
||
def _reduce_columns(self, op): | ||
""" | ||
Reduce DataFrame column-wise. | ||
|
||
Parameters | ||
---------- | ||
op : func | ||
The reducing function to be called on the values. | ||
|
||
Returns | ||
------- | ||
Series | ||
""" | ||
result = [] | ||
|
||
for i in range(len(self.columns)): | ||
val = op(self._data.iget_values(i)) | ||
result.append(val) | ||
|
||
return self._constructor_sliced(result, index=self.columns) | ||
|
||
def nunique(self, axis=0, dropna=True) -> Series: | ||
""" | ||
Count distinct observations over requested axis. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typing args a +