-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: fix #32976 slow group by for categorical columns #33739
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,7 @@ | |
from pandas.core.dtypes.missing import _maybe_fill, isna | ||
|
||
import pandas.core.algorithms as algorithms | ||
from pandas.core.arrays.categorical import Categorical | ||
from pandas.core.base import SelectionMixin | ||
import pandas.core.common as com | ||
from pandas.core.frame import DataFrame | ||
|
@@ -356,6 +357,29 @@ def get_group_levels(self) -> List[Index]: | |
|
||
_name_functions = {"ohlc": ["open", "high", "low", "close"]} | ||
|
||
_cat_method_blacklist = ( | ||
"add", | ||
"median", | ||
"prod", | ||
"sem", | ||
"cumsum", | ||
"sum", | ||
"cummin", | ||
"mean", | ||
"max", | ||
"skew", | ||
"cumprod", | ||
"cummax", | ||
"rank", | ||
"pct_change", | ||
"min", | ||
"var", | ||
"mad", | ||
"describe", | ||
"std", | ||
"quantile", | ||
) | ||
|
||
def _is_builtin_func(self, arg): | ||
""" | ||
if we define a builtin function for this argument, return it, | ||
|
@@ -460,7 +484,7 @@ def _cython_operation( | |
|
||
# categoricals are only 1d, so we | ||
# are not setup for dim transforming | ||
if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): | ||
if is_sparse(values.dtype): | ||
raise NotImplementedError(f"{values.dtype} dtype not supported") | ||
elif is_datetime64_any_dtype(values.dtype): | ||
if how in ["add", "prod", "cumsum", "cumprod"]: | ||
|
@@ -481,6 +505,7 @@ def _cython_operation( | |
|
||
is_datetimelike = needs_i8_conversion(values.dtype) | ||
is_numeric = is_numeric_dtype(values.dtype) | ||
is_categorical = is_categorical_dtype(values) | ||
|
||
if is_datetimelike: | ||
values = values.view("int64") | ||
|
@@ -496,6 +521,17 @@ def _cython_operation( | |
values = ensure_int_or_float(values) | ||
elif is_numeric and not is_complex_dtype(values): | ||
values = ensure_float64(values) | ||
elif is_categorical: | ||
if how in self._cat_method_blacklist: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really don't like doing this. Can you elaborate when we can actually process this? listing methods is a bad idea generally. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not exactly sure what you mean by "when we can actually process this" this but I agree that listing methods isn't necessarily thorough and isn't robust. However, I've been unable to find a suitable alternative to blacklisting methods where we don't want to apply the aggregation on the category codes -- open to other ideas though. Please find more context in this thread. |
||
raise NotImplementedError( | ||
f"{values.dtype} dtype not supported for `how` argument {how}" | ||
) | ||
values, categories, ordered = ( | ||
values.codes.astype(np.int64), | ||
values.categories, | ||
values.ordered, | ||
) | ||
is_numeric = True | ||
else: | ||
values = values.astype(object) | ||
|
||
|
@@ -572,6 +608,11 @@ def _cython_operation( | |
result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) | ||
elif is_datetimelike and kind == "aggregate": | ||
result = result.astype(orig_values.dtype) | ||
elif is_categorical: | ||
# re-create categories | ||
result = Categorical.from_codes( | ||
result, categories=categories, ordered=ordered, | ||
) | ||
|
||
if is_extension_array_dtype(orig_values.dtype): | ||
result = maybe_cast_result(result=result, obj=orig_values, how=how) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what methods don't work?