Skip to content

Commit 165cfbb

Browse files
committed
PERF: fix #32976 slow group by for categorical columns
Aggregate categorical codes with fast cython aggregation for select `how` operations.
1 parent cf61be6 commit 165cfbb

File tree

3 files changed

+65
-2
lines changed

3 files changed

+65
-2
lines changed

asv_bench/benchmarks/groupby.py

+22
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from pandas import (
88
Categorical,
9+
CategoricalDtype,
910
DataFrame,
1011
MultiIndex,
1112
Series,
@@ -510,6 +511,27 @@ def time_groupby_extra_cat_nosort(self):
510511
self.df_extra_cat.groupby("a", sort=False)["b"].count()
511512

512513

514+
class CategoricalFrame:
515+
def setup(self):
516+
SIZE = 100000
517+
GROUPS = 10000 # The larger, the more extreme the timing differences
518+
CARDINALITY = 10
519+
CAT = CategoricalDtype(list(range(CARDINALITY)))
520+
df_int = DataFrame(
521+
{
522+
"group": [np.random.randint(0, GROUPS) for i in range(SIZE)],
523+
"cat": [np.random.choice(CAT.categories) for i in range(SIZE)],
524+
}
525+
)
526+
self.df_cat_values = df_int.astype({"cat": CAT})
527+
528+
def time_groupby(self):
529+
self.df_cat_values.groupby("group").last()
530+
531+
def time_groupby_ordered(self):
532+
self.df_cat_values.groupby("group", sort=True).last()
533+
534+
513535
class Datelike:
514536
# GH 14338
515537
params = ["period_range", "date_range", "date_range_tz"]

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ Performance improvements
426426
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
427427
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
428428
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
429+
- Performance improvement in :meth:`DataFrameGroupBy.func` when aggregating categorical data (:issue:`32976`)
429430

430431

431432
.. ---------------------------------------------------------------------------

pandas/core/groupby/generic.py

+42-2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
ensure_int64,
4545
ensure_platform_int,
4646
is_bool,
47+
is_categorical_dtype,
4748
is_integer_dtype,
4849
is_interval_dtype,
4950
is_numeric_dtype,
@@ -59,6 +60,7 @@
5960
normalize_keyword_aggregation,
6061
)
6162
import pandas.core.algorithms as algorithms
63+
from pandas.core.arrays.categorical import Categorical
6264
from pandas.core.base import DataError, SpecificationError
6365
import pandas.core.common as com
6466
from pandas.core.construction import create_series_with_explicit_dtype
@@ -1056,9 +1058,47 @@ def _cython_agg_blocks(
10561058
result = no_result
10571059
locs = block.mgr_locs.as_array
10581060
try:
1059-
result, _ = self.grouper.aggregate(
1060-
block.values, how, axis=1, min_count=min_count
1061+
1062+
cat_method_blacklist = (
1063+
"add",
1064+
"median",
1065+
"prod",
1066+
"sem",
1067+
"cumsum",
1068+
"sum",
1069+
"cummin",
1070+
"mean",
1071+
"max",
1072+
"skew",
1073+
"cumprod",
1074+
"cummax",
1075+
"rank",
1076+
"pct_change",
1077+
"min",
1078+
"var",
1079+
"mad",
1080+
"describe",
1081+
"std",
1082+
"quantile",
10611083
)
1084+
if (
1085+
is_categorical_dtype(block.values)
1086+
and how not in cat_method_blacklist
1087+
):
1088+
# perf improvement: aggregate categorical codes with fast cython agg
1089+
# for select `how` operations
1090+
result_codes, _ = self.grouper.aggregate(
1091+
block.values.codes, how, axis=1, min_count=min_count
1092+
)
1093+
result = Categorical.from_codes(
1094+
result_codes,
1095+
categories=block.values.categories,
1096+
ordered=block.values.ordered,
1097+
)
1098+
else:
1099+
result, _ = self.grouper.aggregate(
1100+
block.values, how, axis=1, min_count=min_count
1101+
)
10621102
except NotImplementedError:
10631103
# generally if we have numeric_only=False
10641104
# and non-applicable functions

0 commit comments

Comments
 (0)