Skip to content

Commit 8286520

Browse files
committed
PERF: fix pandas-dev#32976 slow group by for categorical columns
Aggregate categorical codes with fast cython aggregation for select `how` operations.
1 parent cf61be6 commit 8286520

File tree

1 file changed

+18
-3
lines changed

1 file changed

+18
-3
lines changed

pandas/core/groupby/generic.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
ensure_int64,
4545
ensure_platform_int,
4646
is_bool,
47+
is_categorical_dtype,
4748
is_integer_dtype,
4849
is_interval_dtype,
4950
is_numeric_dtype,
@@ -59,6 +60,7 @@
5960
normalize_keyword_aggregation,
6061
)
6162
import pandas.core.algorithms as algorithms
63+
from pandas.core.arrays.categorical import Categorical
6264
from pandas.core.base import DataError, SpecificationError
6365
import pandas.core.common as com
6466
from pandas.core.construction import create_series_with_explicit_dtype
@@ -1056,9 +1058,22 @@ def _cython_agg_blocks(
10561058
result = no_result
10571059
locs = block.mgr_locs.as_array
10581060
try:
1059-
result, _ = self.grouper.aggregate(
1060-
block.values, how, axis=1, min_count=min_count
1061-
)
1061+
1062+
if is_categorical_dtype(block.values) and how in ('first', 'last'):
1063+
# perf improvement: aggregate categorical codes with fast cython agg
1064+
# for select `how` operations
1065+
result_codes, _ = self.grouper.aggregate(
1066+
block.values.codes, how, axis=1, min_count=min_count
1067+
)
1068+
result = Categorical.from_codes(
1069+
result_codes,
1070+
categories=block.values.categories,
1071+
ordered=block.values.ordered,
1072+
)
1073+
else:
1074+
result, _ = self.grouper.aggregate(
1075+
block.values, how, axis=1, min_count=min_count
1076+
)
10621077
except NotImplementedError:
10631078
# generally if we have numeric_only=False
10641079
# and non-applicable functions

0 commit comments

Comments
 (0)