PERF: fix #32976 slow group by for categorical columns

rtlee9 · rtlee9 · commit 165cfbb501d5 · 2020-04-22T23:04:33.000-07:00
Aggregate categorical codes with fast cython aggregation for select
`how` operations.
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -6,6 +6,7 @@
 
 from pandas import (
     Categorical,
+    CategoricalDtype,
     DataFrame,
     MultiIndex,
     Series,
@@ -510,6 +511,27 @@ def time_groupby_extra_cat_nosort(self):
         self.df_extra_cat.groupby("a", sort=False)["b"].count()
 
 
+class CategoricalFrame:
+    def setup(self):
+        SIZE = 100000
+        GROUPS = 10000  # The larger, the more extreme the timing differences
+        CARDINALITY = 10
+        CAT = CategoricalDtype(list(range(CARDINALITY)))
+        df_int = DataFrame(
+            {
+                "group": [np.random.randint(0, GROUPS) for i in range(SIZE)],
+                "cat": [np.random.choice(CAT.categories) for i in range(SIZE)],
+            }
+        )
+        self.df_cat_values = df_int.astype({"cat": CAT})
+
+    def time_groupby(self):
+        self.df_cat_values.groupby("group").last()
+
+    def time_groupby_ordered(self):
+        self.df_cat_values.groupby("group", sort=True).last()
+
+
 class Datelike:
     # GH 14338
     params = ["period_range", "date_range", "date_range_tz"]
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -426,6 +426,7 @@ Performance improvements
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
+- Performance improvement in :meth:`DataFrameGroupBy.func` when aggregating categorical data (:issue:`32976`)
 
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -44,6 +44,7 @@
     ensure_int64,
     ensure_platform_int,
     is_bool,
+    is_categorical_dtype,
     is_integer_dtype,
     is_interval_dtype,
     is_numeric_dtype,
@@ -59,6 +60,7 @@
     normalize_keyword_aggregation,
 )
 import pandas.core.algorithms as algorithms
+from pandas.core.arrays.categorical import Categorical
 from pandas.core.base import DataError, SpecificationError
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
@@ -1056,9 +1058,47 @@ def _cython_agg_blocks(
             result = no_result
             locs = block.mgr_locs.as_array
             try:
-                result, _ = self.grouper.aggregate(
-                    block.values, how, axis=1, min_count=min_count
+
+                cat_method_blacklist = (
+                    "add",
+                    "median",
+                    "prod",
+                    "sem",
+                    "cumsum",
+                    "sum",
+                    "cummin",
+                    "mean",
+                    "max",
+                    "skew",
+                    "cumprod",
+                    "cummax",
+                    "rank",
+                    "pct_change",
+                    "min",
+                    "var",
+                    "mad",
+                    "describe",
+                    "std",
+                    "quantile",
                 )
+                if (
+                    is_categorical_dtype(block.values)
+                    and how not in cat_method_blacklist
+                ):
+                    # perf improvement: aggregate categorical codes with fast cython agg
+                    # for select `how` operations
+                    result_codes, _ = self.grouper.aggregate(
+                        block.values.codes, how, axis=1, min_count=min_count
+                    )
+                    result = Categorical.from_codes(
+                        result_codes,
+                        categories=block.values.categories,
+                        ordered=block.values.ordered,
+                    )
+                else:
+                    result, _ = self.grouper.aggregate(
+                        block.values, how, axis=1, min_count=min_count
+                    )
             except NotImplementedError:
                 # generally if we have numeric_only=False
                 # and non-applicable functions