PERF: fix #32976 slow group by for categorical columns

rtlee9 · rtlee9 · commit 673573be1c0b · 2020-04-26T15:12:12.000-07:00
Aggregate categorical codes with fast cython aggregation for select
`how` operations.
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -6,6 +6,7 @@
 
 from pandas import (
     Categorical,
+    CategoricalDtype,
     DataFrame,
     MultiIndex,
     Series,
@@ -473,6 +474,7 @@ def time_sum(self):
 
 
 class Categories:
+    # benchmark grouping by categoricals
     def setup(self):
         N = 10 ** 5
         arr = np.random.random(N)
@@ -510,6 +512,33 @@ def time_groupby_extra_cat_nosort(self):
         self.df_extra_cat.groupby("a", sort=False)["b"].count()
 
 
+class CategoricalFrame:
+    # benchmark grouping with operations on categorical values (GH #32976)
+    param_names = ["groupby_type", "value_type"]
+    params = [(int, str), (int, str)]
+
+    def setup(self, groupby_type, value_type):
+        SIZE = 100000
+        GROUPS = 10000  # The larger, the more extreme the timing differences
+        CARDINALITY = 10
+        CAT = CategoricalDtype([value_type(i) for i in range(CARDINALITY)])
+        df = DataFrame(
+            {
+                "group": [
+                    groupby_type(np.random.randint(0, GROUPS)) for i in range(SIZE)
+                ],
+                "cat": [np.random.choice(CAT.categories) for i in range(SIZE)],
+            }
+        )
+        self.df_cat_values = df.astype({"cat": CAT})
+
+    def time_groupby(self, groupby_type, value_type):
+        self.df_cat_values.groupby("group").last()
+
+    def time_groupby_ordered(self, groupby_type, value_type):
+        self.df_cat_values.groupby("group", sort=True).last()
+
+
 class Datelike:
     # GH 14338
     params = ["period_range", "date_range", "date_range_tz"]
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -427,6 +427,7 @@ Performance improvements
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
+- Performance improvement in :meth:`DataFrame.groupby` when aggregating categorical data (:issue:`32976`)
 
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -39,6 +39,7 @@
 from pandas.core.dtypes.missing import _maybe_fill, isna
 
 import pandas.core.algorithms as algorithms
+from pandas.core.arrays.categorical import Categorical
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
@@ -451,7 +452,7 @@ def _cython_operation(
 
         # categoricals are only 1d, so we
         # are not setup for dim transforming
-        if is_categorical_dtype(values) or is_sparse(values):
+        if is_sparse(values):
             raise NotImplementedError(f"{values.dtype} dtype not supported")
         elif is_datetime64_any_dtype(values):
             if how in ["add", "prod", "cumsum", "cumprod"]:
@@ -472,6 +473,29 @@ def _cython_operation(
 
         is_datetimelike = needs_i8_conversion(values.dtype)
         is_numeric = is_numeric_dtype(values.dtype)
+        is_categorical = is_categorical_dtype(values)
+        cat_method_blacklist = (
+            "add",
+            "median",
+            "prod",
+            "sem",
+            "cumsum",
+            "sum",
+            "cummin",
+            "mean",
+            "max",
+            "skew",
+            "cumprod",
+            "cummax",
+            "rank",
+            "pct_change",
+            "min",
+            "var",
+            "mad",
+            "describe",
+            "std",
+            "quantile",
+        )
 
         if is_datetimelike:
             values = values.view("int64")
@@ -487,6 +511,17 @@ def _cython_operation(
                 values = ensure_int_or_float(values)
         elif is_numeric and not is_complex_dtype(values):
             values = ensure_float64(values)
+        elif is_categorical:
+            if how in cat_method_blacklist:
+                raise NotImplementedError(
+                    f"{values.dtype} dtype not supported for `how` argument {how}"
+                )
+            values, categories, ordered = (
+                values.codes.astype(np.int64),
+                values.categories,
+                values.ordered,
+            )
+            is_numeric = True
         else:
             values = values.astype(object)
 
@@ -574,6 +609,11 @@ def _cython_operation(
             result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
         elif is_datetimelike and kind == "aggregate":
             result = result.astype(orig_values.dtype)
+        elif is_categorical:
+            # re-create categories
+            result = Categorical.from_codes(
+                result, categories=categories, ordered=ordered,
+            )
 
         return result, names
 
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -466,7 +466,7 @@ def test_agg_cython_category_not_implemented_fallback():
     result = df.groupby("col_num").col_cat.first()
     expected = pd.Series(
         [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat"
-    )
+    ).astype("category")
     tm.assert_series_equal(result, expected)
 
     result = df.groupby("col_num").agg({"col_cat": "first"})