From 0fd3dde3f055aeb368eff7170ca6333d188eaa00 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 21 Apr 2020 23:32:05 -0700 Subject: [PATCH] PERF: fix #32976 slow group by for categorical columns Aggregate categorical codes with fast cython aggregation for select `how` operations. 8/1/20: rebase and move release note to 1.2 8/2/20: Update tests to expect categorical back 8/3/20: add PR as issue for whatsnew groupby api change --- asv_bench/benchmarks/groupby.py | 29 ++++++++++++++++ doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/groupby/ops.py | 43 +++++++++++++++++++++++- pandas/tests/groupby/test_categorical.py | 6 ++-- 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5ffda03fad80f..7c127b90c181c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -6,6 +6,7 @@ from pandas import ( Categorical, + CategoricalDtype, DataFrame, MultiIndex, Series, @@ -473,6 +474,7 @@ def time_sum(self): class Categories: + # benchmark grouping by categoricals def setup(self): N = 10 ** 5 arr = np.random.random(N) @@ -510,6 +512,33 @@ def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby("a", sort=False)["b"].count() +class CategoricalFrame: + # benchmark grouping with operations on categorical values (GH #32976) + param_names = ["groupby_type", "value_type", "agg_method"] + params = [(int,), (int, str), ("last", "head", "count")] + + def setup(self, groupby_type, value_type, agg_method): + SIZE = 100000 + GROUPS = 1000 + CARDINALITY = 10 + CAT = CategoricalDtype([value_type(i) for i in range(CARDINALITY)]) + df = DataFrame( + { + "group": [ + groupby_type(np.random.randint(0, GROUPS)) for i in range(SIZE) + ], + "cat": [np.random.choice(CAT.categories) for i in range(SIZE)], + } + ) + self.df_cat_values = df.astype({"cat": CAT}) + + def time_groupby(self, groupby_type, value_type, agg_method): + getattr(self.df_cat_values.groupby("group"), agg_method)() + + def time_groupby_ordered(self, groupby_type, value_type, agg_method): + getattr(self.df_cat_values.groupby("group", sort=True), agg_method)() + + class Datelike: # GH 14338 params = ["period_range", "date_range", "date_range_tz"] diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4..122a4b76ff0be 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -40,6 +40,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :meth:`DataFrame.groupby` when aggregating categorical data (:issue:`32976`) - - @@ -132,6 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- :meth:`DataFrame.groupby` aggregations of categorical series will now return a :class:`Categorical` while preserving the codes and categories of the original series (:issue:`33739`) - - diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..d2e8f2023e7bc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -40,6 +40,7 @@ from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms +from pandas.core.arrays.categorical import Categorical from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -356,6 +357,29 @@ def get_group_levels(self) -> List[Index]: _name_functions = {"ohlc": ["open", "high", "low", "close"]} + _cat_method_blacklist = ( + "add", + "median", + "prod", + "sem", + "cumsum", + "sum", + "cummin", + "mean", + "max", + "skew", + "cumprod", + "cummax", + "rank", + "pct_change", + "min", + "var", + "mad", + "describe", + "std", + "quantile", + ) + def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -460,7 +484,7 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): + if is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values.dtype): if how in ["add", "prod", "cumsum", "cumprod"]: @@ -481,6 +505,7 @@ def _cython_operation( is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) + is_categorical = is_categorical_dtype(values) if is_datetimelike: values = values.view("int64") @@ -496,6 +521,17 @@ def _cython_operation( values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) + elif is_categorical: + if how in self._cat_method_blacklist: + raise NotImplementedError( + f"{values.dtype} dtype not supported for `how` argument {how}" + ) + values, categories, ordered = ( + values.codes.astype(np.int64), + values.categories, + values.ordered, + ) + is_numeric = True else: values = values.astype(object) @@ -572,6 +608,11 @@ def _cython_operation( result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) + elif is_categorical: + # re-create categories + result = Categorical.from_codes( + result, categories=categories, ordered=ordered, + ) if is_extension_array_dtype(orig_values.dtype): result = maybe_cast_result(result=result, obj=orig_values, how=how) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0d447a70b540d..887e349fa51d5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1505,7 +1505,9 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func): ) df_grouped = df.groupby("A")["B"] result = getattr(df_grouped, func)() - expected = pd.Series(["b"], index=pd.Index([1997], name="A"), name="B") + expected = pd.Series( + ["b"], index=pd.Index([1997], name="A"), name="B", dtype="category" + ).cat.as_ordered() tm.assert_series_equal(result, expected) @@ -1574,7 +1576,7 @@ def test_agg_cython_category_not_implemented_fallback(): result = df.groupby("col_num").col_cat.first() expected = pd.Series( [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" - ) + ).astype("category") tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"})