diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index cf6f3f92068e8..1152c2253c42d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -677,6 +677,15 @@ def time_groupby_extra_cat_sort(self): def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby("a", sort=False)["b"].count() + def time_series_groupby_value_counts_many_categories(self): + df = self.df_extra_cat[0:10**4] + df.groupby("b")["a"].value_counts() + + def time_series_groupby_value_counts_few_categories(self): + df = self.df_extra_cat[0:10**4].copy() + df["a"] = df["a"].cat.remove_unused_categories() + df.groupby("b")["a"].value_counts() + class Datelike: # GH 14338 diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 931d18dc349f3..cd2126826866f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -468,6 +468,7 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) +- Performance improvement in :meth:`SeriesGroupBy.value_counts` with categorical values. (:issue:`46202`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 245e33fb1a23b..5b79e37586b42 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -609,12 +609,35 @@ def value_counts( names = self.grouper.names + [self.obj.name] - if is_categorical_dtype(val.dtype) or ( - bins is not None and not np.iterable(bins) - ): + if is_categorical_dtype(val.dtype): + df = self.obj.to_frame() + df.columns = Index([self.obj.name]) + # GH38672 relates to categorical dtype + groupby = DataFrameGroupBy( + df, + self.grouper, + axis=self.axis, + level=self.level, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + mutated=self.mutated, + dropna=self.dropna, + ) + ser = groupby.value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + ser.name = self.obj.name + return ser + + if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level # in a backward compatible way - # GH38672 relates to categorical dtype + ser = self.apply( Series.value_counts, normalize=normalize,