BUG/PERF: SeriesGroupBy.value_counts sorting bug and categorical performance (pandas-dev#50548)

rhshadrach · simonjayhawkins · web-flow · commit 4f42ecbd0569 · 2023-01-10T21:21:56.000-05:00
Co-authored-by: Simon Hawkins &lt;simonjayhawkins@gmail.com&gt;
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -846,6 +846,7 @@ Performance improvements
 - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
 - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`)
 - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`)
+- Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
@@ -1020,6 +1021,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Resampler.size` caused a wide :class:`DataFrame` to be returned instead of a :class:`Series` with :class:`MultiIndex` (:issue:`46826`)
 - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`)
 - Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
+- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
 -
 
 Reshaping
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -75,7 +75,6 @@
     reconstruct_func,
     validate_func_kwargs,
 )
-from pandas.core.arrays.categorical import Categorical
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
 from pandas.core.groupby import base
@@ -86,14 +85,12 @@
     _apply_docs,
     _transform_template,
 )
-from pandas.core.groupby.grouper import get_grouper
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
     all_indexes_same,
     default_index,
 )
-from pandas.core.indexes.category import CategoricalIndex
 from pandas.core.series import Series
 from pandas.core.util.numba_ import maybe_use_numba
 
@@ -647,6 +644,12 @@ def value_counts(
         bins=None,
         dropna: bool = True,
     ) -> Series:
+        if bins is None:
+            result = self._value_counts(
+                normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+            )
+            assert isinstance(result, Series)
+            return result
 
         from pandas.core.reshape.merge import get_join_indexers
         from pandas.core.reshape.tile import cut
@@ -2224,122 +2227,7 @@ def value_counts(
         3    male       low      US        0.25
         4    male    medium      FR        0.25
         """
-        if self.axis == 1:
-            raise NotImplementedError(
-                "DataFrameGroupBy.value_counts only handles axis=0"
-            )
-
-        with self._group_selection_context():
-            df = self.obj
-
-            in_axis_names = {
-                grouping.name for grouping in self.grouper.groupings if grouping.in_axis
-            }
-            if isinstance(self._selected_obj, Series):
-                name = self._selected_obj.name
-                keys = [] if name in in_axis_names else [self._selected_obj]
-            else:
-                unique_cols = set(self._selected_obj.columns)
-                if subset is not None:
-                    subsetted = set(subset)
-                    clashing = subsetted & set(in_axis_names)
-                    if clashing:
-                        raise ValueError(
-                            f"Keys {clashing} in subset cannot be in "
-                            "the groupby column keys."
-                        )
-                    doesnt_exist = subsetted - unique_cols
-                    if doesnt_exist:
-                        raise ValueError(
-                            f"Keys {doesnt_exist} in subset do not "
-                            f"exist in the DataFrame."
-                        )
-                else:
-                    subsetted = unique_cols
-
-                keys = [
-                    # Can't use .values because the column label needs to be preserved
-                    self._selected_obj.iloc[:, idx]
-                    for idx, name in enumerate(self._selected_obj.columns)
-                    if name not in in_axis_names and name in subsetted
-                ]
-
-            groupings = list(self.grouper.groupings)
-            for key in keys:
-                grouper, _, _ = get_grouper(
-                    df,
-                    key=key,
-                    axis=self.axis,
-                    sort=self.sort,
-                    observed=False,
-                    dropna=dropna,
-                )
-                groupings += list(grouper.groupings)
-
-            # Take the size of the overall columns
-            gb = df.groupby(
-                groupings,
-                sort=self.sort,
-                observed=self.observed,
-                dropna=self.dropna,
-            )
-            result_series = cast(Series, gb.size())
-
-            # GH-46357 Include non-observed categories
-            # of non-grouping columns regardless of `observed`
-            if any(
-                isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
-                and not grouping._observed
-                for grouping in groupings
-            ):
-                levels_list = [ping.result_index for ping in groupings]
-                multi_index, _ = MultiIndex.from_product(
-                    levels_list, names=[ping.name for ping in groupings]
-                ).sortlevel()
-                result_series = result_series.reindex(multi_index, fill_value=0)
-
-            if normalize:
-                # Normalize the results by dividing by the original group sizes.
-                # We are guaranteed to have the first N levels be the
-                # user-requested grouping.
-                levels = list(
-                    range(len(self.grouper.groupings), result_series.index.nlevels)
-                )
-                indexed_group_size = result_series.groupby(
-                    result_series.index.droplevel(levels),
-                    sort=self.sort,
-                    dropna=self.dropna,
-                ).transform("sum")
-                result_series /= indexed_group_size
-
-                # Handle groups of non-observed categories
-                result_series = result_series.fillna(0.0)
-
-            if sort:
-                # Sort the values and then resort by the main grouping
-                index_level = range(len(self.grouper.groupings))
-                result_series = result_series.sort_values(
-                    ascending=ascending
-                ).sort_index(level=index_level, sort_remaining=False)
-
-            result: Series | DataFrame
-            if self.as_index:
-                result = result_series
-            else:
-                # Convert to frame
-                name = "proportion" if normalize else "count"
-                index = result_series.index
-                columns = com.fill_missing_names(index.names)
-                if name in columns:
-                    raise ValueError(
-                        f"Column label '{name}' is duplicate of result column"
-                    )
-                result_series.name = name
-                result_series.index = index.set_names(range(len(columns)))
-                result_frame = result_series.reset_index()
-                result_frame.columns = columns + [name]
-                result = result_frame
-            return result.__finalize__(self.obj, method="value_counts")
+        return self._value_counts(subset, normalize, sort, ascending, dropna)
 
     def fillna(
         self,
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -113,6 +113,7 @@ class providing the base-class of operations.
     numba_,
     ops,
 )
+from pandas.core.groupby.grouper import get_grouper
 from pandas.core.groupby.indexing import (
     GroupByIndexingMixin,
     GroupByNthSelector,
@@ -923,8 +924,6 @@ def __init__(
         self.dropna = dropna
 
         if grouper is None:
-            from pandas.core.groupby.grouper import get_grouper
-
             grouper, exclusions, obj = get_grouper(
                 obj,
                 keys,
@@ -2160,6 +2159,138 @@ def var(
                 ddof=ddof,
             )
 
+    @final
+    def _value_counts(
+        self,
+        subset: Sequence[Hashable] | None = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> DataFrame | Series:
+        """
+        Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
+
+        SeriesGroupBy additionally supports a bins argument. See the docstring of
+        DataFrameGroupBy.value_counts for a description of arguments.
+        """
+        if self.axis == 1:
+            raise NotImplementedError(
+                "DataFrameGroupBy.value_counts only handles axis=0"
+            )
+
+        with self._group_selection_context():
+            df = self.obj
+
+            in_axis_names = {
+                grouping.name for grouping in self.grouper.groupings if grouping.in_axis
+            }
+            if isinstance(self._selected_obj, Series):
+                name = self._selected_obj.name
+                keys = [] if name in in_axis_names else [self._selected_obj]
+            else:
+                unique_cols = set(self._selected_obj.columns)
+                if subset is not None:
+                    subsetted = set(subset)
+                    clashing = subsetted & set(in_axis_names)
+                    if clashing:
+                        raise ValueError(
+                            f"Keys {clashing} in subset cannot be in "
+                            "the groupby column keys."
+                        )
+                    doesnt_exist = subsetted - unique_cols
+                    if doesnt_exist:
+                        raise ValueError(
+                            f"Keys {doesnt_exist} in subset do not "
+                            f"exist in the DataFrame."
+                        )
+                else:
+                    subsetted = unique_cols
+
+                keys = [
+                    # Can't use .values because the column label needs to be preserved
+                    self._selected_obj.iloc[:, idx]
+                    for idx, name in enumerate(self._selected_obj.columns)
+                    if name not in in_axis_names and name in subsetted
+                ]
+
+            groupings = list(self.grouper.groupings)
+            for key in keys:
+                grouper, _, _ = get_grouper(
+                    df,
+                    key=key,
+                    axis=self.axis,
+                    sort=self.sort,
+                    observed=False,
+                    dropna=dropna,
+                )
+                groupings += list(grouper.groupings)
+
+            # Take the size of the overall columns
+            gb = df.groupby(
+                groupings,
+                sort=self.sort,
+                observed=self.observed,
+                dropna=self.dropna,
+            )
+            result_series = cast(Series, gb.size())
+
+            # GH-46357 Include non-observed categories
+            # of non-grouping columns regardless of `observed`
+            if any(
+                isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
+                and not grouping._observed
+                for grouping in groupings
+            ):
+                levels_list = [ping.result_index for ping in groupings]
+                multi_index, _ = MultiIndex.from_product(
+                    levels_list, names=[ping.name for ping in groupings]
+                ).sortlevel()
+                result_series = result_series.reindex(multi_index, fill_value=0)
+
+            if normalize:
+                # Normalize the results by dividing by the original group sizes.
+                # We are guaranteed to have the first N levels be the
+                # user-requested grouping.
+                levels = list(
+                    range(len(self.grouper.groupings), result_series.index.nlevels)
+                )
+                indexed_group_size = result_series.groupby(
+                    result_series.index.droplevel(levels),
+                    sort=self.sort,
+                    dropna=self.dropna,
+                ).transform("sum")
+                result_series /= indexed_group_size
+
+                # Handle groups of non-observed categories
+                result_series = result_series.fillna(0.0)
+
+            if sort:
+                # Sort the values and then resort by the main grouping
+                index_level = range(len(self.grouper.groupings))
+                result_series = result_series.sort_values(
+                    ascending=ascending
+                ).sort_index(level=index_level, sort_remaining=False)
+
+            result: Series | DataFrame
+            if self.as_index:
+                result = result_series
+            else:
+                # Convert to frame
+                name = "proportion" if normalize else "count"
+                index = result_series.index
+                columns = com.fill_missing_names(index.names)
+                if name in columns:
+                    raise ValueError(
+                        f"Column label '{name}' is duplicate of result column"
+                    )
+                result_series.name = name
+                result_series.index = index.set_names(range(len(columns)))
+                result_frame = result_series.reset_index()
+                result_frame.columns = columns + [name]
+                result = result_frame
+            return result.__finalize__(self.obj, method="value_counts")
+
     @final
     @Substitution(name="groupby")
     @Appender(_common_see_also)
@@ -2944,8 +3075,6 @@ def _nth(
 
             # create a grouper with the original parameters, but on dropped
             # object
-            from pandas.core.groupby.grouper import get_grouper
-
             grouper, _, _ = get_grouper(
                 dropped,
                 key=self.keys,
diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
@@ -199,6 +199,26 @@ def test_series_groupby_value_counts_on_categorical():
     tm.assert_series_equal(result, expected)
 
 
+def test_series_groupby_value_counts_no_sort():
+    # GH#50482
+    df = DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", "high", "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    gb = df.groupby(["country", "gender"], sort=False)["education"]
+    result = gb.value_counts(sort=False)
+    index = MultiIndex(
+        levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
+        codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
+        names=["country", "gender", "education"],
+    )
+    expected = Series([1, 1, 1, 2, 1], index=index, name="education")
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.fixture
 def education_df():
     return DataFrame(