diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3ac004ef335ac..366dc2e39e16a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -846,6 +846,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) +- Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: @@ -1020,6 +1021,7 @@ Groupby/resample/rolling - Bug in :meth:`Resampler.size` caused a wide :class:`DataFrame` to be returned instead of a :class:`Series` with :class:`MultiIndex` (:issue:`46826`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`) - Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`) +- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`) - Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 15e11aea4b65b..02222fa3c8399 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -75,7 +75,6 @@ reconstruct_func, validate_func_kwargs, ) -from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import base @@ -86,14 +85,12 @@ _apply_docs, _transform_template, ) -from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( Index, MultiIndex, all_indexes_same, default_index, ) -from pandas.core.indexes.category import CategoricalIndex from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -647,6 +644,12 @@ def value_counts( bins=None, dropna: bool = True, ) -> Series: + if bins is None: + result = self._value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + assert isinstance(result, Series) + return result from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut @@ -2224,122 +2227,7 @@ def value_counts( 3 male low US 0.25 4 male medium FR 0.25 """ - if self.axis == 1: - raise NotImplementedError( - "DataFrameGroupBy.value_counts only handles axis=0" - ) - - with self._group_selection_context(): - df = self.obj - - in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis - } - if isinstance(self._selected_obj, Series): - name = self._selected_obj.name - keys = [] if name in in_axis_names else [self._selected_obj] - else: - unique_cols = set(self._selected_obj.columns) - if subset is not None: - subsetted = set(subset) - clashing = subsetted & set(in_axis_names) - if clashing: - raise ValueError( - f"Keys {clashing} in subset cannot be in " - "the groupby column keys." - ) - doesnt_exist = subsetted - unique_cols - if doesnt_exist: - raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." - ) - else: - subsetted = unique_cols - - keys = [ - # Can't use .values because the column label needs to be preserved - self._selected_obj.iloc[:, idx] - for idx, name in enumerate(self._selected_obj.columns) - if name not in in_axis_names and name in subsetted - ] - - groupings = list(self.grouper.groupings) - for key in keys: - grouper, _, _ = get_grouper( - df, - key=key, - axis=self.axis, - sort=self.sort, - observed=False, - dropna=dropna, - ) - groupings += list(grouper.groupings) - - # Take the size of the overall columns - gb = df.groupby( - groupings, - sort=self.sort, - observed=self.observed, - dropna=self.dropna, - ) - result_series = cast(Series, gb.size()) - - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ).sortlevel() - result_series = result_series.reindex(multi_index, fill_value=0) - - if normalize: - # Normalize the results by dividing by the original group sizes. - # We are guaranteed to have the first N levels be the - # user-requested grouping. - levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) - ) - indexed_group_size = result_series.groupby( - result_series.index.droplevel(levels), - sort=self.sort, - dropna=self.dropna, - ).transform("sum") - result_series /= indexed_group_size - - # Handle groups of non-observed categories - result_series = result_series.fillna(0.0) - - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values( - ascending=ascending - ).sort_index(level=index_level, sort_remaining=False) - - result: Series | DataFrame - if self.as_index: - result = result_series - else: - # Convert to frame - name = "proportion" if normalize else "count" - index = result_series.index - columns = com.fill_missing_names(index.names) - if name in columns: - raise ValueError( - f"Column label '{name}' is duplicate of result column" - ) - result_series.name = name - result_series.index = index.set_names(range(len(columns))) - result_frame = result_series.reset_index() - result_frame.columns = columns + [name] - result = result_frame - return result.__finalize__(self.obj, method="value_counts") + return self._value_counts(subset, normalize, sort, ascending, dropna) def fillna( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dd7600c4dbf89..ca61cc72ef927 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -113,6 +113,7 @@ class providing the base-class of operations. numba_, ops, ) +from pandas.core.groupby.grouper import get_grouper from pandas.core.groupby.indexing import ( GroupByIndexingMixin, GroupByNthSelector, @@ -923,8 +924,6 @@ def __init__( self.dropna = dropna if grouper is None: - from pandas.core.groupby.grouper import get_grouper - grouper, exclusions, obj = get_grouper( obj, keys, @@ -2160,6 +2159,138 @@ def var( ddof=ddof, ) + @final + def _value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. + + SeriesGroupBy additionally supports a bins argument. See the docstring of + DataFrameGroupBy.value_counts for a description of arguments. + """ + if self.axis == 1: + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) + + with self._group_selection_context(): + df = self.obj + + in_axis_names = { + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + } + if isinstance(self._selected_obj, Series): + name = self._selected_obj.name + keys = [] if name in in_axis_names else [self._selected_obj] + else: + unique_cols = set(self._selected_obj.columns) + if subset is not None: + subsetted = set(subset) + clashing = subsetted & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys." + ) + doesnt_exist = subsetted - unique_cols + if doesnt_exist: + raise ValueError( + f"Keys {doesnt_exist} in subset do not " + f"exist in the DataFrame." + ) + else: + subsetted = unique_cols + + keys = [ + # Can't use .values because the column label needs to be preserved + self._selected_obj.iloc[:, idx] + for idx, name in enumerate(self._selected_obj.columns) + if name not in in_axis_names and name in subsetted + ] + + groupings = list(self.grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + observed=False, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result_series = cast(Series, gb.size()) + + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping.result_index for ping in groupings] + multi_index, _ = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ).sortlevel() + result_series = result_series.reindex(multi_index, fill_value=0) + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list( + range(len(self.grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), + sort=self.sort, + dropna=self.dropna, + ).transform("sum") + result_series /= indexed_group_size + + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + + if sort: + # Sort the values and then resort by the main grouping + index_level = range(len(self.grouper.groupings)) + result_series = result_series.sort_values( + ascending=ascending + ).sort_index(level=index_level, sort_remaining=False) + + result: Series | DataFrame + if self.as_index: + result = result_series + else: + # Convert to frame + name = "proportion" if normalize else "count" + index = result_series.index + columns = com.fill_missing_names(index.names) + if name in columns: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + result_frame.columns = columns + [name] + result = result_frame + return result.__finalize__(self.obj, method="value_counts") + @final @Substitution(name="groupby") @Appender(_common_see_also) @@ -2944,8 +3075,6 @@ def _nth( # create a grouper with the original parameters, but on dropped # object - from pandas.core.groupby.grouper import get_grouper - grouper, _, _ = get_grouper( dropped, key=self.keys, diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 476eb4384fd76..f67fea9cd6c0e 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -199,6 +199,26 @@ def test_series_groupby_value_counts_on_categorical(): tm.assert_series_equal(result, expected) +def test_series_groupby_value_counts_no_sort(): + # GH#50482 + df = DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + gb = df.groupby(["country", "gender"], sort=False)["education"] + result = gb.value_counts(sort=False) + index = MultiIndex( + levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]], + codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], + names=["country", "gender", "education"], + ) + expected = Series([1, 1, 1, 2, 1], index=index, name="education") + tm.assert_series_equal(result, expected) + + @pytest.fixture def education_df(): return DataFrame(