From 8d7922b551a78b98a3d020995ef22256a062c996 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 3 Jan 2023 20:50:44 -0500 Subject: [PATCH 1/3] BUG/PERF: SeriesGroupBy.value_counts sorting bug and categorical performance --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/groupby/generic.py | 126 ++------------------ pandas/core/groupby/groupby.py | 133 ++++++++++++++++++++++ pandas/tests/groupby/test_value_counts.py | 20 ++++ 4 files changed, 162 insertions(+), 119 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 2f0ed266a02ab..3cd143a188ed0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -784,6 +784,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) +- Performance improvement in :meth:`.SeriesGroupBy.value_count` with categorical dtype (:issue:`46202`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: @@ -949,6 +950,7 @@ Groupby/resample/rolling - Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`) - Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`) - Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`) +- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`) - Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 955f65585963d..a47101285395c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -75,7 +75,6 @@ reconstruct_func, validate_func_kwargs, ) -from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import base @@ -86,14 +85,12 @@ _apply_docs, _transform_template, ) -from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( Index, MultiIndex, all_indexes_same, default_index, ) -from pandas.core.indexes.category import CategoricalIndex from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs from pandas.core.util.numba_ import maybe_use_numba @@ -648,6 +645,12 @@ def value_counts( bins=None, dropna: bool = True, ) -> Series: + if bins is None: + result = self._value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + assert isinstance(result, Series) + return result from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut @@ -2023,122 +2026,7 @@ def value_counts( 3 male low US 0.25 4 male medium FR 0.25 """ - if self.axis == 1: - raise NotImplementedError( - "DataFrameGroupBy.value_counts only handles axis=0" - ) - - with self._group_selection_context(): - df = self.obj - - in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis - } - if isinstance(self._selected_obj, Series): - name = self._selected_obj.name - keys = [] if name in in_axis_names else [self._selected_obj] - else: - unique_cols = set(self._selected_obj.columns) - if subset is not None: - subsetted = set(subset) - clashing = subsetted & set(in_axis_names) - if clashing: - raise ValueError( - f"Keys {clashing} in subset cannot be in " - "the groupby column keys." - ) - doesnt_exist = subsetted - unique_cols - if doesnt_exist: - raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." - ) - else: - subsetted = unique_cols - - keys = [ - # Can't use .values because the column label needs to be preserved - self._selected_obj.iloc[:, idx] - for idx, name in enumerate(self._selected_obj.columns) - if name not in in_axis_names and name in subsetted - ] - - groupings = list(self.grouper.groupings) - for key in keys: - grouper, _, _ = get_grouper( - df, - key=key, - axis=self.axis, - sort=self.sort, - observed=False, - dropna=dropna, - ) - groupings += list(grouper.groupings) - - # Take the size of the overall columns - gb = df.groupby( - groupings, - sort=self.sort, - observed=self.observed, - dropna=self.dropna, - ) - result_series = cast(Series, gb.size()) - - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ).sortlevel() - result_series = result_series.reindex(multi_index, fill_value=0) - - if normalize: - # Normalize the results by dividing by the original group sizes. - # We are guaranteed to have the first N levels be the - # user-requested grouping. - levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) - ) - indexed_group_size = result_series.groupby( - result_series.index.droplevel(levels), - sort=self.sort, - dropna=self.dropna, - ).transform("sum") - result_series /= indexed_group_size - - # Handle groups of non-observed categories - result_series = result_series.fillna(0.0) - - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values( - ascending=ascending - ).sort_index(level=index_level, sort_remaining=False) - - result: Series | DataFrame - if self.as_index: - result = result_series - else: - # Convert to frame - name = "proportion" if normalize else "count" - index = result_series.index - columns = com.fill_missing_names(index.names) - if name in columns: - raise ValueError( - f"Column label '{name}' is duplicate of result column" - ) - result_series.name = name - result_series.index = index.set_names(range(len(columns))) - result_frame = result_series.reset_index() - result_frame.columns = columns + [name] - result = result_frame - return result.__finalize__(self.obj, method="value_counts") + return self._value_counts(subset, normalize, sort, ascending, dropna) def fillna( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5061594708942..d3b3c624b76b8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -113,6 +113,7 @@ class providing the base-class of operations. numba_, ops, ) +from pandas.core.groupby.grouper import get_grouper from pandas.core.groupby.indexing import ( GroupByIndexingMixin, GroupByNthSelector, @@ -2157,6 +2158,138 @@ def var( ddof=ddof, ) + @final + def _value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. + + SeriesGroupBy additionally supports a bins argument. See the docstring of + DataFrameGroupBy.value_counts for a description of arguments. + """ + if self.axis == 1: + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) + + with self._group_selection_context(): + df = self.obj + + in_axis_names = { + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + } + if isinstance(self._selected_obj, Series): + name = self._selected_obj.name + keys = [] if name in in_axis_names else [self._selected_obj] + else: + unique_cols = set(self._selected_obj.columns) + if subset is not None: + subsetted = set(subset) + clashing = subsetted & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys." + ) + doesnt_exist = subsetted - unique_cols + if doesnt_exist: + raise ValueError( + f"Keys {doesnt_exist} in subset do not " + f"exist in the DataFrame." + ) + else: + subsetted = unique_cols + + keys = [ + # Can't use .values because the column label needs to be preserved + self._selected_obj.iloc[:, idx] + for idx, name in enumerate(self._selected_obj.columns) + if name not in in_axis_names and name in subsetted + ] + + groupings = list(self.grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + observed=False, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result_series = cast(Series, gb.size()) + + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping.result_index for ping in groupings] + multi_index, _ = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ).sortlevel() + result_series = result_series.reindex(multi_index, fill_value=0) + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list( + range(len(self.grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), + sort=self.sort, + dropna=self.dropna, + ).transform("sum") + result_series /= indexed_group_size + + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + + if sort: + # Sort the values and then resort by the main grouping + index_level = range(len(self.grouper.groupings)) + result_series = result_series.sort_values( + ascending=ascending + ).sort_index(level=index_level, sort_remaining=False) + + result: Series | DataFrame + if self.as_index: + result = result_series + else: + # Convert to frame + name = "proportion" if normalize else "count" + index = result_series.index + columns = com.fill_missing_names(index.names) + if name in columns: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + result_frame.columns = columns + [name] + result = result_frame + return result.__finalize__(self.obj, method="value_counts") + @final @Substitution(name="groupby") @Appender(_common_see_also) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 11ee13ec05fd6..f790e2ce592ce 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -194,3 +194,23 @@ def test_series_groupby_value_counts_on_categorical(): # dtype: int64 tm.assert_series_equal(result, expected) + + +def test_series_groupby_value_counts_no_sort(): + # GH#50482 + df = DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + gb = df.groupby(["country", "gender"], sort=False)["education"] + result = gb.value_counts(sort=False) + index = MultiIndex( + levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]], + codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], + names=["country", "gender", "education"], + ) + expected = Series([1, 1, 1, 2, 1], index=index, name="education") + tm.assert_series_equal(result, expected) From ebc9c783d63012c7fcec73f4cda8c3f931c6dd73 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 10 Jan 2023 07:43:31 -0500 Subject: [PATCH 2/3] Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 56f494a662b10..7ac02110618bb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -844,7 +844,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) -- Performance improvement in :meth:`.SeriesGroupBy.value_count` with categorical dtype (:issue:`46202`) +- Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: From 22bb78cb696ef4fb7443936dedeaa3a122c897b3 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 10 Jan 2023 17:49:54 -0500 Subject: [PATCH 3/3] Remove unnecessary imports --- pandas/core/groupby/groupby.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7eb11867cfccf..ca61cc72ef927 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -924,8 +924,6 @@ def __init__( self.dropna = dropna if grouper is None: - from pandas.core.groupby.grouper import get_grouper - grouper, exclusions, obj = get_grouper( obj, keys, @@ -3077,8 +3075,6 @@ def _nth( # create a grouper with the original parameters, but on dropped # object - from pandas.core.groupby.grouper import get_grouper - grouper, _, _ = get_grouper( dropped, key=self.keys,