ENH: Add DataFrameGroupBy.value_counts (#44267)

johnzangwill · web-flow · commit 539545ba0ad5 · 2021-12-19T18:25:41.000-05:00
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
@@ -122,6 +122,7 @@ application to columns of a specific data type.
    DataFrameGroupBy.skew
    DataFrameGroupBy.take
    DataFrameGroupBy.tshift
+   DataFrameGroupBy.value_counts
 
 The following methods are available only for ``SeriesGroupBy`` objects.
 
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -217,6 +217,7 @@ Other enhancements
 - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
 - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
+- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
 - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
 - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
 - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`)
diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
@@ -143,6 +143,7 @@ class OutputKey:
         "take",
         "transform",
         "sample",
+        "value_counts",
     ]
 )
 # Valid values  of `name` for `groupby.transform(name)`
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -17,6 +17,7 @@
     Iterable,
     Mapping,
     NamedTuple,
+    Sequence,
     TypeVar,
     Union,
     cast,
@@ -76,6 +77,7 @@
     _transform_template,
     warn_dropping_nuisance_columns_deprecated,
 )
+from pandas.core.groupby.grouper import get_grouper
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
@@ -1569,6 +1571,193 @@ def func(df):
 
     boxplot = boxplot_frame_groupby
 
+    def value_counts(
+        self,
+        subset: Sequence[Hashable] | None = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> DataFrame | Series:
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        .. versionadded:: 1.4.0
+
+        Parameters
+        ----------
+        subset : list-like, optional
+            Columns to use when counting unique combinations.
+        normalize : bool, default False
+            Return proportions rather than frequencies.
+        sort : bool, default True
+            Sort by frequencies.
+        ascending : bool, default False
+            Sort in ascending order.
+        dropna : bool, default True
+            Don’t include counts of rows that contain NA values.
+
+        Returns
+        -------
+        Series or DataFrame
+            Series if the groupby as_index is True, otherwise DataFrame.
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+        DataFrame.value_counts: Equivalent method on DataFrame.
+        SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
+
+        Notes
+        -----
+        - If the groupby as_index is True then the returned Series will have a
+          MultiIndex with one level per input column.
+        - If the groupby as_index is False then the returned DataFrame will have an
+          additional column with the value_counts. The column is labelled 'count' or
+          'proportion', depending on the ``normalize`` parameter.
+
+        By default, rows that contain any NA values are omitted from
+        the result.
+
+        By default, the result will be in descending order so that the
+        first element of each group is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({
+        ...    'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+        ...    'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+        ...    'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+        ... })
+
+        >>> df
+            gender 	education 	country
+        0 	male 	low 	    US
+        1 	male 	medium 	    FR
+        2 	female 	high 	    US
+        3 	male 	low 	    FR
+        4 	female 	high 	    FR
+        5 	male 	low 	    FR
+
+        >>> df.groupby('gender').value_counts()
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        FR         2
+                           US         1
+                medium     FR         1
+        dtype: int64
+
+        >>> df.groupby('gender').value_counts(ascending=True)
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        US         1
+                medium     FR         1
+                low        FR         2
+        dtype: int64
+
+        >>> df.groupby('gender').value_counts(normalize=True)
+        gender  education  country
+        female  high       FR         0.50
+                           US         0.50
+        male    low        FR         0.50
+                           US         0.25
+                medium     FR         0.25
+        dtype: float64
+
+        >>> df.groupby('gender', as_index=False).value_counts()
+           gender education country  count
+        0  female      high      FR      1
+        1  female      high      US      1
+        2    male       low      FR      2
+        3    male       low      US      1
+        4    male    medium      FR      1
+
+        >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
+           gender education country  proportion
+        0  female      high      FR        0.50
+        1  female      high      US        0.50
+        2    male       low      FR        0.50
+        3    male       low      US        0.25
+        4    male    medium      FR        0.25
+        """
+        if self.axis == 1:
+            raise NotImplementedError(
+                "DataFrameGroupBy.value_counts only handles axis=0"
+            )
+
+        with self._group_selection_context():
+            df = self.obj
+
+            in_axis_names = {
+                grouping.name for grouping in self.grouper.groupings if grouping.in_axis
+            }
+            if isinstance(self._selected_obj, Series):
+                name = self._selected_obj.name
+                keys = [] if name in in_axis_names else [self._selected_obj]
+            else:
+                keys = [
+                    # Can't use .values because the column label needs to be preserved
+                    self._selected_obj.iloc[:, idx]
+                    for idx, name in enumerate(self._selected_obj.columns)
+                    if name not in in_axis_names
+                ]
+
+            if subset is not None:
+                clashing = set(subset) & set(in_axis_names)
+                if clashing:
+                    raise ValueError(
+                        f"Keys {clashing} in subset cannot be in "
+                        "the groupby column keys"
+                    )
+
+            groupings = list(self.grouper.groupings)
+            for key in keys:
+                grouper, _, _ = get_grouper(
+                    df,
+                    key=key,
+                    axis=self.axis,
+                    sort=self.sort,
+                    dropna=dropna,
+                )
+                groupings += list(grouper.groupings)
+
+            # Take the size of the overall columns
+            gb = df.groupby(
+                groupings,
+                sort=self.sort,
+                observed=self.observed,
+                dropna=self.dropna,
+            )
+            result = cast(Series, gb.size())
+
+            if normalize:
+                # Normalize the results by dividing by the original group sizes.
+                # We are guaranteed to have the first N levels be the
+                # user-requested grouping.
+                levels = list(range(len(self.grouper.groupings), result.index.nlevels))
+                indexed_group_size = result.groupby(
+                    result.index.droplevel(levels),
+                    sort=self.sort,
+                    observed=self.observed,
+                    dropna=self.dropna,
+                ).transform("sum")
+
+                result /= indexed_group_size
+
+            if sort:
+                # Sort the values and then resort by the main grouping
+                index_level = range(len(self.grouper.groupings))
+                result = result.sort_values(ascending=ascending).sort_index(
+                    level=index_level, sort_remaining=False
+                )
+
+            if not self.as_index:
+                # Convert to frame
+                result = result.reset_index(name="proportion" if normalize else "count")
+            return result.__finalize__(self.obj, method="value_counts")
+
 
 def _wrap_transform_general_frame(
     obj: DataFrame, group: DataFrame, res: DataFrame | Series
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -800,7 +800,7 @@ def get_grouper(
 
     # what are we after, exactly?
     any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
-    any_groupers = any(isinstance(g, Grouper) for g in keys)
+    any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
     any_arraylike = any(
         isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
     )
diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py
@@ -319,6 +319,7 @@ def test_tab_completion(mframe):
         "pipe",
         "sample",
         "ewm",
+        "value_counts",
     }
     assert results == expected
 
diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py

Original file line number	Diff line number	Diff line change
`@@ -143,6 +143,7 @@ class OutputKey:`
`143`	`143`	`"take",`
`144`	`144`	`"transform",`
`145`	`145`	`"sample",`
	`146`	`+ "value_counts",`
`146`	`147`	`]`
`147`	`148`	`)`
`148`	`149`	# Valid values of `name` for `groupby.transform(name)`
Original file line number	Diff line number	Diff line change
`@@ -800,7 +800,7 @@ def get_grouper(`
`800`	`800`
`801`	`801`	`# what are we after, exactly?`
`802`	`802`	`any_callable = any(callable(g) or isinstance(g, dict) for g in keys)`
`803`		`- any_groupers = any(isinstance(g, Grouper) for g in keys)`
	`803`	`+ any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)`
`804`	`804`	`any_arraylike = any(`
`805`	`805`	`isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys`
`806`	`806`	`)`
Original file line number	Diff line number	Diff line change
`@@ -319,6 +319,7 @@ def test_tab_completion(mframe):`
`319`	`319`	`"pipe",`
`320`	`320`	`"sample",`
`321`	`321`	`"ewm",`
	`322`	`+ "value_counts",`
`322`	`323`	`}`
`323`	`324`	`assert results == expected`
`324`	`325`