Skip to content

Commit 539545b

Browse files
authored
ENH: Add DataFrameGroupBy.value_counts (#44267)
1 parent f2639df commit 539545b

File tree

7 files changed

+638
-1
lines changed

7 files changed

+638
-1
lines changed

doc/source/reference/groupby.rst

+1
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ application to columns of a specific data type.
122122
DataFrameGroupBy.skew
123123
DataFrameGroupBy.take
124124
DataFrameGroupBy.tshift
125+
DataFrameGroupBy.value_counts
125126

126127
The following methods are available only for ``SeriesGroupBy`` objects.
127128

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ Other enhancements
217217
- Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
218218
- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
219219
- :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
220+
- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
220221
- :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
221222
- :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
222223
- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`)

pandas/core/groupby/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ class OutputKey:
143143
"take",
144144
"transform",
145145
"sample",
146+
"value_counts",
146147
]
147148
)
148149
# Valid values of `name` for `groupby.transform(name)`

pandas/core/groupby/generic.py

+189
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
Iterable,
1818
Mapping,
1919
NamedTuple,
20+
Sequence,
2021
TypeVar,
2122
Union,
2223
cast,
@@ -76,6 +77,7 @@
7677
_transform_template,
7778
warn_dropping_nuisance_columns_deprecated,
7879
)
80+
from pandas.core.groupby.grouper import get_grouper
7981
from pandas.core.indexes.api import (
8082
Index,
8183
MultiIndex,
@@ -1569,6 +1571,193 @@ def func(df):
15691571

15701572
boxplot = boxplot_frame_groupby
15711573

1574+
def value_counts(
1575+
self,
1576+
subset: Sequence[Hashable] | None = None,
1577+
normalize: bool = False,
1578+
sort: bool = True,
1579+
ascending: bool = False,
1580+
dropna: bool = True,
1581+
) -> DataFrame | Series:
1582+
"""
1583+
Return a Series or DataFrame containing counts of unique rows.
1584+
1585+
.. versionadded:: 1.4.0
1586+
1587+
Parameters
1588+
----------
1589+
subset : list-like, optional
1590+
Columns to use when counting unique combinations.
1591+
normalize : bool, default False
1592+
Return proportions rather than frequencies.
1593+
sort : bool, default True
1594+
Sort by frequencies.
1595+
ascending : bool, default False
1596+
Sort in ascending order.
1597+
dropna : bool, default True
1598+
Don’t include counts of rows that contain NA values.
1599+
1600+
Returns
1601+
-------
1602+
Series or DataFrame
1603+
Series if the groupby as_index is True, otherwise DataFrame.
1604+
1605+
See Also
1606+
--------
1607+
Series.value_counts: Equivalent method on Series.
1608+
DataFrame.value_counts: Equivalent method on DataFrame.
1609+
SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
1610+
1611+
Notes
1612+
-----
1613+
- If the groupby as_index is True then the returned Series will have a
1614+
MultiIndex with one level per input column.
1615+
- If the groupby as_index is False then the returned DataFrame will have an
1616+
additional column with the value_counts. The column is labelled 'count' or
1617+
'proportion', depending on the ``normalize`` parameter.
1618+
1619+
By default, rows that contain any NA values are omitted from
1620+
the result.
1621+
1622+
By default, the result will be in descending order so that the
1623+
first element of each group is the most frequently-occurring row.
1624+
1625+
Examples
1626+
--------
1627+
>>> df = pd.DataFrame({
1628+
... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
1629+
... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
1630+
... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
1631+
... })
1632+
1633+
>>> df
1634+
gender education country
1635+
0 male low US
1636+
1 male medium FR
1637+
2 female high US
1638+
3 male low FR
1639+
4 female high FR
1640+
5 male low FR
1641+
1642+
>>> df.groupby('gender').value_counts()
1643+
gender education country
1644+
female high FR 1
1645+
US 1
1646+
male low FR 2
1647+
US 1
1648+
medium FR 1
1649+
dtype: int64
1650+
1651+
>>> df.groupby('gender').value_counts(ascending=True)
1652+
gender education country
1653+
female high FR 1
1654+
US 1
1655+
male low US 1
1656+
medium FR 1
1657+
low FR 2
1658+
dtype: int64
1659+
1660+
>>> df.groupby('gender').value_counts(normalize=True)
1661+
gender education country
1662+
female high FR 0.50
1663+
US 0.50
1664+
male low FR 0.50
1665+
US 0.25
1666+
medium FR 0.25
1667+
dtype: float64
1668+
1669+
>>> df.groupby('gender', as_index=False).value_counts()
1670+
gender education country count
1671+
0 female high FR 1
1672+
1 female high US 1
1673+
2 male low FR 2
1674+
3 male low US 1
1675+
4 male medium FR 1
1676+
1677+
>>> df.groupby('gender', as_index=False).value_counts(normalize=True)
1678+
gender education country proportion
1679+
0 female high FR 0.50
1680+
1 female high US 0.50
1681+
2 male low FR 0.50
1682+
3 male low US 0.25
1683+
4 male medium FR 0.25
1684+
"""
1685+
if self.axis == 1:
1686+
raise NotImplementedError(
1687+
"DataFrameGroupBy.value_counts only handles axis=0"
1688+
)
1689+
1690+
with self._group_selection_context():
1691+
df = self.obj
1692+
1693+
in_axis_names = {
1694+
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
1695+
}
1696+
if isinstance(self._selected_obj, Series):
1697+
name = self._selected_obj.name
1698+
keys = [] if name in in_axis_names else [self._selected_obj]
1699+
else:
1700+
keys = [
1701+
# Can't use .values because the column label needs to be preserved
1702+
self._selected_obj.iloc[:, idx]
1703+
for idx, name in enumerate(self._selected_obj.columns)
1704+
if name not in in_axis_names
1705+
]
1706+
1707+
if subset is not None:
1708+
clashing = set(subset) & set(in_axis_names)
1709+
if clashing:
1710+
raise ValueError(
1711+
f"Keys {clashing} in subset cannot be in "
1712+
"the groupby column keys"
1713+
)
1714+
1715+
groupings = list(self.grouper.groupings)
1716+
for key in keys:
1717+
grouper, _, _ = get_grouper(
1718+
df,
1719+
key=key,
1720+
axis=self.axis,
1721+
sort=self.sort,
1722+
dropna=dropna,
1723+
)
1724+
groupings += list(grouper.groupings)
1725+
1726+
# Take the size of the overall columns
1727+
gb = df.groupby(
1728+
groupings,
1729+
sort=self.sort,
1730+
observed=self.observed,
1731+
dropna=self.dropna,
1732+
)
1733+
result = cast(Series, gb.size())
1734+
1735+
if normalize:
1736+
# Normalize the results by dividing by the original group sizes.
1737+
# We are guaranteed to have the first N levels be the
1738+
# user-requested grouping.
1739+
levels = list(range(len(self.grouper.groupings), result.index.nlevels))
1740+
indexed_group_size = result.groupby(
1741+
result.index.droplevel(levels),
1742+
sort=self.sort,
1743+
observed=self.observed,
1744+
dropna=self.dropna,
1745+
).transform("sum")
1746+
1747+
result /= indexed_group_size
1748+
1749+
if sort:
1750+
# Sort the values and then resort by the main grouping
1751+
index_level = range(len(self.grouper.groupings))
1752+
result = result.sort_values(ascending=ascending).sort_index(
1753+
level=index_level, sort_remaining=False
1754+
)
1755+
1756+
if not self.as_index:
1757+
# Convert to frame
1758+
result = result.reset_index(name="proportion" if normalize else "count")
1759+
return result.__finalize__(self.obj, method="value_counts")
1760+
15721761

15731762
def _wrap_transform_general_frame(
15741763
obj: DataFrame, group: DataFrame, res: DataFrame | Series

pandas/core/groupby/grouper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -800,7 +800,7 @@ def get_grouper(
800800

801801
# what are we after, exactly?
802802
any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
803-
any_groupers = any(isinstance(g, Grouper) for g in keys)
803+
any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
804804
any_arraylike = any(
805805
isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
806806
)

pandas/tests/groupby/test_allowlist.py

+1
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ def test_tab_completion(mframe):
319319
"pipe",
320320
"sample",
321321
"ewm",
322+
"value_counts",
322323
}
323324
assert results == expected
324325

0 commit comments

Comments
 (0)