Skip to content

Commit 8b200c1

Browse files
authored
ENH: Implement DataFrame.value_counts (#31247)
1 parent 9bf3a28 commit 8b200c1

File tree

6 files changed

+217
-1
lines changed

6 files changed

+217
-1
lines changed

doc/source/getting_started/basics.rst

+11
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
689689
s.value_counts()
690690
pd.value_counts(data)
691691
692+
.. versionadded:: 1.1.0
693+
694+
The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns.
695+
By default all columns are used but a subset can be selected using the ``subset`` argument.
696+
697+
.. ipython:: python
698+
699+
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
700+
frame = pd.DataFrame(data)
701+
frame.value_counts()
702+
692703
Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame:
693704

694705
.. ipython:: python

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ Computations / descriptive stats
170170
DataFrame.std
171171
DataFrame.var
172172
DataFrame.nunique
173+
DataFrame.value_counts
173174

174175
Reindexing / selection / label manipulation
175176
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other API changes
5555

5656
- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
5757
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
58+
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
5859
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
5960
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
6061
-

pandas/core/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -1196,6 +1196,7 @@ def value_counts(
11961196
--------
11971197
Series.count: Number of non-NA elements in a Series.
11981198
DataFrame.count: Number of non-NA elements in a DataFrame.
1199+
DataFrame.value_counts: Equivalent method on DataFrames.
11991200
12001201
Examples
12011202
--------

pandas/core/frame.py

+101-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@
111111
from pandas.core.indexes import base as ibase
112112
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
113113
from pandas.core.indexes.datetimes import DatetimeIndex
114-
from pandas.core.indexes.multi import maybe_droplevels
114+
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
115115
from pandas.core.indexes.period import PeriodIndex
116116
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
117117
from pandas.core.internals import BlockManager
@@ -4569,6 +4569,10 @@ def drop_duplicates(
45694569
-------
45704570
DataFrame
45714571
DataFrame with duplicates removed or None if ``inplace=True``.
4572+
4573+
See Also
4574+
--------
4575+
DataFrame.value_counts: Count unique combinations of columns.
45724576
"""
45734577
if self.empty:
45744578
return self.copy()
@@ -4814,6 +4818,102 @@ def sort_index(
48144818
else:
48154819
return self._constructor(new_data).__finalize__(self)
48164820

4821+
def value_counts(
4822+
self,
4823+
subset: Optional[Sequence[Label]] = None,
4824+
normalize: bool = False,
4825+
sort: bool = True,
4826+
ascending: bool = False,
4827+
):
4828+
"""
4829+
Return a Series containing counts of unique rows in the DataFrame.
4830+
4831+
.. versionadded:: 1.1.0
4832+
4833+
Parameters
4834+
----------
4835+
subset : list-like, optional
4836+
Columns to use when counting unique combinations.
4837+
normalize : bool, default False
4838+
Return proportions rather than frequencies.
4839+
sort : bool, default True
4840+
Sort by frequencies.
4841+
ascending : bool, default False
4842+
Sort in ascending order.
4843+
4844+
Returns
4845+
-------
4846+
Series
4847+
4848+
See Also
4849+
--------
4850+
Series.value_counts: Equivalent method on Series.
4851+
4852+
Notes
4853+
-----
4854+
The returned Series will have a MultiIndex with one level per input
4855+
column. By default, rows that contain any NA values are omitted from
4856+
the result. By default, the resulting Series will be in descending
4857+
order so that the first element is the most frequently-occurring row.
4858+
4859+
Examples
4860+
--------
4861+
>>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
4862+
... 'num_wings': [2, 0, 0, 0]},
4863+
... index=['falcon', 'dog', 'cat', 'ant'])
4864+
>>> df
4865+
num_legs num_wings
4866+
falcon 2 2
4867+
dog 4 0
4868+
cat 4 0
4869+
ant 6 0
4870+
4871+
>>> df.value_counts()
4872+
num_legs num_wings
4873+
4 0 2
4874+
6 0 1
4875+
2 2 1
4876+
dtype: int64
4877+
4878+
>>> df.value_counts(sort=False)
4879+
num_legs num_wings
4880+
2 2 1
4881+
4 0 2
4882+
6 0 1
4883+
dtype: int64
4884+
4885+
>>> df.value_counts(ascending=True)
4886+
num_legs num_wings
4887+
2 2 1
4888+
6 0 1
4889+
4 0 2
4890+
dtype: int64
4891+
4892+
>>> df.value_counts(normalize=True)
4893+
num_legs num_wings
4894+
4 0 0.50
4895+
6 0 0.25
4896+
2 2 0.25
4897+
dtype: float64
4898+
"""
4899+
if subset is None:
4900+
subset = self.columns.tolist()
4901+
4902+
counts = self.groupby(subset).size()
4903+
4904+
if sort:
4905+
counts = counts.sort_values(ascending=ascending)
4906+
if normalize:
4907+
counts /= counts.sum()
4908+
4909+
# Force MultiIndex for single column
4910+
if len(subset) == 1:
4911+
counts.index = MultiIndex.from_arrays(
4912+
[counts.index], names=[counts.index.name]
4913+
)
4914+
4915+
return counts
4916+
48174917
def nlargest(self, n, columns, keep="first") -> "DataFrame":
48184918
"""
48194919
Return the first `n` rows ordered by `columns` in descending order.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
import pandas._testing as tm
5+
6+
7+
def test_data_frame_value_counts_unsorted():
8+
df = pd.DataFrame(
9+
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
10+
index=["falcon", "dog", "cat", "ant"],
11+
)
12+
13+
result = df.value_counts(sort=False)
14+
expected = pd.Series(
15+
data=[1, 2, 1],
16+
index=pd.MultiIndex.from_arrays(
17+
[(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
18+
),
19+
)
20+
21+
tm.assert_series_equal(result, expected)
22+
23+
24+
def test_data_frame_value_counts_ascending():
25+
df = pd.DataFrame(
26+
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
27+
index=["falcon", "dog", "cat", "ant"],
28+
)
29+
30+
result = df.value_counts(ascending=True)
31+
expected = pd.Series(
32+
data=[1, 1, 2],
33+
index=pd.MultiIndex.from_arrays(
34+
[(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
35+
),
36+
)
37+
38+
tm.assert_series_equal(result, expected)
39+
40+
41+
def test_data_frame_value_counts_default():
42+
df = pd.DataFrame(
43+
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
44+
index=["falcon", "dog", "cat", "ant"],
45+
)
46+
47+
result = df.value_counts()
48+
expected = pd.Series(
49+
data=[2, 1, 1],
50+
index=pd.MultiIndex.from_arrays(
51+
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
52+
),
53+
)
54+
55+
tm.assert_series_equal(result, expected)
56+
57+
58+
def test_data_frame_value_counts_normalize():
59+
df = pd.DataFrame(
60+
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
61+
index=["falcon", "dog", "cat", "ant"],
62+
)
63+
64+
result = df.value_counts(normalize=True)
65+
expected = pd.Series(
66+
data=[0.5, 0.25, 0.25],
67+
index=pd.MultiIndex.from_arrays(
68+
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
69+
),
70+
)
71+
72+
tm.assert_series_equal(result, expected)
73+
74+
75+
def test_data_frame_value_counts_single_col_default():
76+
df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})
77+
78+
result = df.value_counts()
79+
expected = pd.Series(
80+
data=[2, 1, 1],
81+
index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
82+
)
83+
84+
tm.assert_series_equal(result, expected)
85+
86+
87+
def test_data_frame_value_counts_empty():
88+
df_no_cols = pd.DataFrame()
89+
90+
result = df_no_cols.value_counts()
91+
expected = pd.Series([], dtype=np.int64)
92+
93+
tm.assert_series_equal(result, expected)
94+
95+
96+
def test_data_frame_value_counts_empty_normalize():
97+
df_no_cols = pd.DataFrame()
98+
99+
result = df_no_cols.value_counts(normalize=True)
100+
expected = pd.Series([], dtype=np.float64)
101+
102+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)