Skip to content

Commit 670e9be

Browse files
authored
ENH: Add dropna argument to DataFrame.value_counts() (#41334)
1 parent 9ab182a commit 670e9be

File tree

3 files changed

+76
-1
lines changed

3 files changed

+76
-1
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ Other enhancements
226226
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
227227
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
228228
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
229+
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
229230
-
230231

231232
.. ---------------------------------------------------------------------------

pandas/core/frame.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -6381,6 +6381,7 @@ def value_counts(
63816381
normalize: bool = False,
63826382
sort: bool = True,
63836383
ascending: bool = False,
6384+
dropna: bool = True,
63846385
):
63856386
"""
63866387
Return a Series containing counts of unique rows in the DataFrame.
@@ -6397,6 +6398,10 @@ def value_counts(
63976398
Sort by frequencies.
63986399
ascending : bool, default False
63996400
Sort in ascending order.
6401+
dropna : bool, default True
6402+
Don’t include counts of rows that contain NA values.
6403+
6404+
.. versionadded:: 1.3.0
64006405
64016406
Returns
64026407
-------
@@ -6452,11 +6457,36 @@ def value_counts(
64526457
2 2 0.25
64536458
6 0 0.25
64546459
dtype: float64
6460+
6461+
With `dropna` set to `False` we can also count rows with NA values.
6462+
6463+
>>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
6464+
... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
6465+
>>> df
6466+
first_name middle_name
6467+
0 John Smith
6468+
1 Anne <NA>
6469+
2 John <NA>
6470+
3 Beth Louise
6471+
6472+
>>> df.value_counts()
6473+
first_name middle_name
6474+
Beth Louise 1
6475+
John Smith 1
6476+
dtype: int64
6477+
6478+
>>> df.value_counts(dropna=False)
6479+
first_name middle_name
6480+
Anne NaN 1
6481+
Beth Louise 1
6482+
John Smith 1
6483+
NaN 1
6484+
dtype: int64
64556485
"""
64566486
if subset is None:
64576487
subset = self.columns.tolist()
64586488

6459-
counts = self.groupby(subset).grouper.size()
6489+
counts = self.groupby(subset, dropna=dropna).grouper.size()
64606490

64616491
if sort:
64626492
counts = counts.sort_values(ascending=ascending)

pandas/tests/frame/methods/test_value_counts.py

+44
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,47 @@ def test_data_frame_value_counts_empty_normalize():
100100
expected = pd.Series([], dtype=np.float64)
101101

102102
tm.assert_series_equal(result, expected)
103+
104+
105+
def test_data_frame_value_counts_dropna_true(nulls_fixture):
106+
# GH 41334
107+
df = pd.DataFrame(
108+
{
109+
"first_name": ["John", "Anne", "John", "Beth"],
110+
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
111+
},
112+
)
113+
result = df.value_counts()
114+
expected = pd.Series(
115+
data=[1, 1],
116+
index=pd.MultiIndex.from_arrays(
117+
[("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
118+
),
119+
)
120+
121+
tm.assert_series_equal(result, expected)
122+
123+
124+
def test_data_frame_value_counts_dropna_false(nulls_fixture):
125+
# GH 41334
126+
df = pd.DataFrame(
127+
{
128+
"first_name": ["John", "Anne", "John", "Beth"],
129+
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
130+
},
131+
)
132+
133+
result = df.value_counts(dropna=False)
134+
expected = pd.Series(
135+
data=[1, 1, 1, 1],
136+
index=pd.MultiIndex(
137+
levels=[
138+
pd.Index(["Anne", "Beth", "John"]),
139+
pd.Index(["Louise", "Smith", nulls_fixture]),
140+
],
141+
codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
142+
names=["first_name", "middle_name"],
143+
),
144+
)
145+
146+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)