diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 320912ec38890..cddaaa295af01 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -224,6 +224,7 @@ Other enhancements - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) +- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50837e1b3ed50..f3902b0a9d288 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6380,6 +6380,7 @@ def value_counts( normalize: bool = False, sort: bool = True, ascending: bool = False, + dropna: bool = True, ): """ Return a Series containing counts of unique rows in the DataFrame. @@ -6396,6 +6397,10 @@ def value_counts( Sort by frequencies. ascending : bool, default False Sort in ascending order. + dropna : bool, default True + Don’t include counts of rows that contain NA values. + + .. versionadded:: 1.3.0 Returns ------- @@ -6451,11 +6456,36 @@ def value_counts( 2 2 0.25 6 0 0.25 dtype: float64 + + With `dropna` set to `False` we can also count rows with NA values. + + >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], + ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) + >>> df + first_name middle_name + 0 John Smith + 1 Anne + 2 John + 3 Beth Louise + + >>> df.value_counts() + first_name middle_name + Beth Louise 1 + John Smith 1 + dtype: int64 + + >>> df.value_counts(dropna=False) + first_name middle_name + Anne NaN 1 + Beth Louise 1 + John Smith 1 + NaN 1 + dtype: int64 """ if subset is None: subset = self.columns.tolist() - counts = self.groupby(subset).grouper.size() + counts = self.groupby(subset, dropna=dropna).grouper.size() if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 23f9ebdb4479d..6e8528845ea6b 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -100,3 +100,47 @@ def test_data_frame_value_counts_empty_normalize(): expected = pd.Series([], dtype=np.float64) tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_true(nulls_fixture): + # GH 41334 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts() + expected = pd.Series( + data=[1, 1], + index=pd.MultiIndex.from_arrays( + [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + ), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_false(nulls_fixture): + # GH 41334 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + result = df.value_counts(dropna=False) + expected = pd.Series( + data=[1, 1, 1, 1], + index=pd.MultiIndex( + levels=[ + pd.Index(["Anne", "Beth", "John"]), + pd.Index(["Louise", "Smith", nulls_fixture]), + ], + codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + names=["first_name", "middle_name"], + ), + ) + + tm.assert_series_equal(result, expected)