ENH: Add dropna argument to DataFrame.value_counts() (#41334)

connesy · web-flow · commit 670e9bef2137 · 2021-05-10T10:39:30.000-04:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -226,6 +226,7 @@ Other enhancements
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
 - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
+- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6381,6 +6381,7 @@ def value_counts(
         normalize: bool = False,
         sort: bool = True,
         ascending: bool = False,
+        dropna: bool = True,
     ):
         """
         Return a Series containing counts of unique rows in the DataFrame.
@@ -6397,6 +6398,10 @@ def value_counts(
             Sort by frequencies.
         ascending : bool, default False
             Sort in ascending order.
+        dropna : bool, default True
+            Don’t include counts of rows that contain NA values.
+
+            .. versionadded:: 1.3.0
 
         Returns
         -------
@@ -6452,11 +6457,36 @@ def value_counts(
         2         2            0.25
         6         0            0.25
         dtype: float64
+
+        With `dropna` set to `False` we can also count rows with NA values.
+
+        >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
+        ...                    'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
+        >>> df
+          first_name middle_name
+        0       John       Smith
+        1       Anne        <NA>
+        2       John        <NA>
+        3       Beth      Louise
+
+        >>> df.value_counts()
+        first_name  middle_name
+        Beth        Louise         1
+        John        Smith          1
+        dtype: int64
+
+        >>> df.value_counts(dropna=False)
+        first_name  middle_name
+        Anne        NaN            1
+        Beth        Louise         1
+        John        Smith          1
+                    NaN            1
+        dtype: int64
         """
         if subset is None:
             subset = self.columns.tolist()
 
-        counts = self.groupby(subset).grouper.size()
+        counts = self.groupby(subset, dropna=dropna).grouper.size()
 
         if sort:
             counts = counts.sort_values(ascending=ascending)
diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
@@ -100,3 +100,47 @@ def test_data_frame_value_counts_empty_normalize():
     expected = pd.Series([], dtype=np.float64)
 
     tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_dropna_true(nulls_fixture):
+    # GH 41334
+    df = pd.DataFrame(
+        {
+            "first_name": ["John", "Anne", "John", "Beth"],
+            "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
+        },
+    )
+    result = df.value_counts()
+    expected = pd.Series(
+        data=[1, 1],
+        index=pd.MultiIndex.from_arrays(
+            [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
+        ),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_dropna_false(nulls_fixture):
+    # GH 41334
+    df = pd.DataFrame(
+        {
+            "first_name": ["John", "Anne", "John", "Beth"],
+            "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
+        },
+    )
+
+    result = df.value_counts(dropna=False)
+    expected = pd.Series(
+        data=[1, 1, 1, 1],
+        index=pd.MultiIndex(
+            levels=[
+                pd.Index(["Anne", "Beth", "John"]),
+                pd.Index(["Louise", "Smith", nulls_fixture]),
+            ],
+            codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
+            names=["first_name", "middle_name"],
+        ),
+    )
+
+    tm.assert_series_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -226,6 +226,7 @@ Other enhancements`
`226`	`226`	- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
`227`	`227`	- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is not a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
`228`	`228`	- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
	`229`	+- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
`229`	`230`	`-`
`230`	`231`
`231`	`232`	`.. ---------------------------------------------------------------------------`