From 304c369422d8b7b8b5ed285be3fc3dd4b134799c Mon Sep 17 00:00:00 2001 From: Stefan Mejlgaard Date: Wed, 5 May 2021 15:39:30 +0000 Subject: [PATCH 1/6] ENH: Add dropna argument to DataFrame.value_counts() --- pandas/core/frame.py | 30 ++++++++++++++- .../tests/frame/methods/test_value_counts.py | 38 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50837e1b3ed50..126683f569340 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6380,6 +6380,7 @@ def value_counts( normalize: bool = False, sort: bool = True, ascending: bool = False, + dropna: bool = True, ): """ Return a Series containing counts of unique rows in the DataFrame. @@ -6396,6 +6397,8 @@ def value_counts( Sort by frequencies. ascending : bool, default False Sort in ascending order. + dropna : bool, default True + Don’t include counts of rows that contain NA values. Returns ------- @@ -6451,11 +6454,36 @@ def value_counts( 2 2 0.25 6 0 0.25 dtype: float64 + + **dropna** + With `dropna` set to `False` we can also count rows with NA values. + >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], + ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) + >>> df + first_name middle_name + 0 John Smith + 1 Anne + 2 John + 3 Beth Louise + + >>> df.value_counts() # dropna is True per default + first_name middle_name + Beth Louise 1 + John Smith 1 + dtype: int64 + + >>> df.value_counts(dropna=False) + first_name middle_name + Anne NaN 1 + Beth Louise 1 + John Smith 1 + NaN 1 + dtype: int64 """ if subset is None: subset = self.columns.tolist() - counts = self.groupby(subset).grouper.size() + counts = self.groupby(subset, dropna=dropna).grouper.size() if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 23f9ebdb4479d..19715ba92266a 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -100,3 +100,41 @@ def test_data_frame_value_counts_empty_normalize(): expected = pd.Series([], dtype=np.float64) tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_true(): + df = pd.DataFrame( + {'first_name': ['John', 'Anne', 'John', 'Beth'], + 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}, + ) + + result = df.value_counts() + expected = pd.Series( + data=[1, 1], + index=pd.MultiIndex.from_arrays( + [('Beth', 'John'), ('Louise', 'Smith')], names=["first_name", "middle_name"] + ), + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_false(): + df = pd.DataFrame( + {'first_name': ['John', 'Anne', 'John', 'Beth'], + 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}, + ) + + result = df.value_counts(dropna=False) + expected = pd.Series( + data=[1, 1, 1, 1], + index=pd.MultiIndex( + levels=[pd.Index(['Anne', 'Beth', 'John']), + pd.Index(['Louise', 'Smith', pd.NA])], + codes=[[0, 1, 2, 2], + [2, 0, 1, 2]], + names=['first_name', 'middle_name'], + ) + ) + + tm.assert_series_equal(result, expected) From ebaad81bc66f3c00f951d38ec69b740b17564945 Mon Sep 17 00:00:00 2001 From: Stefan Mejlgaard Date: Wed, 5 May 2021 16:10:49 +0000 Subject: [PATCH 2/6] ENH: whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/frame.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 320912ec38890..cddaaa295af01 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -224,6 +224,7 @@ Other enhancements - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) +- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 126683f569340..6d82d851cd536 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6400,6 +6400,8 @@ def value_counts( dropna : bool, default True Don’t include counts of rows that contain NA values. + .. versionadded:: 1.3.0 + Returns ------- Series From 5216d01563f963f963a469b5c1380f707b8bb287 Mon Sep 17 00:00:00 2001 From: Stefan Mejlgaard Date: Wed, 5 May 2021 21:23:45 +0200 Subject: [PATCH 3/6] ENH: black and sphinx formatting --- pandas/core/frame.py | 2 ++ .../tests/frame/methods/test_value_counts.py | 27 +++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d82d851cd536..8556dc572a1fe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6458,7 +6458,9 @@ def value_counts( dtype: float64 **dropna** + With `dropna` set to `False` we can also count rows with NA values. + >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) >>> df diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 19715ba92266a..f5e388c333886 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -104,15 +104,17 @@ def test_data_frame_value_counts_empty_normalize(): def test_data_frame_value_counts_dropna_true(): df = pd.DataFrame( - {'first_name': ['John', 'Anne', 'John', 'Beth'], - 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}, + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + }, ) result = df.value_counts() expected = pd.Series( data=[1, 1], index=pd.MultiIndex.from_arrays( - [('Beth', 'John'), ('Louise', 'Smith')], names=["first_name", "middle_name"] + [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] ), ) @@ -121,20 +123,23 @@ def test_data_frame_value_counts_dropna_true(): def test_data_frame_value_counts_dropna_false(): df = pd.DataFrame( - {'first_name': ['John', 'Anne', 'John', 'Beth'], - 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}, + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + }, ) result = df.value_counts(dropna=False) expected = pd.Series( data=[1, 1, 1, 1], index=pd.MultiIndex( - levels=[pd.Index(['Anne', 'Beth', 'John']), - pd.Index(['Louise', 'Smith', pd.NA])], - codes=[[0, 1, 2, 2], - [2, 0, 1, 2]], - names=['first_name', 'middle_name'], - ) + levels=[ + pd.Index(["Anne", "Beth", "John"]), + pd.Index(["Louise", "Smith", pd.NA]), + ], + codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + names=["first_name", "middle_name"], + ), ) tm.assert_series_equal(result, expected) From b16fb3f705cac52dfe32640633a6959d9046ccfd Mon Sep 17 00:00:00 2001 From: Stefan Mejlgaard Date: Thu, 6 May 2021 09:05:01 +0200 Subject: [PATCH 4/6] ENH: Add github reference --- pandas/tests/frame/methods/test_value_counts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index f5e388c333886..0cd269c9b0517 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -103,6 +103,7 @@ def test_data_frame_value_counts_empty_normalize(): def test_data_frame_value_counts_dropna_true(): + # GH 41334 df = pd.DataFrame( { "first_name": ["John", "Anne", "John", "Beth"], @@ -122,6 +123,7 @@ def test_data_frame_value_counts_dropna_true(): def test_data_frame_value_counts_dropna_false(): + # GH 41334 df = pd.DataFrame( { "first_name": ["John", "Anne", "John", "Beth"], From aa94b3b1ca1a533c015359492720120a35f3fc24 Mon Sep 17 00:00:00 2001 From: Stefan Mejlgaard Date: Thu, 6 May 2021 10:43:14 +0200 Subject: [PATCH 5/6] ENH: Parametrize tests over all NaN types --- pandas/tests/frame/methods/test_value_counts.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 0cd269c9b0517..6e8528845ea6b 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -102,15 +102,14 @@ def test_data_frame_value_counts_empty_normalize(): tm.assert_series_equal(result, expected) -def test_data_frame_value_counts_dropna_true(): +def test_data_frame_value_counts_dropna_true(nulls_fixture): # GH 41334 df = pd.DataFrame( { "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], }, ) - result = df.value_counts() expected = pd.Series( data=[1, 1], @@ -122,12 +121,12 @@ def test_data_frame_value_counts_dropna_true(): tm.assert_series_equal(result, expected) -def test_data_frame_value_counts_dropna_false(): +def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( { "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], }, ) @@ -137,7 +136,7 @@ def test_data_frame_value_counts_dropna_false(): index=pd.MultiIndex( levels=[ pd.Index(["Anne", "Beth", "John"]), - pd.Index(["Louise", "Smith", pd.NA]), + pd.Index(["Louise", "Smith", nulls_fixture]), ], codes=[[0, 1, 2, 2], [2, 0, 1, 2]], names=["first_name", "middle_name"], From f798c5b888d1bb28e424d5e39d1510b4e9782391 Mon Sep 17 00:00:00 2001 From: Stefan Mejlgaard Date: Fri, 7 May 2021 09:08:37 +0200 Subject: [PATCH 6/6] ENH: Remove unnecessary comment and label --- pandas/core/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8556dc572a1fe..f3902b0a9d288 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6457,8 +6457,6 @@ def value_counts( 6 0 0.25 dtype: float64 - **dropna** - With `dropna` set to `False` we can also count rows with NA values. >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], @@ -6470,7 +6468,7 @@ def value_counts( 2 John 3 Beth Louise - >>> df.value_counts() # dropna is True per default + >>> df.value_counts() first_name middle_name Beth Louise 1 John Smith 1