From 2ff69a37a2ee9728ebbd37fdad330d4dafa52d95 Mon Sep 17 00:00:00 2001 From: Andrew Eckart Date: Tue, 12 Oct 2021 14:14:32 -0500 Subject: [PATCH 1/2] ENH: Support observed keyword argument in Categorical.value_counts (#43498) --- pandas/core/arrays/categorical.py | 8 ++++++-- pandas/tests/extension/test_categorical.py | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 759c7fb65374d..cf562e3a40b24 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1602,7 +1602,7 @@ def notna(self) -> np.ndarray: notnull = notna - def value_counts(self, dropna: bool = True): + def value_counts(self, dropna: bool = True, observed: bool = False): """ Return a Series containing counts of each category. @@ -1612,6 +1612,9 @@ def value_counts(self, dropna: bool = True): ---------- dropna : bool, default True Don't include counts of NaN. + observed : bool, default False + If True, only include counts for observed categories. + If False, include counts for all categories. Returns ------- @@ -1640,7 +1643,8 @@ def value_counts(self, dropna: bool = True): ix = coerce_indexer_dtype(ix, self.dtype.categories) ix = self._from_backing_data(ix) - return Series(count, index=CategoricalIndex(ix), dtype="int64") + counts = Series(count, index=CategoricalIndex(ix), dtype="int64") + return counts[counts != 0] if observed else counts # error: Argument 2 of "_empty" is incompatible with supertype # "NDArrayBackedExtensionArray"; supertype defines the argument type as diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6a1a9512bc036..dfc8aaa48ebb0 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -163,6 +163,15 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) + def test_value_counts_observed(self, data): + data = data.add_categories(["#", "?"]) # Add some unobserved categories + series = pd.Series(data, dtype=data.categories.dtype) + result = data.value_counts(observed=True).sort_index() + expected = series.value_counts().sort_index() + self.assert_series_equal( + result, expected, check_index_type=False, check_categorical=False + ) + def test_combine_add(self, data_repeated): # GH 20825 # When adding categoricals in combine, result is a string From 30e0540a2e1073a23732981a8bc739f020939543 Mon Sep 17 00:00:00 2001 From: Andrew Eckart Date: Tue, 12 Oct 2021 18:00:35 -0500 Subject: [PATCH 2/2] PR feedback: issue reference and whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/tests/extension/test_categorical.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7828c479e800f..55967c6fdbbfe 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -211,7 +211,7 @@ Other enhancements - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) -- +- :meth:`Categorical.value_counts` now supports the argument ``observed`` (:issue:`43498`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index dfc8aaa48ebb0..da3f8d11a4a18 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -164,6 +164,8 @@ def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) def test_value_counts_observed(self, data): + # GH 43498 + # When observed=True is passed, unobserved categories should be omitted data = data.add_categories(["#", "?"]) # Add some unobserved categories series = pd.Series(data, dtype=data.categories.dtype) result = data.value_counts(observed=True).sort_index()