diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 11a6f2628ac52..381c83c4f6c8f 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -411,6 +411,24 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. a.to_numpy(dtype="float", na_value=np.nan) +**value_counts returns a nullable integer dtype** + +:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable +integer dtype for the values. + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + dtype('int64') + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` and :attr:`numpy.nan`. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c065fdeba2177..b8952fc016570 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -410,52 +410,6 @@ def astype(self, dtype, copy=True): data = self.to_numpy(na_value=na_value) return astype_nansafe(data, dtype, copy=False) - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.values.astype(bool).astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object - ) - - return Series(array, index=index) - def _values_for_argsort(self) -> np.ndarray: """ Return values for sorting. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 50062f09495aa..90cb3f4ffe2c2 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -467,55 +467,6 @@ def _ndarray_values(self) -> np.ndarray: """ return self._data - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate( - [index.values, np.array([self.dtype.na_value], dtype=object)] - ), - dtype=object, - ) - - return Series(array, index=index) - def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6fd9f1efbb408..53bb05873a8e9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -201,3 +201,50 @@ def copy(self): data = data.copy() mask = mask.copy() return type(self)(data, mask, copy=False) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + from pandas.arrays import IntegerArray + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(object) + + # if we want nans, count the mask + if dropna: + counts = value_counts.values + else: + counts = np.empty(len(value_counts) + 1, dtype="int64") + counts[:-1] = value_counts + counts[-1] = self._mask.sum() + + index = Index( + np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), + dtype=object, + ) + + mask = np.zeros(len(counts), dtype="bool") + counts = IntegerArray(counts, mask) + + return Series(counts, index=index) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0da877fb1ad45..fe476ab6ffaa1 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -253,7 +253,7 @@ def _reduce(self, name, skipna=True, **kwargs): def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + return value_counts(self._ndarray, dropna=dropna).astype("Int64") # Overrride parent because we have different return types. @classmethod diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ec7e35e5c6db4..32c1e697b15b5 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -237,3 +237,14 @@ def test_arrow_roundtrip(): tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA + + +def test_value_counts_na(): + arr = pd.array(["a", "b", "a", pd.NA], dtype="string") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index b89aece3f982c..bc406e4cd9520 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -856,3 +856,14 @@ def test_arrow_roundtrip(): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.BooleanDtype) tm.assert_frame_equal(result, df) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 6a3ef75157d5d..14dbdcafb3088 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1039,6 +1039,17 @@ def test_stat_method(pandasmethname, kwargs): assert expected == result +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 9c151b5482c9d..a7ce0fb097599 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -226,6 +226,10 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 8e54543e5437c..afb8412f12ea9 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -209,7 +209,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.parametrize("dropna", [True, False]) + @pytest.mark.skip(reason="uses nullable integer") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 8519c2999ade3..86aed671f1b88 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -81,7 +81,9 @@ class TestNoReduce(base.BaseNoReduceTests): class TestMethods(base.BaseMethodsTests): - pass + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) class TestCasting(base.BaseCastingTests):