BUG: BooleanArray.value_counts dropna (#30824)

TomAugspurger · web-flow · commit 8bdd7b13cff5 · 2020-01-09T13:19:34.000-06:00
* BUG: BooleanArray.value_counts dropna Closes #30685
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -411,6 +411,24 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.
 
    a.to_numpy(dtype="float", na_value=np.nan)
 
+**value_counts returns a nullable integer dtype**
+
+:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable
+integer dtype for the values.
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+   >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
+   dtype('int64')
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
+
 See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA`
 and :attr:`numpy.nan`.
 
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -410,52 +410,6 @@ def astype(self, dtype, copy=True):
         data = self.to_numpy(na_value=na_value)
         return astype_nansafe(data, dtype, copy=False)
 
-    def value_counts(self, dropna=True):
-        """
-        Returns a Series containing counts of each category.
-
-        Every category will have an entry, even those with a count of 0.
-
-        Parameters
-        ----------
-        dropna : bool, default True
-            Don't include counts of NaN.
-
-        Returns
-        -------
-        counts : Series
-
-        See Also
-        --------
-        Series.value_counts
-
-        """
-
-        from pandas import Index, Series
-
-        # compute counts on the data with no nans
-        data = self._data[~self._mask]
-        value_counts = Index(data).value_counts()
-        array = value_counts.values
-
-        # TODO(extension)
-        # if we have allow Index to hold an ExtensionArray
-        # this is easier
-        index = value_counts.index.values.astype(bool).astype(object)
-
-        # if we want nans, count the mask
-        if not dropna:
-
-            # TODO(extension)
-            # appending to an Index *always* infers
-            # w/o passing the dtype
-            array = np.append(array, [self._mask.sum()])
-            index = Index(
-                np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object
-            )
-
-        return Series(array, index=index)
-
     def _values_for_argsort(self) -> np.ndarray:
         """
         Return values for sorting.
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -467,55 +467,6 @@ def _ndarray_values(self) -> np.ndarray:
         """
         return self._data
 
-    def value_counts(self, dropna=True):
-        """
-        Returns a Series containing counts of each category.
-
-        Every category will have an entry, even those with a count of 0.
-
-        Parameters
-        ----------
-        dropna : bool, default True
-            Don't include counts of NaN.
-
-        Returns
-        -------
-        counts : Series
-
-        See Also
-        --------
-        Series.value_counts
-
-        """
-
-        from pandas import Index, Series
-
-        # compute counts on the data with no nans
-        data = self._data[~self._mask]
-        value_counts = Index(data).value_counts()
-        array = value_counts.values
-
-        # TODO(extension)
-        # if we have allow Index to hold an ExtensionArray
-        # this is easier
-        index = value_counts.index.astype(object)
-
-        # if we want nans, count the mask
-        if not dropna:
-
-            # TODO(extension)
-            # appending to an Index *always* infers
-            # w/o passing the dtype
-            array = np.append(array, [self._mask.sum()])
-            index = Index(
-                np.concatenate(
-                    [index.values, np.array([self.dtype.na_value], dtype=object)]
-                ),
-                dtype=object,
-            )
-
-        return Series(array, index=index)
-
     def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
         # TODO: https://github.com/pandas-dev/pandas/issues/30037
         # use masked algorithms, rather than object-dtype / np.nan.
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -201,3 +201,50 @@ def copy(self):
         data = data.copy()
         mask = mask.copy()
         return type(self)(data, mask, copy=False)
+
+    def value_counts(self, dropna=True):
+        """
+        Returns a Series containing counts of each unique value.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include counts of missing values.
+
+        Returns
+        -------
+        counts : Series
+
+        See Also
+        --------
+        Series.value_counts
+        """
+        from pandas import Index, Series
+        from pandas.arrays import IntegerArray
+
+        # compute counts on the data with no nans
+        data = self._data[~self._mask]
+        value_counts = Index(data).value_counts()
+
+        # TODO(extension)
+        # if we have allow Index to hold an ExtensionArray
+        # this is easier
+        index = value_counts.index.values.astype(object)
+
+        # if we want nans, count the mask
+        if dropna:
+            counts = value_counts.values
+        else:
+            counts = np.empty(len(value_counts) + 1, dtype="int64")
+            counts[:-1] = value_counts
+            counts[-1] = self._mask.sum()
+
+            index = Index(
+                np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
+                dtype=object,
+            )
+
+        mask = np.zeros(len(counts), dtype="bool")
+        counts = IntegerArray(counts, mask)
+
+        return Series(counts, index=index)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -253,7 +253,7 @@ def _reduce(self, name, skipna=True, **kwargs):
     def value_counts(self, dropna=False):
         from pandas import value_counts
 
-        return value_counts(self._ndarray, dropna=dropna)
+        return value_counts(self._ndarray, dropna=dropna).astype("Int64")
 
     # Overrride parent because we have different return types.
     @classmethod
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -239,3 +239,14 @@ def test_arrow_roundtrip():
     tm.assert_frame_equal(result, df)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is pd.NA
+
+
+def test_value_counts_na():
+    arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
+    result = arr.value_counts(dropna=False)
+    expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
+    tm.assert_series_equal(result, expected)
+
+    result = arr.value_counts(dropna=True)
+    expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -868,3 +868,14 @@ def test_arrow_roundtrip():
     result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.BooleanDtype)
     tm.assert_frame_equal(result, df)
+
+
+def test_value_counts_na():
+    arr = pd.array([True, False, pd.NA], dtype="boolean")
+    result = arr.value_counts(dropna=False)
+    expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
+    tm.assert_series_equal(result, expected)
+
+    result = arr.value_counts(dropna=True)
+    expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -1039,6 +1039,17 @@ def test_stat_method(pandasmethname, kwargs):
     assert expected == result
 
 
+def test_value_counts_na():
+    arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
+    result = arr.value_counts(dropna=False)
+    expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64")
+    tm.assert_series_equal(result, expected)
+
+    result = arr.value_counts(dropna=True)
+    expected = pd.Series([2, 1], index=[1, 2], dtype="Int64")
+    tm.assert_series_equal(result, expected)
+
+
 # TODO(jreback) - these need testing / are broken
 
 # shift
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -226,6 +226,10 @@ def test_searchsorted(self, data_for_sorting, as_series):
         sorter = np.array([1, 0])
         assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
 
+    @pytest.mark.skip(reason="uses nullable integer")
+    def test_value_counts(self, all_data, dropna):
+        return super().test_value_counts(all_data, dropna)
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
@@ -209,7 +209,7 @@ class TestMissing(base.BaseMissingTests):
 
 
 class TestMethods(base.BaseMethodsTests):
-    @pytest.mark.parametrize("dropna", [True, False])
+    @pytest.mark.skip(reason="uses nullable integer")
     def test_value_counts(self, all_data, dropna):
         all_data = all_data[:10]
         if dropna:
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -81,7 +81,9 @@ class TestNoReduce(base.BaseNoReduceTests):
 
 
 class TestMethods(base.BaseMethodsTests):
-    pass
+    @pytest.mark.skip(reason="returns nullable")
+    def test_value_counts(self, all_data, dropna):
+        return super().test_value_counts(all_data, dropna)
 
 
 class TestCasting(base.BaseCastingTests):