pandas-dev · phofl · Sep 30, 2023 · Sep 3, 2023 · Sep 3, 2023 · Sep 6, 2023
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -29,6 +29,7 @@ enhancement2
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
+- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -240,7 +240,7 @@ def value_count(
     values: np.ndarray,
     dropna: bool,
     mask: npt.NDArray[np.bool_] | None = ...,
-) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ...  # np.ndarray[same-as-values]
+) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ...  # np.ndarray[same-as-values]
 
 # arr and values should have same dtype
 def ismember(

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t
 cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
 {{endif}}
     cdef:
-        Py_ssize_t i = 0
+        Py_ssize_t i = 0, na_counter = 0, na_add = 0
         Py_ssize_t n = len(values)
         kh_{{ttype}}_t *table
 
@@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
         bint uses_mask = mask is not None
         bint isna_entry = False
 
-    if uses_mask and not dropna:
-        raise NotImplementedError("uses_mask not implemented with dropna=False")
-
     # we track the order in which keys are first seen (GH39009),
     # khash-map isn't insertion-ordered, thus:
     #    table maps keys to counts
@@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
     for i in range(n):
         val = {{to_c_type}}(values[i])
 
+        if uses_mask:
+            isna_entry = mask[i]
+
         if dropna:
-            if uses_mask:
-                isna_entry = mask[i]
-            else:
+            if not uses_mask:
                 isna_entry = is_nan_{{c_type}}(val)
 
         if not dropna or not isna_entry:
-            k = kh_get_{{ttype}}(table, val)
-            if k != table.n_buckets:
-                table.vals[k] += 1
+            if uses_mask and isna_entry:
+                na_counter += 1
             else:
-                k = kh_put_{{ttype}}(table, val, &ret)
-                table.vals[k] = 1
-                result_keys.append(val)
+                k = kh_get_{{ttype}}(table, val)
+                if k != table.n_buckets:
+                    table.vals[k] += 1
+                else:
+                    k = kh_put_{{ttype}}(table, val, &ret)
+                    table.vals[k] = 1
+                    result_keys.append(val)
     {{endif}}
 
     # collect counts in the order corresponding to result_keys:
+    if na_counter > 0:
+        na_add = 1
     cdef:
-        int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
+        int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64)
 
     for i in range(table.size):
         {{if dtype == 'object'}}
@@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
         {{endif}}
         result_counts[i] = table.vals[k]
 
+    if na_counter > 0:
+        result_counts[table.size] = na_counter
+        result_keys.append(val)
+
     kh_destroy_{{ttype}}(table)
 
-    return result_keys.to_array(), result_counts.base
+    return result_keys.to_array(), result_counts.base, na_counter
 
 
 @cython.wraparound(False)
@@ -399,10 +406,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
         ndarray[htfunc_t] modes
 
         int64_t[::1] counts
-        int64_t count, max_count = -1
+        int64_t count, _, max_count = -1
         Py_ssize_t nkeys, k, j = 0
 
-    keys, counts = value_count(values, dropna, mask=mask)
+    keys, counts, _ = value_count(values, dropna, mask=mask)
     nkeys = len(keys)
 
     modes = np.empty(nkeys, dtype=values.dtype)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -924,7 +924,7 @@ def value_counts_internal(
 
         else:
             values = _ensure_arraylike(values, func_name="value_counts")
-            keys, counts = value_counts_arraylike(values, dropna)
+            keys, counts, _ = value_counts_arraylike(values, dropna)
             if keys.dtype == np.float16:
                 keys = keys.astype(np.float32)
 
@@ -949,7 +949,7 @@ def value_counts_internal(
 # Called once from SparseArray, otherwise could be private
 def value_counts_arraylike(
     values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
-) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
+) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
     """
     Parameters
     ----------
@@ -965,7 +965,7 @@ def value_counts_arraylike(
     original = values
     values = _ensure_data(values)
 
-    keys, counts = htable.value_count(values, dropna, mask=mask)
+    keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)
 
     if needs_i8_conversion(original.dtype):
         # datetime, timedelta, or period
@@ -975,7 +975,7 @@ def value_counts_arraylike(
             keys, counts = keys[mask], counts[mask]
 
     res_keys = _reconstruct_data(keys, original.dtype, original)
-    return res_keys, counts
+    return res_keys, counts, na_counter
 
 
 def duplicated(

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -1044,28 +1044,22 @@ def value_counts(self, dropna: bool = True) -> Series:
         )
         from pandas.arrays import IntegerArray
 
-        keys, value_counts = algos.value_counts_arraylike(
-            self._data, dropna=True, mask=self._mask
+        keys, value_counts, na_counter = algos.value_counts_arraylike(
+            self._data, dropna=dropna, mask=self._mask
         )
+        mask_index = np.zeros((len(value_counts),), dtype=np.bool_)
+        mask = mask_index.copy()
 
-        if dropna:
-            res = Series(value_counts, index=keys, name="count", copy=False)
-            res.index = res.index.astype(self.dtype)
-            res = res.astype("Int64")
-            return res
+        if na_counter > 0:
+            mask_index[-1] = True
 
-        # if we want nans, count the mask
-        counts = np.empty(len(value_counts) + 1, dtype="int64")
-        counts[:-1] = value_counts
-        counts[-1] = self._mask.sum()
-
-        index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
-        index = index.astype(self.dtype)
-
-        mask = np.zeros(len(counts), dtype="bool")
-        counts_array = IntegerArray(counts, mask)
-
-        return Series(counts_array, index=index, name="count", copy=False)
+        arr = IntegerArray(value_counts, mask)
+        index = Index(
+            self.dtype.construct_array_type()(
+                keys, mask_index  # type: ignore[arg-type]
+            )
+        )
+        return Series(arr, index=index, name="count", copy=False)
 
     @doc(ExtensionArray.equals)
     def equals(self, other) -> bool:

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -881,7 +881,7 @@ def value_counts(self, dropna: bool = True) -> Series:
             Series,
         )
 
-        keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
+        keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
         fcounts = self.sp_index.ngaps
         if fcounts > 0 and (not self._null_fill_value or not dropna):
             mask = isna(keys) if self._null_fill_value else keys == self.fill_value

diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -586,15 +586,26 @@ def test_value_count(self, dtype, writable):
         expected = (np.arange(N) + N).astype(dtype)
         values = np.repeat(expected, 5)
         values.flags.writeable = writable
-        keys, counts = ht.value_count(values, False)
+        keys, counts, _ = ht.value_count(values, False)
         tm.assert_numpy_array_equal(np.sort(keys), expected)
         assert np.all(counts == 5)
 
+    def test_value_count_mask(self, dtype):
+        if dtype == np.object_:
+            pytest.skip("mask not implemented for object dtype")
+        values = np.array([1] * 5, dtype=dtype)
+        mask = np.zeros((5,), dtype=np.bool_)
+        mask[1] = True
+        mask[4] = True
+        keys, counts, na_counter = ht.value_count(values, False, mask=mask)
+        assert len(keys) == 2
+        assert na_counter == 2
+
     def test_value_count_stable(self, dtype, writable):
         # GH12679
         values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
         values.flags.writeable = writable
-        keys, counts = ht.value_count(values, False)
+        keys, counts, _ = ht.value_count(values, False)
         tm.assert_numpy_array_equal(keys, values)
         assert np.all(counts == 1)
 
@@ -685,9 +696,9 @@ def test_unique_label_indices():
 class TestHelpFunctionsWithNans:
     def test_value_count(self, dtype):
         values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
-        keys, counts = ht.value_count(values, True)
+        keys, counts, _ = ht.value_count(values, True)
         assert len(keys) == 0
-        keys, counts = ht.value_count(values, False)
+        keys, counts, _ = ht.value_count(values, False)
         assert len(keys) == 1 and np.all(np.isnan(keys))
         assert counts[0] == 3
 

diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py
@@ -250,3 +250,22 @@ def test_value_counts_complex_numbers(self, input_array, expected):
         # GH 17927
         result = Series(input_array).value_counts()
         tm.assert_series_equal(result, expected)
+
+    def test_value_counts_masked(self):
+        # GH#54984
+        dtype = "Int64"
+        ser = Series([1, 2, None, 2, None, 3], dtype=dtype)
+        result = ser.value_counts(dropna=False)
+        expected = Series(
+            [2, 2, 1, 1],
+            index=Index([2, None, 1, 3], dtype=dtype),
+            dtype=dtype,
+            name="count",
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = ser.value_counts(dropna=True)
+        expected = Series(
+            [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count"
+        )
+        tm.assert_series_equal(result, expected)