Skip to content

Commit 6f0cd8d

Browse files
authored
ENH: Implement masked algorithm for value_counts (#54984)
1 parent c8e7a98 commit 6f0cd8d

File tree

8 files changed

+78
-46
lines changed

8 files changed

+78
-46
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ Other enhancements
7777
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7878
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
7979
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
80+
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
8081
-
8182

8283
.. ---------------------------------------------------------------------------

pandas/_libs/hashtable.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def value_count(
240240
values: np.ndarray,
241241
dropna: bool,
242242
mask: npt.NDArray[np.bool_] | None = ...,
243-
) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values]
243+
) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values]
244244

245245
# arr and values should have same dtype
246246
def ismember(

pandas/_libs/hashtable_func_helper.pxi.in

+24-17
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t
3636
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
3737
{{endif}}
3838
cdef:
39-
Py_ssize_t i = 0
39+
Py_ssize_t i = 0, na_counter = 0, na_add = 0
4040
Py_ssize_t n = len(values)
4141
kh_{{ttype}}_t *table
4242

@@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
4949
bint uses_mask = mask is not None
5050
bint isna_entry = False
5151

52-
if uses_mask and not dropna:
53-
raise NotImplementedError("uses_mask not implemented with dropna=False")
54-
5552
# we track the order in which keys are first seen (GH39009),
5653
# khash-map isn't insertion-ordered, thus:
5754
# table maps keys to counts
@@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
8279
for i in range(n):
8380
val = {{to_c_type}}(values[i])
8481

82+
if uses_mask:
83+
isna_entry = mask[i]
84+
8585
if dropna:
86-
if uses_mask:
87-
isna_entry = mask[i]
88-
else:
86+
if not uses_mask:
8987
isna_entry = is_nan_{{c_type}}(val)
9088

9189
if not dropna or not isna_entry:
92-
k = kh_get_{{ttype}}(table, val)
93-
if k != table.n_buckets:
94-
table.vals[k] += 1
90+
if uses_mask and isna_entry:
91+
na_counter += 1
9592
else:
96-
k = kh_put_{{ttype}}(table, val, &ret)
97-
table.vals[k] = 1
98-
result_keys.append(val)
93+
k = kh_get_{{ttype}}(table, val)
94+
if k != table.n_buckets:
95+
table.vals[k] += 1
96+
else:
97+
k = kh_put_{{ttype}}(table, val, &ret)
98+
table.vals[k] = 1
99+
result_keys.append(val)
99100
{{endif}}
100101

101102
# collect counts in the order corresponding to result_keys:
103+
if na_counter > 0:
104+
na_add = 1
102105
cdef:
103-
int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
106+
int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64)
104107

105108
for i in range(table.size):
106109
{{if dtype == 'object'}}
@@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
110113
{{endif}}
111114
result_counts[i] = table.vals[k]
112115

116+
if na_counter > 0:
117+
result_counts[table.size] = na_counter
118+
result_keys.append(val)
119+
113120
kh_destroy_{{ttype}}(table)
114121

115-
return result_keys.to_array(), result_counts.base
122+
return result_keys.to_array(), result_counts.base, na_counter
116123

117124

118125
@cython.wraparound(False)
@@ -399,10 +406,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
399406
ndarray[htfunc_t] modes
400407

401408
int64_t[::1] counts
402-
int64_t count, max_count = -1
409+
int64_t count, _, max_count = -1
403410
Py_ssize_t nkeys, k, j = 0
404411

405-
keys, counts = value_count(values, dropna, mask=mask)
412+
keys, counts, _ = value_count(values, dropna, mask=mask)
406413
nkeys = len(keys)
407414

408415
modes = np.empty(nkeys, dtype=values.dtype)

pandas/core/algorithms.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,7 @@ def value_counts_internal(
924924

925925
else:
926926
values = _ensure_arraylike(values, func_name="value_counts")
927-
keys, counts = value_counts_arraylike(values, dropna)
927+
keys, counts, _ = value_counts_arraylike(values, dropna)
928928
if keys.dtype == np.float16:
929929
keys = keys.astype(np.float32)
930930

@@ -949,7 +949,7 @@ def value_counts_internal(
949949
# Called once from SparseArray, otherwise could be private
950950
def value_counts_arraylike(
951951
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
952-
) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
952+
) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
953953
"""
954954
Parameters
955955
----------
@@ -965,7 +965,7 @@ def value_counts_arraylike(
965965
original = values
966966
values = _ensure_data(values)
967967

968-
keys, counts = htable.value_count(values, dropna, mask=mask)
968+
keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)
969969

970970
if needs_i8_conversion(original.dtype):
971971
# datetime, timedelta, or period
@@ -975,7 +975,7 @@ def value_counts_arraylike(
975975
keys, counts = keys[mask], counts[mask]
976976

977977
res_keys = _reconstruct_data(keys, original.dtype, original)
978-
return res_keys, counts
978+
return res_keys, counts, na_counter
979979

980980

981981
def duplicated(

pandas/core/arrays/masked.py

+13-19
Original file line numberDiff line numberDiff line change
@@ -1044,28 +1044,22 @@ def value_counts(self, dropna: bool = True) -> Series:
10441044
)
10451045
from pandas.arrays import IntegerArray
10461046

1047-
keys, value_counts = algos.value_counts_arraylike(
1048-
self._data, dropna=True, mask=self._mask
1047+
keys, value_counts, na_counter = algos.value_counts_arraylike(
1048+
self._data, dropna=dropna, mask=self._mask
10491049
)
1050+
mask_index = np.zeros((len(value_counts),), dtype=np.bool_)
1051+
mask = mask_index.copy()
10501052

1051-
if dropna:
1052-
res = Series(value_counts, index=keys, name="count", copy=False)
1053-
res.index = res.index.astype(self.dtype)
1054-
res = res.astype("Int64")
1055-
return res
1053+
if na_counter > 0:
1054+
mask_index[-1] = True
10561055

1057-
# if we want nans, count the mask
1058-
counts = np.empty(len(value_counts) + 1, dtype="int64")
1059-
counts[:-1] = value_counts
1060-
counts[-1] = self._mask.sum()
1061-
1062-
index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
1063-
index = index.astype(self.dtype)
1064-
1065-
mask = np.zeros(len(counts), dtype="bool")
1066-
counts_array = IntegerArray(counts, mask)
1067-
1068-
return Series(counts_array, index=index, name="count", copy=False)
1056+
arr = IntegerArray(value_counts, mask)
1057+
index = Index(
1058+
self.dtype.construct_array_type()(
1059+
keys, mask_index # type: ignore[arg-type]
1060+
)
1061+
)
1062+
return Series(arr, index=index, name="count", copy=False)
10691063

10701064
@doc(ExtensionArray.equals)
10711065
def equals(self, other) -> bool:

pandas/core/arrays/sparse/array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -881,7 +881,7 @@ def value_counts(self, dropna: bool = True) -> Series:
881881
Series,
882882
)
883883

884-
keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
884+
keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
885885
fcounts = self.sp_index.ngaps
886886
if fcounts > 0 and (not self._null_fill_value or not dropna):
887887
mask = isna(keys) if self._null_fill_value else keys == self.fill_value

pandas/tests/libs/test_hashtable.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -586,15 +586,26 @@ def test_value_count(self, dtype, writable):
586586
expected = (np.arange(N) + N).astype(dtype)
587587
values = np.repeat(expected, 5)
588588
values.flags.writeable = writable
589-
keys, counts = ht.value_count(values, False)
589+
keys, counts, _ = ht.value_count(values, False)
590590
tm.assert_numpy_array_equal(np.sort(keys), expected)
591591
assert np.all(counts == 5)
592592

593+
def test_value_count_mask(self, dtype):
594+
if dtype == np.object_:
595+
pytest.skip("mask not implemented for object dtype")
596+
values = np.array([1] * 5, dtype=dtype)
597+
mask = np.zeros((5,), dtype=np.bool_)
598+
mask[1] = True
599+
mask[4] = True
600+
keys, counts, na_counter = ht.value_count(values, False, mask=mask)
601+
assert len(keys) == 2
602+
assert na_counter == 2
603+
593604
def test_value_count_stable(self, dtype, writable):
594605
# GH12679
595606
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
596607
values.flags.writeable = writable
597-
keys, counts = ht.value_count(values, False)
608+
keys, counts, _ = ht.value_count(values, False)
598609
tm.assert_numpy_array_equal(keys, values)
599610
assert np.all(counts == 1)
600611

@@ -685,9 +696,9 @@ def test_unique_label_indices():
685696
class TestHelpFunctionsWithNans:
686697
def test_value_count(self, dtype):
687698
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
688-
keys, counts = ht.value_count(values, True)
699+
keys, counts, _ = ht.value_count(values, True)
689700
assert len(keys) == 0
690-
keys, counts = ht.value_count(values, False)
701+
keys, counts, _ = ht.value_count(values, False)
691702
assert len(keys) == 1 and np.all(np.isnan(keys))
692703
assert counts[0] == 3
693704

pandas/tests/series/methods/test_value_counts.py

+19
Original file line numberDiff line numberDiff line change
@@ -250,3 +250,22 @@ def test_value_counts_complex_numbers(self, input_array, expected):
250250
# GH 17927
251251
result = Series(input_array).value_counts()
252252
tm.assert_series_equal(result, expected)
253+
254+
def test_value_counts_masked(self):
255+
# GH#54984
256+
dtype = "Int64"
257+
ser = Series([1, 2, None, 2, None, 3], dtype=dtype)
258+
result = ser.value_counts(dropna=False)
259+
expected = Series(
260+
[2, 2, 1, 1],
261+
index=Index([2, None, 1, 3], dtype=dtype),
262+
dtype=dtype,
263+
name="count",
264+
)
265+
tm.assert_series_equal(result, expected)
266+
267+
result = ser.value_counts(dropna=True)
268+
expected = Series(
269+
[2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count"
270+
)
271+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)