Skip to content

Commit ebfdc71

Browse files
committed
Revert "ENH: Implement masked algorithm for value_counts (pandas-dev#54984)"
This reverts commit 6f0cd8d
1 parent 00f10db commit ebfdc71

File tree

7 files changed

+46
-77
lines changed

7 files changed

+46
-77
lines changed

pandas/_libs/hashtable.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def value_count(
240240
values: np.ndarray,
241241
dropna: bool,
242242
mask: npt.NDArray[np.bool_] | None = ...,
243-
) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values]
243+
) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values]
244244

245245
# arr and values should have same dtype
246246
def ismember(

pandas/_libs/hashtable_func_helper.pxi.in

+17-24
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t
3636
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
3737
{{endif}}
3838
cdef:
39-
Py_ssize_t i = 0, na_counter = 0, na_add = 0
39+
Py_ssize_t i = 0
4040
Py_ssize_t n = len(values)
4141
kh_{{ttype}}_t *table
4242

@@ -49,6 +49,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
4949
bint uses_mask = mask is not None
5050
bint isna_entry = False
5151

52+
if uses_mask and not dropna:
53+
raise NotImplementedError("uses_mask not implemented with dropna=False")
54+
5255
# we track the order in which keys are first seen (GH39009),
5356
# khash-map isn't insertion-ordered, thus:
5457
# table maps keys to counts
@@ -79,31 +82,25 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
7982
for i in range(n):
8083
val = {{to_c_type}}(values[i])
8184

82-
if uses_mask:
83-
isna_entry = mask[i]
84-
8585
if dropna:
86-
if not uses_mask:
86+
if uses_mask:
87+
isna_entry = mask[i]
88+
else:
8789
isna_entry = is_nan_{{c_type}}(val)
8890

8991
if not dropna or not isna_entry:
90-
if uses_mask and isna_entry:
91-
na_counter += 1
92+
k = kh_get_{{ttype}}(table, val)
93+
if k != table.n_buckets:
94+
table.vals[k] += 1
9295
else:
93-
k = kh_get_{{ttype}}(table, val)
94-
if k != table.n_buckets:
95-
table.vals[k] += 1
96-
else:
97-
k = kh_put_{{ttype}}(table, val, &ret)
98-
table.vals[k] = 1
99-
result_keys.append(val)
96+
k = kh_put_{{ttype}}(table, val, &ret)
97+
table.vals[k] = 1
98+
result_keys.append(val)
10099
{{endif}}
101100

102101
# collect counts in the order corresponding to result_keys:
103-
if na_counter > 0:
104-
na_add = 1
105102
cdef:
106-
int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64)
103+
int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
107104

108105
for i in range(table.size):
109106
{{if dtype == 'object'}}
@@ -113,13 +110,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
113110
{{endif}}
114111
result_counts[i] = table.vals[k]
115112

116-
if na_counter > 0:
117-
result_counts[table.size] = na_counter
118-
result_keys.append(val)
119-
120113
kh_destroy_{{ttype}}(table)
121114

122-
return result_keys.to_array(), result_counts.base, na_counter
115+
return result_keys.to_array(), result_counts.base
123116

124117

125118
@cython.wraparound(False)
@@ -406,10 +399,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
406399
ndarray[htfunc_t] modes
407400

408401
int64_t[::1] counts
409-
int64_t count, _, max_count = -1
402+
int64_t count, max_count = -1
410403
Py_ssize_t nkeys, k, j = 0
411404

412-
keys, counts, _ = value_count(values, dropna, mask=mask)
405+
keys, counts = value_count(values, dropna, mask=mask)
413406
nkeys = len(keys)
414407

415408
modes = np.empty(nkeys, dtype=values.dtype)

pandas/core/algorithms.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,7 @@ def value_counts_internal(
923923

924924
else:
925925
values = _ensure_arraylike(values, func_name="value_counts")
926-
keys, counts, _ = value_counts_arraylike(values, dropna)
926+
keys, counts = value_counts_arraylike(values, dropna)
927927
if keys.dtype == np.float16:
928928
keys = keys.astype(np.float32)
929929

@@ -948,7 +948,7 @@ def value_counts_internal(
948948
# Called once from SparseArray, otherwise could be private
949949
def value_counts_arraylike(
950950
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
951-
) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
951+
) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
952952
"""
953953
Parameters
954954
----------
@@ -964,7 +964,7 @@ def value_counts_arraylike(
964964
original = values
965965
values = _ensure_data(values)
966966

967-
keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)
967+
keys, counts = htable.value_count(values, dropna, mask=mask)
968968

969969
if needs_i8_conversion(original.dtype):
970970
# datetime, timedelta, or period
@@ -974,7 +974,7 @@ def value_counts_arraylike(
974974
keys, counts = keys[mask], counts[mask]
975975

976976
res_keys = _reconstruct_data(keys, original.dtype, original)
977-
return res_keys, counts, na_counter
977+
return res_keys, counts
978978

979979

980980
def duplicated(

pandas/core/arrays/masked.py

+19-13
Original file line numberDiff line numberDiff line change
@@ -1052,22 +1052,28 @@ def value_counts(self, dropna: bool = True) -> Series:
10521052
)
10531053
from pandas.arrays import IntegerArray
10541054

1055-
keys, value_counts, na_counter = algos.value_counts_arraylike(
1056-
self._data, dropna=dropna, mask=self._mask
1055+
keys, value_counts = algos.value_counts_arraylike(
1056+
self._data, dropna=True, mask=self._mask
10571057
)
1058-
mask_index = np.zeros((len(value_counts),), dtype=np.bool_)
1059-
mask = mask_index.copy()
10601058

1061-
if na_counter > 0:
1062-
mask_index[-1] = True
1059+
if dropna:
1060+
res = Series(value_counts, index=keys, name="count", copy=False)
1061+
res.index = res.index.astype(self.dtype)
1062+
res = res.astype("Int64")
1063+
return res
10631064

1064-
arr = IntegerArray(value_counts, mask)
1065-
index = Index(
1066-
self.dtype.construct_array_type()(
1067-
keys, mask_index # type: ignore[arg-type]
1068-
)
1069-
)
1070-
return Series(arr, index=index, name="count", copy=False)
1065+
# if we want nans, count the mask
1066+
counts = np.empty(len(value_counts) + 1, dtype="int64")
1067+
counts[:-1] = value_counts
1068+
counts[-1] = self._mask.sum()
1069+
1070+
index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
1071+
index = index.astype(self.dtype)
1072+
1073+
mask = np.zeros(len(counts), dtype="bool")
1074+
counts_array = IntegerArray(counts, mask)
1075+
1076+
return Series(counts_array, index=index, name="count", copy=False)
10711077

10721078
@doc(ExtensionArray.equals)
10731079
def equals(self, other) -> bool:

pandas/core/arrays/sparse/array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ def value_counts(self, dropna: bool = True) -> Series:
890890
Series,
891891
)
892892

893-
keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
893+
keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
894894
fcounts = self.sp_index.ngaps
895895
if fcounts > 0 and (not self._null_fill_value or not dropna):
896896
mask = isna(keys) if self._null_fill_value else keys == self.fill_value

pandas/tests/libs/test_hashtable.py

+4-15
Original file line numberDiff line numberDiff line change
@@ -586,26 +586,15 @@ def test_value_count(self, dtype, writable):
586586
expected = (np.arange(N) + N).astype(dtype)
587587
values = np.repeat(expected, 5)
588588
values.flags.writeable = writable
589-
keys, counts, _ = ht.value_count(values, False)
589+
keys, counts = ht.value_count(values, False)
590590
tm.assert_numpy_array_equal(np.sort(keys), expected)
591591
assert np.all(counts == 5)
592592

593-
def test_value_count_mask(self, dtype):
594-
if dtype == np.object_:
595-
pytest.skip("mask not implemented for object dtype")
596-
values = np.array([1] * 5, dtype=dtype)
597-
mask = np.zeros((5,), dtype=np.bool_)
598-
mask[1] = True
599-
mask[4] = True
600-
keys, counts, na_counter = ht.value_count(values, False, mask=mask)
601-
assert len(keys) == 2
602-
assert na_counter == 2
603-
604593
def test_value_count_stable(self, dtype, writable):
605594
# GH12679
606595
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
607596
values.flags.writeable = writable
608-
keys, counts, _ = ht.value_count(values, False)
597+
keys, counts = ht.value_count(values, False)
609598
tm.assert_numpy_array_equal(keys, values)
610599
assert np.all(counts == 1)
611600

@@ -696,9 +685,9 @@ def test_unique_label_indices():
696685
class TestHelpFunctionsWithNans:
697686
def test_value_count(self, dtype):
698687
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
699-
keys, counts, _ = ht.value_count(values, True)
688+
keys, counts = ht.value_count(values, True)
700689
assert len(keys) == 0
701-
keys, counts, _ = ht.value_count(values, False)
690+
keys, counts = ht.value_count(values, False)
702691
assert len(keys) == 1 and np.all(np.isnan(keys))
703692
assert counts[0] == 3
704693

pandas/tests/series/methods/test_value_counts.py

-19
Original file line numberDiff line numberDiff line change
@@ -250,22 +250,3 @@ def test_value_counts_complex_numbers(self, input_array, expected):
250250
# GH 17927
251251
result = Series(input_array).value_counts()
252252
tm.assert_series_equal(result, expected)
253-
254-
def test_value_counts_masked(self):
255-
# GH#54984
256-
dtype = "Int64"
257-
ser = Series([1, 2, None, 2, None, 3], dtype=dtype)
258-
result = ser.value_counts(dropna=False)
259-
expected = Series(
260-
[2, 2, 1, 1],
261-
index=Index([2, None, 1, 3], dtype=dtype),
262-
dtype=dtype,
263-
name="count",
264-
)
265-
tm.assert_series_equal(result, expected)
266-
267-
result = ser.value_counts(dropna=True)
268-
expected = Series(
269-
[2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count"
270-
)
271-
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)