Skip to content

ENH: Implement masked algorithm for value_counts #54984

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enhancement2
Other enhancements
^^^^^^^^^^^^^^^^^^
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
-

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def value_count(
values: np.ndarray,
dropna: bool,
mask: npt.NDArray[np.bool_] | None = ...,
) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values]
) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values]

# arr and values should have same dtype
def ismember(
Expand Down
41 changes: 24 additions & 17 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
{{endif}}
cdef:
Py_ssize_t i = 0
Py_ssize_t i = 0, na_counter = 0, na_add = 0
Py_ssize_t n = len(values)
kh_{{ttype}}_t *table

Expand All @@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
bint uses_mask = mask is not None
bint isna_entry = False

if uses_mask and not dropna:
raise NotImplementedError("uses_mask not implemented with dropna=False")

# we track the order in which keys are first seen (GH39009),
# khash-map isn't insertion-ordered, thus:
# table maps keys to counts
Expand Down Expand Up @@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
for i in range(n):
val = {{to_c_type}}(values[i])

if uses_mask:
isna_entry = mask[i]

if dropna:
if uses_mask:
isna_entry = mask[i]
else:
if not uses_mask:
isna_entry = is_nan_{{c_type}}(val)

if not dropna or not isna_entry:
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
if uses_mask and isna_entry:
na_counter += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
table.vals[k] = 1
result_keys.append(val)
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
table.vals[k] = 1
result_keys.append(val)
{{endif}}

# collect counts in the order corresponding to result_keys:
if na_counter > 0:
na_add = 1
cdef:
int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64)

for i in range(table.size):
{{if dtype == 'object'}}
Expand All @@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
{{endif}}
result_counts[i] = table.vals[k]

if na_counter > 0:
result_counts[table.size] = na_counter
result_keys.append(val)

kh_destroy_{{ttype}}(table)

return result_keys.to_array(), result_counts.base
return result_keys.to_array(), result_counts.base, na_counter


@cython.wraparound(False)
Expand Down Expand Up @@ -399,10 +406,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
ndarray[htfunc_t] modes

int64_t[::1] counts
int64_t count, max_count = -1
int64_t count, _, max_count = -1
Py_ssize_t nkeys, k, j = 0

keys, counts = value_count(values, dropna, mask=mask)
keys, counts, _ = value_count(values, dropna, mask=mask)
nkeys = len(keys)

modes = np.empty(nkeys, dtype=values.dtype)
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@ def value_counts_internal(

else:
values = _ensure_arraylike(values, func_name="value_counts")
keys, counts = value_counts_arraylike(values, dropna)
keys, counts, _ = value_counts_arraylike(values, dropna)
if keys.dtype == np.float16:
keys = keys.astype(np.float32)

Expand All @@ -949,7 +949,7 @@ def value_counts_internal(
# Called once from SparseArray, otherwise could be private
def value_counts_arraylike(
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
"""
Parameters
----------
Expand All @@ -965,7 +965,7 @@ def value_counts_arraylike(
original = values
values = _ensure_data(values)

keys, counts = htable.value_count(values, dropna, mask=mask)
keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)

if needs_i8_conversion(original.dtype):
# datetime, timedelta, or period
Expand All @@ -975,7 +975,7 @@ def value_counts_arraylike(
keys, counts = keys[mask], counts[mask]

res_keys = _reconstruct_data(keys, original.dtype, original)
return res_keys, counts
return res_keys, counts, na_counter


def duplicated(
Expand Down
32 changes: 13 additions & 19 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,28 +1044,22 @@ def value_counts(self, dropna: bool = True) -> Series:
)
from pandas.arrays import IntegerArray

keys, value_counts = algos.value_counts_arraylike(
self._data, dropna=True, mask=self._mask
keys, value_counts, na_counter = algos.value_counts_arraylike(
self._data, dropna=dropna, mask=self._mask
)
mask_index = np.zeros((len(value_counts),), dtype=np.bool_)
mask = mask_index.copy()

if dropna:
res = Series(value_counts, index=keys, name="count", copy=False)
res.index = res.index.astype(self.dtype)
res = res.astype("Int64")
return res
if na_counter > 0:
mask_index[-1] = True

# if we want nans, count the mask
counts = np.empty(len(value_counts) + 1, dtype="int64")
counts[:-1] = value_counts
counts[-1] = self._mask.sum()

index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
index = index.astype(self.dtype)

mask = np.zeros(len(counts), dtype="bool")
counts_array = IntegerArray(counts, mask)

return Series(counts_array, index=index, name="count", copy=False)
arr = IntegerArray(value_counts, mask)
index = Index(
self.dtype.construct_array_type()(
keys, mask_index # type: ignore[arg-type]
)
)
return Series(arr, index=index, name="count", copy=False)

@doc(ExtensionArray.equals)
def equals(self, other) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ def value_counts(self, dropna: bool = True) -> Series:
Series,
)

keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
fcounts = self.sp_index.ngaps
if fcounts > 0 and (not self._null_fill_value or not dropna):
mask = isna(keys) if self._null_fill_value else keys == self.fill_value
Expand Down
19 changes: 15 additions & 4 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,15 +586,26 @@ def test_value_count(self, dtype, writable):
expected = (np.arange(N) + N).astype(dtype)
values = np.repeat(expected, 5)
values.flags.writeable = writable
keys, counts = ht.value_count(values, False)
keys, counts, _ = ht.value_count(values, False)
tm.assert_numpy_array_equal(np.sort(keys), expected)
assert np.all(counts == 5)

def test_value_count_mask(self, dtype):
if dtype == np.object_:
pytest.skip("mask not implemented for object dtype")
values = np.array([1] * 5, dtype=dtype)
mask = np.zeros((5,), dtype=np.bool_)
mask[1] = True
mask[4] = True
keys, counts, na_counter = ht.value_count(values, False, mask=mask)
assert len(keys) == 2
assert na_counter == 2

def test_value_count_stable(self, dtype, writable):
# GH12679
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
values.flags.writeable = writable
keys, counts = ht.value_count(values, False)
keys, counts, _ = ht.value_count(values, False)
tm.assert_numpy_array_equal(keys, values)
assert np.all(counts == 1)

Expand Down Expand Up @@ -685,9 +696,9 @@ def test_unique_label_indices():
class TestHelpFunctionsWithNans:
def test_value_count(self, dtype):
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
keys, counts = ht.value_count(values, True)
keys, counts, _ = ht.value_count(values, True)
assert len(keys) == 0
keys, counts = ht.value_count(values, False)
keys, counts, _ = ht.value_count(values, False)
assert len(keys) == 1 and np.all(np.isnan(keys))
assert counts[0] == 3

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,22 @@ def test_value_counts_complex_numbers(self, input_array, expected):
# GH 17927
result = Series(input_array).value_counts()
tm.assert_series_equal(result, expected)

def test_value_counts_masked(self):
# GH#54984
dtype = "Int64"
ser = Series([1, 2, None, 2, None, 3], dtype=dtype)
result = ser.value_counts(dropna=False)
expected = Series(
[2, 2, 1, 1],
index=Index([2, None, 1, 3], dtype=dtype),
dtype=dtype,
name="count",
)
tm.assert_series_equal(result, expected)

result = ser.value_counts(dropna=True)
expected = Series(
[2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count"
)
tm.assert_series_equal(result, expected)