Skip to content

ENH: Support mask in duplicated algorithm #48150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Sep 13, 2022
23 changes: 23 additions & 0 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,29 @@ def time_duplicated(self, unique, keep, dtype):
self.idx.duplicated(keep=keep)


class DuplicatedMaskedArray:

params = [
[True, False],
["first", "last", False],
["Int64", "Float64"],
]
param_names = ["unique", "keep", "dtype"]

def setup(self, unique, keep, dtype):
N = 10**5
data = pd.Series(np.arange(N), dtype=dtype)
data[list(range(1, N, 100))] = pd.NA
if not unique:
data = data.repeat(5)
self.ser = data
# cache is_unique
self.ser.is_unique

def time_duplicated(self, unique, keep, dtype):
self.ser.duplicated(keep=keep)


class Hashing:
def setup_cache(self):
N = 10**5
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ class IntpHashTable(HashTable): ...
def duplicated(
values: np.ndarray,
keep: Literal["last", "first", False] = ...,
mask: npt.NDArray[np.bool_] | None = ...,
) -> npt.NDArray[np.bool_]: ...
def mode(
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ...
Expand Down
90 changes: 59 additions & 31 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
{{else}}
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
{{endif}}
cdef:
int ret = 0
Expand All @@ -129,10 +129,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
{{else}}
PyObject* value
{{endif}}
Py_ssize_t i, n = len(values)
Py_ssize_t i, n = len(values), first_na = -1
khiter_t k
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
bint seen_na = False, uses_mask = mask is not None
bint seen_multiple_na = False

kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

Expand All @@ -147,9 +149,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
{{endif}}
for i in range(n - 1, -1, -1):
# equivalent: range(n)[::-1], which cython doesn't like in nogil
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0
if uses_mask and mask[i]:
if seen_na:
out[i] = True
else:
out[i] = False
seen_na = True
else:
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0

elif keep == 'first':
{{if dtype == 'object'}}
Expand All @@ -158,9 +167,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
with nogil:
{{endif}}
for i in range(n):
Copy link
Member

@mroeschke mroeschke Sep 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice if we could template for i in ... based on {{ if keep == "first"/"last" }} to deduplicate this similar logic, but could be a follow up.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh good idea, would do as a follow up to keep diff reviewable.

value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0
if uses_mask and mask[i]:
if seen_na:
out[i] = True
else:
out[i] = False
seen_na = True
else:
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0

else:
{{if dtype == 'object'}}
Expand All @@ -169,15 +185,27 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
with nogil:
{{endif}}
for i in range(n):
value = {{to_c_type}}(values[i])
k = kh_get_{{ttype}}(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
if uses_mask and mask[i]:
if not seen_na:
first_na = i
seen_na = True
elif not seen_multiple_na:
out[i] = 1
out[first_na] = 1
seen_multiple_na = True
else:
out[i] = 1

else:
k = kh_put_{{ttype}}(table, value, &ret)
table.vals[k] = i
out[i] = 0
value = {{to_c_type}}(values[i])
k = kh_get_{{ttype}}(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_{{ttype}}(table, value, &ret)
table.vals[k] = i
out[i] = 0

kh_destroy_{{ttype}}(table)
return out
Expand Down Expand Up @@ -301,37 +329,37 @@ cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=N
raise TypeError(values.dtype)


cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
if htfunc_t is object:
return duplicated_object(values, keep)
return duplicated_object(values, keep, mask=mask)

elif htfunc_t is int8_t:
return duplicated_int8(values, keep)
return duplicated_int8(values, keep, mask=mask)
elif htfunc_t is int16_t:
return duplicated_int16(values, keep)
return duplicated_int16(values, keep, mask=mask)
elif htfunc_t is int32_t:
return duplicated_int32(values, keep)
return duplicated_int32(values, keep, mask=mask)
elif htfunc_t is int64_t:
return duplicated_int64(values, keep)
return duplicated_int64(values, keep, mask=mask)

elif htfunc_t is uint8_t:
return duplicated_uint8(values, keep)
return duplicated_uint8(values, keep, mask=mask)
elif htfunc_t is uint16_t:
return duplicated_uint16(values, keep)
return duplicated_uint16(values, keep, mask=mask)
elif htfunc_t is uint32_t:
return duplicated_uint32(values, keep)
return duplicated_uint32(values, keep, mask=mask)
elif htfunc_t is uint64_t:
return duplicated_uint64(values, keep)
return duplicated_uint64(values, keep, mask=mask)

elif htfunc_t is float64_t:
return duplicated_float64(values, keep)
return duplicated_float64(values, keep, mask=mask)
elif htfunc_t is float32_t:
return duplicated_float32(values, keep)
return duplicated_float32(values, keep, mask=mask)

elif htfunc_t is complex128_t:
return duplicated_complex128(values, keep)
return duplicated_complex128(values, keep, mask=mask)
elif htfunc_t is complex64_t:
return duplicated_complex64(values, keep)
return duplicated_complex64(values, keep, mask=mask)

else:
raise TypeError(values.dtype)
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,10 @@ def duplicated(
-------
duplicated : ndarray[bool]
"""
if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes, that we don't want to add duplicated to the ea interface

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to add an issue to discuss this. Kinda unfortunate we need this exception for BasedMaskedDtype

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah not happy about this either, will open a follow up issue to discuss

values = cast("BaseMaskedArray", values)
return htable.duplicated(values._data, keep=keep, mask=values._mask)

values = _ensure_data(values)
return htable.duplicated(values, keep=keep)

Expand Down