-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: Use String HashTable in value_counts #52113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5695b24
0f20375
ada23cd
6470143
37b3b02
f725a16
b8eb45d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,34 +6,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in | |
|
||
{{py: | ||
|
||
# name, dtype, ttype, c_type, to_c_type | ||
dtypes = [('Complex128', 'complex128', 'complex128', | ||
# name, vector_name, dtype, ttype, c_type, to_c_type | ||
dtypes = [('complex128', 'Complex128', 'complex128', 'complex128', | ||
'khcomplex128_t', 'to_khcomplex128_t'), | ||
('Complex64', 'complex64', 'complex64', | ||
('complex64', 'Complex64', 'complex64', 'complex64', | ||
'khcomplex64_t', 'to_khcomplex64_t'), | ||
('Float64', 'float64', 'float64', 'float64_t', ''), | ||
('Float32', 'float32', 'float32', 'float32_t', ''), | ||
('UInt64', 'uint64', 'uint64', 'uint64_t', ''), | ||
('UInt32', 'uint32', 'uint32', 'uint32_t', ''), | ||
('UInt16', 'uint16', 'uint16', 'uint16_t', ''), | ||
('UInt8', 'uint8', 'uint8', 'uint8_t', ''), | ||
('Object', 'object', 'pymap', 'object', '<PyObject*>'), | ||
('Int64', 'int64', 'int64', 'int64_t', ''), | ||
('Int32', 'int32', 'int32', 'int32_t', ''), | ||
('Int16', 'int16', 'int16', 'int16_t', ''), | ||
('Int8', 'int8', 'int8', 'int8_t', '')] | ||
('float64', 'Float64', 'float64', 'float64', 'float64_t', ''), | ||
('float32', 'Float32', 'float32', 'float32', 'float32_t', ''), | ||
('uint64', 'UInt64', 'uint64', 'uint64', 'uint64_t', ''), | ||
('uint32', 'UInt32', 'uint32', 'uint32', 'uint32_t', ''), | ||
('uint16', 'UInt16', 'uint16', 'uint16', 'uint16_t', ''), | ||
('uint8', 'UInt8', 'uint8', 'uint8', 'uint8_t', ''), | ||
('object', 'Object', 'object', 'pymap', 'object', ''), | ||
('string', 'Object', 'object', 'str', 'char *', 'util.get_c_string'), | ||
('int64', 'Int64', 'int64', 'int64', 'int64_t', ''), | ||
('int32', 'Int32', 'int32', 'int32', 'int32_t', ''), | ||
('int16', 'Int16', 'int16', 'int16', 'int16_t', ''), | ||
('int8', 'Int8', 'int8', 'int8', 'int8_t', '')] | ||
|
||
}} | ||
|
||
{{for name, dtype, ttype, c_type, to_c_type in dtypes}} | ||
{{for name, vector_name, dtype, ttype, c_type, to_c_type in dtypes}} | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
{{if dtype == 'object'}} | ||
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None): | ||
cdef value_count_{{name}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None): | ||
{{else}} | ||
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): | ||
cdef value_count_{{name}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): | ||
{{endif}} | ||
cdef: | ||
Py_ssize_t i = 0 | ||
|
@@ -43,12 +44,18 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 | |
# Don't use Py_ssize_t, since table.n_buckets is unsigned | ||
khiter_t k | ||
|
||
{{if name != 'string'}} | ||
{{c_type}} val | ||
{{else}} | ||
object val | ||
{{endif}} | ||
|
||
int ret = 0 | ||
bint uses_mask = mask is not None | ||
bint isna_entry = False | ||
|
||
dict na_dict = dict() | ||
|
||
if uses_mask and not dropna: | ||
raise NotImplementedError("uses_mask not implemented with dropna=False") | ||
|
||
|
@@ -57,7 +64,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 | |
# table maps keys to counts | ||
# result_keys remembers the original order of keys | ||
|
||
result_keys = {{name}}Vector() | ||
result_keys = {{vector_name}}Vector() | ||
table = kh_init_{{ttype}}() | ||
|
||
{{if dtype == 'object'}} | ||
|
@@ -69,11 +76,20 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 | |
for i in range(n): | ||
val = values[i] | ||
if not dropna or not checknull(val): | ||
k = kh_get_{{ttype}}(table, {{to_c_type}}val) | ||
{{if name == 'string'}} | ||
if not isinstance(val, str): | ||
if val in na_dict: | ||
na_dict[val] += 1 | ||
else: | ||
na_dict[val] = 1 | ||
result_keys.append(val) | ||
continue | ||
{{endif}} | ||
k = kh_get_{{ttype}}(table, {{to_c_type}}(val)) | ||
if k != table.n_buckets: | ||
table.vals[k] += 1 | ||
else: | ||
k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret) | ||
k = kh_put_{{ttype}}(table, {{to_c_type}}(val), &ret) | ||
table.vals[k] = 1 | ||
result_keys.append(val) | ||
{{else}} | ||
|
@@ -100,11 +116,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 | |
|
||
# collect counts in the order corresponding to result_keys: | ||
cdef: | ||
int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64) | ||
int64_t[::1] result_counts = np.empty(len(result_keys), dtype=np.int64) | ||
|
||
for i in range(table.size): | ||
for i in range(len(result_keys)): | ||
{{if dtype == 'object'}} | ||
k = kh_get_{{ttype}}(table, result_keys.data[i]) | ||
val = <object>result_keys.data[i] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did the object branch need to change? |
||
{{if name == 'string'}} | ||
if not isinstance(val, str): | ||
result_counts[i] = na_dict[val] | ||
continue | ||
{{endif}} | ||
k = kh_get_{{ttype}}(table, {{to_c_type}}(val)) | ||
{{else}} | ||
k = kh_get_{{ttype}}(table, result_keys.data.data[i]) | ||
{{endif}} | ||
|
@@ -115,27 +137,25 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 | |
return result_keys.to_array(), result_counts.base | ||
|
||
|
||
{{if name != 'string'}} | ||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
{{if dtype == 'object'}} | ||
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None): | ||
cdef duplicated_{{name}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None): | ||
{{else}} | ||
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None): | ||
cdef duplicated_{{name}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None): | ||
{{endif}} | ||
cdef: | ||
int ret = 0 | ||
{{if dtype != 'object'}} | ||
{{c_type}} value | ||
{{else}} | ||
PyObject* value | ||
{{endif}} | ||
Py_ssize_t i, n = len(values), first_na = -1 | ||
khiter_t k | ||
kh_{{ttype}}_t *table = kh_init_{{ttype}}() | ||
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') | ||
bint seen_na = False, uses_mask = mask is not None | ||
bint seen_multiple_na = False | ||
|
||
|
||
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) | ||
|
||
if keep not in ('last', 'first', False): | ||
|
@@ -197,19 +217,19 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons | |
|
||
kh_destroy_{{ttype}}(table) | ||
return out | ||
|
||
{{endif}} | ||
|
||
# ---------------------------------------------------------------------- | ||
# Membership | ||
# ---------------------------------------------------------------------- | ||
|
||
|
||
{{if name != 'string'}} | ||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
{{if dtype == 'object'}} | ||
cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): | ||
cdef ismember_{{name}}(ndarray[{{dtype}}] arr, ndarray[{{dtype}}] values): | ||
{{else}} | ||
cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): | ||
cdef ismember_{{name}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): | ||
{{endif}} | ||
""" | ||
Return boolean of values in arr on an | ||
|
@@ -230,11 +250,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): | |
int ret = 0 | ||
ndarray[uint8_t] result | ||
|
||
{{if dtype == "object"}} | ||
PyObject* val | ||
{{else}} | ||
{{c_type}} val | ||
{{endif}} | ||
|
||
kh_{{ttype}}_t *table = kh_init_{{ttype}}() | ||
|
||
|
@@ -267,7 +283,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): | |
|
||
kh_destroy_{{ttype}}(table) | ||
return result.view(np.bool_) | ||
|
||
{{endif}} | ||
# ---------------------------------------------------------------------- | ||
# Mode Computations | ||
# ---------------------------------------------------------------------- | ||
|
@@ -283,6 +299,8 @@ ctypedef fused htfunc_t: | |
|
||
cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): | ||
if htfunc_t is object: | ||
if lib.is_string_array(values, skipna=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a viable way to avoid needing |
||
return value_count_string(values, dropna, mask=mask) | ||
return value_count_object(values, dropna, mask=mask) | ||
|
||
elif htfunc_t is int8_t: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,9 +53,9 @@ cdef extern from "khash_python.h": | |
kh_pymap_t* kh_init_pymap() | ||
void kh_destroy_pymap(kh_pymap_t*) | ||
void kh_clear_pymap(kh_pymap_t*) | ||
khuint_t kh_get_pymap(kh_pymap_t*, PyObject*) | ||
khuint_t kh_get_pymap(kh_pymap_t*, object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this just a cleanup or did it actually change the behavior in some way? I have a slight preference for the extern declarations to match exactly the way they are declared in the source headers to minimize having to interpret what is happening; at the end of the day the linker isn't going to type check just resolve the symbols |
||
void kh_resize_pymap(kh_pymap_t*, khuint_t) | ||
khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) | ||
khuint_t kh_put_pymap(kh_pymap_t*, object, int*) | ||
void kh_del_pymap(kh_pymap_t*, khuint_t) | ||
|
||
bint kh_exist_pymap(kh_pymap_t*, khiter_t) | ||
|
@@ -76,7 +76,7 @@ cdef extern from "khash_python.h": | |
|
||
bint kh_exist_pyset(kh_pyset_t*, khiter_t) | ||
|
||
ctypedef char* kh_cstr_t | ||
ctypedef const char* kh_cstr_t | ||
|
||
ctypedef struct kh_str_t: | ||
khuint_t n_buckets, size, n_occupied, upper_bound | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can make this const char *