Skip to content

PERF: Use String HashTable in value_counts #52113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,15 +194,22 @@ def time_clip(self):


class ValueCounts:
params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]]
params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object", "string"]]
param_names = ["N", "dtype"]

def setup(self, N, dtype):
self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)
ser = Series(np.random.randint(0, N, size=10 * N))
if dtype == "string":
self.s = tm.makeStringIndex(N)
else:
self.s = ser.astype(dtype)

def time_value_counts(self, N, dtype):
self.s.value_counts()

def peakmem_value_counts(self, N, dtype):
self.s.value_counts()


class ValueCountsEA:
params = [[10**3, 10**4, 10**5], [True, False]]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Deprecations

Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :meth:`Series.value_counts`, :meth:`DataFrame.value_counts`, and :meth:`Index.value_counts` for strings (:issue:`14860`)
- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
- Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ cnp.import_array()


from pandas._libs cimport util
from pandas._libs import lib
from pandas._libs.dtypes cimport numeric_object_t
from pandas._libs.khash cimport (
KHASH_TRACE_DOMAIN,
Expand Down
14 changes: 7 additions & 7 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -1269,7 +1269,7 @@ cdef class PyObjectHashTable(HashTable):
khiter_t k
hash(key)

k = kh_get_pymap(self.table, <PyObject*>key)
k = kh_get_pymap(self.table, key)
return k != self.table.n_buckets

def sizeof(self, deep: bool = False) -> int:
Expand All @@ -1296,7 +1296,7 @@ cdef class PyObjectHashTable(HashTable):
cdef:
khiter_t k

k = kh_get_pymap(self.table, <PyObject*>val)
k = kh_get_pymap(self.table, val)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
Expand All @@ -1310,7 +1310,7 @@ cdef class PyObjectHashTable(HashTable):

hash(key)

k = kh_put_pymap(self.table, <PyObject*>key, &ret)
k = kh_put_pymap(self.table, key, &ret)
if kh_exist_pymap(self.table, k):
self.table.vals[k] = val
else:
Expand All @@ -1328,7 +1328,7 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

k = kh_put_pymap(self.table, <PyObject*>val, &ret)
k = kh_put_pymap(self.table, val, &ret)
self.table.vals[k] = i

def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
Expand All @@ -1345,7 +1345,7 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

k = kh_get_pymap(self.table, <PyObject*>val)
k = kh_get_pymap(self.table, val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
Expand Down Expand Up @@ -1418,10 +1418,10 @@ cdef class PyObjectHashTable(HashTable):
labels[i] = na_sentinel
continue

k = kh_get_pymap(self.table, <PyObject*>val)
k = kh_get_pymap(self.table, val)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
k = kh_put_pymap(self.table, val, &ret)
uniques.append(val)
if return_inverse:
self.table.vals[k] = count
Expand Down
94 changes: 56 additions & 38 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

{{py:

# name, dtype, ttype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'complex128',
# name, vector_name, dtype, ttype, c_type, to_c_type
dtypes = [('complex128', 'Complex128', 'complex128', 'complex128',
'khcomplex128_t', 'to_khcomplex128_t'),
('Complex64', 'complex64', 'complex64',
('complex64', 'Complex64', 'complex64', 'complex64',
'khcomplex64_t', 'to_khcomplex64_t'),
('Float64', 'float64', 'float64', 'float64_t', ''),
('Float32', 'float32', 'float32', 'float32_t', ''),
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
('Object', 'object', 'pymap', 'object', '<PyObject*>'),
('Int64', 'int64', 'int64', 'int64_t', ''),
('Int32', 'int32', 'int32', 'int32_t', ''),
('Int16', 'int16', 'int16', 'int16_t', ''),
('Int8', 'int8', 'int8', 'int8_t', '')]
('float64', 'Float64', 'float64', 'float64', 'float64_t', ''),
('float32', 'Float32', 'float32', 'float32', 'float32_t', ''),
('uint64', 'UInt64', 'uint64', 'uint64', 'uint64_t', ''),
('uint32', 'UInt32', 'uint32', 'uint32', 'uint32_t', ''),
('uint16', 'UInt16', 'uint16', 'uint16', 'uint16_t', ''),
('uint8', 'UInt8', 'uint8', 'uint8', 'uint8_t', ''),
('object', 'Object', 'object', 'pymap', 'object', ''),
('string', 'Object', 'object', 'str', 'char *', 'util.get_c_string'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can make this const char *

('int64', 'Int64', 'int64', 'int64', 'int64_t', ''),
('int32', 'Int32', 'int32', 'int32', 'int32_t', ''),
('int16', 'Int16', 'int16', 'int16', 'int16_t', ''),
('int8', 'Int8', 'int8', 'int8', 'int8_t', '')]

}}

{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
{{for name, vector_name, dtype, ttype, c_type, to_c_type in dtypes}}


@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
cdef value_count_{{name}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
{{else}}
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
cdef value_count_{{name}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
{{endif}}
cdef:
Py_ssize_t i = 0
Expand All @@ -43,12 +44,18 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
# Don't use Py_ssize_t, since table.n_buckets is unsigned
khiter_t k

{{if name != 'string'}}
{{c_type}} val
{{else}}
object val
{{endif}}

int ret = 0
bint uses_mask = mask is not None
bint isna_entry = False

dict na_dict = dict()

if uses_mask and not dropna:
raise NotImplementedError("uses_mask not implemented with dropna=False")

Expand All @@ -57,7 +64,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
# table maps keys to counts
# result_keys remembers the original order of keys

result_keys = {{name}}Vector()
result_keys = {{vector_name}}Vector()
table = kh_init_{{ttype}}()

{{if dtype == 'object'}}
Expand All @@ -69,11 +76,20 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
for i in range(n):
val = values[i]
if not dropna or not checknull(val):
k = kh_get_{{ttype}}(table, {{to_c_type}}val)
{{if name == 'string'}}
if not isinstance(val, str):
if val in na_dict:
na_dict[val] += 1
else:
na_dict[val] = 1
result_keys.append(val)
continue
{{endif}}
k = kh_get_{{ttype}}(table, {{to_c_type}}(val))
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
k = kh_put_{{ttype}}(table, {{to_c_type}}(val), &ret)
table.vals[k] = 1
result_keys.append(val)
{{else}}
Expand All @@ -100,11 +116,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8

# collect counts in the order corresponding to result_keys:
cdef:
int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
int64_t[::1] result_counts = np.empty(len(result_keys), dtype=np.int64)

for i in range(table.size):
for i in range(len(result_keys)):
{{if dtype == 'object'}}
k = kh_get_{{ttype}}(table, result_keys.data[i])
val = <object>result_keys.data[i]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did the object branch need to change?

{{if name == 'string'}}
if not isinstance(val, str):
result_counts[i] = na_dict[val]
continue
{{endif}}
k = kh_get_{{ttype}}(table, {{to_c_type}}(val))
{{else}}
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
{{endif}}
Expand All @@ -115,27 +137,25 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
return result_keys.to_array(), result_counts.base


{{if name != 'string'}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
cdef duplicated_{{name}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
{{else}}
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
cdef duplicated_{{name}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
{{endif}}
cdef:
int ret = 0
{{if dtype != 'object'}}
{{c_type}} value
{{else}}
PyObject* value
{{endif}}
Py_ssize_t i, n = len(values), first_na = -1
khiter_t k
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
bint seen_na = False, uses_mask = mask is not None
bint seen_multiple_na = False


kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

if keep not in ('last', 'first', False):
Expand Down Expand Up @@ -197,19 +217,19 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons

kh_destroy_{{ttype}}(table)
return out

{{endif}}

# ----------------------------------------------------------------------
# Membership
# ----------------------------------------------------------------------


{{if name != 'string'}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
cdef ismember_{{name}}(ndarray[{{dtype}}] arr, ndarray[{{dtype}}] values):
{{else}}
cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
cdef ismember_{{name}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{endif}}
"""
Return boolean of values in arr on an
Expand All @@ -230,11 +250,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
int ret = 0
ndarray[uint8_t] result

{{if dtype == "object"}}
PyObject* val
{{else}}
{{c_type}} val
{{endif}}

kh_{{ttype}}_t *table = kh_init_{{ttype}}()

Expand Down Expand Up @@ -267,7 +283,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):

kh_destroy_{{ttype}}(table)
return result.view(np.bool_)

{{endif}}
# ----------------------------------------------------------------------
# Mode Computations
# ----------------------------------------------------------------------
Expand All @@ -283,6 +299,8 @@ ctypedef fused htfunc_t:

cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
if htfunc_t is object:
if lib.is_string_array(values, skipna=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a viable way to avoid needing lib here? this file is relatively low-dependency and lib is the opposite

return value_count_string(values, dropna, mask=mask)
return value_count_object(values, dropna, mask=mask)

elif htfunc_t is int8_t:
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/khash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ cdef extern from "khash_python.h":
kh_pymap_t* kh_init_pymap()
void kh_destroy_pymap(kh_pymap_t*)
void kh_clear_pymap(kh_pymap_t*)
khuint_t kh_get_pymap(kh_pymap_t*, PyObject*)
khuint_t kh_get_pymap(kh_pymap_t*, object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just a cleanup or did it actually change the behavior in some way? I have a slight preference for the extern declarations to match exactly the way they are declared in the source headers to minimize having to interpret what is happening; at the end of the day the linker isn't going to type check just resolve the symbols

void kh_resize_pymap(kh_pymap_t*, khuint_t)
khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
khuint_t kh_put_pymap(kh_pymap_t*, object, int*)
void kh_del_pymap(kh_pymap_t*, khuint_t)

bint kh_exist_pymap(kh_pymap_t*, khiter_t)
Expand All @@ -76,7 +76,7 @@ cdef extern from "khash_python.h":

bint kh_exist_pyset(kh_pyset_t*, khiter_t)

ctypedef char* kh_cstr_t
ctypedef const char* kh_cstr_t

ctypedef struct kh_str_t:
khuint_t n_buckets, size, n_occupied, upper_bound
Expand Down