diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 204393cbb76f2..edb917d65ae56 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -194,15 +194,22 @@ def time_clip(self): class ValueCounts: - params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] + params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object", "string"]] param_names = ["N", "dtype"] def setup(self, N, dtype): - self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) + ser = Series(np.random.randint(0, N, size=10 * N)) + if dtype == "string": + self.s = tm.makeStringIndex(N) + else: + self.s = ser.astype(dtype) def time_value_counts(self, N, dtype): self.s.value_counts() + def peakmem_value_counts(self, N, dtype): + self.s.value_counts() + class ValueCountsEA: params = [[10**3, 10**4, 10**5], [True, False]] diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index db900ddd1f85b..4754abb439182 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -116,6 +116,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :meth:`Series.value_counts`, :meth:`DataFrame.value_counts`, and :meth:`Index.value_counts` for strings (:issue:`14860`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..17217a0eaf832 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -21,6 +21,7 @@ cnp.import_array() from pandas._libs cimport util +from pandas._libs import lib from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.khash cimport ( KHASH_TRACE_DOMAIN, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index d4d3117a32ac9..4aac0a1600a14 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1269,7 +1269,7 @@ cdef class PyObjectHashTable(HashTable): khiter_t k hash(key) - k = kh_get_pymap(self.table, key) + k = kh_get_pymap(self.table, key) return k != self.table.n_buckets def sizeof(self, deep: bool = False) -> int: @@ -1296,7 +1296,7 @@ cdef class PyObjectHashTable(HashTable): cdef: khiter_t k - k = kh_get_pymap(self.table, val) + k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] else: @@ -1310,7 +1310,7 @@ cdef class PyObjectHashTable(HashTable): hash(key) - k = kh_put_pymap(self.table, key, &ret) + k = kh_put_pymap(self.table, key, &ret) if kh_exist_pymap(self.table, k): self.table.vals[k] = val else: @@ -1328,7 +1328,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - k = kh_put_pymap(self.table, val, &ret) + k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i def lookup(self, ndarray[object] values, object mask = None) -> ndarray: @@ -1345,7 +1345,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - k = kh_get_pymap(self.table, val) + k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: @@ -1418,10 +1418,10 @@ cdef class PyObjectHashTable(HashTable): labels[i] = na_sentinel continue - k = kh_get_pymap(self.table, val) + k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: # k hasn't been seen yet - k = kh_put_pymap(self.table, val, &ret) + k = kh_put_pymap(self.table, val, &ret) uniques.append(val) if return_inverse: self.table.vals[k] = count diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b9cf6011481af..ab5647bae8b94 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,34 +6,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, ttype, c_type, to_c_type -dtypes = [('Complex128', 'complex128', 'complex128', +# name, vector_name, dtype, ttype, c_type, to_c_type +dtypes = [('complex128', 'Complex128', 'complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), - ('Complex64', 'complex64', 'complex64', + ('complex64', 'Complex64', 'complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), - ('Float64', 'float64', 'float64', 'float64_t', ''), - ('Float32', 'float32', 'float32', 'float32_t', ''), - ('UInt64', 'uint64', 'uint64', 'uint64_t', ''), - ('UInt32', 'uint32', 'uint32', 'uint32_t', ''), - ('UInt16', 'uint16', 'uint16', 'uint16_t', ''), - ('UInt8', 'uint8', 'uint8', 'uint8_t', ''), - ('Object', 'object', 'pymap', 'object', ''), - ('Int64', 'int64', 'int64', 'int64_t', ''), - ('Int32', 'int32', 'int32', 'int32_t', ''), - ('Int16', 'int16', 'int16', 'int16_t', ''), - ('Int8', 'int8', 'int8', 'int8_t', '')] + ('float64', 'Float64', 'float64', 'float64', 'float64_t', ''), + ('float32', 'Float32', 'float32', 'float32', 'float32_t', ''), + ('uint64', 'UInt64', 'uint64', 'uint64', 'uint64_t', ''), + ('uint32', 'UInt32', 'uint32', 'uint32', 'uint32_t', ''), + ('uint16', 'UInt16', 'uint16', 'uint16', 'uint16_t', ''), + ('uint8', 'UInt8', 'uint8', 'uint8', 'uint8_t', ''), + ('object', 'Object', 'object', 'pymap', 'object', ''), + ('string', 'Object', 'object', 'str', 'char *', 'util.get_c_string'), + ('int64', 'Int64', 'int64', 'int64', 'int64_t', ''), + ('int32', 'Int32', 'int32', 'int32', 'int32_t', ''), + ('int16', 'Int16', 'int16', 'int16', 'int16_t', ''), + ('int8', 'Int8', 'int8', 'int8', 'int8_t', '')] }} -{{for name, dtype, ttype, c_type, to_c_type in dtypes}} +{{for name, vector_name, dtype, ttype, c_type, to_c_type in dtypes}} @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None): +cdef value_count_{{name}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None): {{else}} -cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): +cdef value_count_{{name}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): {{endif}} cdef: Py_ssize_t i = 0 @@ -43,12 +44,18 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k + {{if name != 'string'}} {{c_type}} val + {{else}} + object val + {{endif}} int ret = 0 bint uses_mask = mask is not None bint isna_entry = False + dict na_dict = dict() + if uses_mask and not dropna: raise NotImplementedError("uses_mask not implemented with dropna=False") @@ -57,7 +64,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 # table maps keys to counts # result_keys remembers the original order of keys - result_keys = {{name}}Vector() + result_keys = {{vector_name}}Vector() table = kh_init_{{ttype}}() {{if dtype == 'object'}} @@ -69,11 +76,20 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 for i in range(n): val = values[i] if not dropna or not checknull(val): - k = kh_get_{{ttype}}(table, {{to_c_type}}val) + {{if name == 'string'}} + if not isinstance(val, str): + if val in na_dict: + na_dict[val] += 1 + else: + na_dict[val] = 1 + result_keys.append(val) + continue + {{endif}} + k = kh_get_{{ttype}}(table, {{to_c_type}}(val)) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret) + k = kh_put_{{ttype}}(table, {{to_c_type}}(val), &ret) table.vals[k] = 1 result_keys.append(val) {{else}} @@ -100,11 +116,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 # collect counts in the order corresponding to result_keys: cdef: - int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64) + int64_t[::1] result_counts = np.empty(len(result_keys), dtype=np.int64) - for i in range(table.size): + for i in range(len(result_keys)): {{if dtype == 'object'}} - k = kh_get_{{ttype}}(table, result_keys.data[i]) + val = result_keys.data[i] + {{if name == 'string'}} + if not isinstance(val, str): + result_counts[i] = na_dict[val] + continue + {{endif}} + k = kh_get_{{ttype}}(table, {{to_c_type}}(val)) {{else}} k = kh_get_{{ttype}}(table, result_keys.data.data[i]) {{endif}} @@ -115,20 +137,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 return result_keys.to_array(), result_counts.base +{{if name != 'string'}} @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None): +cdef duplicated_{{name}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None): {{else}} -cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None): +cdef duplicated_{{name}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None): {{endif}} cdef: int ret = 0 - {{if dtype != 'object'}} {{c_type}} value - {{else}} - PyObject* value - {{endif}} Py_ssize_t i, n = len(values), first_na = -1 khiter_t k kh_{{ttype}}_t *table = kh_init_{{ttype}}() @@ -136,6 +155,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons bint seen_na = False, uses_mask = mask is not None bint seen_multiple_na = False + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): @@ -197,19 +217,19 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons kh_destroy_{{ttype}}(table) return out - +{{endif}} # ---------------------------------------------------------------------- # Membership # ---------------------------------------------------------------------- - +{{if name != 'string'}} @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): +cdef ismember_{{name}}(ndarray[{{dtype}}] arr, ndarray[{{dtype}}] values): {{else}} -cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): +cdef ismember_{{name}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} """ Return boolean of values in arr on an @@ -230,11 +250,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): int ret = 0 ndarray[uint8_t] result - {{if dtype == "object"}} - PyObject* val - {{else}} {{c_type}} val - {{endif}} kh_{{ttype}}_t *table = kh_init_{{ttype}}() @@ -267,7 +283,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): kh_destroy_{{ttype}}(table) return result.view(np.bool_) - +{{endif}} # ---------------------------------------------------------------------- # Mode Computations # ---------------------------------------------------------------------- @@ -283,6 +299,8 @@ ctypedef fused htfunc_t: cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): if htfunc_t is object: + if lib.is_string_array(values, skipna=True): + return value_count_string(values, dropna, mask=mask) return value_count_object(values, dropna, mask=mask) elif htfunc_t is int8_t: diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index a9f819e5e16db..f28e94ccf7ba9 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -53,9 +53,9 @@ cdef extern from "khash_python.h": kh_pymap_t* kh_init_pymap() void kh_destroy_pymap(kh_pymap_t*) void kh_clear_pymap(kh_pymap_t*) - khuint_t kh_get_pymap(kh_pymap_t*, PyObject*) + khuint_t kh_get_pymap(kh_pymap_t*, object) void kh_resize_pymap(kh_pymap_t*, khuint_t) - khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) + khuint_t kh_put_pymap(kh_pymap_t*, object, int*) void kh_del_pymap(kh_pymap_t*, khuint_t) bint kh_exist_pymap(kh_pymap_t*, khiter_t) @@ -76,7 +76,7 @@ cdef extern from "khash_python.h": bint kh_exist_pyset(kh_pyset_t*, khiter_t) - ctypedef char* kh_cstr_t + ctypedef const char* kh_cstr_t ctypedef struct kh_str_t: khuint_t n_buckets, size, n_occupied, upper_bound