From 0b70965d4f4677ae529616d41156d8f738ea0f57 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 29 Nov 2020 20:49:54 +0100 Subject: [PATCH 01/16] remove unnecessary setting of keys kh_put_xxx, already does it! --- pandas/_libs/hashtable_class_helper.pxi.in | 3 --- pandas/_libs/hashtable_func_helper.pxi.in | 2 -- 2 files changed, 5 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b582ed1533a8e..916b9dd4d5aa5 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -365,7 +365,6 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 k = kh_put_{{dtype}}(self.table, key, &ret) - self.table.keys[k] = key if kh_exist_{{dtype}}(self.table, k): self.table.vals[k] = val else: @@ -698,7 +697,6 @@ cdef class StringHashTable(HashTable): v = get_c_string(key) k = kh_put_str(self.table, v, &ret) - self.table.keys[k] = v if kh_exist_str(self.table, k): self.table.vals[k] = val else: @@ -1022,7 +1020,6 @@ cdef class PyObjectHashTable(HashTable): hash(key) k = kh_put_pymap(self.table, key, &ret) - # self.table.keys[k] = key if kh_exist_pymap(self.table, k): self.table.vals[k] = val else: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 7c5afa4ff6b27..0a252f2105081 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -184,7 +184,6 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): out[i] = 1 else: k = kh_put_{{ttype}}(table, value, &ret) - table.keys[k] = value table.vals[k] = i out[i] = 0 {{else}} @@ -197,7 +196,6 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): out[i] = 1 else: k = kh_put_{{ttype}}(table, value, &ret) - table.keys[k] = value table.vals[k] = i out[i] = 0 {{endif}} From 06f7a071923d055e5425cd6b63e8bd8d33db2815 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 29 Nov 2020 21:07:08 +0100 Subject: [PATCH 02/16] adding explicit c_type as preparation for complex, for which c_type will not be dtype_t --- pandas/_libs/hashtable_class_helper.pxi.in | 48 +++++++++++----------- pandas/_libs/hashtable_func_helper.pxi.in | 4 +- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 916b9dd4d5aa5..d5d3249cee476 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -303,22 +303,22 @@ cdef class HashTable: {{py: -# name, dtype, float_group -dtypes = [('Float64', 'float64', True), - ('UInt64', 'uint64', False), - ('Int64', 'int64', False), - ('Float32', 'float32', True), - ('UInt32', 'uint32', False), - ('Int32', 'int32', False), - ('UInt16', 'uint16', False), - ('Int16', 'int16', False), - ('UInt8', 'uint8', False), - ('Int8', 'int8', False)] +# name, dtype, c_type, float_group +dtypes = [('Float64', 'float64', 'float64_t', True), + ('UInt64', 'uint64', 'uint64_t', False), + ('Int64', 'int64', 'int64_t', False), + ('Float32', 'float32', 'float32_t', True), + ('UInt32', 'uint32', 'uint32_t', False), + ('Int32', 'int32', 'int32_t', False), + ('UInt16', 'uint16', 'uint16_t', False), + ('Int16', 'int16', 'int16_t', False), + ('UInt8', 'uint8', 'uint8_t', False), + ('Int8', 'int8', 'int8_t', False)] }} -{{for name, dtype, float_group in dtypes}} +{{for name, dtype, c_type, float_group in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -371,11 +371,11 @@ cdef class {{name}}HashTable(HashTable): raise KeyError(key) @cython.boundscheck(False) - def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values): + def map(self, const {{c_type}}[:] keys, const int64_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {{dtype}}_t key + {{c_type}} key khiter_t k with nogil: @@ -385,11 +385,11 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = values[i] @cython.boundscheck(False) - def map_locations(self, const {{dtype}}_t[:] values): + def map_locations(self, const {{c_type}}[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {{dtype}}_t val + {{c_type}} val khiter_t k with nogil: @@ -399,11 +399,11 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i @cython.boundscheck(False) - def lookup(self, const {{dtype}}_t[:] values): + def lookup(self, const {{c_type}}[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {{dtype}}_t val + {{c_type}} val khiter_t k intp_t[:] locs = np.empty(n, dtype=np.intp) @@ -420,7 +420,7 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + def _unique(self, const {{c_type}}[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, object mask=None, bint return_inverse=False): @@ -465,7 +465,7 @@ cdef class {{name}}HashTable(HashTable): Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int ret = 0 - {{dtype}}_t val, na_value2 + {{c_type}} val, na_value2 khiter_t k {{name}}VectorData *ud bint use_na_value, use_mask @@ -538,7 +538,7 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + def unique(self, const {{c_type}}[:] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -561,7 +561,7 @@ cdef class {{name}}HashTable(HashTable): return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) - def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + def factorize(self, const {{c_type}}[:] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -606,13 +606,13 @@ cdef class {{name}}HashTable(HashTable): return labels @cython.boundscheck(False) - def get_labels_groupby(self, const {{dtype}}_t[:] values): + def get_labels_groupby(self, const {{c_type}}[:] values): cdef: Py_ssize_t i, n = len(values) intp_t[:] labels Py_ssize_t idx, count = 0 int ret = 0 - {{dtype}}_t val + {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 0a252f2105081..1bcfe5b5bb937 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -30,7 +30,7 @@ dtypes = [('float64', 'float64', 'float64_t'), cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values, kh_{{ttype}}_t *table, bint dropna): {{else}} -cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, +cdef build_count_table_{{dtype}}({{c_type}}[:] values, kh_{{ttype}}_t *table, bint dropna): {{endif}} cdef: @@ -138,7 +138,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): cdef: int ret = 0 {{if dtype != 'object'}} - {{dtype}}_t value + {{c_type}} value {{endif}} Py_ssize_t i, n = len(values) khiter_t k From 6fe568ed446c0cfce184d6bc1bf06d3b5881342a Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 27 Nov 2020 23:05:03 +0100 Subject: [PATCH 03/16] introducing complex hash tables --- pandas/_libs/hashtable.pxd | 18 +++ pandas/_libs/hashtable.pyx | 10 +- pandas/_libs/hashtable_class_helper.pxi.in | 117 ++++++++++++++---- pandas/_libs/hashtable_func_helper.pxi.in | 38 +++--- pandas/_libs/khash.pxd | 14 +++ .../_libs/khash_for_primitive_helper.pxi.in | 2 + pandas/_libs/src/klib/khash_python.h | 39 ++++++ 7 files changed, 199 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 7b630c264753f..cc9341665b8db 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,12 +1,16 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( + complex64_t, + complex128_t, float32_t, float64_t, int8_t, int16_t, int32_t, int64_t, + kh_complex64_t, + kh_complex128_t, kh_float32_t, kh_float64_t, kh_int8_t, @@ -19,6 +23,8 @@ from pandas._libs.khash cimport ( kh_uint16_t, kh_uint32_t, kh_uint64_t, + khcomplex64_t, + khcomplex128_t, uint8_t, uint16_t, uint32_t, @@ -90,6 +96,18 @@ cdef class Float32HashTable(HashTable): cpdef get_item(self, float32_t val) cpdef set_item(self, float32_t key, Py_ssize_t val) +cdef class Complex64HashTable(HashTable): + cdef kh_complex64_t *table + + cpdef get_item(self, complex64_t val) + cpdef set_item(self, complex64_t key, Py_ssize_t val) + +cdef class Complex128HashTable(HashTable): + cdef kh_complex128_t *table + + cpdef get_item(self, complex128_t val) + cpdef set_item(self, complex128_t key, Py_ssize_t val) + cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 963fddd4d5af9..36e2e99bbf615 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,7 +13,15 @@ cnp.import_array() from pandas._libs cimport util -from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t +from pandas._libs.khash cimport ( + KHASH_TRACE_DOMAIN, + are_equal_khcomplex64_t, + are_equal_khcomplex128_t, + kh_str_t, + khcomplex64_t, + khcomplex128_t, + khiter_t, +) from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index d5d3249cee476..5c1d413bb3412 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -8,7 +8,34 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # name -cimported_types = ['float32', +complex_types = ['complex64', + 'complex128'] +}} + +{{for name in complex_types}} +cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: + cdef kh{{name}}_t res + res.real = val.real + res.imag = val.imag + return res + +cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil: + cdef {{name}}_t res + res.real = val.real + res.imag = val.imag + return res + +cdef bint is_nan_kh{{name}}_t(kh{{name}}_t val) nogil: + return val.real != val.real or val.imag != val.imag +{{endfor}} + + +{{py: + +# name +cimported_types = ['complex64', + 'complex128', + 'float32', 'float64', 'int8', 'int16', @@ -48,7 +75,9 @@ from pandas._libs.missing cimport C_NA # but is included for completeness (rather ObjectVector is used # for uniques in hashtables) -dtypes = [('Float64', 'float64', 'float64_t'), +dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), + ('Complex64', 'complex64', 'khcomplex64_t'), + ('Float64', 'float64', 'float64_t'), ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), @@ -94,6 +123,8 @@ ctypedef fused vector_data: UInt8VectorData Float64VectorData Float32VectorData + Complex128VectorData + Complex64VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -106,7 +137,9 @@ cdef inline bint needs_resize(vector_data *data) nogil: {{py: # name, dtype, c_type -dtypes = [('Float64', 'float64', 'float64_t'), +dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), + ('Complex64', 'complex64', 'khcomplex64_t'), + ('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), ('Float32', 'float32', 'float32_t'), @@ -303,22 +336,24 @@ cdef class HashTable: {{py: -# name, dtype, c_type, float_group -dtypes = [('Float64', 'float64', 'float64_t', True), - ('UInt64', 'uint64', 'uint64_t', False), - ('Int64', 'int64', 'int64_t', False), - ('Float32', 'float32', 'float32_t', True), - ('UInt32', 'uint32', 'uint32_t', False), - ('Int32', 'int32', 'int32_t', False), - ('UInt16', 'uint16', 'uint16_t', False), - ('Int16', 'int16', 'int16_t', False), - ('UInt8', 'uint8', 'uint8_t', False), - ('Int8', 'int8', 'int8_t', False)] +# name, dtype, c_type, float_group, complex_group +dtypes = [('Complex128', 'complex128', 'khcomplex128_t', True, True), + ('Float64', 'float64', 'float64_t', True, False), + ('UInt64', 'uint64', 'uint64_t', False, False), + ('Int64', 'int64', 'int64_t', False, False), + ('Complex64', 'complex64', 'khcomplex64_t', True, True), + ('Float32', 'float32', 'float32_t', True, False), + ('UInt32', 'uint32', 'uint32_t', False, False), + ('Int32', 'int32', 'int32_t', False, False), + ('UInt16', 'uint16', 'uint16_t', False, False), + ('Int16', 'int16', 'int16_t', False, False), + ('UInt8', 'uint8', 'uint8_t', False, False), + ('Int8', 'int8', 'int8_t', False, False)] }} -{{for name, dtype, c_type, float_group in dtypes}} +{{for name, dtype, c_type, float_group, complex_group in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -339,7 +374,13 @@ cdef class {{name}}HashTable(HashTable): def __contains__(self, object key): cdef: khiter_t k - k = kh_get_{{dtype}}(self.table, key) + {{c_type}} ckey + {{if complex_group}} + ckey = to_{{c_type}}(key) + {{else}} + ckey = key + {{endif}} + k = kh_get_{{dtype}}(self.table, ckey) return k != self.table.n_buckets def sizeof(self, deep=False): @@ -353,7 +394,13 @@ cdef class {{name}}HashTable(HashTable): cpdef get_item(self, {{dtype}}_t val): cdef: khiter_t k - k = kh_get_{{dtype}}(self.table, val) + {{c_type}} cval + {{if complex_group}} + cval = to_{{c_type}}(val) + {{else}} + cval = val + {{endif}} + k = kh_get_{{dtype}}(self.table, cval) if k != self.table.n_buckets: return self.table.vals[k] else: @@ -363,8 +410,13 @@ cdef class {{name}}HashTable(HashTable): cdef: khiter_t k int ret = 0 - - k = kh_put_{{dtype}}(self.table, key, &ret) + {{c_type}} ckey + {{if complex_group}} + ckey = to_{{c_type}}(key) + {{else}} + ckey = key + {{endif}} + k = kh_put_{{dtype}}(self.table, ckey, &ret) if kh_exist_{{dtype}}(self.table, k): self.table.vals[k] = val else: @@ -486,9 +538,17 @@ cdef class {{name}}HashTable(HashTable): # We use None, to make it optional, which requires `object` type # for the parameter. To please the compiler, we use na_value2, # which is only used if it's *specified*. - na_value2 = <{{dtype}}_t>na_value + {{if complex_group}} + na_value2 = to_{{c_type}}(na_value) + {{else}} + na_value2 = na_value + {{endif}} else: + {{if complex_group}} + na_value2 = to_{{c_type}}(0) + {{else}} na_value2 = 0 + {{endif}} with nogil: for i in range(n): @@ -499,10 +559,14 @@ cdef class {{name}}HashTable(HashTable): labels[i] = na_sentinel continue elif ignore_na and ( - {{if not name.lower().startswith(("uint", "int"))}} - val != val or - {{endif}} + {{if complex_group}} + not is_nan_{{c_type}}(val) or + (use_na_value and are_equal_{{c_type}}(val,na_value2)) + {{elif float_group}} + val != val or (use_na_value and val == na_value2) + {{else}} (use_na_value and val == na_value2) + {{endif}} ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, @@ -625,7 +689,12 @@ cdef class {{name}}HashTable(HashTable): val = values[i] # specific for groupby - {{if dtype != 'uint64'}} + {{if dtype == 'complex64' or dtype== 'complex128'}} + # TODO: what should be done here? + if val.real < 0: + labels[i] = -1 + continue + {{elif dtype != 'uint64'}} if val < 0: labels[i] = -1 continue diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 1bcfe5b5bb937..b864bcffa699e 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,22 +6,24 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type -dtypes = [('float64', 'float64', 'float64_t'), - ('float32', 'float32', 'float32_t'), - ('uint64', 'uint64', 'uint64_t'), - ('uint32', 'uint32', 'uint32_t'), - ('uint16', 'uint16', 'uint16_t'), - ('uint8', 'uint8', 'uint8_t'), - ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t'), - ('int32', 'int32', 'int32_t'), - ('int16', 'int16', 'int16_t'), - ('int8', 'int8', 'int8_t')] +# dtype, ttype, c_type, complex_group +dtypes = [('complex128', 'complex128', 'khcomplex128_t', True), + ('complex64', 'complex64', 'khcomplex64_t', True), + ('float64', 'float64', 'float64_t', False), + ('float32', 'float32', 'float32_t', False), + ('uint64', 'uint64', 'uint64_t', False), + ('uint32', 'uint32', 'uint32_t', False), + ('uint16', 'uint16', 'uint16_t', False), + ('uint8', 'uint8', 'uint8_t', False), + ('object', 'pymap', 'object', False), + ('int64', 'int64', 'int64_t', False), + ('int32', 'int32', 'int32_t', False), + ('int16', 'int16', 'int16_t', False), + ('int8', 'int8', 'int8_t', False)] }} -{{for dtype, ttype, c_type in dtypes}} +{{for dtype, ttype, c_type, complex_group in dtypes}} @cython.wraparound(False) @@ -63,6 +65,8 @@ cdef build_count_table_{{dtype}}({{c_type}}[:] values, {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: + {{elif complex_group}} + if not is_nan_{{c_type}}(val) or not dropna: {{else}} if True: {{endif}} @@ -114,7 +118,11 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): with nogil: for k in range(table.n_buckets): if kh_exist_{{ttype}}(table, k): + {{if complex_group}} + result_keys[i] = to_{{dtype}}(table.keys[k]) + {{else}} result_keys[i] = table.keys[k] + {{endif}} result_counts[i] = table.vals[k] i += 1 {{endif}} @@ -279,7 +287,9 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{py: # dtype, ctype, table_type, npy_dtype -dtypes = [('float64', 'float64_t', 'float64', 'float64'), +dtypes = [('complex128', 'khcomplex128_t', 'complex128', 'complex128'), + ('complex64', 'khcomplex64_t', 'complex64', 'complex64'), + ('float64', 'float64_t', 'float64', 'float64'), ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), ('int32', 'int32_t', 'int32', 'int32'), diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 0d0c5ae058b21..bb01c592ae6e0 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,5 +1,7 @@ from cpython.object cimport PyObject from numpy cimport ( + complex64_t, + complex128_t, float32_t, float64_t, int8_t, @@ -19,6 +21,18 @@ cdef extern from "khash_python.h": ctypedef uint32_t khint_t ctypedef khint_t khiter_t + ctypedef struct khcomplex128_t: + double real + double imag + + bint are_equal_khcomplex128_t "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil + + ctypedef struct khcomplex64_t: + float real + float imag + + bint are_equal_khcomplex64_t "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil + ctypedef struct kh_pymap_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index db8d3e0b19417..dc7b11adb957b 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -17,6 +17,8 @@ primitive_types = [('int64', 'int64_t'), ('uint16', 'uint16_t'), ('int8', 'int8_t'), ('uint8', 'uint8_t'), + ('complex64', 'khcomplex64_t'), + ('complex128', 'khcomplex128_t'), ] }} diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 8e4e61b4f3077..b311b6fb0041f 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -1,6 +1,19 @@ #include #include +#include + +//typedef struct { double real, imag; } khcomplex128_t; +//typedef struct { float real, imag; } khcomplex64_t; +//typedef __pyx_t_float_complex khcomplex64_t; +//typedef __pyx_t_double_complex khcomplex128_t; + + +typedef npy_complex64 khcomplex64_t; +typedef npy_complex128 khcomplex128_t; + + + // khash should report usage to tracemalloc #if PY_VERSION_HEX >= 0x03060000 #include @@ -128,6 +141,32 @@ KHASH_MAP_INIT_FLOAT64(float64, size_t) KHASH_MAP_INIT_FLOAT32(float32, size_t) +khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){ + return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag); +} +khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){ + return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag); +} + +#define kh_complex_hash_equal(a, b) \ + (kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag)) + + +#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \ + KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal) + +KHASH_MAP_INIT_COMPLEX64(complex64, size_t) + + +#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ + KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal) + +KHASH_MAP_INIT_COMPLEX128(complex128, size_t) + + +#define kh_exist_complex64(h, k) (kh_exist(h, k)) +#define kh_exist_complex128(h, k) (kh_exist(h, k)) + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); From fbbf544392ac33dc7fb26b0717d9a7c68981703e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 29 Nov 2020 23:52:57 +0100 Subject: [PATCH 04/16] ensure const-correctness of input is tested --- pandas/tests/libs/test_hashtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index a6fd421911d3e..d6037505efea4 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -80,6 +80,8 @@ def test_map(self, table_type, dtype): table = table_type() keys = np.arange(N).astype(dtype) vals = np.arange(N).astype(np.int64) + N + keys.flags.writeable = False + vals.flags.writeable = False table.map(keys, vals) for i in range(N): assert table.get_item(keys[i]) == i + N @@ -88,6 +90,7 @@ def test_map_locations(self, table_type, dtype): N = 8 table = table_type() keys = (np.arange(N) + N).astype(dtype) + keys.flags.writeable = False table.map_locations(keys) for i in range(N): assert table.get_item(keys[i]) == i @@ -96,6 +99,7 @@ def test_lookup(self, table_type, dtype): N = 3 table = table_type() keys = (np.arange(N) + N).astype(dtype) + keys.flags.writeable = False table.map_locations(keys) result = table.lookup(keys) expected = np.arange(N) @@ -121,6 +125,7 @@ def test_unique(self, table_type, dtype): table = table_type() expected = (np.arange(N) + N).astype(dtype) keys = np.repeat(expected, 5) + keys.flags.writeable = False unique = table.unique(keys) tm.assert_numpy_array_equal(unique, expected) @@ -254,6 +259,7 @@ def test_duplicated_first(self, dtype, type_suffix): N = 100 duplicated = get_ht_function("duplicated", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) + values.flags.writeable = False result = duplicated(values) expected = np.ones_like(values, dtype=np.bool_) expected[::5] = False @@ -264,6 +270,8 @@ def test_ismember_yes(self, dtype, type_suffix): ismember = get_ht_function("ismember", type_suffix) arr = np.arange(N).astype(dtype) values = np.arange(N).astype(dtype) + arr.flags.writeable = False + values.flags.writeable = False result = ismember(arr, values) expected = np.ones_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) From 1fe2d5555a0395ce83d5d48907400acf1c79a065 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 00:13:20 +0100 Subject: [PATCH 05/16] const memoryviews of struct dtypes will work only with Cython>=0.29.22, so making a workaround --- pandas/_libs/hashtable_class_helper.pxi.in | 34 +++++++++++++++++----- pandas/_libs/hashtable_func_helper.pxi.in | 33 +++++++++++++++++---- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5c1d413bb3412..c4867f1c35959 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -423,7 +423,7 @@ cdef class {{name}}HashTable(HashTable): raise KeyError(key) @cython.boundscheck(False) - def map(self, const {{c_type}}[:] keys, const int64_t[:] values): + def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -432,12 +432,16 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): + {{if complex_group}} + key = to_{{c_type}}(keys[i]) + {{else}} key = keys[i] + {{endif}} k = kh_put_{{dtype}}(self.table, key, &ret) self.table.vals[k] = values[i] @cython.boundscheck(False) - def map_locations(self, const {{c_type}}[:] values): + def map_locations(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -446,12 +450,16 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): + {{if complex_group}} + val= to_{{c_type}}(values[i]) + {{else}} val = values[i] + {{endif}} k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i @cython.boundscheck(False) - def lookup(self, const {{c_type}}[:] values): + def lookup(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -461,7 +469,11 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): + {{if complex_group}} + val = to_{{c_type}}(values[i]) + {{else}} val = values[i] + {{endif}} k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] @@ -472,7 +484,7 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, const {{c_type}}[:] values, {{name}}Vector uniques, + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, object mask=None, bint return_inverse=False): @@ -552,7 +564,11 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): + {{if complex_group}} + val = to_{{c_type}}(values[i]) + {{else}} val = values[i] + {{endif}} if ignore_na and use_mask: if mask_values[i]: @@ -602,7 +618,7 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), np.asarray(labels) return uniques.to_array() - def unique(self, const {{c_type}}[:] values, bint return_inverse=False): + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -625,7 +641,7 @@ cdef class {{name}}HashTable(HashTable): return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) - def factorize(self, const {{c_type}}[:] values, Py_ssize_t na_sentinel=-1, + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -670,7 +686,7 @@ cdef class {{name}}HashTable(HashTable): return labels @cython.boundscheck(False) - def get_labels_groupby(self, const {{c_type}}[:] values): + def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) intp_t[:] labels @@ -686,7 +702,11 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): + {{if complex_group}} + val = to_{{c_type}}(values[i]) + {{else}} val = values[i] + {{endif}} # specific for groupby {{if dtype == 'complex64' or dtype== 'complex128'}} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b864bcffa699e..ecde398ef7812 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -141,7 +141,7 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): {{if dtype == 'object'}} def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} -def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): +def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{endif}} cdef: int ret = 0 @@ -168,7 +168,12 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): with nogil: for i in range(n - 1, -1, -1): # equivalent: range(n)[::-1], which cython doesn't like in nogil - kh_put_{{ttype}}(table, values[i], &ret) + {{if complex_group}} + value = to_{{c_type}}(values[i]) + {{else}} + value = values[i] + {{endif}} + kh_put_{{ttype}}(table, value, &ret) out[i] = ret == 0 {{endif}} elif keep == 'first': @@ -179,7 +184,12 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{else}} with nogil: for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + {{if complex_group}} + value = to_{{c_type}}(values[i]) + {{else}} + value = values[i] + {{endif}} + kh_put_{{ttype}}(table, value, &ret) out[i] = ret == 0 {{endif}} else: @@ -197,7 +207,11 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{else}} with nogil: for i in range(n): + {{if complex_group}} + value = to_{{c_type}}(values[i]) + {{else}} value = values[i] + {{endif}} k = kh_get_{{ttype}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 @@ -221,7 +235,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): +def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} """ Return boolean of values in arr on an @@ -254,7 +268,12 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{else}} with nogil: for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + {{if complex_group}} + val = to_{{c_type}}(values[i]) + {{else}} + val = values[i] + {{endif}} + kh_put_{{ttype}}(table, val, &ret) {{endif}} # test membership @@ -269,7 +288,11 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{else}} with nogil: for i in range(n): + {{if complex_group}} + val = to_{{c_type}}(arr[i]) + {{else}} val = arr[i] + {{endif}} k = kh_get_{{ttype}}(table, val) result[i] = (k != table.n_buckets) {{endif}} From 57e2a72c8516c71f74a311f8de745a539119c528 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 00:21:07 +0100 Subject: [PATCH 06/16] making count_values and mode accept constand input as well --- pandas/_libs/hashtable_func_helper.pxi.in | 15 +++++++++++---- pandas/tests/libs/test_hashtable.py | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index ecde398ef7812..93990627d5fe3 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -32,7 +32,7 @@ dtypes = [('complex128', 'complex128', 'khcomplex128_t', True), cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values, kh_{{ttype}}_t *table, bint dropna): {{else}} -cdef build_count_table_{{dtype}}({{c_type}}[:] values, +cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, kh_{{ttype}}_t *table, bint dropna): {{endif}} cdef: @@ -47,8 +47,11 @@ cdef build_count_table_{{dtype}}({{c_type}}[:] values, kh_resize_{{ttype}}(table, n // 10) for i in range(n): + {{if complex_group}} + val = to_{{c_type}}(values[i]) + {{else}} val = values[i] - + {{endif}} if not checknull(val) or not dropna: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: @@ -61,7 +64,11 @@ cdef build_count_table_{{dtype}}({{c_type}}[:] values, kh_resize_{{ttype}}(table, n) for i in range(n): + {{if complex_group}} + val = to_{{c_type}}(values[i]) + {{else}} val = values[i] + {{endif}} {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: @@ -84,7 +91,7 @@ cdef build_count_table_{{dtype}}({{c_type}}[:] values, {{if dtype == 'object'}} cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): +cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 @@ -338,7 +345,7 @@ def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna): {{else}} -def mode_{{dtype}}({{ctype}}[:] values, bint dropna): +def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: int count, max_count = 1 diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index d6037505efea4..61f151ecfae91 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -251,6 +251,7 @@ def test_value_count(self, dtype, type_suffix): value_count = get_ht_function("value_count", type_suffix) expected = (np.arange(N) + N).astype(dtype) values = np.repeat(expected, 5) + values.flags.writeable = False keys, counts = value_count(values, False) tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) @@ -293,6 +294,7 @@ def test_mode(self, dtype, type_suffix): mode = get_ht_function("mode", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 + values.flags.writeable = False result = mode(values, False) assert result == 42 From 259deff15d0b19e71b17dc97a870708acbb5a0e4 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 00:22:56 +0100 Subject: [PATCH 07/16] activating tests for complex hashtables --- pandas/tests/libs/test_hashtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 61f151ecfae91..6f73cc56dfd8c 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -30,9 +30,11 @@ def get_allocated_khash_memory(): "table_type, dtype", [ (ht.PyObjectHashTable, np.object_), + (ht.Complex128HashTable, np.complex128), (ht.Int64HashTable, np.int64), (ht.UInt64HashTable, np.uint64), (ht.Float64HashTable, np.float64), + (ht.Complex64HashTable, np.complex64), (ht.Int32HashTable, np.int32), (ht.UInt32HashTable, np.uint32), (ht.Float32HashTable, np.float32), @@ -182,6 +184,8 @@ def test_tracemalloc_for_empty_StringHashTable(): [ (ht.Float64HashTable, np.float64), (ht.Float32HashTable, np.float32), + (ht.Complex128HashTable, np.complex128), + (ht.Complex64HashTable, np.complex64), ], ) class TestHashTableWithNans: @@ -233,9 +237,11 @@ def get_ht_function(fun_name, type_suffix): "dtype, type_suffix", [ (np.object_, "object"), + (np.complex128, "complex128"), (np.int64, "int64"), (np.uint64, "uint64"), (np.float64, "float64"), + (np.complex64, "complex64"), (np.int32, "int32"), (np.uint32, "uint32"), (np.float32, "float32"), @@ -304,6 +310,8 @@ def test_mode(self, dtype, type_suffix): [ (np.float64, "float64"), (np.float32, "float32"), + (np.complex128, "complex128"), + (np.complex64, "complex64"), ], ) class TestHelpFunctionsWithNans: From 22ee6e7b499fe7be834c4117c679264266cf93c5 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 09:37:28 +0100 Subject: [PATCH 08/16] using to_c_type, rather than a switch --- pandas/_libs/hashtable_class_helper.pxi.in | 88 ++++++---------------- pandas/_libs/hashtable_func_helper.pxi.in | 70 ++++++----------- 2 files changed, 45 insertions(+), 113 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c4867f1c35959..ca94c23ccd607 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -336,24 +336,24 @@ cdef class HashTable: {{py: -# name, dtype, c_type, float_group, complex_group -dtypes = [('Complex128', 'complex128', 'khcomplex128_t', True, True), - ('Float64', 'float64', 'float64_t', True, False), - ('UInt64', 'uint64', 'uint64_t', False, False), - ('Int64', 'int64', 'int64_t', False, False), - ('Complex64', 'complex64', 'khcomplex64_t', True, True), - ('Float32', 'float32', 'float32_t', True, False), - ('UInt32', 'uint32', 'uint32_t', False, False), - ('Int32', 'int32', 'int32_t', False, False), - ('UInt16', 'uint16', 'uint16_t', False, False), - ('Int16', 'int16', 'int16_t', False, False), - ('UInt8', 'uint8', 'uint8_t', False, False), - ('Int8', 'int8', 'int8_t', False, False)] +# name, dtype, c_type, float_group, complex_group, to_c_type +dtypes = [('Complex128', 'complex128', 'khcomplex128_t', True, True, "to_khcomplex128_t"), + ('Float64', 'float64', 'float64_t', True, False, ""), + ('UInt64', 'uint64', 'uint64_t', False, False, ""), + ('Int64', 'int64', 'int64_t', False, False, ""), + ('Complex64', 'complex64', 'khcomplex64_t', True, True, "to_khcomplex64_t"), + ('Float32', 'float32', 'float32_t', True, False, ""), + ('UInt32', 'uint32', 'uint32_t', False, False, ""), + ('Int32', 'int32', 'int32_t', False, False, ""), + ('UInt16', 'uint16', 'uint16_t', False, False, ""), + ('Int16', 'int16', 'int16_t', False, False, ""), + ('UInt8', 'uint8', 'uint8_t', False, False, ""), + ('Int8', 'int8', 'int8_t', False, False, "")] }} -{{for name, dtype, c_type, float_group, complex_group in dtypes}} +{{for name, dtype, c_type, float_group, complex_group, to_c_type in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -375,11 +375,7 @@ cdef class {{name}}HashTable(HashTable): cdef: khiter_t k {{c_type}} ckey - {{if complex_group}} - ckey = to_{{c_type}}(key) - {{else}} - ckey = key - {{endif}} + ckey = {{to_c_type}}(key) k = kh_get_{{dtype}}(self.table, ckey) return k != self.table.n_buckets @@ -395,11 +391,7 @@ cdef class {{name}}HashTable(HashTable): cdef: khiter_t k {{c_type}} cval - {{if complex_group}} - cval = to_{{c_type}}(val) - {{else}} - cval = val - {{endif}} + cval = {{to_c_type}}(val) k = kh_get_{{dtype}}(self.table, cval) if k != self.table.n_buckets: return self.table.vals[k] @@ -411,11 +403,7 @@ cdef class {{name}}HashTable(HashTable): khiter_t k int ret = 0 {{c_type}} ckey - {{if complex_group}} - ckey = to_{{c_type}}(key) - {{else}} - ckey = key - {{endif}} + ckey = {{to_c_type}}(key) k = kh_put_{{dtype}}(self.table, ckey, &ret) if kh_exist_{{dtype}}(self.table, k): self.table.vals[k] = val @@ -432,11 +420,7 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): - {{if complex_group}} - key = to_{{c_type}}(keys[i]) - {{else}} - key = keys[i] - {{endif}} + key = {{to_c_type}}(keys[i]) k = kh_put_{{dtype}}(self.table, key, &ret) self.table.vals[k] = values[i] @@ -450,11 +434,7 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): - {{if complex_group}} - val= to_{{c_type}}(values[i]) - {{else}} - val = values[i] - {{endif}} + val= {{to_c_type}}(values[i]) k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i @@ -469,11 +449,7 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(values[i]) - {{else}} - val = values[i] - {{endif}} + val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] @@ -550,25 +526,13 @@ cdef class {{name}}HashTable(HashTable): # We use None, to make it optional, which requires `object` type # for the parameter. To please the compiler, we use na_value2, # which is only used if it's *specified*. - {{if complex_group}} - na_value2 = to_{{c_type}}(na_value) - {{else}} - na_value2 = na_value - {{endif}} + na_value2 = {{to_c_type}}(na_value) else: - {{if complex_group}} - na_value2 = to_{{c_type}}(0) - {{else}} - na_value2 = 0 - {{endif}} + na_value2 = {{to_c_type}}(0) with nogil: for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(values[i]) - {{else}} - val = values[i] - {{endif}} + val = {{to_c_type}}(values[i]) if ignore_na and use_mask: if mask_values[i]: @@ -702,11 +666,7 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(values[i]) - {{else}} - val = values[i] - {{endif}} + val = {{to_c_type}}(values[i]) # specific for groupby {{if dtype == 'complex64' or dtype== 'complex128'}} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 93990627d5fe3..04f74f3397d2c 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,24 +6,24 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type, complex_group -dtypes = [('complex128', 'complex128', 'khcomplex128_t', True), - ('complex64', 'complex64', 'khcomplex64_t', True), - ('float64', 'float64', 'float64_t', False), - ('float32', 'float32', 'float32_t', False), - ('uint64', 'uint64', 'uint64_t', False), - ('uint32', 'uint32', 'uint32_t', False), - ('uint16', 'uint16', 'uint16_t', False), - ('uint8', 'uint8', 'uint8_t', False), - ('object', 'pymap', 'object', False), - ('int64', 'int64', 'int64_t', False), - ('int32', 'int32', 'int32_t', False), - ('int16', 'int16', 'int16_t', False), - ('int8', 'int8', 'int8_t', False)] +# dtype, ttype, c_type, complex_group, to_c_type +dtypes = [('complex128', 'complex128', 'khcomplex128_t', True, "to_khcomplex128_t"), + ('complex64', 'complex64', 'khcomplex64_t', True, "to_khcomplex64_t"), + ('float64', 'float64', 'float64_t', False, ""), + ('float32', 'float32', 'float32_t', False, ""), + ('uint64', 'uint64', 'uint64_t', False, ""), + ('uint32', 'uint32', 'uint32_t', False, ""), + ('uint16', 'uint16', 'uint16_t', False, ""), + ('uint8', 'uint8', 'uint8_t', False, ""), + ('object', 'pymap', 'object', False, ""), + ('int64', 'int64', 'int64_t', False, ""), + ('int32', 'int32', 'int32_t', False, ""), + ('int16', 'int16', 'int16_t', False, ""), + ('int8', 'int8', 'int8_t', False, "")] }} -{{for dtype, ttype, c_type, complex_group in dtypes}} +{{for dtype, ttype, c_type, complex_group, to_c_type in dtypes}} @cython.wraparound(False) @@ -47,11 +47,7 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, kh_resize_{{ttype}}(table, n // 10) for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(values[i]) - {{else}} val = values[i] - {{endif}} if not checknull(val) or not dropna: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: @@ -64,11 +60,7 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, kh_resize_{{ttype}}(table, n) for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(values[i]) - {{else}} - val = values[i] - {{endif}} + val = {{to_c_type}}(values[i]) {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: @@ -175,11 +167,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): with nogil: for i in range(n - 1, -1, -1): # equivalent: range(n)[::-1], which cython doesn't like in nogil - {{if complex_group}} - value = to_{{c_type}}(values[i]) - {{else}} - value = values[i] - {{endif}} + value = {{to_c_type}}(values[i]) kh_put_{{ttype}}(table, value, &ret) out[i] = ret == 0 {{endif}} @@ -191,11 +179,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{else}} with nogil: for i in range(n): - {{if complex_group}} - value = to_{{c_type}}(values[i]) - {{else}} - value = values[i] - {{endif}} + value = {{to_c_type}}(values[i]) kh_put_{{ttype}}(table, value, &ret) out[i] = ret == 0 {{endif}} @@ -214,11 +198,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{else}} with nogil: for i in range(n): - {{if complex_group}} - value = to_{{c_type}}(values[i]) - {{else}} - value = values[i] - {{endif}} + value = {{to_c_type}}(values[i]) k = kh_get_{{ttype}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 @@ -275,11 +255,7 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{else}} with nogil: for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(values[i]) - {{else}} - val = values[i] - {{endif}} + val = {{to_c_type}}(values[i]) kh_put_{{ttype}}(table, val, &ret) {{endif}} @@ -295,11 +271,7 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{else}} with nogil: for i in range(n): - {{if complex_group}} - val = to_{{c_type}}(arr[i]) - {{else}} - val = arr[i] - {{endif}} + val = {{to_c_type}}(arr[i]) k = kh_get_{{ttype}}(table, val) result[i] = (k != table.n_buckets) {{endif}} From 5bcd1066ee07a3848d79c09fb8e9ed5de94af14f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 10:09:45 +0100 Subject: [PATCH 09/16] getting rid of complex_group/float_group --- pandas/_libs/hashtable.pyx | 6 +- pandas/_libs/hashtable_class_helper.pxi.in | 76 +++++++++++++++------- pandas/_libs/hashtable_func_helper.pxi.in | 42 +++++------- pandas/_libs/khash.pxd | 7 +- 4 files changed, 79 insertions(+), 52 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 36e2e99bbf615..2c7780e0d95fd 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -15,8 +15,10 @@ cnp.import_array() from pandas._libs cimport util from pandas._libs.khash cimport ( KHASH_TRACE_DOMAIN, - are_equal_khcomplex64_t, - are_equal_khcomplex128_t, + are_equivalent_float32_t, + are_equivalent_float64_t, + are_equivalent_khcomplex64_t, + are_equivalent_khcomplex128_t, kh_str_t, khcomplex64_t, khcomplex128_t, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ca94c23ccd607..4532f8c2566af 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -30,6 +30,44 @@ cdef bint is_nan_kh{{name}}_t(kh{{name}}_t val) nogil: {{endfor}} +{{py: + +# name +float_types = ['float64_t', + 'float32_t'] +}} + +{{for c_type in float_types}} + +cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + return val != val +{{endfor}} + + +{{py: + + +# name +int_types = ['int64_t', + 'int32_t', + 'int16_t', + 'int8_t', + 'uint64_t', + 'uint32_t', + 'uint16_t', + 'uint8_t',] +}} + +{{for c_type in int_types}} + +cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + return False + +cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil: + return val1 == val2 +{{endfor}} + + {{py: # name @@ -336,24 +374,24 @@ cdef class HashTable: {{py: -# name, dtype, c_type, float_group, complex_group, to_c_type -dtypes = [('Complex128', 'complex128', 'khcomplex128_t', True, True, "to_khcomplex128_t"), - ('Float64', 'float64', 'float64_t', True, False, ""), - ('UInt64', 'uint64', 'uint64_t', False, False, ""), - ('Int64', 'int64', 'int64_t', False, False, ""), - ('Complex64', 'complex64', 'khcomplex64_t', True, True, "to_khcomplex64_t"), - ('Float32', 'float32', 'float32_t', True, False, ""), - ('UInt32', 'uint32', 'uint32_t', False, False, ""), - ('Int32', 'int32', 'int32_t', False, False, ""), - ('UInt16', 'uint16', 'uint16_t', False, False, ""), - ('Int16', 'int16', 'int16_t', False, False, ""), - ('UInt8', 'uint8', 'uint8_t', False, False, ""), - ('Int8', 'int8', 'int8_t', False, False, "")] +# name, dtype, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'khcomplex128_t', "to_khcomplex128_t"), + ('Float64', 'float64', 'float64_t', ""), + ('UInt64', 'uint64', 'uint64_t', ""), + ('Int64', 'int64', 'int64_t', ""), + ('Complex64', 'complex64', 'khcomplex64_t', "to_khcomplex64_t"), + ('Float32', 'float32', 'float32_t', ""), + ('UInt32', 'uint32', 'uint32_t', ""), + ('Int32', 'int32', 'int32_t', ""), + ('UInt16', 'uint16', 'uint16_t', ""), + ('Int16', 'int16', 'int16_t', ""), + ('UInt8', 'uint8', 'uint8_t', ""), + ('Int8', 'int8', 'int8_t', "")] }} -{{for name, dtype, c_type, float_group, complex_group, to_c_type in dtypes}} +{{for name, dtype, c_type, to_c_type in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -539,14 +577,8 @@ cdef class {{name}}HashTable(HashTable): labels[i] = na_sentinel continue elif ignore_na and ( - {{if complex_group}} - not is_nan_{{c_type}}(val) or - (use_na_value and are_equal_{{c_type}}(val,na_value2)) - {{elif float_group}} - val != val or (use_na_value and val == na_value2) - {{else}} - (use_na_value and val == na_value2) - {{endif}} + is_nan_{{c_type}}(val) or + (use_na_value and are_equivalent_{{c_type}}(val,na_value2)) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 04f74f3397d2c..7e7dba34f12b0 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,24 +6,24 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type, complex_group, to_c_type -dtypes = [('complex128', 'complex128', 'khcomplex128_t', True, "to_khcomplex128_t"), - ('complex64', 'complex64', 'khcomplex64_t', True, "to_khcomplex64_t"), - ('float64', 'float64', 'float64_t', False, ""), - ('float32', 'float32', 'float32_t', False, ""), - ('uint64', 'uint64', 'uint64_t', False, ""), - ('uint32', 'uint32', 'uint32_t', False, ""), - ('uint16', 'uint16', 'uint16_t', False, ""), - ('uint8', 'uint8', 'uint8_t', False, ""), - ('object', 'pymap', 'object', False, ""), - ('int64', 'int64', 'int64_t', False, ""), - ('int32', 'int32', 'int32_t', False, ""), - ('int16', 'int16', 'int16_t', False, ""), - ('int8', 'int8', 'int8_t', False, "")] +# dtype, ttype, c_type, to_c_type, to_dtype +dtypes = [('complex128', 'complex128', 'khcomplex128_t', "to_khcomplex128_t", "to_complex128"), + ('complex64', 'complex64', 'khcomplex64_t', "to_khcomplex64_t", "to_complex64"), + ('float64', 'float64', 'float64_t', "", ""), + ('float32', 'float32', 'float32_t', "", ""), + ('uint64', 'uint64', 'uint64_t', "", ""), + ('uint32', 'uint32', 'uint32_t', "", ""), + ('uint16', 'uint16', 'uint16_t', "", ""), + ('uint8', 'uint8', 'uint8_t', "", ""), + ('object', 'pymap', 'object', "", ""), + ('int64', 'int64', 'int64_t', "", ""), + ('int32', 'int32', 'int32_t', "", ""), + ('int16', 'int16', 'int16_t', "", ""), + ('int8', 'int8', 'int8_t', "", "")] }} -{{for dtype, ttype, c_type, complex_group, to_c_type in dtypes}} +{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} @cython.wraparound(False) @@ -62,13 +62,7 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, for i in range(n): val = {{to_c_type}}(values[i]) - {{if dtype == 'float64' or dtype == 'float32'}} - if val == val or not dropna: - {{elif complex_group}} if not is_nan_{{c_type}}(val) or not dropna: - {{else}} - if True: - {{endif}} k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 @@ -117,11 +111,7 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): with nogil: for k in range(table.n_buckets): if kh_exist_{{ttype}}(table, k): - {{if complex_group}} - result_keys[i] = to_{{dtype}}(table.keys[k]) - {{else}} - result_keys[i] = table.keys[k] - {{endif}} + result_keys[i] = {{to_dtype}}(table.keys[k]) result_counts[i] = table.vals[k] i += 1 {{endif}} diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index bb01c592ae6e0..19d8e48f6619e 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -25,13 +25,16 @@ cdef extern from "khash_python.h": double real double imag - bint are_equal_khcomplex128_t "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil + bint are_equivalent_khcomplex128_t "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil ctypedef struct khcomplex64_t: float real float imag - bint are_equal_khcomplex64_t "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil + bint are_equivalent_khcomplex64_t "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil + + bint are_equivalent_float64_t "kh_floats_hash_equal" (float64_t a, float64_t b) nogil + bint are_equivalent_float32_t "kh_floats_hash_equal" (float32_t a, float32_t b) nogil ctypedef struct kh_pymap_t: khint_t n_buckets, size, n_occupied, upper_bound From 432f839648bf95fd5e7c76183ef3c442f19e521d Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 20:40:22 +0100 Subject: [PATCH 10/16] fixing style issues --- pandas/_libs/hashtable_class_helper.pxi.in | 10 +++++----- pandas/_libs/hashtable_func_helper.pxi.in | 8 +++++--- pandas/_libs/khash.pxd | 13 +++++++++---- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 4532f8c2566af..9c145a31b4a59 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -55,7 +55,7 @@ int_types = ['int64_t', 'uint64_t', 'uint32_t', 'uint16_t', - 'uint8_t',] + 'uint8_t'] }} {{for c_type in int_types}} @@ -564,9 +564,9 @@ cdef class {{name}}HashTable(HashTable): # We use None, to make it optional, which requires `object` type # for the parameter. To please the compiler, we use na_value2, # which is only used if it's *specified*. - na_value2 = {{to_c_type}}(na_value) + na_value2 = {{to_c_type}}(na_value) else: - na_value2 = {{to_c_type}}(0) + na_value2 = {{to_c_type}}(0) with nogil: for i in range(n): @@ -577,8 +577,8 @@ cdef class {{name}}HashTable(HashTable): labels[i] = na_sentinel continue elif ignore_na and ( - is_nan_{{c_type}}(val) or - (use_na_value and are_equivalent_{{c_type}}(val,na_value2)) + is_nan_{{c_type}}(val) or + (use_na_value and are_equivalent_{{c_type}}(val, na_value2)) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 7e7dba34f12b0..ca53aea4d0cdc 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -7,8 +7,10 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # dtype, ttype, c_type, to_c_type, to_dtype -dtypes = [('complex128', 'complex128', 'khcomplex128_t', "to_khcomplex128_t", "to_complex128"), - ('complex64', 'complex64', 'khcomplex64_t', "to_khcomplex64_t", "to_complex64"), +dtypes = [('complex128', 'complex128', 'khcomplex128_t', \ + "to_khcomplex128_t", "to_complex128"), + ('complex64', 'complex64', 'khcomplex64_t', \ + "to_khcomplex64_t", "to_complex64"), ('float64', 'float64', 'float64_t', "", ""), ('float32', 'float32', 'float32_t', "", ""), ('uint64', 'uint64', 'uint64_t', "", ""), @@ -111,7 +113,7 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): with nogil: for k in range(table.n_buckets): if kh_exist_{{ttype}}(table, k): - result_keys[i] = {{to_dtype}}(table.keys[k]) + result_keys[i] = {{to_dtype}}(table.keys[k]) result_counts[i] = table.vals[k] i += 1 {{endif}} diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 19d8e48f6619e..53b94c5a73b83 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -25,16 +25,21 @@ cdef extern from "khash_python.h": double real double imag - bint are_equivalent_khcomplex128_t "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil + bint are_equivalent_khcomplex128_t \ + "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil ctypedef struct khcomplex64_t: float real float imag - bint are_equivalent_khcomplex64_t "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil + bint are_equivalent_khcomplex64_t \ + "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil - bint are_equivalent_float64_t "kh_floats_hash_equal" (float64_t a, float64_t b) nogil - bint are_equivalent_float32_t "kh_floats_hash_equal" (float32_t a, float32_t b) nogil + bint are_equivalent_float64_t \ + "kh_floats_hash_equal" (float64_t a, float64_t b) nogil + + bint are_equivalent_float32_t \ + "kh_floats_hash_equal" (float32_t a, float32_t b) nogil ctypedef struct kh_pymap_t: khint_t n_buckets, size, n_occupied, upper_bound From a4903b0ed19a46032069a372c20301c4227cc398 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 21:18:53 +0100 Subject: [PATCH 11/16] removing commented out code --- pandas/_libs/src/klib/khash_python.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index b311b6fb0041f..d5eb45ec231b8 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -1,14 +1,9 @@ #include #include -#include - -//typedef struct { double real, imag; } khcomplex128_t; -//typedef struct { float real, imag; } khcomplex64_t; -//typedef __pyx_t_float_complex khcomplex64_t; -//typedef __pyx_t_double_complex khcomplex128_t; - +// use numpy's definitions for complex +#include typedef npy_complex64 khcomplex64_t; typedef npy_complex128 khcomplex128_t; From bb1f3d5bc6f71d0adbad837a4840501255e2d122 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Nov 2020 23:00:43 +0100 Subject: [PATCH 12/16] get_labels_groupby is only used with Int64HashTable, thus define it only for int64. What should happen for other types (encoding nans as negative values is not possible with all types) depends on how it is used (but it is not used right now) --- pandas/_libs/hashtable_class_helper.pxi.in | 9 ++------- pandas/tests/libs/test_hashtable.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 9c145a31b4a59..3c7afc65e4cdc 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -681,6 +681,7 @@ cdef class {{name}}HashTable(HashTable): ignore_na=True, return_inverse=True) return labels + {{if dtype == 'int64'}} @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: @@ -701,16 +702,9 @@ cdef class {{name}}HashTable(HashTable): val = {{to_c_type}}(values[i]) # specific for groupby - {{if dtype == 'complex64' or dtype== 'complex128'}} - # TODO: what should be done here? - if val.real < 0: - labels[i] = -1 - continue - {{elif dtype != 'uint64'}} if val < 0: labels[i] = -1 continue - {{endif}} k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: @@ -730,6 +724,7 @@ cdef class {{name}}HashTable(HashTable): arr_uniques = uniques.to_array() return np.asarray(labels), arr_uniques + {{endif}} {{endfor}} diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 6f73cc56dfd8c..c3886b1769b9b 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -156,6 +156,16 @@ def test_tracemalloc_for_empty(self, table_type, dtype): assert get_allocated_khash_memory() == 0 +def test_get_labels_groupby_for_Int64(): + table = ht.Int64HashTable() + vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) + arr, unique = table.get_labels_groupby(vals) + expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.int64) + expected_unique = np.array([1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(arr, expected_arr) + tm.assert_numpy_array_equal(unique, expected_unique) + + def test_tracemalloc_works_for_StringHashTable(): N = 1000 keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) From 28c2f2d5d257f94a5d91f520c235e1ad728b565f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 1 Dec 2020 21:27:01 +0100 Subject: [PATCH 13/16] fixing test case for 32bit build --- pandas/tests/libs/test_hashtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index c3886b1769b9b..67f50aa43efd1 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -162,7 +162,7 @@ def test_get_labels_groupby_for_Int64(): arr, unique = table.get_labels_groupby(vals) expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.int64) expected_unique = np.array([1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(arr, expected_arr) + tm.assert_numpy_array_equal(arr.astype(np.int64), expected_arr) tm.assert_numpy_array_equal(unique, expected_unique) From 50d7136ab5ad4acb3039e6cbe9ccca5a8f9da6d9 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 1 Dec 2020 21:54:39 +0100 Subject: [PATCH 14/16] minor style issues --- pandas/_libs/hashtable_class_helper.pxi.in | 24 ++++++++--------- pandas/_libs/hashtable_func_helper.pxi.in | 30 +++++++++++----------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3c7afc65e4cdc..0771a02811e76 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -375,18 +375,18 @@ cdef class HashTable: {{py: # name, dtype, c_type, to_c_type -dtypes = [('Complex128', 'complex128', 'khcomplex128_t', "to_khcomplex128_t"), - ('Float64', 'float64', 'float64_t', ""), - ('UInt64', 'uint64', 'uint64_t', ""), - ('Int64', 'int64', 'int64_t', ""), - ('Complex64', 'complex64', 'khcomplex64_t', "to_khcomplex64_t"), - ('Float32', 'float32', 'float32_t', ""), - ('UInt32', 'uint32', 'uint32_t', ""), - ('Int32', 'int32', 'int32_t', ""), - ('UInt16', 'uint16', 'uint16_t', ""), - ('Int16', 'int16', 'int16_t', ""), - ('UInt8', 'uint8', 'uint8_t', ""), - ('Int8', 'int8', 'int8_t', "")] +dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), + ('Float64', 'float64', 'float64_t', ''), + ('UInt64', 'uint64', 'uint64_t', ''), + ('Int64', 'int64', 'int64_t', ''), + ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float32', 'float32', 'float32_t', ''), + ('UInt32', 'uint32', 'uint32_t', ''), + ('Int32', 'int32', 'int32_t', ''), + ('UInt16', 'uint16', 'uint16_t', ''), + ('Int16', 'int16', 'int16_t', ''), + ('UInt8', 'uint8', 'uint8_t', ''), + ('Int8', 'int8', 'int8_t', '')] }} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index ca53aea4d0cdc..f8f541235dcb7 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -7,21 +7,21 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # dtype, ttype, c_type, to_c_type, to_dtype -dtypes = [('complex128', 'complex128', 'khcomplex128_t', \ - "to_khcomplex128_t", "to_complex128"), - ('complex64', 'complex64', 'khcomplex64_t', \ - "to_khcomplex64_t", "to_complex64"), - ('float64', 'float64', 'float64_t', "", ""), - ('float32', 'float32', 'float32_t', "", ""), - ('uint64', 'uint64', 'uint64_t', "", ""), - ('uint32', 'uint32', 'uint32_t', "", ""), - ('uint16', 'uint16', 'uint16_t', "", ""), - ('uint8', 'uint8', 'uint8_t', "", ""), - ('object', 'pymap', 'object', "", ""), - ('int64', 'int64', 'int64_t', "", ""), - ('int32', 'int32', 'int32_t', "", ""), - ('int16', 'int16', 'int16_t', "", ""), - ('int8', 'int8', 'int8_t', "", "")] +dtypes = [('complex128', 'complex128', 'khcomplex128_t', + 'to_khcomplex128_t', 'to_complex128'), + ('complex64', 'complex64', 'khcomplex64_t', + 'to_khcomplex64_t', 'to_complex64'), + ('float64', 'float64', 'float64_t', '', ''), + ('float32', 'float32', 'float32_t', '', ''), + ('uint64', 'uint64', 'uint64_t', '', ''), + ('uint32', 'uint32', 'uint32_t', '', ''), + ('uint16', 'uint16', 'uint16_t', '', ''), + ('uint8', 'uint8', 'uint8_t', '', ''), + ('object', 'pymap', 'object', '', ''), + ('int64', 'int64', 'int64_t', '', ''), + ('int32', 'int32', 'int32_t', '', ''), + ('int16', 'int16', 'int16_t', '', ''), + ('int8', 'int8', 'int8_t', '', '')] }} From 4efdc6733b76ddbf638c7628494991cca4ec22c6 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 2 Dec 2020 21:50:51 +0100 Subject: [PATCH 15/16] parameterize tests on writeable True/False --- pandas/tests/libs/test_hashtable.py | 39 +++++++++++++++-------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 67f50aa43efd1..894b126cc4269 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -75,33 +75,33 @@ def test_get_set_contains_len(self, table_type, dtype): table.get_item(index + 2) assert str(index + 2) in str(excinfo.value) - def test_map(self, table_type, dtype): + def test_map(self, table_type, dtype, writable): # PyObjectHashTable has no map-method if table_type != ht.PyObjectHashTable: N = 77 table = table_type() keys = np.arange(N).astype(dtype) vals = np.arange(N).astype(np.int64) + N - keys.flags.writeable = False - vals.flags.writeable = False + keys.flags.writeable = writable + vals.flags.writeable = writable table.map(keys, vals) for i in range(N): assert table.get_item(keys[i]) == i + N - def test_map_locations(self, table_type, dtype): + def test_map_locations(self, table_type, dtype, writable): N = 8 table = table_type() keys = (np.arange(N) + N).astype(dtype) - keys.flags.writeable = False + keys.flags.writeable = writable table.map_locations(keys) for i in range(N): assert table.get_item(keys[i]) == i - def test_lookup(self, table_type, dtype): + def test_lookup(self, table_type, dtype, writable): N = 3 table = table_type() keys = (np.arange(N) + N).astype(dtype) - keys.flags.writeable = False + keys.flags.writeable = writable table.map_locations(keys) result = table.lookup(keys) expected = np.arange(N) @@ -119,7 +119,7 @@ def test_lookup_wrong(self, table_type, dtype): result = table.lookup(wrong_keys) assert np.all(result == -1) - def test_unique(self, table_type, dtype): + def test_unique(self, table_type, dtype, writable): if dtype in (np.int8, np.uint8): N = 88 else: @@ -127,7 +127,7 @@ def test_unique(self, table_type, dtype): table = table_type() expected = (np.arange(N) + N).astype(dtype) keys = np.repeat(expected, 5) - keys.flags.writeable = False + keys.flags.writeable = writable unique = table.unique(keys) tm.assert_numpy_array_equal(unique, expected) @@ -156,9 +156,10 @@ def test_tracemalloc_for_empty(self, table_type, dtype): assert get_allocated_khash_memory() == 0 -def test_get_labels_groupby_for_Int64(): +def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) + vals.flags.writeable = writable arr, unique = table.get_labels_groupby(vals) expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.int64) expected_unique = np.array([1, 2], dtype=np.int64) @@ -262,33 +263,33 @@ def get_ht_function(fun_name, type_suffix): ], ) class TestHelpFunctions: - def test_value_count(self, dtype, type_suffix): + def test_value_count(self, dtype, type_suffix, writable): N = 43 value_count = get_ht_function("value_count", type_suffix) expected = (np.arange(N) + N).astype(dtype) values = np.repeat(expected, 5) - values.flags.writeable = False + values.flags.writeable = writable keys, counts = value_count(values, False) tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) - def test_duplicated_first(self, dtype, type_suffix): + def test_duplicated_first(self, dtype, type_suffix, writable): N = 100 duplicated = get_ht_function("duplicated", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) - values.flags.writeable = False + values.flags.writeable = writable result = duplicated(values) expected = np.ones_like(values, dtype=np.bool_) expected[::5] = False tm.assert_numpy_array_equal(result, expected) - def test_ismember_yes(self, dtype, type_suffix): + def test_ismember_yes(self, dtype, type_suffix, writable): N = 127 ismember = get_ht_function("ismember", type_suffix) arr = np.arange(N).astype(dtype) values = np.arange(N).astype(dtype) - arr.flags.writeable = False - values.flags.writeable = False + arr.flags.writeable = writable + values.flags.writeable = writable result = ismember(arr, values) expected = np.ones_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) @@ -302,7 +303,7 @@ def test_ismember_no(self, dtype, type_suffix): expected = np.zeros_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) - def test_mode(self, dtype, type_suffix): + def test_mode(self, dtype, type_suffix, writable): if dtype in (np.int8, np.uint8): N = 53 else: @@ -310,7 +311,7 @@ def test_mode(self, dtype, type_suffix): mode = get_ht_function("mode", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 - values.flags.writeable = False + values.flags.writeable = writable result = mode(values, False) assert result == 42 From 9930a5582ad8b4e37510b95075878904b57bb465 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 2 Dec 2020 22:06:20 +0100 Subject: [PATCH 16/16] reshuffle code, so it becomes more obvious which functions are defined for which c_types --- pandas/_libs/hashtable_class_helper.pxi.in | 52 +++++++++++----------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0771a02811e76..276f162545399 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -19,28 +19,13 @@ cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: res.imag = val.imag return res + cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil: cdef {{name}}_t res res.real = val.real res.imag = val.imag return res -cdef bint is_nan_kh{{name}}_t(kh{{name}}_t val) nogil: - return val.real != val.real or val.imag != val.imag -{{endfor}} - - -{{py: - -# name -float_types = ['float64_t', - 'float32_t'] -}} - -{{for c_type in float_types}} - -cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: - return val != val {{endfor}} @@ -48,23 +33,39 @@ cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: # name -int_types = ['int64_t', - 'int32_t', - 'int16_t', - 'int8_t', - 'uint64_t', - 'uint32_t', - 'uint16_t', - 'uint8_t'] +c_types = ['khcomplex128_t', + 'khcomplex64_t', + 'float64_t', + 'float32_t', + 'int64_t', + 'int32_t', + 'int16_t', + 'int8_t', + 'uint64_t', + 'uint32_t', + 'uint16_t', + 'uint8_t'] }} -{{for c_type in int_types}} +{{for c_type in c_types}} cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }} + return val.real != val.real or val.imag != val.imag + {{elif c_type in {'float64_t', 'float32_t'} }} + return val != val + {{else}} return False + {{endif}} + +{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }} +# are_equivalent_{{c_type}} is cimported via khash.pxd +{{else}} cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil: return val1 == val2 +{{endif}} + {{endfor}} @@ -97,6 +98,7 @@ from pandas._libs.khash cimport ( kh_put_{{name}}, kh_resize_{{name}}, ) + {{endfor}} # ----------------------------------------------------------------------