From b871012448678068b14590accf4d8a41037d4b7f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 21:18:34 +0100 Subject: [PATCH 01/18] extracting khash for primitive types into a helper-file --- pandas/_libs/khash.pxd | 68 +---------------- .../_libs/khash_for_primitive_helper.pxi.in | 74 +++++++++++++++++++ setup.py | 1 + 3 files changed, 77 insertions(+), 66 deletions(-) create mode 100644 pandas/_libs/khash_for_primitive_helper.pxi.in diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 1bb3a158b4b1a..85f8c3d322770 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -67,72 +67,6 @@ cdef extern from "khash_python.h": void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil - ctypedef struct kh_int64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int64_t *keys - size_t *vals - - kh_int64_t* kh_init_int64() nogil - void kh_destroy_int64(kh_int64_t*) nogil - void kh_clear_int64(kh_int64_t*) nogil - khint_t kh_get_int64(kh_int64_t*, int64_t) nogil - void kh_resize_int64(kh_int64_t*, khint_t) nogil - khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil - void kh_del_int64(kh_int64_t*, khint_t) nogil - - bint kh_exist_int64(kh_int64_t*, khiter_t) nogil - - ctypedef uint64_t khuint64_t - - ctypedef struct kh_uint64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - khuint64_t *keys - size_t *vals - - kh_uint64_t* kh_init_uint64() nogil - void kh_destroy_uint64(kh_uint64_t*) nogil - void kh_clear_uint64(kh_uint64_t*) nogil - khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil - void kh_resize_uint64(kh_uint64_t*, khint_t) nogil - khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil - void kh_del_uint64(kh_uint64_t*, khint_t) nogil - - bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil - - ctypedef struct kh_float64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - float64_t *keys - size_t *vals - - kh_float64_t* kh_init_float64() nogil - void kh_destroy_float64(kh_float64_t*) nogil - void kh_clear_float64(kh_float64_t*) nogil - khint_t kh_get_float64(kh_float64_t*, float64_t) nogil - void kh_resize_float64(kh_float64_t*, khint_t) nogil - khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil - void kh_del_float64(kh_float64_t*, khint_t) nogil - - bint kh_exist_float64(kh_float64_t*, khiter_t) nogil - - ctypedef struct kh_int32_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int32_t *keys - size_t *vals - - kh_int32_t* kh_init_int32() nogil - void kh_destroy_int32(kh_int32_t*) nogil - void kh_clear_int32(kh_int32_t*) nogil - khint_t kh_get_int32(kh_int32_t*, int32_t) nogil - void kh_resize_int32(kh_int32_t*, khint_t) nogil - khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil - void kh_del_int32(kh_int32_t*, khint_t) nogil - - bint kh_exist_int32(kh_int32_t*, khiter_t) nogil - # sweep factorize ctypedef struct kh_strbox_t: @@ -150,3 +84,5 @@ cdef extern from "khash_python.h": void kh_del_strbox(kh_strbox_t*, khint_t) nogil bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + +include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in new file mode 100644 index 0000000000000..80a7abee4a07e --- /dev/null +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -0,0 +1,74 @@ +""" +Template for wrapping khash-tables for each primitive `dtype` + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + + + +cdef extern from "khash_python.h": + ctypedef struct kh_int64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int64_t *keys + size_t *vals + + kh_int64_t* kh_init_int64() nogil + void kh_destroy_int64(kh_int64_t*) nogil + void kh_clear_int64(kh_int64_t*) nogil + khint_t kh_get_int64(kh_int64_t*, int64_t) nogil + void kh_resize_int64(kh_int64_t*, khint_t) nogil + khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil + void kh_del_int64(kh_int64_t*, khint_t) nogil + + bint kh_exist_int64(kh_int64_t*, khiter_t) nogil + + ctypedef uint64_t khuint64_t + + ctypedef struct kh_uint64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + khuint64_t *keys + size_t *vals + + kh_uint64_t* kh_init_uint64() nogil + void kh_destroy_uint64(kh_uint64_t*) nogil + void kh_clear_uint64(kh_uint64_t*) nogil + khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil + void kh_resize_uint64(kh_uint64_t*, khint_t) nogil + khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil + void kh_del_uint64(kh_uint64_t*, khint_t) nogil + + bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil + + ctypedef struct kh_float64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + float64_t *keys + size_t *vals + + kh_float64_t* kh_init_float64() nogil + void kh_destroy_float64(kh_float64_t*) nogil + void kh_clear_float64(kh_float64_t*) nogil + khint_t kh_get_float64(kh_float64_t*, float64_t) nogil + void kh_resize_float64(kh_float64_t*, khint_t) nogil + khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil + void kh_del_float64(kh_float64_t*, khint_t) nogil + + bint kh_exist_float64(kh_float64_t*, khiter_t) nogil + + ctypedef struct kh_int32_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int32_t *keys + size_t *vals + + kh_int32_t* kh_init_int32() nogil + void kh_destroy_int32(kh_int32_t*) nogil + void kh_clear_int32(kh_int32_t*) nogil + khint_t kh_get_int32(kh_int32_t*, int32_t) nogil + void kh_resize_int32(kh_int32_t*, khint_t) nogil + khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil + void kh_del_int32(kh_int32_t*, khint_t) nogil + + bint kh_exist_int32(kh_int32_t*, khiter_t) nogil diff --git a/setup.py b/setup.py index 78a789c808efb..56e3eaab0b85d 100755 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ def is_platform_mac(): "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", + "_libs/khash_for_primitive_helper.pxi.in", ], "index": ["_libs/index_class_helper.pxi.in"], "sparse": ["_libs/sparse_op_helper.pxi.in"], From 8983425e61e95841bbaa34c49ca974e393ffc753 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 21:45:14 +0100 Subject: [PATCH 02/18] use template for int64-map --- .../_libs/khash_for_primitive_helper.pxi.in | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 80a7abee4a07e..394388fb57597 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -4,24 +4,34 @@ Template for wrapping khash-tables for each primitive `dtype` WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +{{py: +# name, c_type +primitive_types = [('int64', 'int64_t')] +}} + +{{for name, c_type in primitive_types}} cdef extern from "khash_python.h": - ctypedef struct kh_int64_t: + ctypedef struct kh_{{name}}_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags - int64_t *keys + {{c_type}} *keys size_t *vals - kh_int64_t* kh_init_int64() nogil - void kh_destroy_int64(kh_int64_t*) nogil - void kh_clear_int64(kh_int64_t*) nogil - khint_t kh_get_int64(kh_int64_t*, int64_t) nogil - void kh_resize_int64(kh_int64_t*, khint_t) nogil - khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil - void kh_del_int64(kh_int64_t*, khint_t) nogil + kh_{{name}}_t* kh_init_{{name}}() nogil + void kh_destroy_{{name}}(kh_{{name}}_t*) nogil + void kh_clear_{{name}}(kh_{{name}}_t*) nogil + khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil + void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil + khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil + void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil + + bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil - bint kh_exist_int64(kh_int64_t*, khiter_t) nogil +{{endfor}} + +cdef extern from "khash_python.h": ctypedef uint64_t khuint64_t From 9b3c5a5c4e75784315cc95804e3dd55776bdd2bb Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 22:00:39 +0100 Subject: [PATCH 03/18] use template for uint64/float64/int32-map --- .../_libs/khash_for_primitive_helper.pxi.in | 54 ++----------------- 1 file changed, 5 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 394388fb57597..3a9c5a36784e8 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -7,7 +7,11 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # name, c_type -primitive_types = [('int64', 'int64_t')] +primitive_types = [('int64', 'int64_t'), + ('uint64', 'uint64_t'), + ('float64', 'float64_t'), + ('int32', 'int32_t'), + ] }} {{for name, c_type in primitive_types}} @@ -34,51 +38,3 @@ cdef extern from "khash_python.h": cdef extern from "khash_python.h": ctypedef uint64_t khuint64_t - - ctypedef struct kh_uint64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - khuint64_t *keys - size_t *vals - - kh_uint64_t* kh_init_uint64() nogil - void kh_destroy_uint64(kh_uint64_t*) nogil - void kh_clear_uint64(kh_uint64_t*) nogil - khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil - void kh_resize_uint64(kh_uint64_t*, khint_t) nogil - khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil - void kh_del_uint64(kh_uint64_t*, khint_t) nogil - - bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil - - ctypedef struct kh_float64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - float64_t *keys - size_t *vals - - kh_float64_t* kh_init_float64() nogil - void kh_destroy_float64(kh_float64_t*) nogil - void kh_clear_float64(kh_float64_t*) nogil - khint_t kh_get_float64(kh_float64_t*, float64_t) nogil - void kh_resize_float64(kh_float64_t*, khint_t) nogil - khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil - void kh_del_float64(kh_float64_t*, khint_t) nogil - - bint kh_exist_float64(kh_float64_t*, khiter_t) nogil - - ctypedef struct kh_int32_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int32_t *keys - size_t *vals - - kh_int32_t* kh_init_int32() nogil - void kh_destroy_int32(kh_int32_t*) nogil - void kh_clear_int32(kh_int32_t*) nogil - khint_t kh_get_int32(kh_int32_t*, int32_t) nogil - void kh_resize_int32(kh_int32_t*, khint_t) nogil - khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil - void kh_del_int32(kh_int32_t*, khint_t) nogil - - bint kh_exist_int32(kh_int32_t*, khiter_t) nogil From e2f062b1011455af6817292d0e1ec859748c86bd Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 22:01:30 +0100 Subject: [PATCH 04/18] remove unused define --- pandas/_libs/khash_for_primitive_helper.pxi.in | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 3a9c5a36784e8..674ff7f60b5ed 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -34,7 +34,3 @@ cdef extern from "khash_python.h": bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil {{endfor}} - -cdef extern from "khash_python.h": - - ctypedef uint64_t khuint64_t From 8a7fc6c479ff03514540ecca48a76419b04dd76e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 22:46:42 +0100 Subject: [PATCH 05/18] introducing Int32HashTable --- pandas/_libs/hashtable.pxd | 8 ++++++++ pandas/_libs/hashtable.pyx | 6 ++++++ pandas/_libs/hashtable_class_helper.pxi.in | 8 ++++++-- pandas/_libs/hashtable_func_helper.pxi.in | 4 +++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 75c273b35ee7d..2d1f8f6fd2688 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -2,8 +2,10 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( float64_t, + int32_t, int64_t, kh_float64_t, + kh_int32_t, kh_int64_t, kh_pymap_t, kh_str_t, @@ -28,6 +30,12 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) +cdef class Int32HashTable(HashTable): + cdef kh_int32_t *table + + cpdef get_item(self, int32_t val) + cpdef set_item(self, int32_t key, Py_ssize_t val) + cdef class Float64HashTable(HashTable): cdef kh_float64_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 5a0cddb0af197..706c24360d024 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -16,23 +16,27 @@ from pandas._libs cimport util from pandas._libs.khash cimport ( kh_destroy_float64, kh_destroy_int64, + kh_destroy_int32, kh_destroy_pymap, kh_destroy_str, kh_destroy_uint64, kh_exist_float64, kh_exist_int64, + kh_exist_int32, kh_exist_pymap, kh_exist_str, kh_exist_uint64, kh_float64_t, kh_get_float64, kh_get_int64, + kh_get_int32, kh_get_pymap, kh_get_str, kh_get_strbox, kh_get_uint64, kh_init_float64, kh_init_int64, + kh_init_int32, kh_init_pymap, kh_init_str, kh_init_strbox, @@ -40,12 +44,14 @@ from pandas._libs.khash cimport ( kh_int64_t, kh_put_float64, kh_put_int64, + kh_put_int32, kh_put_pymap, kh_put_str, kh_put_strbox, kh_put_uint64, kh_resize_float64, kh_resize_int64, + kh_resize_int32, kh_resize_pymap, kh_resize_str, kh_resize_uint64, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index da91fa69b0dec..9e58a95982e95 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -21,6 +21,7 @@ from pandas._libs.missing cimport C_NA dtypes = [('Float64', 'float64', 'float64_t'), ('Int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32_t'), ('String', 'string', 'char *'), ('UInt64', 'uint64', 'uint64_t')] }} @@ -49,6 +50,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData + Int32VectorData UInt64VectorData Float64VectorData StringVectorData @@ -65,7 +67,8 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, c_type dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t')] + ('Int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32_t')] }} @@ -256,7 +259,8 @@ cdef class HashTable: # name, dtype, float_group, default_na_value dtypes = [('Float64', 'float64', True, 'np.nan'), ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'NPY_NAT')] + ('Int64', 'int64', False, 'NPY_NAT'), + ('Int32', 'int32', False, 0)] }} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 4a466ada765ca..edbf95945cf8f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -10,7 +10,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in dtypes = [('float64', 'float64', 'float64_t'), ('uint64', 'uint64', 'uint64_t'), ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t')] + ('int64', 'int64', 'int64_t'), + ('int32', 'int32', 'int32_t')] }} @@ -276,6 +277,7 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): # dtype, ctype, table_type, npy_dtype dtypes = [('float64', 'float64_t', 'float64', 'float64'), ('int64', 'int64_t', 'int64', 'int64'), + ('int32', 'int32_t', 'int32', 'int32'), ('uint64', 'uint64_t', 'uint64', 'uint64'), ('object', 'object', 'pymap', 'object_')] }} From 5ab4d6828ac4b6fa8ddaf9cc8815eb7d69e56771 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 23:02:06 +0100 Subject: [PATCH 06/18] expanding some tests to test Int32HashTable --- pandas/tests/test_algos.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 34b7d0e73e914..3b6f5d145b500 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1517,6 +1517,7 @@ def test_get_unique(self): (ht.StringHashTable, ht.ObjectVector, "object", True), (ht.Float64HashTable, ht.Float64Vector, "float64", False), (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.Int32HashTable, ht.Int32Vector, "int32", False), (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), ], ) @@ -1640,6 +1641,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): ht.StringHashTable, ht.Float64HashTable, ht.Int64HashTable, + ht.Int32HashTable, ht.UInt64HashTable, ], ) From d9ab327562b3158da6a5210095338c70762126f8 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 23:21:40 +0100 Subject: [PATCH 07/18] moving cimport to helper, so it can become a template --- pandas/_libs/hashtable.pyx | 46 +--------------------- pandas/_libs/hashtable_class_helper.pxi.in | 23 +++++++++++ 2 files changed, 24 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 706c24360d024..cc080a87cfb5b 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,51 +13,7 @@ cnp.import_array() from pandas._libs cimport util -from pandas._libs.khash cimport ( - kh_destroy_float64, - kh_destroy_int64, - kh_destroy_int32, - kh_destroy_pymap, - kh_destroy_str, - kh_destroy_uint64, - kh_exist_float64, - kh_exist_int64, - kh_exist_int32, - kh_exist_pymap, - kh_exist_str, - kh_exist_uint64, - kh_float64_t, - kh_get_float64, - kh_get_int64, - kh_get_int32, - kh_get_pymap, - kh_get_str, - kh_get_strbox, - kh_get_uint64, - kh_init_float64, - kh_init_int64, - kh_init_int32, - kh_init_pymap, - kh_init_str, - kh_init_strbox, - kh_init_uint64, - kh_int64_t, - kh_put_float64, - kh_put_int64, - kh_put_int32, - kh_put_pymap, - kh_put_str, - kh_put_strbox, - kh_put_uint64, - kh_resize_float64, - kh_resize_int64, - kh_resize_int32, - kh_resize_pymap, - kh_resize_str, - kh_resize_uint64, - kh_str_t, - khiter_t, -) +from pandas._libs.khash cimport kh_str_t, khiter_t from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 9e58a95982e95..679fbb980574b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,6 +5,29 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +{{py: + +# name +cimported_types = ['float64', + 'int64', + 'int32', + 'pymap', + 'str', + 'strbox', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, +) +{{endfor}} + # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- From 41d4b57703f562313e0818b31efa6e2faf04223b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 18 Nov 2020 23:38:00 +0100 Subject: [PATCH 08/18] adding some tests for hashtables --- pandas/tests/libs/test_hashtable.py | 235 ++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 pandas/tests/libs/test_hashtable.py diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py new file mode 100644 index 0000000000000..600979f556f93 --- /dev/null +++ b/pandas/tests/libs/test_hashtable.py @@ -0,0 +1,235 @@ +import pytest +import numpy as np + +from pandas._libs import hashtable as ht +import pandas._testing as tm + + +@pytest.mark.parametrize( + "table_type, dtype", + [ + (ht.Int64HashTable, np.int64), + (ht.UInt64HashTable, np.uint64), + (ht.Float64HashTable, np.float64), + (ht.Int32HashTable, np.int32), + ], +) +class TestHashTable: + def test_get_set_contains_len(self, table_type, dtype): + index = 5 + table = table_type(55) + assert len(table) == 0 + assert index not in table + table.set_item(index, 42) + assert len(table) == 1 + assert index in table + assert table.get_item(index) == 42 + table.set_item(index + 1, 41) + assert index in table + assert index + 1 in table + assert len(table) == 2 + table.get_item(index) == 42 + table.get_item(index + 1) == 41 + table.set_item(index, 21) + assert index in table + assert index + 1 in table + assert len(table) == 2 + table.get_item(index) == 21 + table.get_item(index + 1) == 41 + assert index + 2 not in table + with pytest.raises(KeyError) as excinfo: + table.get_item(index + 2) + assert str(index + 2) in str(excinfo.value) + + def test_map(self, table_type, dtype): + N = 77 + table = table_type() + keys = np.arange(N).astype(dtype) + vals = np.arange(N).astype(np.int64) + N + table.map(keys, vals) + for i in range(N): + assert table.get_item(keys[i]) == i + N + + def test_map_locations(self, table_type, dtype): + N = 8 + table = table_type() + keys = (np.arange(N) + N).astype(dtype) + table.map_locations(keys) + for i in range(N): + assert table.get_item(keys[i]) == i + + def test_lookup(self, table_type, dtype): + N = 3 + table = table_type() + keys = (np.arange(N) + N).astype(dtype) + table.map_locations(keys) + result = table.lookup(keys) + expected = np.arange(N) + tm.assert_numpy_array_equal(result, expected) + + def test_lookup_wrong(self, table_type, dtype): + N = 512 + table = table_type() + keys = (np.arange(N) + N).astype(dtype) + table.map_locations(keys) + wrong_keys = np.arange(N).astype(dtype) + result = table.lookup(wrong_keys) + assert np.all(result == -1) + + def test_unique(self, table_type, dtype): + N = 1000 + table = table_type() + expected = (np.arange(N) + N).astype(dtype) + keys = np.repeat(expected, 5) + unique = table.unique(keys) + tm.assert_numpy_array_equal(unique, expected) + + +@pytest.mark.parametrize( + "table_type, dtype", + [ + (ht.Float64HashTable, np.float64), + ], +) +class TestHashTableWithNans: + def test_get_set_contains_len(self, table_type, dtype): + index = float("nan") + table = table_type() + assert index not in table + table.set_item(index, 42) + assert len(table) == 1 + assert index in table + assert table.get_item(index) == 42 + table.set_item(index, 41) + assert len(table) == 1 + assert index in table + assert table.get_item(index) == 41 + + def test_map(self, table_type, dtype): + N = 332 + table = table_type() + keys = np.full(N, np.nan, dtype=dtype) + vals = (np.arange(N) + N).astype(np.int64) + table.map(keys, vals) + assert len(table) == 1 + assert table.get_item(np.nan) == 2 * N - 1 + + def test_map_locations(self, table_type, dtype): + N = 10 + table = table_type() + keys = np.full(N, np.nan, dtype=dtype) + table.map_locations(keys) + assert len(table) == 1 + assert table.get_item(np.nan) == N - 1 + + def test_unique(self, table_type, dtype): + N = 1020 + table = table_type() + keys = np.full(N, np.nan, dtype=dtype) + unique = table.unique(keys) + assert np.all(np.isnan(unique)) and len(unique) == 1 + + +def get_ht_function(fun_name, type_suffix): + return getattr(ht, fun_name + "_" + type_suffix) + + +@pytest.mark.parametrize( + "dtype, type_suffix", + [ + (np.int64, "int64"), + (np.uint64, "uint64"), + (np.float64, "float64"), + (np.int32, "int32"), + ], +) +class TestHelpFunctions: + def test_value_count(self, dtype, type_suffix): + N = 43 + value_count = get_ht_function("value_count", type_suffix) + expected = (np.arange(N) + N).astype(dtype) + values = np.repeat(expected, 5) + keys, counts = value_count(values, False) + tm.assert_numpy_array_equal(np.sort(keys), expected) + assert np.all(counts == 5) + + def test_duplicated_first(self, dtype, type_suffix): + N = 100 + duplicated = get_ht_function("duplicated", type_suffix) + values = np.repeat(np.arange(N).astype(dtype), 5) + result = duplicated(values) + expected = np.ones_like(values, dtype=np.bool) + expected[::5] = False + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_yes(self, dtype, type_suffix): + N = 127 + ismember = get_ht_function("ismember", type_suffix) + arr = np.arange(N).astype(dtype) + values = np.arange(N).astype(dtype) + result = ismember(arr, values) + expected = np.ones_like(values, dtype=np.bool) + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_no(self, dtype, type_suffix): + N = 17 + ismember = get_ht_function("ismember", type_suffix) + arr = np.arange(N).astype(dtype) + values = (np.arange(N) + N).astype(dtype) + result = ismember(arr, values) + expected = np.zeros_like(values, dtype=np.bool) + tm.assert_numpy_array_equal(result, expected) + + def test_mode(self, dtype, type_suffix): + N = 11111 + mode = get_ht_function("mode", type_suffix) + values = np.repeat(np.arange(N).astype(dtype), 5) + values[0] = 42 + result = mode(values, False) + assert result == 42 + + +@pytest.mark.parametrize( + "dtype, type_suffix", + [ + (np.float64, "float64"), + ], +) +class TestHelpFunctionsWithNans: + def test_value_count(self, dtype, type_suffix): + value_count = get_ht_function("value_count", type_suffix) + values = np.array([np.nan, np.nan, np.nan], dtype=dtype) + keys, counts = value_count(values, True) + assert len(keys) == 0 + keys, counts = value_count(values, False) + assert len(keys) == 1 and np.all(np.isnan(keys)) + assert counts[0] == 3 + + def test_duplicated_first(self, dtype, type_suffix): + duplicated = get_ht_function("duplicated", type_suffix) + values = np.array([np.nan, np.nan, np.nan], dtype=dtype) + result = duplicated(values) + expected = np.array([False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_yes(self, dtype, type_suffix): + ismember = get_ht_function("ismember", type_suffix) + arr = np.array([np.nan, np.nan, np.nan], dtype=dtype) + values = np.array([np.nan, np.nan], dtype=dtype) + result = ismember(arr, values) + expected = np.array([True, True, True], dtype=np.bool) + tm.assert_numpy_array_equal(result, expected) + + def test_ismember_no(self, dtype, type_suffix): + ismember = get_ht_function("ismember", type_suffix) + arr = np.array([np.nan, np.nan, np.nan], dtype=dtype) + values = np.array([1], dtype=dtype) + result = ismember(arr, values) + expected = np.array([False, False, False], dtype=np.bool) + tm.assert_numpy_array_equal(result, expected) + + def test_mode(self, dtype, type_suffix): + mode = get_ht_function("mode", type_suffix) + values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) + assert mode(values, True) == 42 + assert np.isnan(mode(values, False)) From 70c6fc5f669b7136e089156e6a28815ec593fa8e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 19 Nov 2020 00:05:27 +0100 Subject: [PATCH 09/18] introducing UInt32HashTable --- pandas/_libs/hashtable.pxd | 8 ++++++++ pandas/_libs/hashtable_class_helper.pxi.in | 9 +++++++-- pandas/_libs/hashtable_func_helper.pxi.in | 2 ++ pandas/_libs/khash_for_primitive_helper.pxi.in | 1 + pandas/_libs/src/klib/khash.h | 5 +++++ pandas/tests/libs/test_hashtable.py | 5 ++++- 6 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 2d1f8f6fd2688..86aa9d7c2f4ab 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -9,7 +9,9 @@ from pandas._libs.khash cimport ( kh_int64_t, kh_pymap_t, kh_str_t, + kh_uint32_t, kh_uint64_t, + uint32_t, uint64_t, ) @@ -30,6 +32,12 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) +cdef class UInt32HashTable(HashTable): + cdef kh_uint32_t *table + + cpdef get_item(self, uint32_t val) + cpdef set_item(self, uint32_t key, Py_ssize_t val) + cdef class Int32HashTable(HashTable): cdef kh_int32_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 679fbb980574b..426822533d7bc 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -9,11 +9,12 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name cimported_types = ['float64', - 'int64', 'int32', + 'int64', 'pymap', 'str', 'strbox', + 'uint32', 'uint64'] }} @@ -46,7 +47,8 @@ dtypes = [('Float64', 'float64', 'float64_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), ('String', 'string', 'char *'), - ('UInt64', 'uint64', 'uint64_t')] + ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t')] }} {{for name, dtype, c_type in dtypes}} @@ -75,6 +77,7 @@ ctypedef fused vector_data: Int64VectorData Int32VectorData UInt64VectorData + UInt32VectorData Float64VectorData StringVectorData @@ -91,6 +94,7 @@ cdef inline bint needs_resize(vector_data *data) nogil: dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), + ('UInt32', 'uint32', 'uint32_t'), ('Int32', 'int32', 'int32_t')] }} @@ -283,6 +287,7 @@ cdef class HashTable: dtypes = [('Float64', 'float64', True, 'np.nan'), ('UInt64', 'uint64', False, 0), ('Int64', 'int64', False, 'NPY_NAT'), + ('UInt32', 'uint32', False, 0), ('Int32', 'int32', False, 0)] }} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index edbf95945cf8f..85ae783bfce06 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -9,6 +9,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), ('uint64', 'uint64', 'uint64_t'), + ('uint32', 'uint32', 'uint32_t'), ('object', 'pymap', 'object'), ('int64', 'int64', 'int64_t'), ('int32', 'int32', 'int32_t')] @@ -279,6 +280,7 @@ dtypes = [('float64', 'float64_t', 'float64', 'float64'), ('int64', 'int64_t', 'int64', 'int64'), ('int32', 'int32_t', 'int32', 'int32'), ('uint64', 'uint64_t', 'uint64', 'uint64'), + ('uint32', 'uint32_t', 'uint32', 'uint32'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 674ff7f60b5ed..8a11f747b9cdc 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -11,6 +11,7 @@ primitive_types = [('int64', 'int64_t'), ('uint64', 'uint64_t'), ('float64', 'float64_t'), ('int32', 'int32_t'), + ('uint32', 'uint32_t'), ] }} diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 61a4e80ea8cbc..5086115e50e12 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -591,6 +591,9 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @@ -635,9 +638,11 @@ typedef const char *kh_cstr_t; #define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) +#define kh_exist_uint32(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 600979f556f93..72221ef17ed23 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,7 +1,8 @@ -import pytest import numpy as np +import pytest from pandas._libs import hashtable as ht + import pandas._testing as tm @@ -12,6 +13,7 @@ (ht.UInt64HashTable, np.uint64), (ht.Float64HashTable, np.float64), (ht.Int32HashTable, np.int32), + (ht.UInt32HashTable, np.uint32), ], ) class TestHashTable: @@ -141,6 +143,7 @@ def get_ht_function(fun_name, type_suffix): (np.uint64, "uint64"), (np.float64, "float64"), (np.int32, "int32"), + (np.uint32, "uint32"), ], ) class TestHelpFunctions: From 15dfe49833ff9bcfed557843b2a5712c62c71db6 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 19 Nov 2020 20:35:45 +0100 Subject: [PATCH 10/18] formating test case (and adding some missing asserts) --- pandas/tests/libs/test_hashtable.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 72221ef17ed23..41399c939f6ad 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -22,23 +22,27 @@ def test_get_set_contains_len(self, table_type, dtype): table = table_type(55) assert len(table) == 0 assert index not in table + table.set_item(index, 42) assert len(table) == 1 assert index in table assert table.get_item(index) == 42 + table.set_item(index + 1, 41) assert index in table assert index + 1 in table assert len(table) == 2 - table.get_item(index) == 42 - table.get_item(index + 1) == 41 + assert table.get_item(index) == 42 + assert table.get_item(index + 1) == 41 + table.set_item(index, 21) assert index in table assert index + 1 in table assert len(table) == 2 - table.get_item(index) == 21 - table.get_item(index + 1) == 41 + assert table.get_item(index) == 21 + assert table.get_item(index + 1) == 41 assert index + 2 not in table + with pytest.raises(KeyError) as excinfo: table.get_item(index + 2) assert str(index + 2) in str(excinfo.value) From 8975f065e2b99c025b0f427240dbb430155b66e9 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 19 Nov 2020 21:54:47 +0100 Subject: [PATCH 11/18] introducing Float32HashMap --- pandas/_libs/hashtable.pxd | 8 ++++++ pandas/_libs/hashtable_class_helper.pxi.in | 7 ++++- pandas/_libs/hashtable_func_helper.pxi.in | 4 ++- pandas/_libs/khash.pxd | 2 +- .../_libs/khash_for_primitive_helper.pxi.in | 1 + pandas/_libs/src/klib/khash.h | 2 ++ pandas/_libs/src/klib/khash_python.h | 28 +++++++++++++++++-- pandas/tests/libs/test_hashtable.py | 6 ++++ 8 files changed, 53 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 86aa9d7c2f4ab..4a27e2fd0a1fe 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,9 +1,11 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( + float32_t, float64_t, int32_t, int64_t, + kh_float32_t, kh_float64_t, kh_int32_t, kh_int64_t, @@ -50,6 +52,12 @@ cdef class Float64HashTable(HashTable): cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) +cdef class Float32HashTable(HashTable): + cdef kh_float32_t *table + + cpdef get_item(self, float32_t val) + cpdef set_item(self, float32_t key, Py_ssize_t val) + cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 426822533d7bc..7941c8d8b27c8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -8,7 +8,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # name -cimported_types = ['float64', +cimported_types = ['float32', + 'float64', 'int32', 'int64', 'pymap', @@ -44,6 +45,7 @@ from pandas._libs.missing cimport C_NA # for uniques in hashtables) dtypes = [('Float64', 'float64', 'float64_t'), + ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), ('String', 'string', 'char *'), @@ -79,6 +81,7 @@ ctypedef fused vector_data: UInt64VectorData UInt32VectorData Float64VectorData + Float32VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -94,6 +97,7 @@ cdef inline bint needs_resize(vector_data *data) nogil: dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), + ('Float32', 'float32', 'float32_t'), ('UInt32', 'uint32', 'uint32_t'), ('Int32', 'int32', 'int32_t')] @@ -287,6 +291,7 @@ cdef class HashTable: dtypes = [('Float64', 'float64', True, 'np.nan'), ('UInt64', 'uint64', False, 0), ('Int64', 'int64', False, 'NPY_NAT'), + ('Float32', 'float32', True, 'np.nan'), ('UInt32', 'uint32', False, 0), ('Int32', 'int32', False, 0)] diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 85ae783bfce06..b0eb3e7aa0577 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -8,6 +8,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), + ('float32', 'float32', 'float32_t'), ('uint64', 'uint64', 'uint64_t'), ('uint32', 'uint32', 'uint32_t'), ('object', 'pymap', 'object'), @@ -56,7 +57,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] - {{if dtype == 'float64'}} + {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: {{else}} if True: @@ -277,6 +278,7 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): # dtype, ctype, table_type, npy_dtype dtypes = [('float64', 'float64_t', 'float64', 'float64'), + ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), ('int32', 'int32_t', 'int32', 'int32'), ('uint64', 'uint64_t', 'uint64', 'uint64'), diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 85f8c3d322770..7dacbdf36c286 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,5 +1,5 @@ from cpython.object cimport PyObject -from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t +from numpy cimport float32_t, float64_t, int32_t, int64_t, uint32_t, uint64_t cdef extern from "khash_python.h": diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 8a11f747b9cdc..0c26a5883bcbe 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -12,6 +12,7 @@ primitive_types = [('int64', 'int64_t'), ('float64', 'float64_t'), ('int32', 'int32_t'), ('uint32', 'uint32_t'), + ('float32', 'float32_t'), ] }} diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 5086115e50e12..facacd2c2e160 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -130,6 +130,7 @@ typedef signed long long khint64_t; #endif typedef double khfloat64_t; +typedef double khfloat32_t; typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -637,6 +638,7 @@ typedef const char *kh_cstr_t; #define kh_exist_float64(h, k) (kh_exist(h, k)) #define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_float32(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) #define kh_exist_uint32(h, k) (kh_exist(h, k)) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index aebc229abddd2..c37f0e950baa7 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -23,6 +23,12 @@ khint64_t PANDAS_INLINE asint64(double key) { return val; } +khint32_t PANDAS_INLINE asint32(float key) { + khint32_t val; + memcpy(&val, &key, sizeof(float)); + return val; +} + #define ZERO_HASH 0 #define NAN_HASH 0 @@ -39,13 +45,31 @@ khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ return murmur2_64to32(as_int); } -#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) +khint32_t PANDAS_INLINE kh_float32_hash_func(float val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0f){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint32_t as_int = asint32(val); + return murmur2_32to32(as_int); +} + +#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) +#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ + KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) + +KHASH_MAP_INIT_FLOAT32(float32, size_t) + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 41399c939f6ad..05bc86c2e34f5 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -14,6 +14,7 @@ (ht.Float64HashTable, np.float64), (ht.Int32HashTable, np.int32), (ht.UInt32HashTable, np.uint32), + (ht.Float32HashTable, np.float32), ], ) class TestHashTable: @@ -95,6 +96,7 @@ def test_unique(self, table_type, dtype): "table_type, dtype", [ (ht.Float64HashTable, np.float64), + (ht.Float32HashTable, np.float32), ], ) class TestHashTableWithNans: @@ -102,10 +104,12 @@ def test_get_set_contains_len(self, table_type, dtype): index = float("nan") table = table_type() assert index not in table + table.set_item(index, 42) assert len(table) == 1 assert index in table assert table.get_item(index) == 42 + table.set_item(index, 41) assert len(table) == 1 assert index in table @@ -148,6 +152,7 @@ def get_ht_function(fun_name, type_suffix): (np.float64, "float64"), (np.int32, "int32"), (np.uint32, "uint32"), + (np.float32, "float32"), ], ) class TestHelpFunctions: @@ -200,6 +205,7 @@ def test_mode(self, dtype, type_suffix): "dtype, type_suffix", [ (np.float64, "float64"), + (np.float32, "float32"), ], ) class TestHelpFunctionsWithNans: From 0ffd3b233f5ced0515e3ea09fe17cd80e50e6c5b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 19 Nov 2020 22:36:44 +0100 Subject: [PATCH 12/18] introducing Int16HashTable and UInt16HashTable --- pandas/_libs/hashtable.pxd | 16 ++++++++++++++ pandas/_libs/hashtable_class_helper.pxi.in | 16 +++++++++++--- pandas/_libs/hashtable_func_helper.pxi.in | 6 +++++- pandas/_libs/khash.pxd | 11 +++++++++- .../_libs/khash_for_primitive_helper.pxi.in | 2 ++ pandas/_libs/src/klib/khash.h | 21 +++++++++++++++++++ pandas/tests/libs/test_hashtable.py | 4 ++++ 7 files changed, 71 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 4a27e2fd0a1fe..a8790fa00b908 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -3,16 +3,20 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( float32_t, float64_t, + int16_t, int32_t, int64_t, kh_float32_t, kh_float64_t, + kh_int16_t, kh_int32_t, kh_int64_t, kh_pymap_t, kh_str_t, + kh_uint16_t, kh_uint32_t, kh_uint64_t, + uint16_t, uint32_t, uint64_t, ) @@ -46,6 +50,18 @@ cdef class Int32HashTable(HashTable): cpdef get_item(self, int32_t val) cpdef set_item(self, int32_t key, Py_ssize_t val) +cdef class UInt16HashTable(HashTable): + cdef kh_uint16_t *table + + cpdef get_item(self, uint16_t val) + cpdef set_item(self, uint16_t key, Py_ssize_t val) + +cdef class Int16HashTable(HashTable): + cdef kh_int16_t *table + + cpdef get_item(self, int16_t val) + cpdef set_item(self, int16_t key, Py_ssize_t val) + cdef class Float64HashTable(HashTable): cdef kh_float64_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 7941c8d8b27c8..f215f12e997af 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -10,11 +10,13 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name cimported_types = ['float32', 'float64', + 'int16', 'int32', 'int64', 'pymap', 'str', 'strbox', + 'uint16', 'uint32', 'uint64'] }} @@ -48,9 +50,11 @@ dtypes = [('Float64', 'float64', 'float64_t'), ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), ('String', 'string', 'char *'), ('UInt64', 'uint64', 'uint64_t'), - ('UInt32', 'uint32', 'uint32_t')] + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t')] }} {{for name, dtype, c_type in dtypes}} @@ -78,8 +82,10 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData Int32VectorData + Int16VectorData UInt64VectorData UInt32VectorData + UInt16VectorData Float64VectorData Float32VectorData StringVectorData @@ -99,7 +105,9 @@ dtypes = [('Float64', 'float64', 'float64_t'), ('Int64', 'int64', 'int64_t'), ('Float32', 'float32', 'float32_t'), ('UInt32', 'uint32', 'uint32_t'), - ('Int32', 'int32', 'int32_t')] + ('Int32', 'int32', 'int32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('Int16', 'int16', 'int16_t')] }} @@ -293,7 +301,9 @@ dtypes = [('Float64', 'float64', True, 'np.nan'), ('Int64', 'int64', False, 'NPY_NAT'), ('Float32', 'float32', True, 'np.nan'), ('UInt32', 'uint32', False, 0), - ('Int32', 'int32', False, 0)] + ('Int32', 'int32', False, 0), + ('UInt16', 'uint16', False, 0), + ('Int16', 'int16', False, 0)] }} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b0eb3e7aa0577..a00e46cf7f0e3 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -11,9 +11,11 @@ dtypes = [('float64', 'float64', 'float64_t'), ('float32', 'float32', 'float32_t'), ('uint64', 'uint64', 'uint64_t'), ('uint32', 'uint32', 'uint32_t'), + ('uint16', 'uint16', 'uint16_t'), ('object', 'pymap', 'object'), ('int64', 'int64', 'int64_t'), - ('int32', 'int32', 'int32_t')] + ('int32', 'int32', 'int32_t'), + ('int16', 'int16', 'int16_t')] }} @@ -281,8 +283,10 @@ dtypes = [('float64', 'float64_t', 'float64', 'float64'), ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), ('int32', 'int32_t', 'int32', 'int32'), + ('int16', 'int16_t', 'int16', 'int16'), ('uint64', 'uint64_t', 'uint64', 'uint64'), ('uint32', 'uint32_t', 'uint32', 'uint32'), + ('uint16', 'uint16_t', 'uint16', 'uint16'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 7dacbdf36c286..d97a5afe7288c 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,5 +1,14 @@ from cpython.object cimport PyObject -from numpy cimport float32_t, float64_t, int32_t, int64_t, uint32_t, uint64_t +from numpy cimport ( + float32_t, + float64_t, + int16_t, + int32_t, + int64_t, + uint16_t, + uint32_t, + uint64_t, +) cdef extern from "khash_python.h": diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 0c26a5883bcbe..750a87b596eae 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -13,6 +13,8 @@ primitive_types = [('int64', 'int64_t'), ('int32', 'int32_t'), ('uint32', 'uint32_t'), ('float32', 'float32_t'), + ('int16', 'int16_t'), + ('uint16', 'uint16_t'), ] }} diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index facacd2c2e160..607acee02dd9e 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -129,6 +129,12 @@ typedef unsigned long long khuint64_t; typedef signed long long khint64_t; #endif +#if UINT_MAX == 0xffffu +typedef unsigned int khint16_t; +#elif USHRT_MAX == 0xffffu +typedef unsigned short khint16_t; +#endif + typedef double khfloat64_t; typedef double khfloat32_t; @@ -616,6 +622,17 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +/*! @function + @abstract Instantiate a hash map containing 16bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + typedef const char *kh_cstr_t; /*! @function @@ -641,12 +658,16 @@ typedef const char *kh_cstr_t; #define kh_exist_float32(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) #define kh_exist_uint32(h, k) (kh_exist(h, k)) +#define kh_exist_int16(h, k) (kh_exist(h, k)) +#define kh_exist_uint16(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) +KHASH_MAP_INIT_INT16(int16, size_t) +KHASH_MAP_INIT_UINT16(uint16, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 05bc86c2e34f5..a7eec40c8857f 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -15,6 +15,8 @@ (ht.Int32HashTable, np.int32), (ht.UInt32HashTable, np.uint32), (ht.Float32HashTable, np.float32), + (ht.Int16HashTable, np.int16), + (ht.UInt16HashTable, np.uint16), ], ) class TestHashTable: @@ -153,6 +155,8 @@ def get_ht_function(fun_name, type_suffix): (np.int32, "int32"), (np.uint32, "uint32"), (np.float32, "float32"), + (np.int16, "int16"), + (np.uint16, "uint16"), ], ) class TestHelpFunctions: From c952d680579550f43cecac4907730c481e114b54 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 19 Nov 2020 23:10:26 +0100 Subject: [PATCH 13/18] introducing UInt8HashTable and Int8HashTable --- pandas/_libs/hashtable.pxd | 16 +++++++++++++++ pandas/_libs/hashtable_class_helper.pxi.in | 16 ++++++++++++--- pandas/_libs/hashtable_func_helper.pxi.in | 6 +++++- pandas/_libs/khash.pxd | 2 ++ .../_libs/khash_for_primitive_helper.pxi.in | 2 ++ pandas/_libs/src/klib/khash.h | 20 +++++++++++++++++++ pandas/tests/libs/test_hashtable.py | 19 +++++++++++++++--- 7 files changed, 74 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index a8790fa00b908..7b630c264753f 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -3,19 +3,23 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( float32_t, float64_t, + int8_t, int16_t, int32_t, int64_t, kh_float32_t, kh_float64_t, + kh_int8_t, kh_int16_t, kh_int32_t, kh_int64_t, kh_pymap_t, kh_str_t, + kh_uint8_t, kh_uint16_t, kh_uint32_t, kh_uint64_t, + uint8_t, uint16_t, uint32_t, uint64_t, @@ -62,6 +66,18 @@ cdef class Int16HashTable(HashTable): cpdef get_item(self, int16_t val) cpdef set_item(self, int16_t key, Py_ssize_t val) +cdef class UInt8HashTable(HashTable): + cdef kh_uint8_t *table + + cpdef get_item(self, uint8_t val) + cpdef set_item(self, uint8_t key, Py_ssize_t val) + +cdef class Int8HashTable(HashTable): + cdef kh_int8_t *table + + cpdef get_item(self, int8_t val) + cpdef set_item(self, int8_t key, Py_ssize_t val) + cdef class Float64HashTable(HashTable): cdef kh_float64_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f215f12e997af..915695161656c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -10,12 +10,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name cimported_types = ['float32', 'float64', + 'int8', 'int16', 'int32', 'int64', 'pymap', 'str', 'strbox', + 'uint8', 'uint16', 'uint32', 'uint64'] @@ -51,10 +53,12 @@ dtypes = [('Float64', 'float64', 'float64_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), ('UInt64', 'uint64', 'uint64_t'), ('UInt32', 'uint32', 'uint32_t'), - ('UInt16', 'uint16', 'uint16_t')] + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t')] }} {{for name, dtype, c_type in dtypes}} @@ -83,9 +87,11 @@ ctypedef fused vector_data: Int64VectorData Int32VectorData Int16VectorData + Int8VectorData UInt64VectorData UInt32VectorData UInt16VectorData + UInt8VectorData Float64VectorData Float32VectorData StringVectorData @@ -107,7 +113,9 @@ dtypes = [('Float64', 'float64', 'float64_t'), ('UInt32', 'uint32', 'uint32_t'), ('Int32', 'int32', 'int32_t'), ('UInt16', 'uint16', 'uint16_t'), - ('Int16', 'int16', 'int16_t')] + ('Int16', 'int16', 'int16_t'), + ('UInt8', 'uint8', 'uint8_t'), + ('Int8', 'int8', 'int8_t')] }} @@ -303,7 +311,9 @@ dtypes = [('Float64', 'float64', True, 'np.nan'), ('UInt32', 'uint32', False, 0), ('Int32', 'int32', False, 0), ('UInt16', 'uint16', False, 0), - ('Int16', 'int16', False, 0)] + ('Int16', 'int16', False, 0), + ('UInt8', 'uint8', False, 0), + ('Int8', 'int8', False, 0)] }} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index a00e46cf7f0e3..7c5afa4ff6b27 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -12,10 +12,12 @@ dtypes = [('float64', 'float64', 'float64_t'), ('uint64', 'uint64', 'uint64_t'), ('uint32', 'uint32', 'uint32_t'), ('uint16', 'uint16', 'uint16_t'), + ('uint8', 'uint8', 'uint8_t'), ('object', 'pymap', 'object'), ('int64', 'int64', 'int64_t'), ('int32', 'int32', 'int32_t'), - ('int16', 'int16', 'int16_t')] + ('int16', 'int16', 'int16_t'), + ('int8', 'int8', 'int8_t')] }} @@ -284,9 +286,11 @@ dtypes = [('float64', 'float64_t', 'float64', 'float64'), ('int64', 'int64_t', 'int64', 'int64'), ('int32', 'int32_t', 'int32', 'int32'), ('int16', 'int16_t', 'int16', 'int16'), + ('int8', 'int8_t', 'int8', 'int8'), ('uint64', 'uint64_t', 'uint64', 'uint64'), ('uint32', 'uint32_t', 'uint32', 'uint32'), ('uint16', 'uint16_t', 'uint16', 'uint16'), + ('uint8', 'uint8_t', 'uint8', 'uint8'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index d97a5afe7288c..8b082747bf22b 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -2,9 +2,11 @@ from cpython.object cimport PyObject from numpy cimport ( float32_t, float64_t, + int8_t, int16_t, int32_t, int64_t, + uint8_t, uint16_t, uint32_t, uint64_t, diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index 750a87b596eae..db8d3e0b19417 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -15,6 +15,8 @@ primitive_types = [('int64', 'int64_t'), ('float32', 'float32_t'), ('int16', 'int16_t'), ('uint16', 'uint16_t'), + ('int8', 'int8_t'), + ('uint8', 'uint8_t'), ] }} diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 607acee02dd9e..d08e53b681202 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -135,6 +135,10 @@ typedef unsigned int khint16_t; typedef unsigned short khint16_t; #endif +#if UCHAR_MAX == 0xffu +typedef unsigned char khint8_t; +#endif + typedef double khfloat64_t; typedef double khfloat32_t; @@ -633,6 +637,18 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) #define KHASH_MAP_INIT_UINT16(name, khval_t) \ KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +/*! @function + @abstract Instantiate a hash map containing 8bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + + typedef const char *kh_cstr_t; /*! @function @@ -660,6 +676,8 @@ typedef const char *kh_cstr_t; #define kh_exist_uint32(h, k) (kh_exist(h, k)) #define kh_exist_int16(h, k) (kh_exist(h, k)) #define kh_exist_uint16(h, k) (kh_exist(h, k)) +#define kh_exist_int8(h, k) (kh_exist(h, k)) +#define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) @@ -668,6 +686,8 @@ KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) KHASH_MAP_INIT_INT16(int16, size_t) KHASH_MAP_INIT_UINT16(uint16, size_t) +KHASH_MAP_INIT_INT16(int8, size_t) +KHASH_MAP_INIT_UINT16(uint8, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index a7eec40c8857f..61768b9de05ee 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -17,6 +17,8 @@ (ht.Float32HashTable, np.float32), (ht.Int16HashTable, np.int16), (ht.UInt16HashTable, np.uint16), + (ht.Int8HashTable, np.int8), + (ht.UInt8HashTable, np.uint8), ], ) class TestHashTable: @@ -77,7 +79,10 @@ def test_lookup(self, table_type, dtype): tm.assert_numpy_array_equal(result, expected) def test_lookup_wrong(self, table_type, dtype): - N = 512 + if dtype in (np.int8, np.uint8): + N = 100 + else: + N = 512 table = table_type() keys = (np.arange(N) + N).astype(dtype) table.map_locations(keys) @@ -86,7 +91,10 @@ def test_lookup_wrong(self, table_type, dtype): assert np.all(result == -1) def test_unique(self, table_type, dtype): - N = 1000 + if dtype in (np.int8, np.uint8): + N = 88 + else: + N = 1000 table = table_type() expected = (np.arange(N) + N).astype(dtype) keys = np.repeat(expected, 5) @@ -157,6 +165,8 @@ def get_ht_function(fun_name, type_suffix): (np.float32, "float32"), (np.int16, "int16"), (np.uint16, "uint16"), + (np.int8, "int8"), + (np.uint8, "uint8"), ], ) class TestHelpFunctions: @@ -197,7 +207,10 @@ def test_ismember_no(self, dtype, type_suffix): tm.assert_numpy_array_equal(result, expected) def test_mode(self, dtype, type_suffix): - N = 11111 + if dtype in (np.int8, np.uint8): + N = 53 + else: + N = 11111 mode = get_ht_function("mode", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 From 6d026b27194bd6d560257ca2301d5ad69def0364 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 20 Nov 2020 21:44:14 +0100 Subject: [PATCH 14/18] fixing minor issues with tests --- pandas/tests/libs/test_hashtable.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 61768b9de05ee..5ef110e9672f0 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -76,7 +76,7 @@ def test_lookup(self, table_type, dtype): table.map_locations(keys) result = table.lookup(keys) expected = np.arange(N) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64)) def test_lookup_wrong(self, table_type, dtype): if dtype in (np.int8, np.uint8): @@ -184,7 +184,7 @@ def test_duplicated_first(self, dtype, type_suffix): duplicated = get_ht_function("duplicated", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) result = duplicated(values) - expected = np.ones_like(values, dtype=np.bool) + expected = np.ones_like(values, dtype=np.bool_) expected[::5] = False tm.assert_numpy_array_equal(result, expected) @@ -194,7 +194,7 @@ def test_ismember_yes(self, dtype, type_suffix): arr = np.arange(N).astype(dtype) values = np.arange(N).astype(dtype) result = ismember(arr, values) - expected = np.ones_like(values, dtype=np.bool) + expected = np.ones_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) def test_ismember_no(self, dtype, type_suffix): @@ -203,7 +203,7 @@ def test_ismember_no(self, dtype, type_suffix): arr = np.arange(N).astype(dtype) values = (np.arange(N) + N).astype(dtype) result = ismember(arr, values) - expected = np.zeros_like(values, dtype=np.bool) + expected = np.zeros_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) def test_mode(self, dtype, type_suffix): @@ -247,7 +247,7 @@ def test_ismember_yes(self, dtype, type_suffix): arr = np.array([np.nan, np.nan, np.nan], dtype=dtype) values = np.array([np.nan, np.nan], dtype=dtype) result = ismember(arr, values) - expected = np.array([True, True, True], dtype=np.bool) + expected = np.array([True, True, True], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) def test_ismember_no(self, dtype, type_suffix): @@ -255,7 +255,7 @@ def test_ismember_no(self, dtype, type_suffix): arr = np.array([np.nan, np.nan, np.nan], dtype=dtype) values = np.array([1], dtype=dtype) result = ismember(arr, values) - expected = np.array([False, False, False], dtype=np.bool) + expected = np.array([False, False, False], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) def test_mode(self, dtype, type_suffix): From a2e567939ed4f620dcb18f970a63582db8cf5c8b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 20 Nov 2020 21:53:45 +0100 Subject: [PATCH 15/18] adding comment why unsigned int is used for maps --- pandas/_libs/src/klib/khash.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index d08e53b681202..9de50aa6a2285 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -599,6 +599,13 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ + +// we implicitly convert signed int to unsigned int, thus potential overflows +// for operations (<<,*,+) don't trigger undefined behavior, also >>-operator +// is implementation defined for signed ints if sign-bit is set. +// because we never really "get" the keys, there will be no convertion from +// unsigend int to (signed) int (which would be implementation defined behavior) +// this holds also for 16- and 8-bit integers #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) From 897512d5eba6076340ca4e61068816ab54289f82 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 20 Nov 2020 22:05:06 +0100 Subject: [PATCH 16/18] use unsigned ints also for 64 maps, for the sake of consistance but also to avoid undefined/implementation defined behaviors in case of an overflow --- pandas/_libs/src/klib/khash.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 9de50aa6a2285..ecd15d1893c23 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -122,11 +122,9 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khuint64_t; -typedef signed long khint64_t; +typedef unsigned long khint64_t; #else -typedef unsigned long long khuint64_t; -typedef signed long long khint64_t; +typedef unsigned long long khint64_t; #endif #if UINT_MAX == 0xffffu @@ -605,7 +603,7 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) // is implementation defined for signed ints if sign-bit is set. // because we never really "get" the keys, there will be no convertion from // unsigend int to (signed) int (which would be implementation defined behavior) -// this holds also for 16- and 8-bit integers +// this holds also for 64-, 16- and 8-bit integers #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) @@ -617,7 +615,7 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -628,7 +626,7 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) From 17a3feed54890d806e25cd731e9559be60d27366 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 20 Nov 2020 22:20:51 +0100 Subject: [PATCH 17/18] adding missing dependency, otherwise changing khash.h does not trigger rebuild --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 56e3eaab0b85d..9f33c045df6ed 100755 --- a/setup.py +++ b/setup.py @@ -526,7 +526,10 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.hashtable": { "pyxfile": "_libs/hashtable", "include": klib_include, - "depends": (["pandas/_libs/src/klib/khash_python.h"] + _pxi_dep["hashtable"]), + "depends": ( + ["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"] + + _pxi_dep["hashtable"] + ), }, "_libs.index": { "pyxfile": "_libs/index", From 3a4c2bcff2de207b3d993b3d5ec5dd7d2da291df Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 20 Nov 2020 22:29:52 +0100 Subject: [PATCH 18/18] removing not really needed default_na_value --- pandas/_libs/hashtable_class_helper.pxi.in | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 915695161656c..f7001c165870e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -303,22 +303,22 @@ cdef class HashTable: {{py: -# name, dtype, float_group, default_na_value -dtypes = [('Float64', 'float64', True, 'np.nan'), - ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'NPY_NAT'), - ('Float32', 'float32', True, 'np.nan'), - ('UInt32', 'uint32', False, 0), - ('Int32', 'int32', False, 0), - ('UInt16', 'uint16', False, 0), - ('Int16', 'int16', False, 0), - ('UInt8', 'uint8', False, 0), - ('Int8', 'int8', False, 0)] +# name, dtype, float_group +dtypes = [('Float64', 'float64', True), + ('UInt64', 'uint64', False), + ('Int64', 'int64', False), + ('Float32', 'float32', True), + ('UInt32', 'uint32', False), + ('Int32', 'int32', False), + ('UInt16', 'uint16', False), + ('Int16', 'int16', False), + ('UInt8', 'uint8', False), + ('Int8', 'int8', False)] }} -{{for name, dtype, float_group, default_na_value in dtypes}} +{{for name, dtype, float_group in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -487,7 +487,7 @@ cdef class {{name}}HashTable(HashTable): # which is only used if it's *specified*. na_value2 = <{{dtype}}_t>na_value else: - na_value2 = {{default_na_value}} + na_value2 = 0 with nogil: for i in range(n):