Skip to content

PERF: Introducing HashTables for datatypes with 8,16 and 32 bits #37920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
from numpy cimport intp_t, ndarray

from pandas._libs.khash cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
kh_float32_t,
kh_float64_t,
kh_int8_t,
kh_int16_t,
kh_int32_t,
kh_int64_t,
kh_pymap_t,
kh_str_t,
kh_uint8_t,
kh_uint16_t,
kh_uint32_t,
kh_uint64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)

Expand All @@ -28,12 +42,54 @@ cdef class Int64HashTable(HashTable):
cpdef get_item(self, int64_t val)
cpdef set_item(self, int64_t key, Py_ssize_t val)

cdef class UInt32HashTable(HashTable):
cdef kh_uint32_t *table

cpdef get_item(self, uint32_t val)
cpdef set_item(self, uint32_t key, Py_ssize_t val)

cdef class Int32HashTable(HashTable):
cdef kh_int32_t *table

cpdef get_item(self, int32_t val)
cpdef set_item(self, int32_t key, Py_ssize_t val)

cdef class UInt16HashTable(HashTable):
cdef kh_uint16_t *table

cpdef get_item(self, uint16_t val)
cpdef set_item(self, uint16_t key, Py_ssize_t val)

cdef class Int16HashTable(HashTable):
cdef kh_int16_t *table

cpdef get_item(self, int16_t val)
cpdef set_item(self, int16_t key, Py_ssize_t val)

cdef class UInt8HashTable(HashTable):
cdef kh_uint8_t *table

cpdef get_item(self, uint8_t val)
cpdef set_item(self, uint8_t key, Py_ssize_t val)

cdef class Int8HashTable(HashTable):
cdef kh_int8_t *table

cpdef get_item(self, int8_t val)
cpdef set_item(self, int8_t key, Py_ssize_t val)

cdef class Float64HashTable(HashTable):
cdef kh_float64_t *table

cpdef get_item(self, float64_t val)
cpdef set_item(self, float64_t key, Py_ssize_t val)

cdef class Float32HashTable(HashTable):
cdef kh_float32_t *table

cpdef get_item(self, float32_t val)
cpdef set_item(self, float32_t key, Py_ssize_t val)

cdef class PyObjectHashTable(HashTable):
cdef kh_pymap_t *table

Expand Down
40 changes: 1 addition & 39 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,7 @@ cnp.import_array()


from pandas._libs cimport util
from pandas._libs.khash cimport (
kh_destroy_float64,
kh_destroy_int64,
kh_destroy_pymap,
kh_destroy_str,
kh_destroy_uint64,
kh_exist_float64,
kh_exist_int64,
kh_exist_pymap,
kh_exist_str,
kh_exist_uint64,
kh_float64_t,
kh_get_float64,
kh_get_int64,
kh_get_pymap,
kh_get_str,
kh_get_strbox,
kh_get_uint64,
kh_init_float64,
kh_init_int64,
kh_init_pymap,
kh_init_str,
kh_init_strbox,
kh_init_uint64,
kh_int64_t,
kh_put_float64,
kh_put_int64,
kh_put_pymap,
kh_put_str,
kh_put_strbox,
kh_put_uint64,
kh_resize_float64,
kh_resize_int64,
kh_resize_pymap,
kh_resize_str,
kh_resize_uint64,
kh_str_t,
khiter_t,
)
from pandas._libs.khash cimport kh_str_t, khiter_t
from pandas._libs.missing cimport checknull


Expand Down
73 changes: 65 additions & 8 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""


{{py:

# name
cimported_types = ['float32',
'float64',
'int8',
'int16',
'int32',
'int64',
'pymap',
'str',
'strbox',
'uint8',
'uint16',
'uint32',
'uint64']
}}

{{for name in cimported_types}}
from pandas._libs.khash cimport (
kh_destroy_{{name}},
kh_exist_{{name}},
kh_get_{{name}},
kh_init_{{name}},
kh_put_{{name}},
kh_resize_{{name}},
)
{{endfor}}

# ----------------------------------------------------------------------
# VectorData
# ----------------------------------------------------------------------
Expand All @@ -20,9 +49,16 @@ from pandas._libs.missing cimport C_NA
# for uniques in hashtables)

dtypes = [('Float64', 'float64', 'float64_t'),
('Float32', 'float32', 'float32_t'),
('Int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32_t'),
('Int16', 'int16', 'int16_t'),
('Int8', 'int8', 'int8_t'),
('String', 'string', 'char *'),
('UInt64', 'uint64', 'uint64_t')]
('UInt64', 'uint64', 'uint64_t'),
('UInt32', 'uint32', 'uint32_t'),
('UInt16', 'uint16', 'uint16_t'),
('UInt8', 'uint8', 'uint8_t')]
}}

{{for name, dtype, c_type in dtypes}}
Expand All @@ -49,8 +85,15 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,

ctypedef fused vector_data:
Int64VectorData
Int32VectorData
Int16VectorData
Int8VectorData
UInt64VectorData
UInt32VectorData
UInt16VectorData
UInt8VectorData
Float64VectorData
Float32VectorData
StringVectorData

cdef inline bint needs_resize(vector_data *data) nogil:
Expand All @@ -65,7 +108,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
# name, dtype, c_type
dtypes = [('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t')]
('Int64', 'int64', 'int64_t'),
('Float32', 'float32', 'float32_t'),
('UInt32', 'uint32', 'uint32_t'),
('Int32', 'int32', 'int32_t'),
('UInt16', 'uint16', 'uint16_t'),
('Int16', 'int16', 'int16_t'),
('UInt8', 'uint8', 'uint8_t'),
('Int8', 'int8', 'int8_t')]

}}

Expand Down Expand Up @@ -253,15 +303,22 @@ cdef class HashTable:

{{py:

# name, dtype, float_group, default_na_value
dtypes = [('Float64', 'float64', True, 'np.nan'),
('UInt64', 'uint64', False, 0),
('Int64', 'int64', False, 'NPY_NAT')]
# name, dtype, float_group
dtypes = [('Float64', 'float64', True),
('UInt64', 'uint64', False),
('Int64', 'int64', False),
('Float32', 'float32', True),
('UInt32', 'uint32', False),
('Int32', 'int32', False),
('UInt16', 'uint16', False),
('Int16', 'int16', False),
('UInt8', 'uint8', False),
('Int8', 'int8', False)]

}}


{{for name, dtype, float_group, default_na_value in dtypes}}
{{for name, dtype, float_group in dtypes}}

cdef class {{name}}HashTable(HashTable):

Expand Down Expand Up @@ -430,7 +487,7 @@ cdef class {{name}}HashTable(HashTable):
# which is only used if it's *specified*.
na_value2 = <{{dtype}}_t>na_value
else:
na_value2 = {{default_na_value}}
na_value2 = 0

with nogil:
for i in range(n):
Expand Down
18 changes: 16 additions & 2 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

# dtype, ttype, c_type
dtypes = [('float64', 'float64', 'float64_t'),
('float32', 'float32', 'float32_t'),
('uint64', 'uint64', 'uint64_t'),
('uint32', 'uint32', 'uint32_t'),
('uint16', 'uint16', 'uint16_t'),
('uint8', 'uint8', 'uint8_t'),
('object', 'pymap', 'object'),
('int64', 'int64', 'int64_t')]
('int64', 'int64', 'int64_t'),
('int32', 'int32', 'int32_t'),
('int16', 'int16', 'int16_t'),
('int8', 'int8', 'int8_t')]

}}

Expand Down Expand Up @@ -54,7 +61,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
for i in range(n):
val = values[i]

{{if dtype == 'float64'}}
{{if dtype == 'float64' or dtype == 'float32'}}
if val == val or not dropna:
{{else}}
if True:
Expand Down Expand Up @@ -275,8 +282,15 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values):

# dtype, ctype, table_type, npy_dtype
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
('float32', 'float32_t', 'float32', 'float32'),
('int64', 'int64_t', 'int64', 'int64'),
('int32', 'int32_t', 'int32', 'int32'),
('int16', 'int16_t', 'int16', 'int16'),
('int8', 'int8_t', 'int8', 'int8'),
('uint64', 'uint64_t', 'uint64', 'uint64'),
('uint32', 'uint32_t', 'uint32', 'uint32'),
('uint16', 'uint16_t', 'uint16', 'uint16'),
('uint8', 'uint8_t', 'uint8', 'uint8'),
('object', 'object', 'pymap', 'object_')]
}}

Expand Down
81 changes: 14 additions & 67 deletions pandas/_libs/khash.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
from cpython.object cimport PyObject
from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)


cdef extern from "khash_python.h":
Expand Down Expand Up @@ -67,72 +78,6 @@ cdef extern from "khash_python.h":
void kh_destroy_str_starts(kh_str_starts_t*) nogil
void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil

ctypedef struct kh_int64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
int64_t *keys
size_t *vals

kh_int64_t* kh_init_int64() nogil
void kh_destroy_int64(kh_int64_t*) nogil
void kh_clear_int64(kh_int64_t*) nogil
khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
void kh_resize_int64(kh_int64_t*, khint_t) nogil
khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
void kh_del_int64(kh_int64_t*, khint_t) nogil

bint kh_exist_int64(kh_int64_t*, khiter_t) nogil

ctypedef uint64_t khuint64_t

ctypedef struct kh_uint64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
khuint64_t *keys
size_t *vals

kh_uint64_t* kh_init_uint64() nogil
void kh_destroy_uint64(kh_uint64_t*) nogil
void kh_clear_uint64(kh_uint64_t*) nogil
khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
void kh_del_uint64(kh_uint64_t*, khint_t) nogil

bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil

ctypedef struct kh_float64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
float64_t *keys
size_t *vals

kh_float64_t* kh_init_float64() nogil
void kh_destroy_float64(kh_float64_t*) nogil
void kh_clear_float64(kh_float64_t*) nogil
khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
void kh_resize_float64(kh_float64_t*, khint_t) nogil
khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
void kh_del_float64(kh_float64_t*, khint_t) nogil

bint kh_exist_float64(kh_float64_t*, khiter_t) nogil

ctypedef struct kh_int32_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
int32_t *keys
size_t *vals

kh_int32_t* kh_init_int32() nogil
void kh_destroy_int32(kh_int32_t*) nogil
void kh_clear_int32(kh_int32_t*) nogil
khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
void kh_resize_int32(kh_int32_t*, khint_t) nogil
khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
void kh_del_int32(kh_int32_t*, khint_t) nogil

bint kh_exist_int32(kh_int32_t*, khiter_t) nogil

# sweep factorize

ctypedef struct kh_strbox_t:
Expand All @@ -150,3 +95,5 @@ cdef extern from "khash_python.h":
void kh_del_strbox(kh_strbox_t*, khint_t) nogil

bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil

include "khash_for_primitive_helper.pxi"
Loading