Skip to content

PERF: Introducing HashTables for datatypes with 8,16 and 32 bits #37920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 21, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@ from numpy cimport intp_t, ndarray

from pandas._libs.khash cimport (
float64_t,
int32_t,
int64_t,
kh_float64_t,
kh_int32_t,
kh_int64_t,
kh_pymap_t,
kh_str_t,
kh_uint32_t,
kh_uint64_t,
uint32_t,
uint64_t,
)

Expand All @@ -28,6 +32,18 @@ cdef class Int64HashTable(HashTable):
cpdef get_item(self, int64_t val)
cpdef set_item(self, int64_t key, Py_ssize_t val)

cdef class UInt32HashTable(HashTable):
cdef kh_uint32_t *table

cpdef get_item(self, uint32_t val)
cpdef set_item(self, uint32_t key, Py_ssize_t val)

cdef class Int32HashTable(HashTable):
cdef kh_int32_t *table

cpdef get_item(self, int32_t val)
cpdef set_item(self, int32_t key, Py_ssize_t val)

cdef class Float64HashTable(HashTable):
cdef kh_float64_t *table

Expand Down
40 changes: 1 addition & 39 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,7 @@ cnp.import_array()


from pandas._libs cimport util
from pandas._libs.khash cimport (
kh_destroy_float64,
kh_destroy_int64,
kh_destroy_pymap,
kh_destroy_str,
kh_destroy_uint64,
kh_exist_float64,
kh_exist_int64,
kh_exist_pymap,
kh_exist_str,
kh_exist_uint64,
kh_float64_t,
kh_get_float64,
kh_get_int64,
kh_get_pymap,
kh_get_str,
kh_get_strbox,
kh_get_uint64,
kh_init_float64,
kh_init_int64,
kh_init_pymap,
kh_init_str,
kh_init_strbox,
kh_init_uint64,
kh_int64_t,
kh_put_float64,
kh_put_int64,
kh_put_pymap,
kh_put_str,
kh_put_strbox,
kh_put_uint64,
kh_resize_float64,
kh_resize_int64,
kh_resize_pymap,
kh_resize_str,
kh_resize_uint64,
kh_str_t,
khiter_t,
)
from pandas._libs.khash cimport kh_str_t, khiter_t
from pandas._libs.missing cimport checknull


Expand Down
38 changes: 35 additions & 3 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,30 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""


{{py:

# name
cimported_types = ['float64',
'int32',
'int64',
'pymap',
'str',
'strbox',
'uint32',
'uint64']
}}

{{for name in cimported_types}}
from pandas._libs.khash cimport (
kh_destroy_{{name}},
kh_exist_{{name}},
kh_get_{{name}},
kh_init_{{name}},
kh_put_{{name}},
kh_resize_{{name}},
)
{{endfor}}

# ----------------------------------------------------------------------
# VectorData
# ----------------------------------------------------------------------
Expand All @@ -21,8 +45,10 @@ from pandas._libs.missing cimport C_NA

dtypes = [('Float64', 'float64', 'float64_t'),
('Int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32_t'),
('String', 'string', 'char *'),
('UInt64', 'uint64', 'uint64_t')]
('UInt64', 'uint64', 'uint64_t'),
('UInt32', 'uint32', 'uint32_t')]
}}

{{for name, dtype, c_type in dtypes}}
Expand All @@ -49,7 +75,9 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,

ctypedef fused vector_data:
Int64VectorData
Int32VectorData
UInt64VectorData
UInt32VectorData
Float64VectorData
StringVectorData

Expand All @@ -65,7 +93,9 @@ cdef inline bint needs_resize(vector_data *data) nogil:
# name, dtype, c_type
dtypes = [('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t')]
('Int64', 'int64', 'int64_t'),
('UInt32', 'uint32', 'uint32_t'),
('Int32', 'int32', 'int32_t')]

}}

Expand Down Expand Up @@ -256,7 +286,9 @@ cdef class HashTable:
# name, dtype, float_group, default_na_value
dtypes = [('Float64', 'float64', True, 'np.nan'),
('UInt64', 'uint64', False, 0),
('Int64', 'int64', False, 'NPY_NAT')]
('Int64', 'int64', False, 'NPY_NAT'),
('UInt32', 'uint32', False, 0),
('Int32', 'int32', False, 0)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this migth be a problem not sure

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you referring to the 0 here?

i looked at this and dont think its an issue, bc it is only a placeholder that never gets used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This field (default_na_value) is probably not really needed.

Just set na_value2 to 0 here:

{{dtype}}_t val, na_value2

and drop the whole else-branch here:
else:
na_value2 = {{default_na_value}}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah that was my comment, but agree not germane for now.


}}

Expand Down
6 changes: 5 additions & 1 deletion pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
# dtype, ttype, c_type
dtypes = [('float64', 'float64', 'float64_t'),
('uint64', 'uint64', 'uint64_t'),
('uint32', 'uint32', 'uint32_t'),
('object', 'pymap', 'object'),
('int64', 'int64', 'int64_t')]
('int64', 'int64', 'int64_t'),
('int32', 'int32', 'int32_t')]

}}

Expand Down Expand Up @@ -276,7 +278,9 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values):
# dtype, ctype, table_type, npy_dtype
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
('int64', 'int64_t', 'int64', 'int64'),
('int32', 'int32_t', 'int32', 'int32'),
('uint64', 'uint64_t', 'uint64', 'uint64'),
('uint32', 'uint32_t', 'uint32', 'uint32'),
('object', 'object', 'pymap', 'object_')]
}}

Expand Down
68 changes: 2 additions & 66 deletions pandas/_libs/khash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -67,72 +67,6 @@ cdef extern from "khash_python.h":
void kh_destroy_str_starts(kh_str_starts_t*) nogil
void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil

ctypedef struct kh_int64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
int64_t *keys
size_t *vals

kh_int64_t* kh_init_int64() nogil
void kh_destroy_int64(kh_int64_t*) nogil
void kh_clear_int64(kh_int64_t*) nogil
khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
void kh_resize_int64(kh_int64_t*, khint_t) nogil
khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
void kh_del_int64(kh_int64_t*, khint_t) nogil

bint kh_exist_int64(kh_int64_t*, khiter_t) nogil

ctypedef uint64_t khuint64_t

ctypedef struct kh_uint64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
khuint64_t *keys
size_t *vals

kh_uint64_t* kh_init_uint64() nogil
void kh_destroy_uint64(kh_uint64_t*) nogil
void kh_clear_uint64(kh_uint64_t*) nogil
khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
void kh_del_uint64(kh_uint64_t*, khint_t) nogil

bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil

ctypedef struct kh_float64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
float64_t *keys
size_t *vals

kh_float64_t* kh_init_float64() nogil
void kh_destroy_float64(kh_float64_t*) nogil
void kh_clear_float64(kh_float64_t*) nogil
khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
void kh_resize_float64(kh_float64_t*, khint_t) nogil
khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
void kh_del_float64(kh_float64_t*, khint_t) nogil

bint kh_exist_float64(kh_float64_t*, khiter_t) nogil

ctypedef struct kh_int32_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
int32_t *keys
size_t *vals

kh_int32_t* kh_init_int32() nogil
void kh_destroy_int32(kh_int32_t*) nogil
void kh_clear_int32(kh_int32_t*) nogil
khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
void kh_resize_int32(kh_int32_t*, khint_t) nogil
khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
void kh_del_int32(kh_int32_t*, khint_t) nogil

bint kh_exist_int32(kh_int32_t*, khiter_t) nogil

# sweep factorize

ctypedef struct kh_strbox_t:
Expand All @@ -150,3 +84,5 @@ cdef extern from "khash_python.h":
void kh_del_strbox(kh_strbox_t*, khint_t) nogil

bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil

include "khash_for_primitive_helper.pxi"
37 changes: 37 additions & 0 deletions pandas/_libs/khash_for_primitive_helper.pxi.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Template for wrapping khash-tables for each primitive `dtype`

WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

{{py:

# name, c_type
primitive_types = [('int64', 'int64_t'),
('uint64', 'uint64_t'),
('float64', 'float64_t'),
('int32', 'int32_t'),
('uint32', 'uint32_t'),
]
}}

{{for name, c_type in primitive_types}}

cdef extern from "khash_python.h":
ctypedef struct kh_{{name}}_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
{{c_type}} *keys
size_t *vals

kh_{{name}}_t* kh_init_{{name}}() nogil
void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
void kh_clear_{{name}}(kh_{{name}}_t*) nogil
khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil
khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil

bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil

{{endfor}}
5 changes: 5 additions & 0 deletions pandas/_libs/src/klib/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,9 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key)
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_MAP_INIT_UINT(name, khval_t) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment about why the int32 version can be reused verbatim for the uint32 case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it is another way around: a comment is needed why int32-version uses verbatim the uint32 case:) khint32_t is an unsigned int:

typedef unsigned int khint32_t;
so it only naturally to use it for uint32-version

The trick for int32 is that the conversion from signed int to unsigned int is well defined by the standard (see e.g. https://stackoverflow.com/a/50632), thus we can safely pass an int as key (and we don't have something like "get_key", thus it is never casted back to signed int32, which would be implementation defined behavior (and thus not the original value).

Why it is done this way? For example, pandas has tried to improve the situation for int64:

typedef unsigned long khuint64_t;
typedef signed long khint64_t;

The problem now is, that for signed int64 there is implementation defined behavior (>>) and undefined behavior (<<) for negative or "sufficiently large" keys in the hash function:

#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)

as key is now a signed int (see e.g. https://stackoverflow.com/a/4009922).

I would be surprised to see a (sane) compiler which wouldn't do "the right thing" on x86/x86_64, but still...

The hash-function should be probably be

#define kh_int64_hash_func(key) (khint32_t)(((khint64_t)(key))>>33^((khint64_t)(key))^((khint64_t)(key))<<11)

just to be safe.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it is another way around: a comment is needed why int32-version uses verbatim the uint32 case:)

fine by me as long as its clear to the next reader

KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
Expand Down Expand Up @@ -635,9 +638,11 @@ typedef const char *kh_cstr_t;
#define kh_exist_uint64(h, k) (kh_exist(h, k))
#define kh_exist_int64(h, k) (kh_exist(h, k))
#define kh_exist_int32(h, k) (kh_exist(h, k))
#define kh_exist_uint32(h, k) (kh_exist(h, k))

KHASH_MAP_INIT_STR(str, size_t)
KHASH_MAP_INIT_INT(int32, size_t)
KHASH_MAP_INIT_UINT(uint32, size_t)
KHASH_MAP_INIT_INT64(int64, size_t)
KHASH_MAP_INIT_UINT64(uint64, size_t)

Expand Down
Loading