-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: Introducing HashTables for datatypes with 8,16 and 32 bits #37920
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
b871012
8983425
9b3c5a5
e2f062b
8a7fc6c
5ab4d68
d9ab327
41d4b57
70c6fc5
15dfe49
8975f06
0ffd3b2
c952d68
6d026b2
a2e5679
897512d
17a3fee
3a4c2bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
""" | ||
Template for wrapping khash-tables for each primitive `dtype` | ||
|
||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in | ||
""" | ||
|
||
{{py: | ||
|
||
# name, c_type | ||
primitive_types = [('int64', 'int64_t'), | ||
('uint64', 'uint64_t'), | ||
('float64', 'float64_t'), | ||
('int32', 'int32_t'), | ||
('uint32', 'uint32_t'), | ||
] | ||
}} | ||
|
||
{{for name, c_type in primitive_types}} | ||
|
||
cdef extern from "khash_python.h": | ||
ctypedef struct kh_{{name}}_t: | ||
khint_t n_buckets, size, n_occupied, upper_bound | ||
uint32_t *flags | ||
{{c_type}} *keys | ||
size_t *vals | ||
|
||
kh_{{name}}_t* kh_init_{{name}}() nogil | ||
void kh_destroy_{{name}}(kh_{{name}}_t*) nogil | ||
void kh_clear_{{name}}(kh_{{name}}_t*) nogil | ||
khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil | ||
void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil | ||
khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil | ||
void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil | ||
|
||
bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil | ||
|
||
{{endfor}} |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -591,6 +591,9 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) | |||||||||
#define KHASH_MAP_INIT_INT(name, khval_t) \ | ||||||||||
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) | ||||||||||
|
||||||||||
#define KHASH_MAP_INIT_UINT(name, khval_t) \ | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment about why the int32 version can be reused verbatim for the uint32 case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, it is another way around: a comment is needed why int32-version uses verbatim the uint32 case:) pandas/pandas/_libs/src/klib/khash.h Line 119 in 4cfa97a
The trick for int32 is that the conversion from signed int to unsigned int is well defined by the standard (see e.g. https://stackoverflow.com/a/50632), thus we can safely pass an int as key (and we don't have something like "get_key", thus it is never casted back to signed int32, which would be implementation defined behavior (and thus not the original value). Why it is done this way? For example, pandas has tried to improve the situation for int64: pandas/pandas/_libs/src/klib/khash.h Lines 125 to 126 in 4cfa97a
The problem now is, that for signed int64 there is implementation defined behavior (>>) and undefined behavior (<<) for negative or "sufficiently large" keys in the hash function: pandas/pandas/_libs/src/klib/khash.h Line 411 in 4cfa97a
as I would be surprised to see a (sane) compiler which wouldn't do "the right thing" on x86/x86_64, but still... The hash-function should be probably be
just to be safe. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
fine by me as long as its clear to the next reader |
||||||||||
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) | ||||||||||
|
||||||||||
/*! @function | ||||||||||
@abstract Instantiate a hash map containing 64-bit integer keys | ||||||||||
@param name Name of the hash table [symbol] | ||||||||||
|
@@ -635,9 +638,11 @@ typedef const char *kh_cstr_t; | |||||||||
#define kh_exist_uint64(h, k) (kh_exist(h, k)) | ||||||||||
#define kh_exist_int64(h, k) (kh_exist(h, k)) | ||||||||||
#define kh_exist_int32(h, k) (kh_exist(h, k)) | ||||||||||
#define kh_exist_uint32(h, k) (kh_exist(h, k)) | ||||||||||
|
||||||||||
KHASH_MAP_INIT_STR(str, size_t) | ||||||||||
KHASH_MAP_INIT_INT(int32, size_t) | ||||||||||
KHASH_MAP_INIT_UINT(uint32, size_t) | ||||||||||
KHASH_MAP_INIT_INT64(int64, size_t) | ||||||||||
KHASH_MAP_INIT_UINT64(uint64, size_t) | ||||||||||
|
||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this migth be a problem not sure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are you referring to the 0 here?
i looked at this and dont think its an issue, bc it is only a placeholder that never gets used
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This field (
default_na_value
) is probably not really needed.Just set
na_value2
to 0 here:pandas/pandas/_libs/hashtable_class_helper.pxi.in
Line 410 in 4cfa97a
and drop the whole else-branch here:
pandas/pandas/_libs/hashtable_class_helper.pxi.in
Lines 432 to 433 in 4cfa97a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah that was my comment, but agree not germane for now.