Skip to content

Commit fb2bd10

Browse files
authored
PERF: Introducing HashTables for datatypes with 8,16 and 32 bits (#37920)
1 parent 6454943 commit fb2bd10

11 files changed

+551
-125
lines changed

pandas/_libs/hashtable.pxd

+56
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,27 @@
11
from numpy cimport intp_t, ndarray
22

33
from pandas._libs.khash cimport (
4+
float32_t,
45
float64_t,
6+
int8_t,
7+
int16_t,
8+
int32_t,
59
int64_t,
10+
kh_float32_t,
611
kh_float64_t,
12+
kh_int8_t,
13+
kh_int16_t,
14+
kh_int32_t,
715
kh_int64_t,
816
kh_pymap_t,
917
kh_str_t,
18+
kh_uint8_t,
19+
kh_uint16_t,
20+
kh_uint32_t,
1021
kh_uint64_t,
22+
uint8_t,
23+
uint16_t,
24+
uint32_t,
1125
uint64_t,
1226
)
1327

@@ -28,12 +42,54 @@ cdef class Int64HashTable(HashTable):
2842
cpdef get_item(self, int64_t val)
2943
cpdef set_item(self, int64_t key, Py_ssize_t val)
3044

45+
cdef class UInt32HashTable(HashTable):
46+
cdef kh_uint32_t *table
47+
48+
cpdef get_item(self, uint32_t val)
49+
cpdef set_item(self, uint32_t key, Py_ssize_t val)
50+
51+
cdef class Int32HashTable(HashTable):
52+
cdef kh_int32_t *table
53+
54+
cpdef get_item(self, int32_t val)
55+
cpdef set_item(self, int32_t key, Py_ssize_t val)
56+
57+
cdef class UInt16HashTable(HashTable):
58+
cdef kh_uint16_t *table
59+
60+
cpdef get_item(self, uint16_t val)
61+
cpdef set_item(self, uint16_t key, Py_ssize_t val)
62+
63+
cdef class Int16HashTable(HashTable):
64+
cdef kh_int16_t *table
65+
66+
cpdef get_item(self, int16_t val)
67+
cpdef set_item(self, int16_t key, Py_ssize_t val)
68+
69+
cdef class UInt8HashTable(HashTable):
70+
cdef kh_uint8_t *table
71+
72+
cpdef get_item(self, uint8_t val)
73+
cpdef set_item(self, uint8_t key, Py_ssize_t val)
74+
75+
cdef class Int8HashTable(HashTable):
76+
cdef kh_int8_t *table
77+
78+
cpdef get_item(self, int8_t val)
79+
cpdef set_item(self, int8_t key, Py_ssize_t val)
80+
3181
cdef class Float64HashTable(HashTable):
3282
cdef kh_float64_t *table
3383

3484
cpdef get_item(self, float64_t val)
3585
cpdef set_item(self, float64_t key, Py_ssize_t val)
3686

87+
cdef class Float32HashTable(HashTable):
88+
cdef kh_float32_t *table
89+
90+
cpdef get_item(self, float32_t val)
91+
cpdef set_item(self, float32_t key, Py_ssize_t val)
92+
3793
cdef class PyObjectHashTable(HashTable):
3894
cdef kh_pymap_t *table
3995

pandas/_libs/hashtable.pyx

+1-39
Original file line numberDiff line numberDiff line change
@@ -13,45 +13,7 @@ cnp.import_array()
1313

1414

1515
from pandas._libs cimport util
16-
from pandas._libs.khash cimport (
17-
kh_destroy_float64,
18-
kh_destroy_int64,
19-
kh_destroy_pymap,
20-
kh_destroy_str,
21-
kh_destroy_uint64,
22-
kh_exist_float64,
23-
kh_exist_int64,
24-
kh_exist_pymap,
25-
kh_exist_str,
26-
kh_exist_uint64,
27-
kh_float64_t,
28-
kh_get_float64,
29-
kh_get_int64,
30-
kh_get_pymap,
31-
kh_get_str,
32-
kh_get_strbox,
33-
kh_get_uint64,
34-
kh_init_float64,
35-
kh_init_int64,
36-
kh_init_pymap,
37-
kh_init_str,
38-
kh_init_strbox,
39-
kh_init_uint64,
40-
kh_int64_t,
41-
kh_put_float64,
42-
kh_put_int64,
43-
kh_put_pymap,
44-
kh_put_str,
45-
kh_put_strbox,
46-
kh_put_uint64,
47-
kh_resize_float64,
48-
kh_resize_int64,
49-
kh_resize_pymap,
50-
kh_resize_str,
51-
kh_resize_uint64,
52-
kh_str_t,
53-
khiter_t,
54-
)
16+
from pandas._libs.khash cimport kh_str_t, khiter_t
5517
from pandas._libs.missing cimport checknull
5618

5719

pandas/_libs/hashtable_class_helper.pxi.in

+65-8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
55
"""
66

77

8+
{{py:
9+
10+
# name
11+
cimported_types = ['float32',
12+
'float64',
13+
'int8',
14+
'int16',
15+
'int32',
16+
'int64',
17+
'pymap',
18+
'str',
19+
'strbox',
20+
'uint8',
21+
'uint16',
22+
'uint32',
23+
'uint64']
24+
}}
25+
26+
{{for name in cimported_types}}
27+
from pandas._libs.khash cimport (
28+
kh_destroy_{{name}},
29+
kh_exist_{{name}},
30+
kh_get_{{name}},
31+
kh_init_{{name}},
32+
kh_put_{{name}},
33+
kh_resize_{{name}},
34+
)
35+
{{endfor}}
36+
837
# ----------------------------------------------------------------------
938
# VectorData
1039
# ----------------------------------------------------------------------
@@ -20,9 +49,16 @@ from pandas._libs.missing cimport C_NA
2049
# for uniques in hashtables)
2150

2251
dtypes = [('Float64', 'float64', 'float64_t'),
52+
('Float32', 'float32', 'float32_t'),
2353
('Int64', 'int64', 'int64_t'),
54+
('Int32', 'int32', 'int32_t'),
55+
('Int16', 'int16', 'int16_t'),
56+
('Int8', 'int8', 'int8_t'),
2457
('String', 'string', 'char *'),
25-
('UInt64', 'uint64', 'uint64_t')]
58+
('UInt64', 'uint64', 'uint64_t'),
59+
('UInt32', 'uint32', 'uint32_t'),
60+
('UInt16', 'uint16', 'uint16_t'),
61+
('UInt8', 'uint8', 'uint8_t')]
2662
}}
2763

2864
{{for name, dtype, c_type in dtypes}}
@@ -49,8 +85,15 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
4985

5086
ctypedef fused vector_data:
5187
Int64VectorData
88+
Int32VectorData
89+
Int16VectorData
90+
Int8VectorData
5291
UInt64VectorData
92+
UInt32VectorData
93+
UInt16VectorData
94+
UInt8VectorData
5395
Float64VectorData
96+
Float32VectorData
5497
StringVectorData
5598

5699
cdef inline bint needs_resize(vector_data *data) nogil:
@@ -65,7 +108,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
65108
# name, dtype, c_type
66109
dtypes = [('Float64', 'float64', 'float64_t'),
67110
('UInt64', 'uint64', 'uint64_t'),
68-
('Int64', 'int64', 'int64_t')]
111+
('Int64', 'int64', 'int64_t'),
112+
('Float32', 'float32', 'float32_t'),
113+
('UInt32', 'uint32', 'uint32_t'),
114+
('Int32', 'int32', 'int32_t'),
115+
('UInt16', 'uint16', 'uint16_t'),
116+
('Int16', 'int16', 'int16_t'),
117+
('UInt8', 'uint8', 'uint8_t'),
118+
('Int8', 'int8', 'int8_t')]
69119

70120
}}
71121

@@ -253,15 +303,22 @@ cdef class HashTable:
253303

254304
{{py:
255305

256-
# name, dtype, float_group, default_na_value
257-
dtypes = [('Float64', 'float64', True, 'np.nan'),
258-
('UInt64', 'uint64', False, 0),
259-
('Int64', 'int64', False, 'NPY_NAT')]
306+
# name, dtype, float_group
307+
dtypes = [('Float64', 'float64', True),
308+
('UInt64', 'uint64', False),
309+
('Int64', 'int64', False),
310+
('Float32', 'float32', True),
311+
('UInt32', 'uint32', False),
312+
('Int32', 'int32', False),
313+
('UInt16', 'uint16', False),
314+
('Int16', 'int16', False),
315+
('UInt8', 'uint8', False),
316+
('Int8', 'int8', False)]
260317

261318
}}
262319

263320

264-
{{for name, dtype, float_group, default_na_value in dtypes}}
321+
{{for name, dtype, float_group in dtypes}}
265322

266323
cdef class {{name}}HashTable(HashTable):
267324

@@ -430,7 +487,7 @@ cdef class {{name}}HashTable(HashTable):
430487
# which is only used if it's *specified*.
431488
na_value2 = <{{dtype}}_t>na_value
432489
else:
433-
na_value2 = {{default_na_value}}
490+
na_value2 = 0
434491

435492
with nogil:
436493
for i in range(n):

pandas/_libs/hashtable_func_helper.pxi.in

+16-2
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
88

99
# dtype, ttype, c_type
1010
dtypes = [('float64', 'float64', 'float64_t'),
11+
('float32', 'float32', 'float32_t'),
1112
('uint64', 'uint64', 'uint64_t'),
13+
('uint32', 'uint32', 'uint32_t'),
14+
('uint16', 'uint16', 'uint16_t'),
15+
('uint8', 'uint8', 'uint8_t'),
1216
('object', 'pymap', 'object'),
13-
('int64', 'int64', 'int64_t')]
17+
('int64', 'int64', 'int64_t'),
18+
('int32', 'int32', 'int32_t'),
19+
('int16', 'int16', 'int16_t'),
20+
('int8', 'int8', 'int8_t')]
1421

1522
}}
1623

@@ -54,7 +61,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
5461
for i in range(n):
5562
val = values[i]
5663

57-
{{if dtype == 'float64'}}
64+
{{if dtype == 'float64' or dtype == 'float32'}}
5865
if val == val or not dropna:
5966
{{else}}
6067
if True:
@@ -275,8 +282,15 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values):
275282

276283
# dtype, ctype, table_type, npy_dtype
277284
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
285+
('float32', 'float32_t', 'float32', 'float32'),
278286
('int64', 'int64_t', 'int64', 'int64'),
287+
('int32', 'int32_t', 'int32', 'int32'),
288+
('int16', 'int16_t', 'int16', 'int16'),
289+
('int8', 'int8_t', 'int8', 'int8'),
279290
('uint64', 'uint64_t', 'uint64', 'uint64'),
291+
('uint32', 'uint32_t', 'uint32', 'uint32'),
292+
('uint16', 'uint16_t', 'uint16', 'uint16'),
293+
('uint8', 'uint8_t', 'uint8', 'uint8'),
280294
('object', 'object', 'pymap', 'object_')]
281295
}}
282296

pandas/_libs/khash.pxd

+14-67
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
from cpython.object cimport PyObject
2-
from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t
2+
from numpy cimport (
3+
float32_t,
4+
float64_t,
5+
int8_t,
6+
int16_t,
7+
int32_t,
8+
int64_t,
9+
uint8_t,
10+
uint16_t,
11+
uint32_t,
12+
uint64_t,
13+
)
314

415

516
cdef extern from "khash_python.h":
@@ -67,72 +78,6 @@ cdef extern from "khash_python.h":
6778
void kh_destroy_str_starts(kh_str_starts_t*) nogil
6879
void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil
6980

70-
ctypedef struct kh_int64_t:
71-
khint_t n_buckets, size, n_occupied, upper_bound
72-
uint32_t *flags
73-
int64_t *keys
74-
size_t *vals
75-
76-
kh_int64_t* kh_init_int64() nogil
77-
void kh_destroy_int64(kh_int64_t*) nogil
78-
void kh_clear_int64(kh_int64_t*) nogil
79-
khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
80-
void kh_resize_int64(kh_int64_t*, khint_t) nogil
81-
khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
82-
void kh_del_int64(kh_int64_t*, khint_t) nogil
83-
84-
bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
85-
86-
ctypedef uint64_t khuint64_t
87-
88-
ctypedef struct kh_uint64_t:
89-
khint_t n_buckets, size, n_occupied, upper_bound
90-
uint32_t *flags
91-
khuint64_t *keys
92-
size_t *vals
93-
94-
kh_uint64_t* kh_init_uint64() nogil
95-
void kh_destroy_uint64(kh_uint64_t*) nogil
96-
void kh_clear_uint64(kh_uint64_t*) nogil
97-
khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
98-
void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
99-
khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
100-
void kh_del_uint64(kh_uint64_t*, khint_t) nogil
101-
102-
bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
103-
104-
ctypedef struct kh_float64_t:
105-
khint_t n_buckets, size, n_occupied, upper_bound
106-
uint32_t *flags
107-
float64_t *keys
108-
size_t *vals
109-
110-
kh_float64_t* kh_init_float64() nogil
111-
void kh_destroy_float64(kh_float64_t*) nogil
112-
void kh_clear_float64(kh_float64_t*) nogil
113-
khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
114-
void kh_resize_float64(kh_float64_t*, khint_t) nogil
115-
khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
116-
void kh_del_float64(kh_float64_t*, khint_t) nogil
117-
118-
bint kh_exist_float64(kh_float64_t*, khiter_t) nogil
119-
120-
ctypedef struct kh_int32_t:
121-
khint_t n_buckets, size, n_occupied, upper_bound
122-
uint32_t *flags
123-
int32_t *keys
124-
size_t *vals
125-
126-
kh_int32_t* kh_init_int32() nogil
127-
void kh_destroy_int32(kh_int32_t*) nogil
128-
void kh_clear_int32(kh_int32_t*) nogil
129-
khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
130-
void kh_resize_int32(kh_int32_t*, khint_t) nogil
131-
khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
132-
void kh_del_int32(kh_int32_t*, khint_t) nogil
133-
134-
bint kh_exist_int32(kh_int32_t*, khiter_t) nogil
135-
13681
# sweep factorize
13782

13883
ctypedef struct kh_strbox_t:
@@ -150,3 +95,5 @@ cdef extern from "khash_python.h":
15095
void kh_del_strbox(kh_strbox_t*, khint_t) nogil
15196

15297
bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
98+
99+
include "khash_for_primitive_helper.pxi"

0 commit comments

Comments
 (0)