Skip to content

Commit 275ef41

Browse files
committed
Add hashtable for C buffer type (ptr + len) avoiding strlen overhead
1 parent 921b897 commit 275ef41

File tree

3 files changed

+178
-1
lines changed

3 files changed

+178
-1
lines changed

pandas/hashtable.pyx

+123-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
1+
from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check,
2+
PyString_AsStringAndSize, PyDict_Copy)
23

34
from khash cimport *
45
from numpy cimport *
@@ -843,6 +844,127 @@ cdef class PyObjectHashTable(HashTable):
843844
return labels
844845

845846

847+
cdef inline cbuf_t to_cbuf(object s):
848+
cdef cbuf_t output
849+
PyString_AsStringAndSize(s, <char**>&output.buf, &output.len)
850+
return output
851+
852+
853+
cdef class CBufHashTable(HashTable):
854+
cdef kh_cbuf_map_t *table
855+
856+
def __cinit__(self, int size_hint=1):
857+
self.table = kh_init_cbuf_map()
858+
if size_hint is not None:
859+
kh_resize_cbuf_map(self.table, size_hint)
860+
861+
def __dealloc__(self):
862+
kh_destroy_cbuf_map(self.table)
863+
864+
cdef inline int check_type(self, object val):
865+
return util.is_string_object(val)
866+
867+
cpdef get_item(self, object val):
868+
cdef khiter_t it
869+
it = kh_get_cbuf_map(self.table, to_cbuf(val))
870+
if it != self.table.n_buckets:
871+
return self.table.vals[it]
872+
else:
873+
raise KeyError(val)
874+
875+
def get_iter_test(self, object key, Py_ssize_t iterations):
876+
cdef khiter_t it
877+
cdef Py_ssize_t i, val
878+
for i in range(iterations):
879+
it = kh_get_cbuf_map(self.table, to_cbuf(key))
880+
if it != self.table.n_buckets:
881+
val = self.table.vals[it]
882+
883+
cpdef set_item(self, object key, Py_ssize_t val):
884+
cdef:
885+
khiter_t it
886+
int ret = 0
887+
cbuf_t buf
888+
889+
buf = to_cbuf(key)
890+
891+
it = kh_put_cbuf_map(self.table, buf, &ret)
892+
self.table.keys[it] = buf
893+
if kh_exist_cbuf_map(self.table, it):
894+
self.table.vals[it] = val
895+
else:
896+
raise KeyError(key)
897+
898+
def get_indexer(self, ndarray[object] values):
899+
cdef:
900+
Py_ssize_t i, n = len(values)
901+
ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
902+
cbuf_t buf
903+
int64_t[::1] out = labels
904+
khiter_t it
905+
kh_cbuf_map_t *table = self.table
906+
907+
for i in range(n):
908+
buf = to_cbuf(values[i])
909+
it = kh_get_cbuf_map(table, buf)
910+
if it != table.n_buckets:
911+
out[i] = table.vals[it]
912+
else:
913+
out[i] = -1
914+
return labels
915+
916+
def unique(self, ndarray[object] values):
917+
cdef:
918+
Py_ssize_t i, n = len(values)
919+
Py_ssize_t idx, count = 0
920+
int ret = 0
921+
object val
922+
cbuf_t buf
923+
khiter_t it
924+
ObjectVector uniques = ObjectVector()
925+
926+
for i in range(n):
927+
val = values[i]
928+
buf = to_cbuf(val)
929+
it = kh_get_cbuf_map(self.table, buf)
930+
if it == self.table.n_buckets:
931+
it = kh_put_cbuf_map(self.table, buf, &ret)
932+
count += 1
933+
uniques.append(val)
934+
935+
return uniques.to_array()
936+
937+
def factorize(self, ndarray[object] values):
938+
cdef:
939+
Py_ssize_t i, n = len(values)
940+
ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
941+
list reverse = []
942+
Py_ssize_t idx, count = 0
943+
int ret = 0
944+
object val
945+
cbuf_t buf
946+
khiter_t it
947+
948+
for i in range(n):
949+
val = values[i]
950+
buf = to_cbuf(val)
951+
it = kh_get_cbuf_map(self.table, buf)
952+
if it != self.table.n_buckets:
953+
idx = self.table.vals[it]
954+
labels[i] = idx
955+
else:
956+
it = kh_put_cbuf_map(self.table, buf, &ret)
957+
958+
self.table.vals[it] = count
959+
reverse.append(val)
960+
labels[i] = count
961+
count += 1
962+
963+
return PyDict_Copy(enumerate(reverse)), labels
964+
965+
966+
967+
846968
cdef class Factorizer:
847969
cdef public PyObjectHashTable table
848970
cdef public ObjectVector uniques

pandas/src/khash.pxd

+18
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,21 @@ cdef extern from "khash_python.h":
115115

116116
bint kh_exist_strbox(kh_strbox_t*, khiter_t)
117117

118+
ctypedef struct cbuf_t:
119+
kh_cstr_t buf
120+
Py_ssize_t len
121+
122+
ctypedef struct kh_cbuf_map_t:
123+
khint_t n_buckets, size, n_occupied, upper_bound
124+
uint32_t *flags
125+
cbuf_t *keys
126+
size_t *vals
127+
128+
inline kh_cbuf_map_t* kh_init_cbuf_map()
129+
inline void kh_destroy_cbuf_map(kh_cbuf_map_t*)
130+
inline void kh_clear_cbuf_map(kh_cbuf_map_t*)
131+
inline khint_t kh_get_cbuf_map(kh_cbuf_map_t*, cbuf_t)
132+
inline void kh_resize_cbuf_map(kh_cbuf_map_t*, khint_t)
133+
inline khint_t kh_put_cbuf_map(kh_cbuf_map_t*, cbuf_t, int*)
134+
135+
bint kh_exist_cbuf_map(kh_cbuf_map_t*, khiter_t)

pandas/src/klib/khash_python.h

+37
Original file line numberDiff line numberDiff line change
@@ -168,4 +168,41 @@ KHASH_SET_INIT_PYOBJECT(pyset)
168168
KHASH_INIT(strbox, kh_cstr_t, kh_pyobject_t, 1,
169169
str_xxhash_hash_func, kh_str_hash_equal)
170170

171+
/* Plain old C buffer structure */
172+
typedef struct {
173+
kh_cstr_t buf;
174+
Py_ssize_t len;
175+
} cbuf_t;
176+
177+
static khint_t PANDAS_INLINE cbuf_xxhash(cbuf_t val) {
178+
switch (val.len) {
179+
case 0:
180+
return XXH32_EMPTY_HASH;
181+
case 1:
182+
return XXH32_ONECHAR_HASH[(uint8_t)val.buf[0]];
183+
default:
184+
return XXH32(val.buf, val.len, XXH32_SEED);
185+
}
186+
}
187+
188+
static int PANDAS_INLINE cbuf_equal(cbuf_t a, cbuf_t b) {
189+
int i;
190+
if (a.len != b.len) {
191+
return 0;
192+
}
193+
if (a.buf == b.buf) {
194+
return 1;
195+
}
196+
for (i = 0; i < a.len; ++i) {
197+
if (a.buf[i] != b.buf[i]) {
198+
return 0;
199+
}
200+
}
201+
return 1;
202+
}
203+
204+
/* [cbuf_t -> size_t] hash map */
205+
KHASH_INIT(cbuf_map, cbuf_t, size_t, 1, cbuf_xxhash, cbuf_equal)
206+
#define kh_exist_cbuf_map(h, k) (kh_exist(h, k))
207+
171208
#endif /* _KLIB_KHASH_PYTHON_H_ */

0 commit comments

Comments
 (0)