Skip to content

Commit 681f9f5

Browse files
committed
ENH: int64 tinkering with khash
1 parent f596a22 commit 681f9f5

File tree

2 files changed

+108
-1
lines changed

2 files changed

+108
-1
lines changed

pandas/src/khash.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ cdef extern from "khash.h":
5858
ctypedef struct kh_int64_t:
5959
khint_t n_buckets, size, n_occupied, upper_bound
6060
uint32_t *flags
61-
kh_cstr_t *keys
61+
int64_t *keys
6262
Py_ssize_t *vals
6363

6464
inline kh_int64_t* kh_init_int64()

pandas/src/sandbox.pyx

+107
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,113 @@ cdef class StringHashTable:
321321
# return None
322322
return reverse, labels, counts[:count].copy()
323323

324+
cdef class Int64HashTable:
325+
326+
cdef:
327+
kh_int64_t *table
328+
329+
def __init__(self, size_hint=1):
330+
if size_hint is not None:
331+
kh_resize_int64(self.table, size_hint)
332+
333+
def __cinit__(self):
334+
self.table = kh_init_int64()
335+
336+
def __dealloc__(self):
337+
kh_destroy_int64(self.table)
338+
339+
cdef inline int check_type(self, object val):
340+
return PyString_Check(val)
341+
342+
cpdef get_item(self, int64_t val):
343+
cdef khiter_t k
344+
k = kh_get_int64(self.table, val)
345+
if k != self.table.n_buckets:
346+
return self.table.vals[k]
347+
else:
348+
raise KeyError(val)
349+
350+
def get_iter_test(self, int64_t key, Py_ssize_t iterations):
351+
cdef Py_ssize_t i, val
352+
for i in range(iterations):
353+
k = kh_get_int64(self.table, val)
354+
if k != self.table.n_buckets:
355+
val = self.table.vals[k]
356+
357+
cpdef set_item(self, int64_t key, Py_ssize_t val):
358+
cdef:
359+
khiter_t k
360+
int ret
361+
362+
k = kh_put_int64(self.table, key, &ret)
363+
self.table.keys[k] = key
364+
if kh_exist_int64(self.table, k):
365+
self.table.vals[k] = val
366+
else:
367+
raise KeyError(key)
368+
369+
def map_locations(self, ndarray[int64_t] values):
370+
cdef:
371+
Py_ssize_t i, n = len(values)
372+
int ret
373+
int64_t val
374+
khiter_t k
375+
376+
for i in range(n):
377+
val = values[i]
378+
k = kh_put_int64(self.table, val, &ret)
379+
# print 'putting %s, %s' % (val, count)
380+
self.table.vals[k] = i
381+
382+
def lookup_locations(self, ndarray[int64_t] values):
383+
cdef:
384+
Py_ssize_t i, n = len(values)
385+
int ret
386+
int64_t val
387+
khiter_t k
388+
ndarray[int32_t] locs = np.empty(n, dtype='i4')
389+
390+
for i in range(n):
391+
val = values[i]
392+
k = kh_get_int64(self.table, val)
393+
if k != self.table.n_buckets:
394+
locs[i] = self.table.vals[k]
395+
else:
396+
locs[i] = -1
397+
398+
return locs
399+
400+
def factorize(self, ndarray[object] values):
401+
cdef:
402+
Py_ssize_t i, n = len(values)
403+
ndarray[int32_t] labels = np.empty(n, dtype=np.int32)
404+
ndarray[int32_t] counts = np.empty(n, dtype=np.int32)
405+
dict reverse = {}
406+
Py_ssize_t idx, count = 0
407+
int ret
408+
int64_t val
409+
khiter_t k
410+
411+
for i in range(n):
412+
val = values[i]
413+
k = kh_get_int64(self.table, val)
414+
if k != self.table.n_buckets:
415+
idx = self.table.vals[k]
416+
labels[i] = idx
417+
counts[idx] = counts[idx] + 1
418+
else:
419+
k = kh_put_int64(self.table, val, &ret)
420+
if not ret:
421+
kh_del_int64(self.table, k)
422+
self.table.vals[k] = count
423+
reverse[count] = val
424+
labels[i] = count
425+
counts[count] = 1
426+
count += 1
427+
428+
# return None
429+
return reverse, labels, counts[:count].copy()
430+
324431
from libc.stdlib cimport free
325432

326433
cdef class PyObjectHashTable:

0 commit comments

Comments
 (0)