pandas-dev · jreback · Nov 21, 2020 · Nov 17, 2020 · Nov 17, 2020 · Nov 17, 2020
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -2,12 +2,16 @@ from numpy cimport intp_t, ndarray
 
 from pandas._libs.khash cimport (
     float64_t,
+    int32_t,
     int64_t,
     kh_float64_t,
+    kh_int32_t,
     kh_int64_t,
     kh_pymap_t,
     kh_str_t,
+    kh_uint32_t,
     kh_uint64_t,
+    uint32_t,
     uint64_t,
 )
 
@@ -28,6 +32,18 @@ cdef class Int64HashTable(HashTable):
     cpdef get_item(self, int64_t val)
     cpdef set_item(self, int64_t key, Py_ssize_t val)
 
+cdef class UInt32HashTable(HashTable):
+    cdef kh_uint32_t *table
+
+    cpdef get_item(self, uint32_t val)
+    cpdef set_item(self, uint32_t key, Py_ssize_t val)
+
+cdef class Int32HashTable(HashTable):
+    cdef kh_int32_t *table
+
+    cpdef get_item(self, int32_t val)
+    cpdef set_item(self, int32_t key, Py_ssize_t val)
+
 cdef class Float64HashTable(HashTable):
     cdef kh_float64_t *table
 

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -13,45 +13,7 @@ cnp.import_array()
 
 
 from pandas._libs cimport util
-from pandas._libs.khash cimport (
-    kh_destroy_float64,
-    kh_destroy_int64,
-    kh_destroy_pymap,
-    kh_destroy_str,
-    kh_destroy_uint64,
-    kh_exist_float64,
-    kh_exist_int64,
-    kh_exist_pymap,
-    kh_exist_str,
-    kh_exist_uint64,
-    kh_float64_t,
-    kh_get_float64,
-    kh_get_int64,
-    kh_get_pymap,
-    kh_get_str,
-    kh_get_strbox,
-    kh_get_uint64,
-    kh_init_float64,
-    kh_init_int64,
-    kh_init_pymap,
-    kh_init_str,
-    kh_init_strbox,
-    kh_init_uint64,
-    kh_int64_t,
-    kh_put_float64,
-    kh_put_int64,
-    kh_put_pymap,
-    kh_put_str,
-    kh_put_strbox,
-    kh_put_uint64,
-    kh_resize_float64,
-    kh_resize_int64,
-    kh_resize_pymap,
-    kh_resize_str,
-    kh_resize_uint64,
-    kh_str_t,
-    khiter_t,
-)
+from pandas._libs.khash cimport kh_str_t, khiter_t
 from pandas._libs.missing cimport checknull
 
 

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -5,6 +5,30 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 
 
+{{py:
+
+# name
+cimported_types = ['float64',
+                   'int32',
+                   'int64',
+                   'pymap',
+                   'str',
+                   'strbox',
+                   'uint32',
+                   'uint64']
+}}
+
+{{for name in cimported_types}}
+from pandas._libs.khash cimport (
+    kh_destroy_{{name}},
+    kh_exist_{{name}},
+    kh_get_{{name}},
+    kh_init_{{name}},
+    kh_put_{{name}},
+    kh_resize_{{name}},
+)
+{{endfor}}
+
 # ----------------------------------------------------------------------
 # VectorData
 # ----------------------------------------------------------------------
@@ -21,8 +45,10 @@ from pandas._libs.missing cimport C_NA
 
 dtypes = [('Float64', 'float64', 'float64_t'),
           ('Int64', 'int64', 'int64_t'),
+          ('Int32', 'int32', 'int32_t'),
           ('String', 'string', 'char *'),
-          ('UInt64', 'uint64', 'uint64_t')]
+          ('UInt64', 'uint64', 'uint64_t'),
+          ('UInt32', 'uint32', 'uint32_t')]
 }}
 
 {{for name, dtype, c_type in dtypes}}
@@ -49,7 +75,9 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
 
 ctypedef fused vector_data:
     Int64VectorData
+    Int32VectorData
     UInt64VectorData
+    UInt32VectorData
     Float64VectorData
     StringVectorData
 
@@ -65,7 +93,9 @@ cdef inline bint needs_resize(vector_data *data) nogil:
 # name, dtype, c_type
 dtypes = [('Float64', 'float64', 'float64_t'),
           ('UInt64', 'uint64', 'uint64_t'),
-          ('Int64', 'int64', 'int64_t')]
+          ('Int64', 'int64', 'int64_t'),
+          ('UInt32', 'uint32', 'uint32_t'),
+          ('Int32', 'int32', 'int32_t')]
 
 }}
 
@@ -256,7 +286,9 @@ cdef class HashTable:
 # name, dtype, float_group, default_na_value
 dtypes = [('Float64', 'float64', True, 'np.nan'),
           ('UInt64', 'uint64', False, 0),
-          ('Int64', 'int64', False, 'NPY_NAT')]
+          ('Int64', 'int64', False, 'NPY_NAT'),
+          ('UInt32', 'uint32', False, 0),
+          ('Int32', 'int32', False, 0)]
 {{dtype}}_t val, na_value2 
 else: 
     na_value2 = {{default_na_value}} 
 {{dtype}}_t val, na_value2 
 else: 
     na_value2 = {{default_na_value}} 
 
 }}
 

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -9,8 +9,10 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # dtype, ttype, c_type
 dtypes = [('float64', 'float64', 'float64_t'),
           ('uint64', 'uint64', 'uint64_t'),
+          ('uint32', 'uint32', 'uint32_t'),
           ('object', 'pymap', 'object'),
-          ('int64', 'int64', 'int64_t')]
+          ('int64', 'int64', 'int64_t'),
+          ('int32', 'int32', 'int32_t')]
 
 }}
 
@@ -276,7 +278,9 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values):
 # dtype, ctype, table_type, npy_dtype
 dtypes = [('float64', 'float64_t', 'float64', 'float64'),
           ('int64', 'int64_t', 'int64', 'int64'),
+          ('int32', 'int32_t', 'int32', 'int32'),
           ('uint64', 'uint64_t', 'uint64', 'uint64'),
+          ('uint32', 'uint32_t', 'uint32', 'uint32'),
           ('object', 'object', 'pymap', 'object_')]
 }}
 

diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
@@ -67,72 +67,6 @@ cdef extern from "khash_python.h":
     void kh_destroy_str_starts(kh_str_starts_t*) nogil
     void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil
 
-    ctypedef struct kh_int64_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        int64_t *keys
-        size_t *vals
-
-    kh_int64_t* kh_init_int64() nogil
-    void kh_destroy_int64(kh_int64_t*) nogil
-    void kh_clear_int64(kh_int64_t*) nogil
-    khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
-    void kh_resize_int64(kh_int64_t*, khint_t) nogil
-    khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
-    void kh_del_int64(kh_int64_t*, khint_t) nogil
-
-    bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
-
-    ctypedef uint64_t khuint64_t
-
-    ctypedef struct kh_uint64_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        khuint64_t *keys
-        size_t *vals
-
-    kh_uint64_t* kh_init_uint64() nogil
-    void kh_destroy_uint64(kh_uint64_t*) nogil
-    void kh_clear_uint64(kh_uint64_t*) nogil
-    khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
-    void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
-    khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
-    void kh_del_uint64(kh_uint64_t*, khint_t) nogil
-
-    bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
-
-    ctypedef struct kh_float64_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        float64_t *keys
-        size_t *vals
-
-    kh_float64_t* kh_init_float64() nogil
-    void kh_destroy_float64(kh_float64_t*) nogil
-    void kh_clear_float64(kh_float64_t*) nogil
-    khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
-    void kh_resize_float64(kh_float64_t*, khint_t) nogil
-    khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
-    void kh_del_float64(kh_float64_t*, khint_t) nogil
-
-    bint kh_exist_float64(kh_float64_t*, khiter_t) nogil
-
-    ctypedef struct kh_int32_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        int32_t *keys
-        size_t *vals
-
-    kh_int32_t* kh_init_int32() nogil
-    void kh_destroy_int32(kh_int32_t*) nogil
-    void kh_clear_int32(kh_int32_t*) nogil
-    khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
-    void kh_resize_int32(kh_int32_t*, khint_t) nogil
-    khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
-    void kh_del_int32(kh_int32_t*, khint_t) nogil
-
-    bint kh_exist_int32(kh_int32_t*, khiter_t) nogil
-
     # sweep factorize
 
     ctypedef struct kh_strbox_t:
@@ -150,3 +84,5 @@ cdef extern from "khash_python.h":
     void kh_del_strbox(kh_strbox_t*, khint_t) nogil
 
     bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
+
+include "khash_for_primitive_helper.pxi"
diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in
@@ -0,0 +1,37 @@
+"""
+Template for wrapping khash-tables for each primitive `dtype`
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+{{py:
+
+# name, c_type
+primitive_types = [('int64', 'int64_t'),
+                   ('uint64', 'uint64_t'),
+                   ('float64', 'float64_t'),
+                   ('int32', 'int32_t'),
+                   ('uint32', 'uint32_t'),
+                  ]
+}}
+
+{{for name, c_type in primitive_types}}
+
+cdef extern from "khash_python.h":
+    ctypedef struct kh_{{name}}_t:
+        khint_t n_buckets, size, n_occupied, upper_bound
+        uint32_t *flags
+        {{c_type}} *keys
+        size_t *vals
+
+    kh_{{name}}_t* kh_init_{{name}}() nogil
+    void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
+    void kh_clear_{{name}}(kh_{{name}}_t*) nogil
+    khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
+    void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil
+    khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
+    void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil
+
+    bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
+
+{{endfor}}
diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h
@@ -591,6 +591,9 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key)
 #define KHASH_MAP_INIT_INT(name, khval_t)								\
 	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
 
+#define KHASH_MAP_INIT_UINT(name, khval_t)								\
 typedef unsigned int khint32_t; 
 typedef unsigned long khuint64_t; 
 typedef signed long khint64_t; 
 #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) 
 typedef unsigned int khint32_t; 
 typedef unsigned long khuint64_t; 
 typedef signed long khint64_t; 
 #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) 
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
 /*! @function
   @abstract     Instantiate a hash map containing 64-bit integer keys
   @param  name  Name of the hash table [symbol]
@@ -635,9 +638,11 @@ typedef const char *kh_cstr_t;
 #define kh_exist_uint64(h, k) (kh_exist(h, k))
 #define kh_exist_int64(h, k) (kh_exist(h, k))
 #define kh_exist_int32(h, k) (kh_exist(h, k))
+#define kh_exist_uint32(h, k) (kh_exist(h, k))
 
 KHASH_MAP_INIT_STR(str, size_t)
 KHASH_MAP_INIT_INT(int32, size_t)
+KHASH_MAP_INIT_UINT(uint32, size_t)
 KHASH_MAP_INIT_INT64(int64, size_t)
 KHASH_MAP_INIT_UINT64(uint64, size_t)