pandas-dev · jreback · Nov 21, 2020 · Nov 17, 2020 · Nov 17, 2020 · Nov 17, 2020
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -1,13 +1,27 @@
 from numpy cimport intp_t, ndarray
 
 from pandas._libs.khash cimport (
+    float32_t,
     float64_t,
+    int8_t,
+    int16_t,
+    int32_t,
     int64_t,
+    kh_float32_t,
     kh_float64_t,
+    kh_int8_t,
+    kh_int16_t,
+    kh_int32_t,
     kh_int64_t,
     kh_pymap_t,
     kh_str_t,
+    kh_uint8_t,
+    kh_uint16_t,
+    kh_uint32_t,
     kh_uint64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
     uint64_t,
 )
 
@@ -28,12 +42,54 @@ cdef class Int64HashTable(HashTable):
     cpdef get_item(self, int64_t val)
     cpdef set_item(self, int64_t key, Py_ssize_t val)
 
+cdef class UInt32HashTable(HashTable):
+    cdef kh_uint32_t *table
+
+    cpdef get_item(self, uint32_t val)
+    cpdef set_item(self, uint32_t key, Py_ssize_t val)
+
+cdef class Int32HashTable(HashTable):
+    cdef kh_int32_t *table
+
+    cpdef get_item(self, int32_t val)
+    cpdef set_item(self, int32_t key, Py_ssize_t val)
+
+cdef class UInt16HashTable(HashTable):
+    cdef kh_uint16_t *table
+
+    cpdef get_item(self, uint16_t val)
+    cpdef set_item(self, uint16_t key, Py_ssize_t val)
+
+cdef class Int16HashTable(HashTable):
+    cdef kh_int16_t *table
+
+    cpdef get_item(self, int16_t val)
+    cpdef set_item(self, int16_t key, Py_ssize_t val)
+
+cdef class UInt8HashTable(HashTable):
+    cdef kh_uint8_t *table
+
+    cpdef get_item(self, uint8_t val)
+    cpdef set_item(self, uint8_t key, Py_ssize_t val)
+
+cdef class Int8HashTable(HashTable):
+    cdef kh_int8_t *table
+
+    cpdef get_item(self, int8_t val)
+    cpdef set_item(self, int8_t key, Py_ssize_t val)
+
 cdef class Float64HashTable(HashTable):
     cdef kh_float64_t *table
 
     cpdef get_item(self, float64_t val)
     cpdef set_item(self, float64_t key, Py_ssize_t val)
 
+cdef class Float32HashTable(HashTable):
+    cdef kh_float32_t *table
+
+    cpdef get_item(self, float32_t val)
+    cpdef set_item(self, float32_t key, Py_ssize_t val)
+
 cdef class PyObjectHashTable(HashTable):
     cdef kh_pymap_t *table
 

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -13,45 +13,7 @@ cnp.import_array()
 
 
 from pandas._libs cimport util
-from pandas._libs.khash cimport (
-    kh_destroy_float64,
-    kh_destroy_int64,
-    kh_destroy_pymap,
-    kh_destroy_str,
-    kh_destroy_uint64,
-    kh_exist_float64,
-    kh_exist_int64,
-    kh_exist_pymap,
-    kh_exist_str,
-    kh_exist_uint64,
-    kh_float64_t,
-    kh_get_float64,
-    kh_get_int64,
-    kh_get_pymap,
-    kh_get_str,
-    kh_get_strbox,
-    kh_get_uint64,
-    kh_init_float64,
-    kh_init_int64,
-    kh_init_pymap,
-    kh_init_str,
-    kh_init_strbox,
-    kh_init_uint64,
-    kh_int64_t,
-    kh_put_float64,
-    kh_put_int64,
-    kh_put_pymap,
-    kh_put_str,
-    kh_put_strbox,
-    kh_put_uint64,
-    kh_resize_float64,
-    kh_resize_int64,
-    kh_resize_pymap,
-    kh_resize_str,
-    kh_resize_uint64,
-    kh_str_t,
-    khiter_t,
-)
+from pandas._libs.khash cimport kh_str_t, khiter_t
 from pandas._libs.missing cimport checknull
 
 

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -5,6 +5,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 
 
+{{py:
+
+# name
+cimported_types = ['float32',
+                   'float64',
+                   'int8',
+                   'int16',
+                   'int32',
+                   'int64',
+                   'pymap',
+                   'str',
+                   'strbox',
+                   'uint8',
+                   'uint16',
+                   'uint32',
+                   'uint64']
+}}
+
+{{for name in cimported_types}}
+from pandas._libs.khash cimport (
+    kh_destroy_{{name}},
+    kh_exist_{{name}},
+    kh_get_{{name}},
+    kh_init_{{name}},
+    kh_put_{{name}},
+    kh_resize_{{name}},
+)
+{{endfor}}
+
 # ----------------------------------------------------------------------
 # VectorData
 # ----------------------------------------------------------------------
@@ -20,9 +49,16 @@ from pandas._libs.missing cimport C_NA
 # for uniques in hashtables)
 
 dtypes = [('Float64', 'float64', 'float64_t'),
+          ('Float32', 'float32', 'float32_t'),
           ('Int64', 'int64', 'int64_t'),
+          ('Int32', 'int32', 'int32_t'),
+          ('Int16', 'int16', 'int16_t'),
+          ('Int8', 'int8', 'int8_t'),
           ('String', 'string', 'char *'),
-          ('UInt64', 'uint64', 'uint64_t')]
+          ('UInt64', 'uint64', 'uint64_t'),
+          ('UInt32', 'uint32', 'uint32_t'),
+          ('UInt16', 'uint16', 'uint16_t'),
+          ('UInt8', 'uint8', 'uint8_t')]
 }}
 
 {{for name, dtype, c_type in dtypes}}
@@ -49,8 +85,15 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
 
 ctypedef fused vector_data:
     Int64VectorData
+    Int32VectorData
+    Int16VectorData
+    Int8VectorData
     UInt64VectorData
+    UInt32VectorData
+    UInt16VectorData
+    UInt8VectorData
     Float64VectorData
+    Float32VectorData
     StringVectorData
 
 cdef inline bint needs_resize(vector_data *data) nogil:
@@ -65,7 +108,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
 # name, dtype, c_type
 dtypes = [('Float64', 'float64', 'float64_t'),
           ('UInt64', 'uint64', 'uint64_t'),
-          ('Int64', 'int64', 'int64_t')]
+          ('Int64', 'int64', 'int64_t'),
+          ('Float32', 'float32', 'float32_t'),
+          ('UInt32', 'uint32', 'uint32_t'),
+          ('Int32', 'int32', 'int32_t'),
+          ('UInt16', 'uint16', 'uint16_t'),
+          ('Int16', 'int16', 'int16_t'),
+          ('UInt8', 'uint8', 'uint8_t'),
+          ('Int8', 'int8', 'int8_t')]
 
 }}
 
@@ -253,15 +303,22 @@ cdef class HashTable:
 
 {{py:
 
-# name, dtype, float_group, default_na_value
-dtypes = [('Float64', 'float64', True, 'np.nan'),
-          ('UInt64', 'uint64', False, 0),
-          ('Int64', 'int64', False, 'NPY_NAT')]
+# name, dtype, float_group
+dtypes = [('Float64', 'float64', True),
+          ('UInt64', 'uint64', False),
+          ('Int64', 'int64', False),
+          ('Float32', 'float32', True),
+          ('UInt32', 'uint32', False),
+          ('Int32', 'int32', False),
+          ('UInt16', 'uint16', False),
+          ('Int16', 'int16', False),
+          ('UInt8', 'uint8', False),
+          ('Int8', 'int8', False)]
 
 }}
 
 
-{{for name, dtype, float_group, default_na_value in dtypes}}
+{{for name, dtype, float_group in dtypes}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -430,7 +487,7 @@ cdef class {{name}}HashTable(HashTable):
             # which is only used if it's *specified*.
             na_value2 = <{{dtype}}_t>na_value
         else:
-            na_value2 = {{default_na_value}}
+            na_value2 = 0
 
         with nogil:
             for i in range(n):

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -8,9 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 # dtype, ttype, c_type
 dtypes = [('float64', 'float64', 'float64_t'),
+          ('float32', 'float32', 'float32_t'),
           ('uint64', 'uint64', 'uint64_t'),
+          ('uint32', 'uint32', 'uint32_t'),
+          ('uint16', 'uint16', 'uint16_t'),
+          ('uint8', 'uint8', 'uint8_t'),
           ('object', 'pymap', 'object'),
-          ('int64', 'int64', 'int64_t')]
+          ('int64', 'int64', 'int64_t'),
+          ('int32', 'int32', 'int32_t'),
+          ('int16', 'int16', 'int16_t'),
+          ('int8', 'int8', 'int8_t')]
 
 }}
 
@@ -54,7 +61,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
         for i in range(n):
             val = values[i]
 
-            {{if dtype == 'float64'}}
+            {{if dtype == 'float64' or dtype == 'float32'}}
             if val == val or not dropna:
             {{else}}
             if True:
@@ -275,8 +282,15 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values):
 
 # dtype, ctype, table_type, npy_dtype
 dtypes = [('float64', 'float64_t', 'float64', 'float64'),
+          ('float32', 'float32_t', 'float32', 'float32'),
           ('int64', 'int64_t', 'int64', 'int64'),
+          ('int32', 'int32_t', 'int32', 'int32'),
+          ('int16', 'int16_t', 'int16', 'int16'),
+          ('int8', 'int8_t', 'int8', 'int8'),
           ('uint64', 'uint64_t', 'uint64', 'uint64'),
+          ('uint32', 'uint32_t', 'uint32', 'uint32'),
+          ('uint16', 'uint16_t', 'uint16', 'uint16'),
+          ('uint8', 'uint8_t', 'uint8', 'uint8'),
           ('object', 'object', 'pymap', 'object_')]
 }}
 

diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
@@ -1,5 +1,16 @@
 from cpython.object cimport PyObject
-from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t
+from numpy cimport (
+    float32_t,
+    float64_t,
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
 
 
 cdef extern from "khash_python.h":
@@ -67,72 +78,6 @@ cdef extern from "khash_python.h":
     void kh_destroy_str_starts(kh_str_starts_t*) nogil
     void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil
 
-    ctypedef struct kh_int64_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        int64_t *keys
-        size_t *vals
-
-    kh_int64_t* kh_init_int64() nogil
-    void kh_destroy_int64(kh_int64_t*) nogil
-    void kh_clear_int64(kh_int64_t*) nogil
-    khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
-    void kh_resize_int64(kh_int64_t*, khint_t) nogil
-    khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
-    void kh_del_int64(kh_int64_t*, khint_t) nogil
-
-    bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
-
-    ctypedef uint64_t khuint64_t
-
-    ctypedef struct kh_uint64_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        khuint64_t *keys
-        size_t *vals
-
-    kh_uint64_t* kh_init_uint64() nogil
-    void kh_destroy_uint64(kh_uint64_t*) nogil
-    void kh_clear_uint64(kh_uint64_t*) nogil
-    khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
-    void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
-    khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
-    void kh_del_uint64(kh_uint64_t*, khint_t) nogil
-
-    bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
-
-    ctypedef struct kh_float64_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        float64_t *keys
-        size_t *vals
-
-    kh_float64_t* kh_init_float64() nogil
-    void kh_destroy_float64(kh_float64_t*) nogil
-    void kh_clear_float64(kh_float64_t*) nogil
-    khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
-    void kh_resize_float64(kh_float64_t*, khint_t) nogil
-    khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
-    void kh_del_float64(kh_float64_t*, khint_t) nogil
-
-    bint kh_exist_float64(kh_float64_t*, khiter_t) nogil
-
-    ctypedef struct kh_int32_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        uint32_t *flags
-        int32_t *keys
-        size_t *vals
-
-    kh_int32_t* kh_init_int32() nogil
-    void kh_destroy_int32(kh_int32_t*) nogil
-    void kh_clear_int32(kh_int32_t*) nogil
-    khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
-    void kh_resize_int32(kh_int32_t*, khint_t) nogil
-    khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
-    void kh_del_int32(kh_int32_t*, khint_t) nogil
-
-    bint kh_exist_int32(kh_int32_t*, khiter_t) nogil
-
     # sweep factorize
 
     ctypedef struct kh_strbox_t:
@@ -150,3 +95,5 @@ cdef extern from "khash_python.h":
     void kh_del_strbox(kh_strbox_t*, khint_t) nogil
 
     bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
+
+include "khash_for_primitive_helper.pxi"