pandas-dev · jreback · May 10, 2021 · Apr 14, 2021 · Apr 14, 2021 · Apr 16, 2021
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -128,10 +128,12 @@ cdef struct Int64VectorData:
     int64_t *data
     Py_ssize_t n, m
 
-cdef class Int64Vector:
+cdef class Vector:
+    cdef bint external_view_exists
+
+cdef class Int64Vector(Vector):
     cdef Int64VectorData *data
     cdef ndarray ao
-    cdef bint external_view_exists
 
     cdef resize(self)
     cpdef ndarray to_array(self)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -56,19 +56,25 @@ include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
 cdef class Factorizer:
-    cdef public:
-        PyObjectHashTable table
-        ObjectVector uniques
+    cdef readonly:
         Py_ssize_t count
 
-    def __init__(self, size_hint: int):
-        self.table = PyObjectHashTable(size_hint)
-        self.uniques = ObjectVector()
+    def __cinit__(self, size_hint: int):
         self.count = 0
 
     def get_count(self) -> int:
         return self.count
 
+
+cdef class ObjectFactorizer(Factorizer):
+    cdef public:
+        PyObjectHashTable table
+        ObjectVector uniques
+
+    def __cinit__(self, size_hint: int):
+        self.table = PyObjectHashTable(size_hint)
+        self.uniques = ObjectVector()
+
     def factorize(
         self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
     ) -> np.ndarray:
@@ -105,24 +111,15 @@ cdef class Factorizer:
         self.count = len(self.uniques)
         return labels
 
-    def unique(self, ndarray[object] values):
-        # just for fun
-        return self.table.unique(values)
 
-
-cdef class Int64Factorizer:
+cdef class Int64Factorizer(Factorizer):
     cdef public:
         Int64HashTable table
         Int64Vector uniques
-        Py_ssize_t count
 
-    def __init__(self, size_hint: int):
+    def __cinit__(self, size_hint: int):
         self.table = Int64HashTable(size_hint)
         self.uniques = Int64Vector()
-        self.count = 0
-
-    def get_count(self) -> int:
-        return self.count
 
     def factorize(self, const int64_t[:] values, sort=False,
                   na_sentinel=-1, na_value=None) -> np.ndarray:

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -127,6 +127,8 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
 
 
 {{if dtype != 'int64'}}
+# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
+#  by IntervalTree
 
 ctypedef struct {{name}}VectorData:
     {{c_type}} *data
@@ -167,6 +169,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
 # Vector
 # ----------------------------------------------------------------------
 
+cdef class Vector:
+    # cdef readonly:
+    #    bint external_view_exists
+
+    def __cinit__(self):
+        self.external_view_exists = False
+
+
 {{py:
 
 # name, dtype, c_type
@@ -187,11 +197,12 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
 
 {{for name, dtype, c_type in dtypes}}
 
-cdef class {{name}}Vector:
+cdef class {{name}}Vector(Vector):
 
+    # For int64 we have to put this declaration in the .pxd file;
+    # Int64Vector is the only one we need exposed for other cython files.
     {{if dtype != 'int64'}}
     cdef:
-        bint external_view_exists
         {{name}}VectorData *data
         ndarray ao
     {{endif}}
@@ -201,7 +212,6 @@ cdef class {{name}}Vector:
             sizeof({{name}}VectorData))
         if not self.data:
             raise MemoryError()
-        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
@@ -246,17 +256,15 @@ cdef class {{name}}Vector:
 
 {{endfor}}
 
-cdef class StringVector:
+cdef class StringVector(Vector):
 
     cdef:
         StringVectorData *data
-        bint external_view_exists
 
     def __cinit__(self):
         self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
         if not self.data:
             raise MemoryError()
-        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.data.data = <char **>malloc(self.data.m * sizeof(char *))
@@ -314,16 +322,14 @@ cdef class StringVector:
             self.append(x[i])
 
 
-cdef class ObjectVector:
+cdef class ObjectVector(Vector):
 
     cdef:
         PyObject **data
         Py_ssize_t n, m
         ndarray ao
-        bint external_view_exists
 
     def __cinit__(self):
-        self.external_view_exists = False
         self.n = 0
         self.m = _INIT_VEC_CAP
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128',
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
+cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
 {{else}}
-cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
+cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 {{endif}}
     cdef:
         Py_ssize_t i = 0
@@ -107,9 +107,9 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
+cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
 {{else}}
-def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
+cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
 {{endif}}
     cdef:
         int ret = 0
@@ -189,9 +189,9 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
+cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
 {{else}}
-def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
+cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
 {{endif}}
     """
     Return boolean of values in arr on an
@@ -256,9 +256,9 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
+cdef mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
 {{else}}
-def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
+cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 {{endif}}
     cdef:
         {{if dtype == 'object'}}
@@ -310,3 +310,163 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
     return modes[:j + 1]
 
 {{endfor}}
+
+
+ctypedef fused htfunc_t:
+    complex128_t
+    complex64_t
+    float64_t
+    float32_t
+    uint64_t
+    uint32_t
+    uint16_t
+    uint8_t
+    int64_t
+    int32_t
+    int16_t
+    int8_t
+    object
+
+
+cpdef value_count(ndarray[htfunc_t] values, bint dropna):
+    if htfunc_t is object:
+        return value_count_object(values, dropna)
+
+    elif htfunc_t is int8_t:
+        return value_count_int8(values, dropna)
+    elif htfunc_t is int16_t:
+        return value_count_int16(values, dropna)
+    elif htfunc_t is int32_t:
+        return value_count_int32(values, dropna)
+    elif htfunc_t is int64_t:
+        return value_count_int64(values, dropna)
+
+    elif htfunc_t is uint8_t:
+        return value_count_uint8(values, dropna)
+    elif htfunc_t is uint16_t:
+        return value_count_uint16(values, dropna)
+    elif htfunc_t is uint32_t:
+        return value_count_uint32(values, dropna)
+    elif htfunc_t is uint64_t:
+        return value_count_uint64(values, dropna)
+
+    elif htfunc_t is float64_t:
+        return value_count_float64(values, dropna)
+    elif htfunc_t is float32_t:
+        return value_count_float32(values, dropna)
+
+    elif htfunc_t is complex128_t:
+        return value_count_complex128(values, dropna)
+    elif htfunc_t is complex64_t:
+        return value_count_complex64(values, dropna)
+
+    else:
+        raise TypeError(values.dtype)
+
+
+cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
+    if htfunc_t is object:
+        return duplicated_object(values, keep)
+
+    elif htfunc_t is int8_t:
+        return duplicated_int8(values, keep)
+    elif htfunc_t is int16_t:
+        return duplicated_int16(values, keep)
+    elif htfunc_t is int32_t:
+        return duplicated_int32(values, keep)
+    elif htfunc_t is int64_t:
+        return duplicated_int64(values, keep)
+
+    elif htfunc_t is uint8_t:
+        return duplicated_uint8(values, keep)
+    elif htfunc_t is uint16_t:
+        return duplicated_uint16(values, keep)
+    elif htfunc_t is uint32_t:
+        return duplicated_uint32(values, keep)
+    elif htfunc_t is uint64_t:
+        return duplicated_uint64(values, keep)
+
+    elif htfunc_t is float64_t:
+        return duplicated_float64(values, keep)
+    elif htfunc_t is float32_t:
+        return duplicated_float32(values, keep)
+
+    elif htfunc_t is complex128_t:
+        return duplicated_complex128(values, keep)
+    elif htfunc_t is complex64_t:
+        return duplicated_complex64(values, keep)
+
+    else:
+        raise TypeError(values.dtype)
+
+
+cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
+    if htfunc_t is object:
+        return ismember_object(arr, values)
+
+    elif htfunc_t is int8_t:
+        return ismember_int8(arr, values)
+    elif htfunc_t is int16_t:
+        return ismember_int16(arr, values)
+    elif htfunc_t is int32_t:
+        return ismember_int32(arr, values)
+    elif htfunc_t is int64_t:
+        return ismember_int64(arr, values)
+
+    elif htfunc_t is uint8_t:
+        return ismember_uint8(arr, values)
+    elif htfunc_t is uint16_t:
+        return ismember_uint16(arr, values)
+    elif htfunc_t is uint32_t:
+        return ismember_uint32(arr, values)
+    elif htfunc_t is uint64_t:
+        return ismember_uint64(arr, values)
+
+    elif htfunc_t is float64_t:
+        return ismember_float64(arr, values)
+    elif htfunc_t is float32_t:
+        return ismember_float32(arr, values)
+
+    elif htfunc_t is complex128_t:
+        return ismember_complex128(arr, values)
+    elif htfunc_t is complex64_t:
+        return ismember_complex64(arr, values)
+
+    else:
+        raise TypeError(values.dtype)
+
+
+cpdef mode(ndarray[htfunc_t] values, bint dropna):
+    if htfunc_t is object:
+        return mode_object(values, dropna)
+
+    elif htfunc_t is int8_t:
+        return mode_int8(values, dropna)
+    elif htfunc_t is int16_t:
+        return mode_int16(values, dropna)
+    elif htfunc_t is int32_t:
+        return mode_int32(values, dropna)
+    elif htfunc_t is int64_t:
+        return mode_int64(values, dropna)
+
+    elif htfunc_t is uint8_t:
+        return mode_uint8(values, dropna)
+    elif htfunc_t is uint16_t:
+        return mode_uint16(values, dropna)
+    elif htfunc_t is uint32_t:
+        return mode_uint32(values, dropna)
+    elif htfunc_t is uint64_t:
+        return mode_uint64(values, dropna)
+
+    elif htfunc_t is float64_t:
+        return mode_float64(values, dropna)
+    elif htfunc_t is float32_t:
+        return mode_float32(values, dropna)
+
+    elif htfunc_t is complex128_t:
+        return mode_complex128(values, dropna)
+    elif htfunc_t is complex64_t:
+        return mode_complex64(values, dropna)
+
+    else:
+        raise TypeError(values.dtype)