pandas-dev · jreback · May 10, 2021 · Apr 14, 2021 · Apr 14, 2021 · Apr 16, 2021
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -128,10 +128,12 @@ cdef struct Int64VectorData:
     int64_t *data
     Py_ssize_t n, m
 
-cdef class Int64Vector:
+cdef class Vector:
+    cdef bint external_view_exists
+
+cdef class Int64Vector(Vector):
     cdef Int64VectorData *data
     cdef ndarray ao
-    cdef bint external_view_exists
 
     cdef resize(self)
     cpdef ndarray to_array(self)

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -12,34 +12,28 @@ def unique_label_indices(
 
 
 class Factorizer:
-    table: PyObjectHashTable
-    uniques: ObjectVector
     count: int
 
     def __init__(self, size_hint: int): ...
     def get_count(self) -> int: ...
 
+
+class ObjectFactorizer(Factorizer):
+    table: PyObjectHashTable
+    uniques: ObjectVector
+
     def factorize(
         self,
-        values: np.ndarray,  # np.ndarray[object]
+        values: np.ndarray,  # ndarray[object]
         sort: bool = ...,
         na_sentinel=...,
         na_value=...,
     ) -> np.ndarray: ...  # np.ndarray[intp]
 
-    def unique(
-        self,
-        values: np.ndarray,  # np.ndarray[object]
-    ) -> np.ndarray: ... # np.ndarray[object]
-
 
-class Int64Factorizer:
+class Int64Factorizer(Factorizer):
     table: Int64HashTable
     uniques: Int64Vector
-    count: int
-
-    def __init__(self, size_hint: int): ...
-    def get_count(self) -> int: ...
 
     def factorize(
         self,
@@ -240,3 +234,26 @@ def value_count_int64(
     np.ndarray,  # np.ndarray[np.int64]
     np.ndarray,  # np.ndarray[np.int64]
 ]: ...
+
+
+def duplicated(
+    values: np.ndarray,
+    keep: Literal["last", "first", False] = ...,
+) -> np.ndarray: ...  # np.ndarray[bool]
+
+def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
+
+def value_count(
+    values: np.ndarray,
+    dropna: bool,
+) -> tuple[
+    np.ndarray,
+    np.ndarray,  # np.ndarray[np.int64]
+]: ...
+
+
+# arr and values should have same dtype
+def ismember(
+    arr: np.ndarray,
+    values: np.ndarray,
+) -> np.ndarray: ...  # np.ndarray[bool]
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -56,19 +56,25 @@ include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
 cdef class Factorizer:
-    cdef public:
-        PyObjectHashTable table
-        ObjectVector uniques
+    cdef readonly:
         Py_ssize_t count
 
-    def __init__(self, size_hint: int):
-        self.table = PyObjectHashTable(size_hint)
-        self.uniques = ObjectVector()
+    def __cinit__(self, size_hint: int):
         self.count = 0
 
     def get_count(self) -> int:
         return self.count
 
+
+cdef class ObjectFactorizer(Factorizer):
+    cdef public:
+        PyObjectHashTable table
+        ObjectVector uniques
+
+    def __cinit__(self, size_hint: int):
+        self.table = PyObjectHashTable(size_hint)
+        self.uniques = ObjectVector()
+
     def factorize(
         self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
     ) -> np.ndarray:
@@ -105,24 +111,15 @@ cdef class Factorizer:
         self.count = len(self.uniques)
         return labels
 
-    def unique(self, ndarray[object] values):
-        # just for fun
-        return self.table.unique(values)
 
-
-cdef class Int64Factorizer:
+cdef class Int64Factorizer(Factorizer):
     cdef public:
         Int64HashTable table
         Int64Vector uniques
-        Py_ssize_t count
 
-    def __init__(self, size_hint: int):
+    def __cinit__(self, size_hint: int):
         self.table = Int64HashTable(size_hint)
         self.uniques = Int64Vector()
-        self.count = 0
-
-    def get_count(self) -> int:
-        return self.count
 
     def factorize(self, const int64_t[:] values, sort=False,
                   na_sentinel=-1, na_value=None) -> np.ndarray:

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -127,6 +127,8 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
 
 
 {{if dtype != 'int64'}}
+# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
+#  by IntervalTree
 
 ctypedef struct {{name}}VectorData:
     {{c_type}} *data
@@ -167,6 +169,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
 # Vector
 # ----------------------------------------------------------------------
 
+cdef class Vector:
+    # cdef readonly:
+    #    bint external_view_exists
+
+    def __cinit__(self):
+        self.external_view_exists = False
+
+
 {{py:
 
 # name, dtype, c_type
@@ -187,11 +197,12 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
 
 {{for name, dtype, c_type in dtypes}}
 
-cdef class {{name}}Vector:
+cdef class {{name}}Vector(Vector):
 
+    # For int64 we have to put this declaration in the .pxd file;
+    # Int64Vector is the only one we need exposed for other cython files.
     {{if dtype != 'int64'}}
     cdef:
-        bint external_view_exists
         {{name}}VectorData *data
         ndarray ao
     {{endif}}
@@ -201,7 +212,6 @@ cdef class {{name}}Vector:
             sizeof({{name}}VectorData))
         if not self.data:
             raise MemoryError()
-        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
@@ -246,17 +256,15 @@ cdef class {{name}}Vector:
 
 {{endfor}}
 
-cdef class StringVector:
+cdef class StringVector(Vector):
 
     cdef:
         StringVectorData *data
-        bint external_view_exists
 
     def __cinit__(self):
         self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
         if not self.data:
             raise MemoryError()
-        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.data.data = <char **>malloc(self.data.m * sizeof(char *))
@@ -314,16 +322,14 @@ cdef class StringVector:
             self.append(x[i])
 
 
-cdef class ObjectVector:
+cdef class ObjectVector(Vector):
 
     cdef:
         PyObject **data
         Py_ssize_t n, m
         ndarray ao
-        bint external_view_exists
 
     def __cinit__(self):
-        self.external_view_exists = False
         self.n = 0
         self.m = _INIT_VEC_CAP
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)