From 7ff424e6d9699f19d514ca5417d49d70b8f1b2ca Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 14 Apr 2021 07:21:18 -0700
Subject: [PATCH 1/2] REF: handle dtype dispatch in libhashtable

---
 pandas/_libs/hashtable_func_helper.pxi.in     | 176 +++++++++++++++++-
 pandas/core/algorithms.py                     |  28 +--
 pandas/core/arrays/categorical.py             |   8 +-
 pandas/core/frame.py                          |   4 +-
 pandas/core/indexes/multi.py                  |   4 +-
 pandas/core/reshape/merge.py                  |   3 +-
 pandas/tests/indexes/multi/test_duplicates.py |   2 +-
 pandas/tests/libs/test_hashtable.py           |   4 +-
 pandas/tests/test_algos.py                    |   2 +-
 9 files changed, 188 insertions(+), 43 deletions(-)

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
index 772d83e67394c..ceb473a0b06af 100644
--- a/pandas/_libs/hashtable_func_helper.pxi.in
+++ b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128',
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
+cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
 {{else}}
-cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
+cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 {{endif}}
     cdef:
         Py_ssize_t i = 0
@@ -107,9 +107,9 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
+cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
 {{else}}
-def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
+cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
 {{endif}}
     cdef:
         int ret = 0
@@ -189,9 +189,9 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
+cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
 {{else}}
-def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
+cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
 {{endif}}
     """
     Return boolean of values in arr on an
@@ -256,9 +256,9 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
+cdef mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
 {{else}}
-def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
+cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 {{endif}}
     cdef:
         {{if dtype == 'object'}}
@@ -310,3 +310,163 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
     return modes[:j + 1]
 
 {{endfor}}
+
+
+ctypedef fused htfunc_t:
+    complex128_t
+    complex64_t
+    float64_t
+    float32_t
+    uint64_t
+    uint32_t
+    uint16_t
+    uint8_t
+    int64_t
+    int32_t
+    int16_t
+    int8_t
+    object
+
+
+cpdef value_count(ndarray[htfunc_t] values, bint dropna):
+    if htfunc_t is object:
+        return value_count_object(values, dropna)
+
+    elif htfunc_t is int8_t:
+        return value_count_int8(values, dropna)
+    elif htfunc_t is int16_t:
+        return value_count_int16(values, dropna)
+    elif htfunc_t is int32_t:
+        return value_count_int32(values, dropna)
+    elif htfunc_t is int64_t:
+        return value_count_int64(values, dropna)
+
+    elif htfunc_t is uint8_t:
+        return value_count_uint8(values, dropna)
+    elif htfunc_t is uint16_t:
+        return value_count_uint16(values, dropna)
+    elif htfunc_t is uint32_t:
+        return value_count_uint32(values, dropna)
+    elif htfunc_t is uint64_t:
+        return value_count_uint64(values, dropna)
+
+    elif htfunc_t is float64_t:
+        return value_count_float64(values, dropna)
+    elif htfunc_t is float32_t:
+        return value_count_float32(values, dropna)
+
+    elif htfunc_t is complex128_t:
+        return value_count_complex128(values, dropna)
+    elif htfunc_t is complex64_t:
+        return value_count_complex64(values, dropna)
+
+    else:
+        raise TypeError(values.dtype)
+
+
+cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
+    if htfunc_t is object:
+        return duplicated_object(values, keep)
+
+    elif htfunc_t is int8_t:
+        return duplicated_int8(values, keep)
+    elif htfunc_t is int16_t:
+        return duplicated_int16(values, keep)
+    elif htfunc_t is int32_t:
+        return duplicated_int32(values, keep)
+    elif htfunc_t is int64_t:
+        return duplicated_int64(values, keep)
+
+    elif htfunc_t is uint8_t:
+        return duplicated_uint8(values, keep)
+    elif htfunc_t is uint16_t:
+        return duplicated_uint16(values, keep)
+    elif htfunc_t is uint32_t:
+        return duplicated_uint32(values, keep)
+    elif htfunc_t is uint64_t:
+        return duplicated_uint64(values, keep)
+
+    elif htfunc_t is float64_t:
+        return duplicated_float64(values, keep)
+    elif htfunc_t is float32_t:
+        return duplicated_float32(values, keep)
+
+    elif htfunc_t is complex128_t:
+        return duplicated_complex128(values, keep)
+    elif htfunc_t is complex64_t:
+        return duplicated_complex64(values, keep)
+
+    else:
+        raise TypeError(values.dtype)
+
+
+cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
+    if htfunc_t is object:
+        return ismember_object(arr, values)
+
+    elif htfunc_t is int8_t:
+        return ismember_int8(arr, values)
+    elif htfunc_t is int16_t:
+        return ismember_int16(arr, values)
+    elif htfunc_t is int32_t:
+        return ismember_int32(arr, values)
+    elif htfunc_t is int64_t:
+        return ismember_int64(arr, values)
+
+    elif htfunc_t is uint8_t:
+        return ismember_uint8(arr, values)
+    elif htfunc_t is uint16_t:
+        return ismember_uint16(arr, values)
+    elif htfunc_t is uint32_t:
+        return ismember_uint32(arr, values)
+    elif htfunc_t is uint64_t:
+        return ismember_uint64(arr, values)
+
+    elif htfunc_t is float64_t:
+        return ismember_float64(arr, values)
+    elif htfunc_t is float32_t:
+        return ismember_float32(arr, values)
+
+    elif htfunc_t is complex128_t:
+        return ismember_complex128(arr, values)
+    elif htfunc_t is complex64_t:
+        return ismember_complex64(arr, values)
+
+    else:
+        raise TypeError(values.dtype)
+
+
+cpdef mode(ndarray[htfunc_t] values, bint dropna):
+    if htfunc_t is object:
+        return mode_object(values, dropna)
+
+    elif htfunc_t is int8_t:
+        return mode_int8(values, dropna)
+    elif htfunc_t is int16_t:
+        return mode_int16(values, dropna)
+    elif htfunc_t is int32_t:
+        return mode_int32(values, dropna)
+    elif htfunc_t is int64_t:
+        return mode_int64(values, dropna)
+
+    elif htfunc_t is uint8_t:
+        return mode_uint8(values, dropna)
+    elif htfunc_t is uint16_t:
+        return mode_uint16(values, dropna)
+    elif htfunc_t is uint32_t:
+        return mode_uint32(values, dropna)
+    elif htfunc_t is uint64_t:
+        return mode_uint64(values, dropna)
+
+    elif htfunc_t is float64_t:
+        return mode_float64(values, dropna)
+    elif htfunc_t is float32_t:
+        return mode_float32(values, dropna)
+
+    elif htfunc_t is complex128_t:
+        return mode_complex128(values, dropna)
+    elif htfunc_t is complex64_t:
+        return mode_complex64(values, dropna)
+
+    else:
+        raise TypeError(values.dtype)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 9e2dd846f0379..6a5d1470a2c9b 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -215,7 +215,7 @@ def _reconstruct_data(
     Parameters
     ----------
     values : np.ndarray or ExtensionArray
-    dtype : np.ndtype or ExtensionDtype
+    dtype : np.dtype or ExtensionDtype
     original : AnyArrayLike
 
     Returns
@@ -519,10 +519,7 @@ def f(c, v):
         )
         values = values.astype(common, copy=False)
         comps = comps.astype(common, copy=False)
-        name = common.name
-        if name == "bool":
-            name = "uint8"
-        f = getattr(htable, f"ismember_{name}")
+        f = htable.ismember
 
     return f(comps, values)
 
@@ -891,24 +888,17 @@ def value_counts_arraylike(values, dropna: bool):
     values = _ensure_arraylike(values)
     original = values
     values, _ = _ensure_data(values)
-    ndtype = values.dtype.name
+
+    # TODO: handle uint8
+    keys, counts = htable.value_count(values, dropna)
 
     if needs_i8_conversion(original.dtype):
         # datetime, timedelta, or period
 
-        keys, counts = htable.value_count_int64(values, dropna)
-
         if dropna:
             msk = keys != iNaT
             keys, counts = keys[msk], counts[msk]
 
-    else:
-        # ndarray like
-
-        # TODO: handle uint8
-        f = getattr(htable, f"value_count_{ndtype}")
-        keys, counts = f(values, dropna)
-
     keys = _reconstruct_data(keys, original.dtype, original)
 
     return keys, counts
@@ -934,9 +924,7 @@ def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray:
     duplicated : ndarray[bool]
     """
     values, _ = _ensure_data(values)
-    ndtype = values.dtype.name
-    f = getattr(htable, f"duplicated_{ndtype}")
-    return f(values, keep=keep)
+    return htable.duplicated(values, keep=keep)
 
 
 def mode(values, dropna: bool = True) -> Series:
@@ -974,10 +962,8 @@ def mode(values, dropna: bool = True) -> Series:
         values = values[~mask]
 
     values, _ = _ensure_data(values)
-    ndtype = values.dtype.name
 
-    f = getattr(htable, f"mode_{ndtype}")
-    result = f(values, dropna=dropna)
+    result = htable.mode(values, dropna=dropna)
     try:
         result = np.sort(result)
     except TypeError as err:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index f2b5ad447a0cf..855930d616eb2 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2113,11 +2113,9 @@ def mode(self, dropna=True):
         if dropna:
             good = self._codes != -1
             codes = self._codes[good]
-        # error: Incompatible types in assignment (expression has type "List[Any]",
-        # variable has type "ndarray")
-        codes = sorted(  # type: ignore[assignment]
-            htable.mode_int64(ensure_int64(codes), dropna)
-        )
+
+        codes = htable.mode(codes, dropna)
+        codes.sort()
         codes = coerce_indexer_dtype(codes, self.dtype.categories)
         return self._from_backing_data(codes)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 045776c3f5c50..42a13e5956b3d 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -43,6 +43,7 @@
     lib,
     properties,
 )
+from pandas._libs.hashtable import duplicated
 from pandas._libs.lib import no_default
 from pandas._typing import (
     AggFuncType,
@@ -6022,7 +6023,6 @@ def duplicated(
         4     True
         dtype: bool
         """
-        from pandas._libs.hashtable import duplicated_int64
 
         if self.empty:
             return self._constructor_sliced(dtype=bool)
@@ -6055,7 +6055,7 @@ def f(vals):
         labels, shape = map(list, zip(*map(f, vals)))
 
         ids = get_group_index(labels, shape, sort=False, xnull=False)
-        result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index)
+        result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
         return result.__finalize__(self, method="duplicated")
 
     # ----------------------------------------------------------------------
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 3305610a4022e..f551823717a41 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -25,7 +25,7 @@
     index as libindex,
     lib,
 )
-from pandas._libs.hashtable import duplicated_int64
+from pandas._libs.hashtable import duplicated
 from pandas._typing import (
     AnyArrayLike,
     DtypeObj,
@@ -1614,7 +1614,7 @@ def duplicated(self, keep="first") -> np.ndarray:
         shape = map(len, self.levels)
         ids = get_group_index(self.codes, shape, sort=False, xnull=False)
 
-        return duplicated_int64(ids, keep)
+        return duplicated(ids, keep)
 
     # error: Cannot override final attribute "_duplicated"
     # (previously declared in base class "IndexOpsMixin")
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 8cee0dd2abb88..553d305ba4977 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2138,6 +2138,7 @@ def _factorize_keys(
         # "_values_for_factorize"
         rk, _ = rk._values_for_factorize()  # type: ignore[union-attr,assignment]
 
+    klass: type[libhashtable.Factorizer]
     if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
         # GH#23917 TODO: needs tests for case where lk is integer-dtype
         #  and rk is datetime-dtype
@@ -2152,7 +2153,7 @@ def _factorize_keys(
         rk = ensure_int64(np.asarray(rk, dtype=np.int64))
 
     else:
-        klass = libhashtable.Factorizer
+        klass = libhashtable.ObjectFactorizer
         lk = ensure_object(lk)
         rk = ensure_object(rk)
 
diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py
index bc0b6e0b028a8..ea59d55989f8b 100644
--- a/pandas/tests/indexes/multi/test_duplicates.py
+++ b/pandas/tests/indexes/multi/test_duplicates.py
@@ -253,7 +253,7 @@ def test_duplicated_large(keep):
     mi = MultiIndex(levels=levels, codes=codes)
 
     result = mi.duplicated(keep=keep)
-    expected = hashtable.duplicated_object(mi.values, keep=keep)
+    expected = hashtable.duplicated(mi.values, keep=keep)
     tm.assert_numpy_array_equal(result, expected)
 
 
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index 04a8aeefbfcd6..aeff591e3f0dc 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -278,7 +278,7 @@ def test_unique(self, table_type, dtype):
 
 
 def get_ht_function(fun_name, type_suffix):
-    return getattr(ht, fun_name + "_" + type_suffix)
+    return getattr(ht, fun_name)
 
 
 @pytest.mark.parametrize(
@@ -374,7 +374,7 @@ def test_modes_with_nans():
     values = np.array([True, pd.NA, np.nan], dtype=np.object_)
     # pd.Na and np.nan will have the same representative: np.nan
     # thus we have 2 nans and 1 True
-    modes = ht.mode_object(values, False)
+    modes = ht.mode(values, False)
     assert modes.size == 1
     assert np.isnan(modes[0])
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 127baae6e9352..e73aaecee5d4a 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -191,7 +191,7 @@ def test_factorize_nan(self):
         # rizer.factorize should not raise an exception if na_sentinel indexes
         # outside of reverse_indexer
         key = np.array([1, 2, 1, np.nan], dtype="O")
-        rizer = ht.Factorizer(len(key))
+        rizer = ht.ObjectFactorizer(len(key))
         for na_sentinel in (-1, 20):
             ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
             expected = np.array([0, 1, 0, na_sentinel], dtype="int32")

From 39187135b5ea2d8e7eca2eaf330e11720c0e8a5e Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 14 Apr 2021 07:21:56 -0700
Subject: [PATCH 2/2] REF: Share Factorizer/Vector code

---
 pandas/_libs/hashtable.pxd                 |  6 +++--
 pandas/_libs/hashtable.pyx                 | 31 ++++++++++------------
 pandas/_libs/hashtable_class_helper.pxi.in | 24 ++++++++++-------
 3 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
index a5679af44ac06..80d7ab58dc559 100644
--- a/pandas/_libs/hashtable.pxd
+++ b/pandas/_libs/hashtable.pxd
@@ -128,10 +128,12 @@ cdef struct Int64VectorData:
     int64_t *data
     Py_ssize_t n, m
 
-cdef class Int64Vector:
+cdef class Vector:
+    cdef bint external_view_exists
+
+cdef class Int64Vector(Vector):
     cdef Int64VectorData *data
     cdef ndarray ao
-    cdef bint external_view_exists
 
     cdef resize(self)
     cpdef ndarray to_array(self)
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 1e2a336f12444..b23daf49d4a91 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -56,19 +56,25 @@ include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
 cdef class Factorizer:
-    cdef public:
-        PyObjectHashTable table
-        ObjectVector uniques
+    cdef readonly:
         Py_ssize_t count
 
-    def __init__(self, size_hint: int):
-        self.table = PyObjectHashTable(size_hint)
-        self.uniques = ObjectVector()
+    def __cinit__(self, size_hint: int):
         self.count = 0
 
     def get_count(self) -> int:
         return self.count
 
+
+cdef class ObjectFactorizer(Factorizer):
+    cdef public:
+        PyObjectHashTable table
+        ObjectVector uniques
+
+    def __cinit__(self, size_hint: int):
+        self.table = PyObjectHashTable(size_hint)
+        self.uniques = ObjectVector()
+
     def factorize(
         self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
     ) -> np.ndarray:
@@ -105,24 +111,15 @@ cdef class Factorizer:
         self.count = len(self.uniques)
         return labels
 
-    def unique(self, ndarray[object] values):
-        # just for fun
-        return self.table.unique(values)
 
-
-cdef class Int64Factorizer:
+cdef class Int64Factorizer(Factorizer):
     cdef public:
         Int64HashTable table
         Int64Vector uniques
-        Py_ssize_t count
 
-    def __init__(self, size_hint: int):
+    def __cinit__(self, size_hint: int):
         self.table = Int64HashTable(size_hint)
         self.uniques = Int64Vector()
-        self.count = 0
-
-    def get_count(self) -> int:
-        return self.count
 
     def factorize(self, const int64_t[:] values, sort=False,
                   na_sentinel=-1, na_value=None) -> np.ndarray:
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index b80a127be970d..3745a1957fd3a 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -127,6 +127,8 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
 
 
 {{if dtype != 'int64'}}
+# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
+#  by IntervalTree
 
 ctypedef struct {{name}}VectorData:
     {{c_type}} *data
@@ -167,6 +169,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
 # Vector
 # ----------------------------------------------------------------------
 
+cdef class Vector:
+    # cdef readonly:
+    #    bint external_view_exists
+
+    def __cinit__(self):
+        self.external_view_exists = False
+
+
 {{py:
 
 # name, dtype, c_type
@@ -187,11 +197,12 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
 
 {{for name, dtype, c_type in dtypes}}
 
-cdef class {{name}}Vector:
+cdef class {{name}}Vector(Vector):
 
+    # For int64 we have to put this declaration in the .pxd file;
+    # Int64Vector is the only one we need exposed for other cython files.
     {{if dtype != 'int64'}}
     cdef:
-        bint external_view_exists
         {{name}}VectorData *data
         ndarray ao
     {{endif}}
@@ -201,7 +212,6 @@ cdef class {{name}}Vector:
             sizeof({{name}}VectorData))
         if not self.data:
             raise MemoryError()
-        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
@@ -246,17 +256,15 @@ cdef class {{name}}Vector:
 
 {{endfor}}
 
-cdef class StringVector:
+cdef class StringVector(Vector):
 
     cdef:
         StringVectorData *data
-        bint external_view_exists
 
     def __cinit__(self):
         self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
         if not self.data:
             raise MemoryError()
-        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.data.data = <char **>malloc(self.data.m * sizeof(char *))
@@ -314,16 +322,14 @@ cdef class StringVector:
             self.append(x[i])
 
 
-cdef class ObjectVector:
+cdef class ObjectVector(Vector):
 
     cdef:
         PyObject **data
         Py_ssize_t n, m
         ndarray ao
-        bint external_view_exists
 
     def __cinit__(self):
-        self.external_view_exists = False
         self.n = 0
         self.m = _INIT_VEC_CAP
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)