[ENH] introducing IntpHashMap and making unique_label_indices use intp (#40653)

realead · web-flow · commit cd8930b86a8c · 2021-10-06T20:36:47.000-04:00
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -192,6 +192,7 @@ class UInt16HashTable(HashTable): ...
 class UInt8HashTable(HashTable): ...
 class StringHashTable(HashTable): ...
 class PyObjectHashTable(HashTable): ...
+class IntpHashTable(HashTable): ...
 
 def duplicated_int64(
     values: np.ndarray,  # const int64_t[:] values
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -65,6 +65,18 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128
 include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
+
+# map derived hash-map types onto basic hash-map types:
+if np.dtype(np.intp) == np.dtype(np.int64):
+    IntpHashTable = Int64HashTable
+    unique_label_indices = _unique_label_indices_int64
+elif np.dtype(np.intp) == np.dtype(np.int32):
+    IntpHashTable = Int32HashTable
+    unique_label_indices = _unique_label_indices_int32
+else:
+    raise ValueError(np.dtype(np.intp))
+
+
 cdef class Factorizer:
     cdef readonly:
         Py_ssize_t count
@@ -168,38 +180,3 @@ cdef class Int64Factorizer(Factorizer):
 
         self.count = len(self.uniques)
         return labels
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def unique_label_indices(const int64_t[:] labels) -> ndarray:
-    """
-    Indices of the first occurrences of the unique labels
-    *excluding* -1. equivalent to:
-        np.unique(labels, return_index=True)[1]
-    """
-    cdef:
-        int ret = 0
-        Py_ssize_t i, n = len(labels)
-        kh_int64_t *table = kh_init_int64()
-        Int64Vector idx = Int64Vector()
-        ndarray[int64_t, ndim=1] arr
-        Int64VectorData *ud = idx.data
-
-    kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
-    with nogil:
-        for i in range(n):
-            kh_put_int64(table, labels[i], &ret)
-            if ret != 0:
-                if needs_resize(ud):
-                    with gil:
-                        idx.resize()
-                append_data_int64(ud, i)
-
-    kh_destroy_int64(table)
-
-    arr = idx.to_array()
-    arr = arr[np.asarray(labels)[arr].argsort()]
-
-    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna):
 
     else:
         raise TypeError(values.dtype)
+
+
+{{py:
+
+# name, dtype, ttype, c_type
+dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
+          ('Int32', 'int32', 'int32', 'int32_t'), ]
+
+}}
+
+{{for name, dtype, ttype, c_type in dtypes}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
+    """
+    Indices of the first occurrences of the unique labels
+    *excluding* -1. equivalent to:
+        np.unique(labels, return_index=True)[1]
+    """
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(labels)
+        kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+        {{name}}Vector idx = {{name}}Vector()
+        ndarray[{{c_type}}, ndim=1] arr
+        {{name}}VectorData *ud = idx.data
+
+    kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
+
+    with nogil:
+        for i in range(n):
+            kh_put_{{ttype}}(table, labels[i], &ret)
+            if ret != 0:
+                if needs_resize(ud):
+                    with gil:
+                        idx.resize()
+                append_data_{{ttype}}(ud, i)
+
+    kh_destroy_{{ttype}}(table)
+
+    arr = idx.to_array()
+    arr = arr[np.asarray(labels)[arr].argsort()]
+
+    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
+
+{{endfor}}
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -261,8 +261,7 @@ def decons_obs_group_ids(
         out = decons_group_index(obs_ids, shape)
         return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
 
-    # TODO: unique_label_indices only used here, should take ndarray[np.intp]
-    indexer = unique_label_indices(ensure_int64(comp_ids))
+    indexer = unique_label_indices(comp_ids)
     return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
 
 
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -44,6 +44,7 @@ def get_allocated_khash_memory():
         (ht.UInt16HashTable, np.uint16),
         (ht.Int8HashTable, np.int8),
         (ht.UInt8HashTable, np.uint8),
+        (ht.IntpHashTable, np.intp),
     ],
 )
 class TestHashTable:
@@ -389,6 +390,7 @@ def get_ht_function(fun_name, type_suffix):
         (np.uint16, "uint16"),
         (np.int8, "int8"),
         (np.uint8, "uint8"),
+        (np.intp, "intp"),
     ],
 )
 class TestHelpFunctions:
@@ -471,6 +473,14 @@ def test_modes_with_nans():
     assert np.isnan(modes[0])
 
 
+def test_unique_label_indices_intp(writable):
+    keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
+    keys.flags.writeable = writable
+    result = ht.unique_label_indices(keys)
+    expected = np.array([0, 1, 5], dtype=np.intp)
+    tm.assert_numpy_array_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "dtype, type_suffix",
     [
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -1741,7 +1741,7 @@ def test_quantile():
 
 def test_unique_label_indices():
 
-    a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64")
+    a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)
 
     left = ht.unique_label_indices(a)
     right = np.unique(a, return_index=True)[1]