BUG: get_indexer returned dtype (#36431)

alexhlim · web-flow · commit 54f23e8f1f02 · 2020-09-19T15:56:05.000-04:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -295,6 +295,7 @@ Indexing
 
 - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`)
 - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
+- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`)
 -
 
 Missing
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -1,7 +1,7 @@
 from pandas._libs.khash cimport (
     kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t,
     int64_t, float64_t)
-from numpy cimport ndarray
+from numpy cimport ndarray, intp_t
 
 # prototypes for sharing
 
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -347,7 +347,7 @@ cdef class {{name}}HashTable(HashTable):
             int ret = 0
             {{dtype}}_t val
             khiter_t k
-            int64_t[:] locs = np.empty(n, dtype=np.int64)
+            intp_t[:] locs = np.empty(n, dtype=np.intp)
 
         with nogil:
             for i in range(n):
@@ -551,15 +551,15 @@ cdef class {{name}}HashTable(HashTable):
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
-            int64_t[:] labels
+            intp_t[:] labels
             Py_ssize_t idx, count = 0
             int ret = 0
             {{dtype}}_t val
             khiter_t k
             {{name}}Vector uniques = {{name}}Vector()
             {{name}}VectorData *ud
 
-        labels = np.empty(n, dtype=np.int64)
+        labels = np.empty(n, dtype=np.intp)
         ud = uniques.data
 
         with nogil:
@@ -648,8 +648,8 @@ cdef class StringHashTable(HashTable):
     def get_indexer(self, ndarray[object] values):
         cdef:
             Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
-            int64_t *resbuf = <int64_t*>labels.data
+            ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
+            intp_t *resbuf = <intp_t*>labels.data
             khiter_t k
             kh_str_t *table = self.table
             const char *v
@@ -680,7 +680,7 @@ cdef class StringHashTable(HashTable):
             object val
             const char *v
             khiter_t k
-            int64_t[:] locs = np.empty(n, dtype=np.int64)
+            intp_t[:] locs = np.empty(n, dtype=np.intp)
 
         # these by-definition *must* be strings
         vecs = <const char **>malloc(n * sizeof(char *))
@@ -986,7 +986,7 @@ cdef class PyObjectHashTable(HashTable):
             int ret = 0
             object val
             khiter_t k
-            int64_t[:] locs = np.empty(n, dtype=np.int64)
+            intp_t[:] locs = np.empty(n, dtype=np.intp)
 
         for i in range(n):
             val = values[i]
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -266,7 +266,7 @@ cdef class IndexEngine:
         """
         cdef:
             ndarray values, x
-            ndarray[int64_t] result, missing
+            ndarray[intp_t] result, missing
             set stargets, remaining_stargets
             dict d = {}
             object val
@@ -283,8 +283,8 @@ cdef class IndexEngine:
         else:
             n_alloc = n
 
-        result = np.empty(n_alloc, dtype=np.int64)
-        missing = np.empty(n_t, dtype=np.int64)
+        result = np.empty(n_alloc, dtype=np.intp)
+        missing = np.empty(n_t, dtype=np.intp)
 
         # map each starget to its position in the index
         if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
@@ -201,4 +201,4 @@ def test_get_indexer_non_unique_dtype_mismatch():
     # GH 25459
     indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0]))
     tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes)
-    tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing)
+    tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing)

Original file line number	Diff line number	Diff line change
`@@ -295,6 +295,7 @@ Indexing`
`295`	`295`
`296`	`296`	- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`)
`297`	`297`	- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
	`298`	+- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`)
`298`	`299`	`-`
`299`	`300`
`300`	`301`	`Missing`