From 2b07e2e25848caf5d80a951abe41c333896c088d Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Tue, 15 Sep 2020 13:52:04 +0000 Subject: [PATCH 1/7] BUG: IndexEngine get_indexer methods return intp arrays --- pandas/_libs/index.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 8155e7e6c074a..f02ae2e3832a0 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -255,7 +255,7 @@ cdef class IndexEngine: def get_indexer(self, values): self._ensure_mapping_populated() - return self.mapping.lookup(values) + return self.mapping.lookup(values).astype('intp') def get_indexer_non_unique(self, targets): """ @@ -266,7 +266,7 @@ cdef class IndexEngine: """ cdef: ndarray values, x - ndarray[int64_t] result, missing + ndarray[intp_t] result, missing set stargets, remaining_stargets dict d = {} object val @@ -283,8 +283,8 @@ cdef class IndexEngine: else: n_alloc = n - result = np.empty(n_alloc, dtype=np.int64) - missing = np.empty(n_t, dtype=np.int64) + result = np.empty(n_alloc, dtype=np.intp) + missing = np.empty(n_t, dtype=np.intp) # map each starget to its position in the index if stargets and len(stargets) < 5 and self.is_monotonic_increasing: From e4066e273e32438edf7cc157f660fa8423350f0e Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 17 Sep 2020 14:53:17 +0000 Subject: [PATCH 2/7] TST: changing missing dtype to intp --- pandas/tests/base/test_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 9523fba953ad0..b8468a5acf277 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -201,4 +201,4 @@ def test_get_indexer_non_unique_dtype_mismatch(): # GH 25459 indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) + tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) From 4a4721a5300864617d71b1ef005b3d28ee5db814 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 17 Sep 2020 16:31:06 +0000 Subject: [PATCH 3/7] adding feature to whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8b18b56929acd..4241e2266fd8b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -290,6 +290,7 @@ Indexing - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) +- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Missing From d875e36306c4080a5f3c10ea3458463b93f63507 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 17 Sep 2020 22:28:02 +0000 Subject: [PATCH 4/7] CLN: reverting astype (#36431) --- pandas/_libs/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f02ae2e3832a0..e31c3739f456d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -255,7 +255,7 @@ cdef class IndexEngine: def get_indexer(self, values): self._ensure_mapping_populated() - return self.mapping.lookup(values).astype('intp') + return self.mapping.lookup(values) def get_indexer_non_unique(self, targets): """ From 4d1d0760979a4b73cb46174e59afe82e5b2c9184 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 17 Sep 2020 22:29:56 +0000 Subject: [PATCH 5/7] BUG: HashTable.lookup to use intp arrays (#36431) --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 0499eabf708af..2650bea921b3f 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,7 +1,7 @@ from pandas._libs.khash cimport ( kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t, int64_t, float64_t) -from numpy cimport ndarray +from numpy cimport ndarray, intp_t # prototypes for sharing diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5e4da96d57e42..5b854f7a56e7b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -347,7 +347,7 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{dtype}}_t val khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) with nogil: for i in range(n): @@ -680,7 +680,7 @@ cdef class StringHashTable(HashTable): object val const char *v khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) @@ -986,7 +986,7 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) for i in range(n): val = values[i] From ad2cdfced3ac3c74b5adaad5b56b4167d2fe3975 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 17 Sep 2020 22:35:05 +0000 Subject: [PATCH 6/7] BUG: HashTable.get_labels_groupby to use intp arrays (#36431) --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5b854f7a56e7b..c1dba0bab4ac8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -551,7 +551,7 @@ cdef class {{name}}HashTable(HashTable): def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) - int64_t[:] labels + intp_t[:] labels Py_ssize_t idx, count = 0 int ret = 0 {{dtype}}_t val @@ -559,7 +559,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) ud = uniques.data with nogil: From aef31e2817a985ca4e003d466247e98e7ef4fc31 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 17 Sep 2020 22:55:17 +0000 Subject: [PATCH 7/7] BUG: HashTable.get_indexer to use intp arrays (#36431) --- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c1dba0bab4ac8..da91fa69b0dec 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -648,8 +648,8 @@ cdef class StringHashTable(HashTable): def get_indexer(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - int64_t *resbuf = labels.data + ndarray[intp_t] labels = np.empty(n, dtype=np.intp) + intp_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v