diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 94bd8b49777cf..a6bfff9bb00fe 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -191,7 +191,7 @@ def is_lexsorted(list_of_arrays: list) -> bint: @cython.boundscheck(False) @cython.wraparound(False) -def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): +def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): """ Compute a 1-d indexer. @@ -200,7 +200,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): Parameters ---------- - index: int64 ndarray + index: np.ndarray[np.intp] Mappings from group -> position. ngroups: int64 Number of groups. @@ -209,7 +209,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): ------- ndarray[intp_t, ndim=1] Indexer - ndarray[int64_t, ndim=1] + ndarray[intp_t, ndim=1] Group Counts Notes @@ -218,13 +218,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ cdef: Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where - ndarray[intp_t] indexer + ndarray[intp_t] indexer, where, counts - counts = np.zeros(ngroups + 1, dtype=np.int64) + counts = np.zeros(ngroups + 1, dtype=np.intp) n = len(index) indexer = np.zeros(n, dtype=np.intp) - where = np.zeros(ngroups + 1, dtype=np.int64) + where = np.zeros(ngroups + 1, dtype=np.intp) with nogil: diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index cdf4ef3b119d2..929cb86c41036 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # take_1d, take_2d # ---------------------------------------------------------------------- + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_intp_intp( + const intp_t[:] values, + const intp_t[:] indexer, + intp_t[::1] out, + intp_t fill_value=-1, +): + cdef: + Py_ssize_t i, n, idx + intp_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i in range(n): + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + {{py: # c_type_in, c_type_out diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f09a6c04aecbf..11e08bfd181b0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -37,6 +37,7 @@ from pandas._libs.util cimport ( ) from pandas._libs.algos import ( + ensure_platform_int, groupsort_indexer, rank_1d, take_2d_axis1_float64_float64, @@ -111,7 +112,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts + ndarray[intp_t] _counts ndarray[float64_t, ndim=2] data ndarray[intp_t] indexer float64_t* ptr @@ -121,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ngroups = len(counts) N, K = (values).shape - indexer, _counts = groupsort_indexer(labels, ngroups) + indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups) counts[:] = _counts[1:] data = np.empty((K, N), dtype=np.float64) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index c2947de943e1a..7888a15a7cb26 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -21,10 +21,9 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import ( - ensure_int64, - ensure_platform_int, groupsort_indexer, take_1d_int64_int64, + take_1d_intp_intp, ) @@ -34,16 +33,16 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, cdef: Py_ssize_t i, j, k, count = 0 ndarray[intp_t] left_sorter, right_sorter - ndarray[int64_t] left_count, right_count - ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc + ndarray[intp_t] left_count, right_count + ndarray[intp_t] left_indexer, right_indexer + intp_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset # NA group in location 0 - left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) - right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -58,8 +57,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, left_pos = left_count[0] right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) + left_indexer = np.empty(count, dtype=np.intp) + right_indexer = np.empty(count, dtype=np.intp) with nogil: for i in range(1, max_groups + 1): @@ -85,17 +84,17 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count + ndarray[intp_t] left_count, right_count ndarray[intp_t] rev, left_sorter, right_sorter - ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc + ndarray[intp_t] left_indexer, right_indexer + intp_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset # NA group in location 0 - left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) - right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -109,8 +108,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos = left_count[0] right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) + left_indexer = np.empty(count, dtype=np.intp) + right_indexer = np.empty(count, dtype=np.intp) with nogil: for i in range(1, max_groups + 1): @@ -142,11 +141,10 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case rev = np.empty(len(left), dtype=np.intp) - rev.put(ensure_platform_int(left_sorter), np.arange(len(left))) + rev.put(left_sorter, np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - rev = ensure_platform_int(rev) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) @@ -159,16 +157,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, cdef: Py_ssize_t i, j, k, count = 0 ndarray[intp_t] left_sorter, right_sorter - ndarray[int64_t] left_count, right_count - ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc - int64_t left_pos = 0, right_pos = 0 + ndarray[intp_t] left_count, right_count + ndarray[intp_t] left_indexer, right_indexer + intp_t lc, rc + intp_t left_pos = 0, right_pos = 0 Py_ssize_t offset, position = 0 # NA group in location 0 - left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) - right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -185,8 +183,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos = left_count[0] right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) + left_indexer = np.empty(count, dtype=np.intp) + right_indexer = np.empty(count, dtype=np.intp) with nogil: for i in range(1, max_groups + 1): @@ -217,31 +215,29 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(right_sorter, right_indexer)) -cdef ndarray[int64_t] _get_result_indexer( - ndarray[intp_t] sorter, ndarray[int64_t] indexer +cdef ndarray[intp_t] _get_result_indexer( + ndarray[intp_t] sorter, ndarray[intp_t] indexer ): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` - res = np.empty(len(indexer), dtype=np.int64) - take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1) - # FIXME: sorter is intp_t, not int64_t, opposite for indexer; - # will this break on 32bit builds? + res = np.empty(len(indexer), dtype=np.intp) + take_1d_intp_intp(sorter, indexer, res, -1) else: # length-0 case - res = np.empty(len(indexer), dtype=np.int64) + res = np.empty(len(indexer), dtype=np.intp) res[:] = -1 return res -def ffill_indexer(const int64_t[:] indexer): +def ffill_indexer(const intp_t[:] indexer): cdef: Py_ssize_t i, n = len(indexer) - ndarray[int64_t] result - int64_t val, last_obs + ndarray[intp_t] result + intp_t val, last_obs - result = np.empty(n, dtype=np.int64) + result = np.empty(n, dtype=np.intp) last_obs = -1 for i in range(n): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1398db6960cc8..3c88590991d77 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1981,9 +1981,9 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ categories = self.categories r, counts = libalgos.groupsort_indexer( - self.codes.astype("int64", copy=False), categories.size + ensure_platform_int(self.codes), categories.size ) - counts = counts.cumsum() + counts = ensure_int64(counts).cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) return dict(zip(categories, _result)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e5e7b446d9cb2..094f4a67d2e61 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4154,7 +4154,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: return np.empty(0, dtype=np.intp) if len(labels) == 1: - return get_group_index_sorter(labels[0]) + return get_group_index_sorter(ensure_platform_int(labels[0])) # find indexers of beginning of each set of # same-key labels w.r.t all but last level @@ -4224,7 +4224,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - ensure_int64(new_lev_codes), ngroups + new_lev_codes, ngroups ) # missing values are placed first; drop them! diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 88fcc13502439..02c41538ca123 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -604,7 +604,7 @@ def get_group_index_sorter( (alpha + beta * ngroups) < (count * np.log(count)) # type: ignore[operator] ) if do_groupsort: - sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) + sorter, _ = algos.groupsort_indexer(ensure_platform_int(group_index), ngroups) # sorter _should_ already be intp, but mypy is not yet able to verify else: sorter = group_index.argsort(kind="mergesort") diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index f5426c71511bb..eeb66f8941260 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -264,8 +264,8 @@ def test_left_outer_join_bug(): lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False) - exp_lidx = np.arange(len(left), dtype=np.int64) - exp_ridx = -np.ones(len(left), dtype=np.int64) + exp_lidx = np.arange(len(left), dtype=np.intp) + exp_ridx = -np.ones(len(left), dtype=np.intp) exp_ridx[left == 1] = 1 exp_ridx[left == 3] = 0 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c8df18ddaeebe..cd800b3f3a452 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2116,8 +2116,8 @@ def test_is_lexsorted(): def test_groupsort_indexer(): - a = np.random.randint(0, 1000, 100).astype(np.int64) - b = np.random.randint(0, 1000, 100).astype(np.int64) + a = np.random.randint(0, 1000, 100).astype(np.intp) + b = np.random.randint(0, 1000, 100).astype(np.intp) result = libalgos.groupsort_indexer(a, 1000)[0]