diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 047eb848b7540..5783d3c2353aa 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -199,10 +199,8 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): Returns ------- - ndarray[intp_t, ndim=1] - Indexer - ndarray[int64_t, ndim=1] - Group Counts + tuple + 1-d indexer ordered by groups, group counts. Notes ----- @@ -210,12 +208,11 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ cdef: Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where - ndarray[intp_t] indexer + ndarray[int64_t] counts, where, result counts = np.zeros(ngroups + 1, dtype=np.int64) n = len(index) - indexer = np.zeros(n, dtype=np.intp) + result = np.zeros(n, dtype=np.int64) where = np.zeros(ngroups + 1, dtype=np.int64) with nogil: @@ -231,10 +228,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): # this is our indexer for i in range(n): label = index[i] + 1 - indexer[where[label]] = i + result[where[label]] = i where[label] += 1 - return indexer, counts + return result, counts @cython.boundscheck(False) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index cdf4ef3b119d2..4eefd9d1f7267 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -66,7 +66,7 @@ def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values, {{else}} def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, {{endif}} - const intp_t[:] indexer, + const int64_t[:] indexer, {{c_type_out}}[:] out, fill_value=np.nan): @@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[intp_t] indexer, + ndarray[int64_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): cdef: @@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[intp_t] indexer, + ndarray[int64_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 89020f2078584..1bfb66cbf21ac 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -19,7 +19,6 @@ from numpy cimport ( int16_t, int32_t, int64_t, - intp_t, ndarray, uint8_t, uint16_t, @@ -142,7 +141,6 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K, ngroups, size ndarray[int64_t] _counts ndarray[float64_t, ndim=2] data - ndarray[intp_t] indexer float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index c2947de943e1a..511b373bc7e1f 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -33,8 +33,7 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[intp_t] left_sorter, right_sorter - ndarray[int64_t] left_count, right_count + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -85,8 +84,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count - ndarray[intp_t] rev, left_sorter, right_sorter + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray rev ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -158,8 +157,7 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[intp_t] left_sorter, right_sorter - ndarray[int64_t] left_count, right_count + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc int64_t left_pos = 0, right_pos = 0 @@ -217,16 +215,12 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(right_sorter, right_indexer)) -cdef ndarray[int64_t] _get_result_indexer( - ndarray[intp_t] sorter, ndarray[int64_t] indexer -): +cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` res = np.empty(len(indexer), dtype=np.int64) - take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1) - # FIXME: sorter is intp_t, not int64_t, opposite for indexer; - # will this break on 32bit builds? + take_1d_int64_int64(sorter, indexer, res, -1) else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index ba1b2a0f0e76e..c1abd8bbf39d0 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -117,10 +117,10 @@ def _take_nd_ndarray( ) -> np.ndarray: if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.intp) + indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = ensure_platform_int(indexer) + indexer = ensure_int64(indexer, copy=False) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, out, fill_value, allow_fill ) @@ -317,7 +317,7 @@ def _get_take_nd_function( if func is None: def func(arr, indexer, out, fill_value=np.nan): - indexer = ensure_platform_int(indexer) + indexer = ensure_int64(indexer) _take_nd_object( arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info ) @@ -468,7 +468,7 @@ def wrapper( def _take_nd_object( arr: np.ndarray, - indexer: np.ndarray, # np.ndarray[np.intp] + indexer: np.ndarray, out: np.ndarray, axis: int, fill_value, @@ -544,5 +544,4 @@ def _take_preprocess_indexer_and_fill_value( # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() - indexer = ensure_platform_int(indexer) return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 769ae52744c74..53929ad9eadc8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1961,8 +1961,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: Returns ------- - Dict[Hashable, np.ndarray[np.intp]] - dict of categories -> indexers + dict of categories -> indexers Examples -------- @@ -1980,7 +1979,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ categories = self.categories r, counts = libalgos.groupsort_indexer( - self.codes.astype("int64", copy=False), categories.size + self.codes.astype("int64"), categories.size ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 3aa4d26f7dc8f..10c13327c79d3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -606,7 +606,6 @@ def get_group_index_sorter( ) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) - # sorter _should_ already be intp, but mypy is not yet able to verify else: sorter = group_index.argsort(kind="mergesort") return ensure_platform_int(sorter) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index da438826a939a..f685680515a8f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1717,9 +1717,9 @@ def test_groupby_categorical_indices_unused_categories(): grouped = df.groupby("key", sort=False) result = grouped.indices expected = { - "b": np.array([0, 1], dtype="intp"), - "a": np.array([2], dtype="intp"), - "c": np.array([], dtype="intp"), + "b": np.array([0, 1], dtype="int64"), + "a": np.array([2], dtype="int64"), + "c": np.array([], dtype="int64"), } assert result.keys() == expected.keys() for key in result.keys(): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c8df18ddaeebe..876df69ae7f63 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2123,19 +2123,19 @@ def test_groupsort_indexer(): # need to use a stable sort # np.argsort returns int, groupsort_indexer - # always returns intp + # always returns int64 expected = np.argsort(a, kind="mergesort") - expected = expected.astype(np.intp) + expected = expected.astype(np.int64) tm.assert_numpy_array_equal(result, expected) # compare with lexsort # np.lexsort returns int, groupsort_indexer - # always returns intp + # always returns int64 key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) - expected = expected.astype(np.intp) + expected = expected.astype(np.int64) tm.assert_numpy_array_equal(result, expected)