diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b643c03b6a19..5352ca53e1b54 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -13,14 +13,16 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - NPY_INT64, + NPY_INTP, int64_t, + intp_t, ndarray, ) cnp.import_array() from pandas._libs.algos import ensure_int64 + from pandas._libs.util cimport is_integer_object @@ -30,7 +32,7 @@ cdef class BlockPlacement: # __slots__ = '_as_slice', '_as_array', '_len' cdef: slice _as_slice - ndarray _as_array # Note: this still allows `None` + ndarray _as_array # Note: this still allows `None`; will be intp_t bint _has_slice, _has_array, _is_known_slice_like def __cinit__(self, val): @@ -53,12 +55,12 @@ cdef class BlockPlacement: self._as_slice = slc self._has_slice = True else: - arr = np.empty(0, dtype=np.int64) + arr = np.empty(0, dtype=np.intp) self._as_array = arr self._has_array = True else: # Cython memoryview interface requires ndarray to be writeable. - arr = np.require(val, dtype=np.int64, requirements='W') + arr = np.require(val, dtype=np.intp, requirements='W') assert arr.ndim == 1, arr.shape self._as_array = arr self._has_array = True @@ -125,8 +127,8 @@ cdef class BlockPlacement: if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) # NOTE: this is the C-optimized equivalent of - # `np.arange(start, stop, step, dtype=np.int64)` - self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) + # `np.arange(start, stop, step, dtype=np.intp)` + self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP) self._has_array = True return self._as_array @@ -325,13 +327,13 @@ cdef slice_getitem(slice slc, ind): else: # NOTE: # this is the C-optimized equivalent of - # `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]` - return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] + # `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]` + return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind] @cython.boundscheck(False) @cython.wraparound(False) -cdef slice indexer_as_slice(int64_t[:] vals): +cdef slice indexer_as_slice(intp_t[:] vals): cdef: Py_ssize_t i, n, start, stop int64_t d diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1ff481553e413..fc3e1ecfb55c1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -451,7 +451,7 @@ def fast_zip(list ndarrays) -> ndarray[object]: return result -def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): +def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: """ Reverse indexing operation. @@ -459,14 +459,25 @@ def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): indexer_inv[indexer[x]] = x - .. note:: If indexer is not unique, only first occurrence is accounted. + Parameters + ---------- + indexer : np.ndarray[np.intp] + length : int + + Returns + ------- + np.ndarray[np.intp] + + Notes + ----- + If indexer is not unique, only first occurrence is accounted. """ cdef: Py_ssize_t i, n = len(indexer) - ndarray[int64_t] rev_indexer - int64_t idx + ndarray[intp_t] rev_indexer + intp_t idx - rev_indexer = np.empty(length, dtype=np.int64) + rev_indexer = np.empty(length, dtype=np.intp) rev_indexer[:] = -1 for i in range(n): idx = indexer[i] @@ -808,23 +819,32 @@ def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, @cython.boundscheck(False) @cython.wraparound(False) -def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): +def get_level_sorter( + ndarray[int64_t, ndim=1] codes, const intp_t[:] starts +) -> ndarray: """ Argsort for a single level of a multi-index, keeping the order of higher levels unchanged. `starts` points to starts of same-key indices w.r.t to leading levels; equivalent to: - np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort') + np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort') + starts[i] for i in range(len(starts) - 1)]) + + Parameters + ---------- + codes : np.ndarray[int64_t, ndim=1] + starts : np.ndarray[intp, ndim=1] + + Returns + ------- + np.ndarray[np.int, ndim=1] """ cdef: - int64_t l, r - Py_ssize_t i - ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64) - ndarray[int64_t, ndim=1] label_arr = np.asarray(label) + Py_ssize_t i, l, r + ndarray[intp_t, ndim=1] out = np.empty(len(codes), dtype=np.intp) for i in range(len(starts) - 1): l, r = starts[i], starts[i + 1] - out[l:r] = l + label_arr[l:r].argsort(kind='mergesort') + out[l:r] = l + codes[l:r].argsort(kind='mergesort') return out diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74e96015b4544..a222a8cc464fb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -991,10 +991,10 @@ def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): @cache_readonly def slabels(self): # Sorted labels - return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False) + return algorithms.take_nd(self.labels, self._sort_idx, allow_fill=False) @cache_readonly - def sort_idx(self): + def _sort_idx(self) -> np.ndarray: # np.ndarray[np.intp] # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups) @@ -1013,7 +1013,7 @@ def __iter__(self): @cache_readonly def sorted_data(self) -> FrameOrSeries: - return self.data.take(self.sort_idx, axis=self.axis) + return self.data.take(self._sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3a468758ab3fd..a5c0a5c6694e5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4135,13 +4135,22 @@ def _join_level( """ from pandas.core.indexes.multi import MultiIndex - def _get_leaf_sorter(labels): + def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: """ Returns sorter for the inner most level while preserving the order of higher levels. + + Parameters + ---------- + labels : list[np.ndarray] + Each ndarray has signed integer dtype, not necessarily identical. + + Returns + ------- + np.ndarray[np.intp] """ if labels[0].size == 0: - return np.empty(0, dtype="int64") + return np.empty(0, dtype=np.intp) if len(labels) == 1: return get_group_index_sorter(labels[0]) @@ -4154,7 +4163,7 @@ def _get_leaf_sorter(labels): starts = np.hstack(([True], tic, [True])).nonzero()[0] lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + return lib.get_level_sorter(lab, ensure_platform_int(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): raise TypeError("Join on level between two MultiIndex objects is ambiguous") @@ -4189,12 +4198,12 @@ def _get_leaf_sorter(labels): join_index = left[left_indexer] else: - left_lev_indexer = ensure_int64(left_lev_indexer) + left_lev_indexer = ensure_platform_int(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) old_codes = left.codes[level] - new_lev_codes = algos.take_nd( - rev_indexer, old_codes[old_codes != -1], allow_fill=False - ) + + taker = old_codes[old_codes != -1] + new_lev_codes = rev_indexer.take(taker) new_codes = list(left.codes) new_codes[level] = new_lev_codes @@ -4204,6 +4213,7 @@ def _get_leaf_sorter(labels): if keep_order: # just drop missing values. o.w. keep order left_indexer = np.arange(len(left), dtype=np.intp) + left_indexer = cast(np.ndarray, left_indexer) mask = new_lev_codes != -1 if not mask.all(): new_codes = [lab[mask] for lab in new_codes] @@ -4213,11 +4223,12 @@ def _get_leaf_sorter(labels): if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - new_lev_codes, ngroups + ensure_int64(new_lev_codes), ngroups ) # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0] :] + # error: Value of type "Optional[ndarray]" is not indexable + left_indexer = left_indexer[counts[0] :] # type: ignore[index] new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 97492f35232e3..31aa5e301d17c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1952,7 +1952,7 @@ def _sort_levels_monotonic(self) -> MultiIndex: lev = lev.take(indexer) # indexer to reorder the level codes - indexer = ensure_int64(indexer) + indexer = ensure_platform_int(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) level_codes = algos.take_nd(ri, level_codes) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6a0286b1c40ef..613669b8cc1d8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -145,7 +145,7 @@ def _indexer_and_to_sort(self): ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) - + indexer = ensure_platform_int(indexer) return indexer, to_sort @cache_readonly diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 720643d3d98aa..10c13327c79d3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -582,6 +582,16 @@ def get_group_index_sorter( Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') + + Parameters + ---------- + group_index : np.ndarray + signed integer dtype + ngroups : int or None, default None + + Returns + ------- + np.ndarray[np.intp] """ if ngroups is None: # error: Incompatible types in assignment (expression has type "number[Any]", @@ -596,9 +606,9 @@ def get_group_index_sorter( ) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) - return ensure_platform_int(sorter) else: - return group_index.argsort(kind="mergesort") + sorter = group_index.argsort(kind="mergesort") + return ensure_platform_int(sorter) def compress_group_index(group_index, sort: bool = True): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c63d5271f1fae..ef1c3ec0c2860 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -259,7 +259,7 @@ def _check(blk): def test_mgr_locs(self): assert isinstance(self.fblock.mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64) + self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.intp) ) def test_attrs(self): @@ -277,7 +277,7 @@ def test_delete(self): newb.delete(0) assert isinstance(newb.mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64) + newb.mgr_locs.as_array, np.array([2, 4], dtype=np.intp) ) assert (newb.values[0] == 1).all() @@ -285,14 +285,14 @@ def test_delete(self): newb.delete(1) assert isinstance(newb.mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64) + newb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp) ) assert (newb.values[1] == 2).all() newb = self.fblock.copy() newb.delete(2) tm.assert_numpy_array_equal( - newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64) + newb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp) ) assert (newb.values[1] == 1).all() @@ -665,7 +665,7 @@ def test_consolidate_ordering_issues(self, mgr): assert cons.nblocks == 1 assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64) + cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.intp) ) def test_reindex_items(self): @@ -1095,7 +1095,7 @@ def test_slice_iter(self, slc, expected): ) def test_slice_to_array_conversion(self, slc, arr): tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64) + BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.intp) ) def test_blockplacement_add(self): diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 60c42878497c2..0532de9998c5f 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -197,9 +197,9 @@ def test_maybe_booleans_to_slice(self): assert result == slice(0, 0) def test_get_reverse_indexer(self): - indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp) result = lib.get_reverse_indexer(indexer, 5) - expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + expected = np.array([4, 2, 3, 6, 7], dtype=np.intp) tm.assert_numpy_array_equal(result, expected)