Skip to content

Commit b519386

Browse files
authored
TYP: get_reverse_indexer, get_group_index_sorter (#40476)
1 parent b524462 commit b519386

File tree

9 files changed

+88
-45
lines changed

9 files changed

+88
-45
lines changed

pandas/_libs/internals.pyx

+11-9
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,16 @@ import numpy as np
1313

1414
cimport numpy as cnp
1515
from numpy cimport (
16-
NPY_INT64,
16+
NPY_INTP,
1717
int64_t,
18+
intp_t,
1819
ndarray,
1920
)
2021

2122
cnp.import_array()
2223

2324
from pandas._libs.algos import ensure_int64
25+
2426
from pandas._libs.util cimport is_integer_object
2527

2628

@@ -30,7 +32,7 @@ cdef class BlockPlacement:
3032
# __slots__ = '_as_slice', '_as_array', '_len'
3133
cdef:
3234
slice _as_slice
33-
ndarray _as_array # Note: this still allows `None`
35+
ndarray _as_array # Note: this still allows `None`; will be intp_t
3436
bint _has_slice, _has_array, _is_known_slice_like
3537

3638
def __cinit__(self, val):
@@ -53,12 +55,12 @@ cdef class BlockPlacement:
5355
self._as_slice = slc
5456
self._has_slice = True
5557
else:
56-
arr = np.empty(0, dtype=np.int64)
58+
arr = np.empty(0, dtype=np.intp)
5759
self._as_array = arr
5860
self._has_array = True
5961
else:
6062
# Cython memoryview interface requires ndarray to be writeable.
61-
arr = np.require(val, dtype=np.int64, requirements='W')
63+
arr = np.require(val, dtype=np.intp, requirements='W')
6264
assert arr.ndim == 1, arr.shape
6365
self._as_array = arr
6466
self._has_array = True
@@ -125,8 +127,8 @@ cdef class BlockPlacement:
125127
if not self._has_array:
126128
start, stop, step, _ = slice_get_indices_ex(self._as_slice)
127129
# NOTE: this is the C-optimized equivalent of
128-
# `np.arange(start, stop, step, dtype=np.int64)`
129-
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64)
130+
# `np.arange(start, stop, step, dtype=np.intp)`
131+
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP)
130132
self._has_array = True
131133

132134
return self._as_array
@@ -325,13 +327,13 @@ cdef slice_getitem(slice slc, ind):
325327
else:
326328
# NOTE:
327329
# this is the C-optimized equivalent of
328-
# `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]`
329-
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind]
330+
# `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]`
331+
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind]
330332

331333

332334
@cython.boundscheck(False)
333335
@cython.wraparound(False)
334-
cdef slice indexer_as_slice(int64_t[:] vals):
336+
cdef slice indexer_as_slice(intp_t[:] vals):
335337
cdef:
336338
Py_ssize_t i, n, start, stop
337339
int64_t d

pandas/_libs/lib.pyx

+32-12
Original file line numberDiff line numberDiff line change
@@ -451,22 +451,33 @@ def fast_zip(list ndarrays) -> ndarray[object]:
451451
return result
452452

453453

454-
def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length):
454+
def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
455455
"""
456456
Reverse indexing operation.
457457

458458
Given `indexer`, make `indexer_inv` of it, such that::
459459

460460
indexer_inv[indexer[x]] = x
461461

462-
.. note:: If indexer is not unique, only first occurrence is accounted.
462+
Parameters
463+
----------
464+
indexer : np.ndarray[np.intp]
465+
length : int
466+
467+
Returns
468+
-------
469+
np.ndarray[np.intp]
470+
471+
Notes
472+
-----
473+
If indexer is not unique, only first occurrence is accounted.
463474
"""
464475
cdef:
465476
Py_ssize_t i, n = len(indexer)
466-
ndarray[int64_t] rev_indexer
467-
int64_t idx
477+
ndarray[intp_t] rev_indexer
478+
intp_t idx
468479

469-
rev_indexer = np.empty(length, dtype=np.int64)
480+
rev_indexer = np.empty(length, dtype=np.intp)
470481
rev_indexer[:] = -1
471482
for i in range(n):
472483
idx = indexer[i]
@@ -808,23 +819,32 @@ def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner,
808819

809820
@cython.boundscheck(False)
810821
@cython.wraparound(False)
811-
def get_level_sorter(const int64_t[:] label, const int64_t[:] starts):
822+
def get_level_sorter(
823+
ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
824+
) -> ndarray:
812825
"""
813826
Argsort for a single level of a multi-index, keeping the order of higher
814827
levels unchanged. `starts` points to starts of same-key indices w.r.t
815828
to leading levels; equivalent to:
816-
np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort')
829+
np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
817830
+ starts[i] for i in range(len(starts) - 1)])
831+
832+
Parameters
833+
----------
834+
codes : np.ndarray[int64_t, ndim=1]
835+
starts : np.ndarray[intp, ndim=1]
836+
837+
Returns
838+
-------
839+
np.ndarray[np.int, ndim=1]
818840
"""
819841
cdef:
820-
int64_t l, r
821-
Py_ssize_t i
822-
ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64)
823-
ndarray[int64_t, ndim=1] label_arr = np.asarray(label)
842+
Py_ssize_t i, l, r
843+
ndarray[intp_t, ndim=1] out = np.empty(len(codes), dtype=np.intp)
824844

825845
for i in range(len(starts) - 1):
826846
l, r = starts[i], starts[i + 1]
827-
out[l:r] = l + label_arr[l:r].argsort(kind='mergesort')
847+
out[l:r] = l + codes[l:r].argsort(kind='mergesort')
828848

829849
return out
830850

pandas/core/groupby/ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -991,10 +991,10 @@ def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0):
991991
@cache_readonly
992992
def slabels(self):
993993
# Sorted labels
994-
return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False)
994+
return algorithms.take_nd(self.labels, self._sort_idx, allow_fill=False)
995995

996996
@cache_readonly
997-
def sort_idx(self):
997+
def _sort_idx(self) -> np.ndarray: # np.ndarray[np.intp]
998998
# Counting sort indexer
999999
return get_group_index_sorter(self.labels, self.ngroups)
10001000

@@ -1013,7 +1013,7 @@ def __iter__(self):
10131013

10141014
@cache_readonly
10151015
def sorted_data(self) -> FrameOrSeries:
1016-
return self.data.take(self.sort_idx, axis=self.axis)
1016+
return self.data.take(self._sort_idx, axis=self.axis)
10171017

10181018
def _chop(self, sdata, slice_obj: slice) -> NDFrame:
10191019
raise AbstractMethodError(self)

pandas/core/indexes/base.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -4135,13 +4135,22 @@ def _join_level(
41354135
"""
41364136
from pandas.core.indexes.multi import MultiIndex
41374137

4138-
def _get_leaf_sorter(labels):
4138+
def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
41394139
"""
41404140
Returns sorter for the inner most level while preserving the
41414141
order of higher levels.
4142+
4143+
Parameters
4144+
----------
4145+
labels : list[np.ndarray]
4146+
Each ndarray has signed integer dtype, not necessarily identical.
4147+
4148+
Returns
4149+
-------
4150+
np.ndarray[np.intp]
41424151
"""
41434152
if labels[0].size == 0:
4144-
return np.empty(0, dtype="int64")
4153+
return np.empty(0, dtype=np.intp)
41454154

41464155
if len(labels) == 1:
41474156
return get_group_index_sorter(labels[0])
@@ -4154,7 +4163,7 @@ def _get_leaf_sorter(labels):
41544163

41554164
starts = np.hstack(([True], tic, [True])).nonzero()[0]
41564165
lab = ensure_int64(labels[-1])
4157-
return lib.get_level_sorter(lab, ensure_int64(starts))
4166+
return lib.get_level_sorter(lab, ensure_platform_int(starts))
41584167

41594168
if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
41604169
raise TypeError("Join on level between two MultiIndex objects is ambiguous")
@@ -4189,12 +4198,12 @@ def _get_leaf_sorter(labels):
41894198
join_index = left[left_indexer]
41904199

41914200
else:
4192-
left_lev_indexer = ensure_int64(left_lev_indexer)
4201+
left_lev_indexer = ensure_platform_int(left_lev_indexer)
41934202
rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level))
41944203
old_codes = left.codes[level]
4195-
new_lev_codes = algos.take_nd(
4196-
rev_indexer, old_codes[old_codes != -1], allow_fill=False
4197-
)
4204+
4205+
taker = old_codes[old_codes != -1]
4206+
new_lev_codes = rev_indexer.take(taker)
41984207

41994208
new_codes = list(left.codes)
42004209
new_codes[level] = new_lev_codes
@@ -4204,6 +4213,7 @@ def _get_leaf_sorter(labels):
42044213

42054214
if keep_order: # just drop missing values. o.w. keep order
42064215
left_indexer = np.arange(len(left), dtype=np.intp)
4216+
left_indexer = cast(np.ndarray, left_indexer)
42074217
mask = new_lev_codes != -1
42084218
if not mask.all():
42094219
new_codes = [lab[mask] for lab in new_codes]
@@ -4213,11 +4223,12 @@ def _get_leaf_sorter(labels):
42134223
if level == 0: # outer most level, take the fast route
42144224
ngroups = 1 + new_lev_codes.max()
42154225
left_indexer, counts = libalgos.groupsort_indexer(
4216-
new_lev_codes, ngroups
4226+
ensure_int64(new_lev_codes), ngroups
42174227
)
42184228

42194229
# missing values are placed first; drop them!
4220-
left_indexer = left_indexer[counts[0] :]
4230+
# error: Value of type "Optional[ndarray]" is not indexable
4231+
left_indexer = left_indexer[counts[0] :] # type: ignore[index]
42214232
new_codes = [lab[left_indexer] for lab in new_codes]
42224233

42234234
else: # sort the leaves

pandas/core/indexes/multi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1952,7 +1952,7 @@ def _sort_levels_monotonic(self) -> MultiIndex:
19521952
lev = lev.take(indexer)
19531953

19541954
# indexer to reorder the level codes
1955-
indexer = ensure_int64(indexer)
1955+
indexer = ensure_platform_int(indexer)
19561956
ri = lib.get_reverse_indexer(indexer, len(indexer))
19571957
level_codes = algos.take_nd(ri, level_codes)
19581958

pandas/core/reshape/reshape.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def _indexer_and_to_sort(self):
145145
ngroups = len(obs_ids)
146146

147147
indexer = get_group_index_sorter(comp_index, ngroups)
148-
148+
indexer = ensure_platform_int(indexer)
149149
return indexer, to_sort
150150

151151
@cache_readonly

pandas/core/sorting.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,16 @@ def get_group_index_sorter(
582582
Both algorithms are `stable` sort and that is necessary for correctness of
583583
groupby operations. e.g. consider:
584584
df.groupby(key)[col].transform('first')
585+
586+
Parameters
587+
----------
588+
group_index : np.ndarray
589+
signed integer dtype
590+
ngroups : int or None, default None
591+
592+
Returns
593+
-------
594+
np.ndarray[np.intp]
585595
"""
586596
if ngroups is None:
587597
# error: Incompatible types in assignment (expression has type "number[Any]",
@@ -596,9 +606,9 @@ def get_group_index_sorter(
596606
)
597607
if do_groupsort:
598608
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
599-
return ensure_platform_int(sorter)
600609
else:
601-
return group_index.argsort(kind="mergesort")
610+
sorter = group_index.argsort(kind="mergesort")
611+
return ensure_platform_int(sorter)
602612

603613

604614
def compress_group_index(group_index, sort: bool = True):

pandas/tests/internals/test_internals.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def _check(blk):
259259
def test_mgr_locs(self):
260260
assert isinstance(self.fblock.mgr_locs, BlockPlacement)
261261
tm.assert_numpy_array_equal(
262-
self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64)
262+
self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.intp)
263263
)
264264

265265
def test_attrs(self):
@@ -277,22 +277,22 @@ def test_delete(self):
277277
newb.delete(0)
278278
assert isinstance(newb.mgr_locs, BlockPlacement)
279279
tm.assert_numpy_array_equal(
280-
newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64)
280+
newb.mgr_locs.as_array, np.array([2, 4], dtype=np.intp)
281281
)
282282
assert (newb.values[0] == 1).all()
283283

284284
newb = self.fblock.copy()
285285
newb.delete(1)
286286
assert isinstance(newb.mgr_locs, BlockPlacement)
287287
tm.assert_numpy_array_equal(
288-
newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64)
288+
newb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp)
289289
)
290290
assert (newb.values[1] == 2).all()
291291

292292
newb = self.fblock.copy()
293293
newb.delete(2)
294294
tm.assert_numpy_array_equal(
295-
newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64)
295+
newb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
296296
)
297297
assert (newb.values[1] == 1).all()
298298

@@ -665,7 +665,7 @@ def test_consolidate_ordering_issues(self, mgr):
665665
assert cons.nblocks == 1
666666
assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
667667
tm.assert_numpy_array_equal(
668-
cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64)
668+
cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.intp)
669669
)
670670

671671
def test_reindex_items(self):
@@ -1095,7 +1095,7 @@ def test_slice_iter(self, slc, expected):
10951095
)
10961096
def test_slice_to_array_conversion(self, slc, arr):
10971097
tm.assert_numpy_array_equal(
1098-
BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64)
1098+
BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.intp)
10991099
)
11001100

11011101
def test_blockplacement_add(self):

pandas/tests/libs/test_lib.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -197,9 +197,9 @@ def test_maybe_booleans_to_slice(self):
197197
assert result == slice(0, 0)
198198

199199
def test_get_reverse_indexer(self):
200-
indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64)
200+
indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp)
201201
result = lib.get_reverse_indexer(indexer, 5)
202-
expected = np.array([4, 2, 3, 6, 7], dtype=np.int64)
202+
expected = np.array([4, 2, 3, 6, 7], dtype=np.intp)
203203
tm.assert_numpy_array_equal(result, expected)
204204

205205

0 commit comments

Comments
 (0)