Skip to content

TYP: get_reverse_indexer, get_group_index_sorter #40476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ import numpy as np

cimport numpy as cnp
from numpy cimport (
NPY_INT64,
NPY_INTP,
int64_t,
intp_t,
ndarray,
)

cnp.import_array()

from pandas._libs.algos import ensure_int64

from pandas._libs.util cimport is_integer_object


Expand All @@ -30,7 +32,7 @@ cdef class BlockPlacement:
# __slots__ = '_as_slice', '_as_array', '_len'
cdef:
slice _as_slice
ndarray _as_array # Note: this still allows `None`
ndarray _as_array # Note: this still allows `None`; will be intp_t
bint _has_slice, _has_array, _is_known_slice_like

def __cinit__(self, val):
Expand All @@ -53,12 +55,12 @@ cdef class BlockPlacement:
self._as_slice = slc
self._has_slice = True
else:
arr = np.empty(0, dtype=np.int64)
arr = np.empty(0, dtype=np.intp)
self._as_array = arr
self._has_array = True
else:
# Cython memoryview interface requires ndarray to be writeable.
arr = np.require(val, dtype=np.int64, requirements='W')
arr = np.require(val, dtype=np.intp, requirements='W')
assert arr.ndim == 1, arr.shape
self._as_array = arr
self._has_array = True
Expand Down Expand Up @@ -125,8 +127,8 @@ cdef class BlockPlacement:
if not self._has_array:
start, stop, step, _ = slice_get_indices_ex(self._as_slice)
# NOTE: this is the C-optimized equivalent of
# `np.arange(start, stop, step, dtype=np.int64)`
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64)
# `np.arange(start, stop, step, dtype=np.intp)`
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP)
self._has_array = True

return self._as_array
Expand Down Expand Up @@ -325,13 +327,13 @@ cdef slice_getitem(slice slc, ind):
else:
# NOTE:
# this is the C-optimized equivalent of
# `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]`
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind]
# `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]`
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind]


@cython.boundscheck(False)
@cython.wraparound(False)
cdef slice indexer_as_slice(int64_t[:] vals):
cdef slice indexer_as_slice(intp_t[:] vals):
cdef:
Py_ssize_t i, n, start, stop
int64_t d
Expand Down
44 changes: 32 additions & 12 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -451,22 +451,33 @@ def fast_zip(list ndarrays) -> ndarray[object]:
return result


def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length):
def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
"""
Reverse indexing operation.

Given `indexer`, make `indexer_inv` of it, such that::

indexer_inv[indexer[x]] = x

.. note:: If indexer is not unique, only first occurrence is accounted.
Parameters
----------
indexer : np.ndarray[np.intp]
length : int

Returns
-------
np.ndarray[np.intp]

Notes
-----
If indexer is not unique, only first occurrence is accounted.
"""
cdef:
Py_ssize_t i, n = len(indexer)
ndarray[int64_t] rev_indexer
int64_t idx
ndarray[intp_t] rev_indexer
intp_t idx

rev_indexer = np.empty(length, dtype=np.int64)
rev_indexer = np.empty(length, dtype=np.intp)
rev_indexer[:] = -1
for i in range(n):
idx = indexer[i]
Expand Down Expand Up @@ -808,23 +819,32 @@ def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner,

@cython.boundscheck(False)
@cython.wraparound(False)
def get_level_sorter(const int64_t[:] label, const int64_t[:] starts):
def get_level_sorter(
ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
) -> ndarray:
"""
Argsort for a single level of a multi-index, keeping the order of higher
levels unchanged. `starts` points to starts of same-key indices w.r.t
to leading levels; equivalent to:
np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort')
np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
+ starts[i] for i in range(len(starts) - 1)])

Parameters
----------
codes : np.ndarray[int64_t, ndim=1]
starts : np.ndarray[intp, ndim=1]

Returns
-------
np.ndarray[np.int, ndim=1]
"""
cdef:
int64_t l, r
Py_ssize_t i
ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64)
ndarray[int64_t, ndim=1] label_arr = np.asarray(label)
Py_ssize_t i, l, r
ndarray[intp_t, ndim=1] out = np.empty(len(codes), dtype=np.intp)

for i in range(len(starts) - 1):
l, r = starts[i], starts[i + 1]
out[l:r] = l + label_arr[l:r].argsort(kind='mergesort')
out[l:r] = l + codes[l:r].argsort(kind='mergesort')

return out

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,10 +991,10 @@ def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0):
@cache_readonly
def slabels(self):
# Sorted labels
return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False)
return algorithms.take_nd(self.labels, self._sort_idx, allow_fill=False)

@cache_readonly
def sort_idx(self):
def _sort_idx(self) -> np.ndarray: # np.ndarray[np.intp]
# Counting sort indexer
return get_group_index_sorter(self.labels, self.ngroups)

Expand All @@ -1013,7 +1013,7 @@ def __iter__(self):

@cache_readonly
def sorted_data(self) -> FrameOrSeries:
return self.data.take(self.sort_idx, axis=self.axis)
return self.data.take(self._sort_idx, axis=self.axis)

def _chop(self, sdata, slice_obj: slice) -> NDFrame:
raise AbstractMethodError(self)
Expand Down
29 changes: 20 additions & 9 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4135,13 +4135,22 @@ def _join_level(
"""
from pandas.core.indexes.multi import MultiIndex

def _get_leaf_sorter(labels):
def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
"""
Returns sorter for the inner most level while preserving the
order of higher levels.

Parameters
----------
labels : list[np.ndarray]
Each ndarray has signed integer dtype, not necessarily identical.

Returns
-------
np.ndarray[np.intp]
"""
if labels[0].size == 0:
return np.empty(0, dtype="int64")
return np.empty(0, dtype=np.intp)

if len(labels) == 1:
return get_group_index_sorter(labels[0])
Expand All @@ -4154,7 +4163,7 @@ def _get_leaf_sorter(labels):

starts = np.hstack(([True], tic, [True])).nonzero()[0]
lab = ensure_int64(labels[-1])
return lib.get_level_sorter(lab, ensure_int64(starts))
return lib.get_level_sorter(lab, ensure_platform_int(starts))

if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
raise TypeError("Join on level between two MultiIndex objects is ambiguous")
Expand Down Expand Up @@ -4189,12 +4198,12 @@ def _get_leaf_sorter(labels):
join_index = left[left_indexer]

else:
left_lev_indexer = ensure_int64(left_lev_indexer)
left_lev_indexer = ensure_platform_int(left_lev_indexer)
rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level))
old_codes = left.codes[level]
new_lev_codes = algos.take_nd(
rev_indexer, old_codes[old_codes != -1], allow_fill=False
)

taker = old_codes[old_codes != -1]
new_lev_codes = rev_indexer.take(taker)

new_codes = list(left.codes)
new_codes[level] = new_lev_codes
Expand All @@ -4204,6 +4213,7 @@ def _get_leaf_sorter(labels):

if keep_order: # just drop missing values. o.w. keep order
left_indexer = np.arange(len(left), dtype=np.intp)
left_indexer = cast(np.ndarray, left_indexer)
mask = new_lev_codes != -1
if not mask.all():
new_codes = [lab[mask] for lab in new_codes]
Expand All @@ -4213,11 +4223,12 @@ def _get_leaf_sorter(labels):
if level == 0: # outer most level, take the fast route
ngroups = 1 + new_lev_codes.max()
left_indexer, counts = libalgos.groupsort_indexer(
new_lev_codes, ngroups
ensure_int64(new_lev_codes), ngroups
)

# missing values are placed first; drop them!
left_indexer = left_indexer[counts[0] :]
# error: Value of type "Optional[ndarray]" is not indexable
left_indexer = left_indexer[counts[0] :] # type: ignore[index]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could maybe leave this one as a "fix later" ignore instead of a cast as we can fix ourselves by typing libalgos.groupsort_indexer (i.e. adding algos.pyi)

new_codes = [lab[left_indexer] for lab in new_codes]

else: # sort the leaves
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1952,7 +1952,7 @@ def _sort_levels_monotonic(self) -> MultiIndex:
lev = lev.take(indexer)

# indexer to reorder the level codes
indexer = ensure_int64(indexer)
indexer = ensure_platform_int(indexer)
ri = lib.get_reverse_indexer(indexer, len(indexer))
level_codes = algos.take_nd(ri, level_codes)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def _indexer_and_to_sort(self):
ngroups = len(obs_ids)

indexer = get_group_index_sorter(comp_index, ngroups)

indexer = ensure_platform_int(indexer)
return indexer, to_sort

@cache_readonly
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,16 @@ def get_group_index_sorter(
Both algorithms are `stable` sort and that is necessary for correctness of
groupby operations. e.g. consider:
df.groupby(key)[col].transform('first')

Parameters
----------
group_index : np.ndarray
signed integer dtype
ngroups : int or None, default None

Returns
-------
np.ndarray[np.intp]
"""
if ngroups is None:
# error: Incompatible types in assignment (expression has type "number[Any]",
Expand All @@ -596,9 +606,9 @@ def get_group_index_sorter(
)
if do_groupsort:
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
return ensure_platform_int(sorter)
else:
return group_index.argsort(kind="mergesort")
sorter = group_index.argsort(kind="mergesort")
return ensure_platform_int(sorter)


def compress_group_index(group_index, sort: bool = True):
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def _check(blk):
def test_mgr_locs(self):
assert isinstance(self.fblock.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64)
self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.intp)
)

def test_attrs(self):
Expand All @@ -277,22 +277,22 @@ def test_delete(self):
newb.delete(0)
assert isinstance(newb.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64)
newb.mgr_locs.as_array, np.array([2, 4], dtype=np.intp)
)
assert (newb.values[0] == 1).all()

newb = self.fblock.copy()
newb.delete(1)
assert isinstance(newb.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64)
newb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp)
)
assert (newb.values[1] == 2).all()

newb = self.fblock.copy()
newb.delete(2)
tm.assert_numpy_array_equal(
newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64)
newb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
)
assert (newb.values[1] == 1).all()

Expand Down Expand Up @@ -665,7 +665,7 @@ def test_consolidate_ordering_issues(self, mgr):
assert cons.nblocks == 1
assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64)
cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.intp)
)

def test_reindex_items(self):
Expand Down Expand Up @@ -1095,7 +1095,7 @@ def test_slice_iter(self, slc, expected):
)
def test_slice_to_array_conversion(self, slc, arr):
tm.assert_numpy_array_equal(
BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64)
BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.intp)
)

def test_blockplacement_add(self):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,9 @@ def test_maybe_booleans_to_slice(self):
assert result == slice(0, 0)

def test_get_reverse_indexer(self):
indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64)
indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp)
result = lib.get_reverse_indexer(indexer, 5)
expected = np.array([4, 2, 3, 6, 7], dtype=np.int64)
expected = np.array([4, 2, 3, 6, 7], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


Expand Down