diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi new file mode 100644 index 0000000000000..979619c3d14c4 --- /dev/null +++ b/pandas/_libs/index.pyi @@ -0,0 +1,86 @@ +import numpy as np + +class IndexEngine: + over_size_threshold: bool + + def __init__(self, vgetter, n: int): ... + + def __contains__(self, val: object) -> bool: ... + + # -> int | slice | np.ndarray[bool] + def get_loc(self, val: object) -> int | slice | np.ndarray: ... + + def sizeof(self, deep: bool = False) -> int: ... + def __sizeof__(self) -> int: ... + + @property + def is_unique(self) -> bool: ... + + @property + def is_monotonic_increasing(self) -> bool: ... + + @property + def is_monotonic_decreasing(self) -> bool: ... + + def get_backfill_indexer(self, other: np.ndarray, limit: int | None =...) -> np.ndarray: ... + def get_pad_indexer(self, other: np.ndarray, limit: int | None =...) -> np.ndarray: ... + + @property + def is_mapping_populated(self) -> bool: ... + + def clear_mapping(self): ... + def get_indexer(self, values: np.ndarray) -> np.ndarray: ... # np.ndarray[np.intp] + def get_indexer_non_unique( + self, + targets: np.ndarray, + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + np.ndarray, # np.ndarray[np.intp] + ]: ... + + +class Float64Engine(IndexEngine): ... +class Float32Engine(IndexEngine): ... + +class Int64Engine(IndexEngine): ... +class Int32Engine(IndexEngine): ... +class Int16Engine(IndexEngine): ... +class Int8Engine(IndexEngine): ... + +class UInt64Engine(IndexEngine): ... +class UInt32Engine(IndexEngine): ... +class UInt16Engine(IndexEngine): ... +class UInt8Engine(IndexEngine): ... + +class ObjectEngine(IndexEngine): ... + +class DatetimeEngine(Int64Engine): ... +class TimedeltaEngine(DatetimeEngine): ... +class PeriodEngine(Int64Engine): ... + + +class BaseMultiIndexCodesEngine: + levels: list[np.ndarray] + offsets: np.ndarray # ndarray[uint64_t, ndim=1] + + def __init__( + self, + levels: list[np.ndarray], # all entries hashable + labels: list[np.ndarray], # all entries integer-dtyped + offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + ): ... + + def get_indexer( + self, + target: np.ndarray, # np.ndarray[object] + ) -> np.ndarray: ... # np.ndarray[np.intp] + + def _extract_level_codes(self, target: object): ... + + def get_indexer_with_fill( + self, + target: np.ndarray, # np.ndarray[object] of tuples + values: np.ndarray, # np.ndarray[object] of tuples + method: str, + limit: int | None, + ) -> np.ndarray: ... # np.ndarray[np.int64] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 71f4b0c0ae18f..47e6d417bb925 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -259,7 +259,7 @@ cdef class IndexEngine: self.monotonic_inc = 0 self.monotonic_dec = 0 - def get_indexer(self, ndarray values): + def get_indexer(self, ndarray values) -> np.ndarray: self._ensure_mapping_populated() return self.mapping.lookup(values) @@ -269,6 +269,11 @@ cdef class IndexEngine: return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results + + Returns + ------- + indexer : np.ndarray[np.intp] + missing : np.ndarray[np.intp] """ cdef: ndarray values, x @@ -455,7 +460,7 @@ cdef class DatetimeEngine(Int64Engine): # we may get datetime64[ns] or timedelta64[ns], cast these to int64 return super().get_indexer_non_unique(targets.view("i8")) - def get_indexer(self, ndarray values): + def get_indexer(self, ndarray values) -> np.ndarray: self._ensure_mapping_populated() if values.dtype != self._get_box_dtype(): return np.repeat(-1, len(values)).astype(np.intp) @@ -572,17 +577,17 @@ cdef class BaseMultiIndexCodesEngine: # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lambda: lab_ints, len(lab_ints)) - def _codes_to_ints(self, codes): + def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: raise NotImplementedError("Implemented by subclass") - def _extract_level_codes(self, object target): + def _extract_level_codes(self, ndarray[object] target) -> np.ndarray: """ Map the requested list of (tuple) keys to their integer representations for searching in the underlying integer index. Parameters ---------- - target : list-like of keys + target : ndarray[object] Each key is a tuple, with a label for each level of the index. Returns @@ -607,7 +612,7 @@ cdef class BaseMultiIndexCodesEngine: Returns ------- - np.ndarray[int64_t, ndim=1] of the indexer of `target` into + np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ lab_ints = self._extract_level_codes(target) @@ -635,7 +640,7 @@ cdef class BaseMultiIndexCodesEngine: the same as the length of all tuples in `values` values : ndarray[object] of tuples must be sorted and all have the same length. Should be the set of - the MultiIndex's values. Needed only if `method` is not None + the MultiIndex's values. method: string "backfill" or "pad" limit: int or None @@ -643,7 +648,7 @@ cdef class BaseMultiIndexCodesEngine: Returns ------- - np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`, + np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`, filled with the `method` (and optionally `limit`) specified """ assert method in ("backfill", "pad") @@ -714,9 +719,7 @@ cdef class BaseMultiIndexCodesEngine: return self._base.get_loc(self, lab_int) - def get_indexer_non_unique(self, ndarray target): - # This needs to be overridden just because the default one works on - # target._values, and target can be itself a MultiIndex. + def get_indexer_non_unique(self, ndarray[object] target): lab_ints = self._extract_level_codes(target) indexer = self._base.get_indexer_non_unique(self, lab_ints) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 67cd6c63c1faa..0a2893ac49a49 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -405,12 +405,7 @@ def _cmp_method(self, other, op): _str_na_value = StringDtype.na_value def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): - from pandas.arrays import ( - BooleanArray, - IntegerArray, - StringArray, - ) - from pandas.core.arrays.string_ import StringDtype + from pandas.arrays import BooleanArray if dtype is None: dtype = StringDtype() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc4eeebc86642..e85d09a479d16 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -320,7 +320,7 @@ def _outer_indexer( # would we like our indexing holder to defer to us _defer_to_indexing = False - _engine_type = libindex.ObjectEngine + _engine_type: Type[libindex.IndexEngine] = libindex.ObjectEngine # whether we support partial string indexing. Overridden # in DatetimeIndex and PeriodIndex _supports_partial_string_indexing = False @@ -723,8 +723,8 @@ def _cleanup(self) -> None: self._engine.clear_mapping() @cache_readonly - def _engine(self) -> libindex.ObjectEngine: - # property, for now, slow to look up + def _engine(self) -> libindex.IndexEngine: + # For base class (object dtype) we get ObjectEngine # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e194148f0fc24..bac00b2399121 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -7,6 +7,7 @@ Any, List, Optional, + Sequence, Tuple, TypeVar, Union, @@ -536,7 +537,7 @@ def shift(self: _T, periods: int = 1, freq=None) -> _T: # -------------------------------------------------------------------- # List-like Methods - def _get_delete_freq(self, loc: int): + def _get_delete_freq(self, loc: Union[int, slice, Sequence[int]]): """ Find the `freq` for self.delete(loc). """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fedb955ce83b9..9751e12c373cd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -557,7 +557,7 @@ def from_tuples( arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): - tuples = tuples._values + tuples = np.asarray(tuples._values) arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): @@ -2689,11 +2689,16 @@ def _get_indexer( target, method=method, limit=limit, tolerance=tolerance ) + # TODO: explicitly raise here? we only have one test that + # gets here, and it is checking that we raise with method="nearest" + if method == "pad" or method == "backfill": if tolerance is not None: raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) + # TODO: get_indexer_with_fill docstring says values must be _sorted_ + # but that doesn't appear to be enforced indexer = self._engine.get_indexer_with_fill( target=target._values, values=self._values, method=method, limit=limit ) @@ -2705,6 +2710,8 @@ def _get_indexer( else: indexer = self._engine.get_indexer(target._values) + # Note: we only get here (in extant tests at least) with + # target.nlevels == self.nlevels return ensure_platform_int(indexer) def get_slice_bound(