From 24cc97e61a9edfbd1a42cb4e7d5ed68089dc792b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Mar 2021 12:18:31 -0700 Subject: [PATCH 1/3] TYP: index.pyi --- pandas/_libs/index.pyi | 88 +++++++++++++++++++++++++++++ pandas/_libs/index.pyx | 34 +++++------ pandas/core/arrays/string_.py | 7 +-- pandas/core/groupby/ops.py | 10 +--- pandas/core/indexes/base.py | 6 +- pandas/core/indexes/datetimelike.py | 3 +- pandas/core/indexes/multi.py | 12 +++- pandas/core/strings/object_array.py | 3 +- 8 files changed, 127 insertions(+), 36 deletions(-) create mode 100644 pandas/_libs/index.pyi diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi new file mode 100644 index 0000000000000..0ea635f1af049 --- /dev/null +++ b/pandas/_libs/index.pyi @@ -0,0 +1,88 @@ +from typing import Optional + +import numpy as np + +class IndexEngine: + over_size_threshold: bool + + def __init__(self, vgetter, n: int): ... + + def __contains__(self, val: object) -> bool: ... + + # -> int | slice | np.ndarray[bool] + def get_loc(self, val: object) -> int | slice | np.ndarray: ... + + def sizeof(self, deep: bool = False) -> int: ... + def __sizeof__(self) -> int: ... + + @property + def is_unique(self) -> bool: ... + + @property + def is_monotonic_increasing(self) -> bool: ... + + @property + def is_monotonic_decreasing(self) -> bool: ... + + def get_backfill_indexer(self, other: np.ndarray, limit: int | None =...) -> np.ndarray: ... + def get_pad_indexer(self, other: np.ndarray, limit: int | None =...) -> np.ndarray: ... + + @property + def is_mapping_populated(self) -> bool: ... + + def clear_mapping(self): ... + def get_indexer(self, values: np.ndarray) -> np.ndarray: ... # np.ndarray[np.intp] + def get_indexer_non_unique( + self, + targets: np.ndarray, + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + np.ndarray, # np.ndarray[np.intp] + ]: ... + + +class Float64Engine(IndexEngine): ... +class Float32Engine(IndexEngine): ... + +class Int64Engine(IndexEngine): ... +class Int32Engine(IndexEngine): ... +class Int16Engine(IndexEngine): ... +class Int8Engine(IndexEngine): ... + +class UInt64Engine(IndexEngine): ... +class UInt32Engine(IndexEngine): ... +class UInt16Engine(IndexEngine): ... +class UInt8Engine(IndexEngine): ... + +class ObjectEngine(IndexEngine): ... + +class DatetimeEngine(Int64Engine): ... +class TimedeltaEngine(DatetimeEngine): ... +class PeriodEngine(Int64Engine): ... + + +class BaseMultiIndexCodesEngine: + levels: list[np.ndarray] + offsets: np.ndarray # ndarray[uint64_t, ndim=1] + + def __init__( + self, + levels: list[np.ndarray], # all entries hashable + labels: list[np.ndarray], # all entries integer-dtyped + offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + ): ... + + def get_indexer( + self, + target: np.ndarray, # np.ndarray[object] + ) -> np.ndarray: ... # np.ndarray[np.intp] + + def _extract_level_codes(self, target: object): ... + + def get_indexer_with_fill( + self, + target: np.ndarray, # np.ndarray[object] of tuples + values: np.ndarray, # np.ndarray[object] of tuples + method: str, + limit: int | None, + ) -> np.ndarray: ... # np.ndarray[np.int64] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 9159fa03c12c0..2525c4d7e90bc 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -259,7 +259,7 @@ cdef class IndexEngine: self.monotonic_inc = 0 self.monotonic_dec = 0 - def get_indexer(self, ndarray values): + def get_indexer(self, ndarray values) -> np.ndarray: self._ensure_mapping_populated() return self.mapping.lookup(values) @@ -269,6 +269,11 @@ cdef class IndexEngine: return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results + + Returns + ------- + indexer : np.ndarray[np.intp] + missing : np.ndarray[np.intp] """ cdef: ndarray values, x @@ -455,22 +460,22 @@ cdef class DatetimeEngine(Int64Engine): # we may get datetime64[ns] or timedelta64[ns], cast these to int64 return super().get_indexer_non_unique(targets.view("i8")) - def get_indexer(self, ndarray values): + def get_indexer(self, ndarray values) -> np.ndarray: self._ensure_mapping_populated() if values.dtype != self._get_box_dtype(): - return np.repeat(-1, len(values)).astype('i4') + return np.repeat(np.intp(-1), len(values)) values = np.asarray(values).view('i8') return self.mapping.lookup(values) def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: if other.dtype != self._get_box_dtype(): - return np.repeat(-1, len(other)).astype('i4') + return np.repeat(np.intp(-1), len(other)) other = np.asarray(other).view('i8') return algos.pad(self._get_index_values(), other, limit=limit) def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: if other.dtype != self._get_box_dtype(): - return np.repeat(-1, len(other)).astype('i4') + return np.repeat(np.intp(-1), len(other)) other = np.asarray(other).view('i8') return algos.backfill(self._get_index_values(), other, limit=limit) @@ -572,17 +577,17 @@ cdef class BaseMultiIndexCodesEngine: # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lambda: lab_ints, len(lab_ints)) - def _codes_to_ints(self, codes): + def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: raise NotImplementedError("Implemented by subclass") - def _extract_level_codes(self, object target): + def _extract_level_codes(self, ndarray[object] target) -> np.ndarray: """ Map the requested list of (tuple) keys to their integer representations for searching in the underlying integer index. Parameters ---------- - target : list-like of keys + target : ndarray[object] Each key is a tuple, with a label for each level of the index. Returns @@ -607,7 +612,7 @@ cdef class BaseMultiIndexCodesEngine: Returns ------- - np.ndarray[int64_t, ndim=1] of the indexer of `target` into + np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ lab_ints = self._extract_level_codes(target) @@ -635,7 +640,7 @@ cdef class BaseMultiIndexCodesEngine: the same as the length of all tuples in `values` values : ndarray[object] of tuples must be sorted and all have the same length. Should be the set of - the MultiIndex's values. Needed only if `method` is not None + the MultiIndex's values. method: string "backfill" or "pad" limit: int or None @@ -694,9 +699,8 @@ cdef class BaseMultiIndexCodesEngine: next_code += 1 # get the indexer, and undo the sorting of `target.values` - sorted_indexer = ( - algos.backfill if method == "backfill" else algos.pad - )(new_codes, new_target_codes, limit=limit).astype('int64') + algo = algos.backfill if method == "backfill" else algos.pad + sorted_indexer = algo(new_codes, new_target_codes, limit=limit).astype("int64") return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): @@ -715,9 +719,7 @@ cdef class BaseMultiIndexCodesEngine: return self._base.get_loc(self, lab_int) - def get_indexer_non_unique(self, ndarray target): - # This needs to be overridden just because the default one works on - # target._values, and target can be itself a MultiIndex. + def get_indexer_non_unique(self, ndarray[object] target): lab_ints = self._extract_level_codes(target) indexer = self._base.get_indexer_non_unique(self, lab_ints) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 67cd6c63c1faa..0a2893ac49a49 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -405,12 +405,7 @@ def _cmp_method(self, other, op): _str_na_value = StringDtype.na_value def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): - from pandas.arrays import ( - BooleanArray, - IntegerArray, - StringArray, - ) - from pandas.core.arrays.string_ import StringDtype + from pandas.arrays import BooleanArray if dtype is None: dtype = StringDtype() diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74e96015b4544..e00cf397e474b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -786,14 +786,10 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): counts[label] = group.shape[0] result[label] = res - result = lib.maybe_convert_objects(result, try_float=False) - # error: Incompatible types in assignment (expression has type - # "Union[ExtensionArray, ndarray]", variable has type "ndarray") - result = maybe_cast_result( # type: ignore[assignment] - result, obj, numeric_only=True - ) + converted = lib.maybe_convert_objects(result, try_float=False) + out = maybe_cast_result(converted, obj, numeric_only=True) - return result, counts + return out, counts class BinGrouper(BaseGrouper): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3a468758ab3fd..aa71a899984b9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -319,7 +319,7 @@ def _outer_indexer( # would we like our indexing holder to defer to us _defer_to_indexing = False - _engine_type = libindex.ObjectEngine + _engine_type: Type[libindex.IndexEngine] = libindex.ObjectEngine # whether we support partial string indexing. Overridden # in DatetimeIndex and PeriodIndex _supports_partial_string_indexing = False @@ -722,8 +722,8 @@ def _cleanup(self) -> None: self._engine.clear_mapping() @cache_readonly - def _engine(self) -> libindex.ObjectEngine: - # property, for now, slow to look up + def _engine(self) -> libindex.IndexEngine: + # For base class (object dtype) we get ObjectEngine # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 31ad8b7d8a295..e1f0d0fdfacda 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -7,6 +7,7 @@ Any, List, Optional, + Sequence, Tuple, TypeVar, Union, @@ -535,7 +536,7 @@ def shift(self: _T, periods: int = 1, freq=None) -> _T: # -------------------------------------------------------------------- # List-like Methods - def _get_delete_freq(self, loc: int): + def _get_delete_freq(self, loc: Union[int, Sequence[int]]): """ Find the `freq` for self.delete(loc). """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 97492f35232e3..99f83a4f286a3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -559,6 +559,7 @@ def from_tuples( if isinstance(tuples, Index): tuples = tuples._values + tuples = cast(np.ndarray, tuples) arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) @@ -1111,8 +1112,8 @@ def _engine(self): # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) + return MultiIndexPyIntEngine(list(self.levels), self.codes, offsets) + return MultiIndexUIntEngine(list(self.levels), self.codes, offsets) @property def _constructor(self) -> Callable[..., MultiIndex]: @@ -2698,11 +2699,16 @@ def _get_indexer( target, method=method, limit=limit, tolerance=tolerance ) + # TODO: explicitly raise here? we only have one test that + # gets here, and it is checking that we raise with method="nearest" + if method == "pad" or method == "backfill": if tolerance is not None: raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) + # TODO: get_indexer_with_fill docstring says values must be _sorted_ + # but that doesn't appear to be enforced indexer = self._engine.get_indexer_with_fill( target=target._values, values=self._values, method=method, limit=limit ) @@ -2714,6 +2720,8 @@ def _get_indexer( else: indexer = self._engine.get_indexer(target._values) + # Note: we only get here (in extant tests at least) with + # target.nlevels == self.nlevels return ensure_platform_int(indexer) def get_slice_bound( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index edf32bade0657..db924e0b00819 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -73,8 +73,9 @@ def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): arr = np.asarray(arr, dtype=object) # type: ignore[assignment] mask = isna(arr) convert = not np.all(mask) + ndarr = np.asarray(arr) try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + result = lib.map_infer_mask(ndarr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN From 3d5cf0e1490e35086393f0cf4e3a11db65234cab Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Mar 2021 07:51:36 -0700 Subject: [PATCH 2/3] revert unnecessary --- pandas/_libs/index.pyi | 2 -- pandas/core/indexes/multi.py | 7 +++---- pandas/core/strings/object_array.py | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 0ea635f1af049..979619c3d14c4 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -1,5 +1,3 @@ -from typing import Optional - import numpy as np class IndexEngine: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 99f83a4f286a3..5816c0231db18 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -557,9 +557,8 @@ def from_tuples( arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): - tuples = tuples._values + tuples = np.asarray(tuples._values) - tuples = cast(np.ndarray, tuples) arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) @@ -1112,8 +1111,8 @@ def _engine(self): # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(list(self.levels), self.codes, offsets) - return MultiIndexUIntEngine(list(self.levels), self.codes, offsets) + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) @property def _constructor(self) -> Callable[..., MultiIndex]: diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index db924e0b00819..edf32bade0657 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -73,9 +73,8 @@ def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): arr = np.asarray(arr, dtype=object) # type: ignore[assignment] mask = isna(arr) convert = not np.all(mask) - ndarr = np.asarray(arr) try: - result = lib.map_infer_mask(ndarr, f, mask.view(np.uint8), convert) + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN From 70270f344a27f7fb087b8f58ef819b96fbbb5c9f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Mar 2021 07:07:07 -0700 Subject: [PATCH 3/3] add slice to _get_delete_freq --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e4d379fc9aed0..bac00b2399121 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -537,7 +537,7 @@ def shift(self: _T, periods: int = 1, freq=None) -> _T: # -------------------------------------------------------------------- # List-like Methods - def _get_delete_freq(self, loc: Union[int, Sequence[int]]): + def _get_delete_freq(self, loc: Union[int, slice, Sequence[int]]): """ Find the `freq` for self.delete(loc). """