Skip to content

Commit 2c12853

Browse files
authored
REF/PERF: MultiIndex.get_indexer with method (#55839)
* refactor MultiIndex._get_fill_indexer * whatsnew * mypy
1 parent 56a4d57 commit 2c12853

File tree

4 files changed

+7
-103
lines changed

4 files changed

+7
-103
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ Performance improvements
324324
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
325325
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
326326
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
327+
- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
327328
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
328329
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
329330
- Performance improvement when indexing into a non-unique index (:issue:`55816`)

pandas/_libs/index.pyi

-7
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,6 @@ class BaseMultiIndexCodesEngine:
8080
) -> None: ...
8181
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
8282
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
83-
def get_indexer_with_fill(
84-
self,
85-
target: np.ndarray, # np.ndarray[object] of tuples
86-
values: np.ndarray, # np.ndarray[object] of tuples
87-
method: str,
88-
limit: int | None,
89-
) -> npt.NDArray[np.intp]: ...
9083

9184
class ExtensionEngine:
9285
def __init__(self, values: ExtensionArray) -> None: ...

pandas/_libs/index.pyx

-85
Original file line numberDiff line numberDiff line change
@@ -753,91 +753,6 @@ cdef class BaseMultiIndexCodesEngine:
753753
"""
754754
return self._base.get_indexer(self, target)
755755

756-
def get_indexer_with_fill(self, ndarray target, ndarray values,
757-
str method, object limit) -> np.ndarray:
758-
"""
759-
Returns an array giving the positions of each value of `target` in
760-
`values`, where -1 represents a value in `target` which does not
761-
appear in `values`
762-
763-
If `method` is "backfill" then the position for a value in `target`
764-
which does not appear in `values` is that of the next greater value
765-
in `values` (if one exists), and -1 if there is no such value.
766-
767-
Similarly, if the method is "pad" then the position for a value in
768-
`target` which does not appear in `values` is that of the next smaller
769-
value in `values` (if one exists), and -1 if there is no such value.
770-
771-
Parameters
772-
----------
773-
target: ndarray[object] of tuples
774-
need not be sorted, but all must have the same length, which must be
775-
the same as the length of all tuples in `values`
776-
values : ndarray[object] of tuples
777-
must be sorted and all have the same length. Should be the set of
778-
the MultiIndex's values.
779-
method: string
780-
"backfill" or "pad"
781-
limit: int or None
782-
if provided, limit the number of fills to this value
783-
784-
Returns
785-
-------
786-
np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
787-
filled with the `method` (and optionally `limit`) specified
788-
"""
789-
assert method in ("backfill", "pad")
790-
cdef:
791-
int64_t i, j, next_code
792-
int64_t num_values, num_target_values
793-
ndarray[int64_t, ndim=1] target_order
794-
ndarray[object, ndim=1] target_values
795-
ndarray[int64_t, ndim=1] new_codes, new_target_codes
796-
ndarray[intp_t, ndim=1] sorted_indexer
797-
798-
target_order = np.argsort(target).astype("int64")
799-
target_values = target[target_order]
800-
num_values, num_target_values = len(values), len(target_values)
801-
new_codes, new_target_codes = (
802-
np.empty((num_values,)).astype("int64"),
803-
np.empty((num_target_values,)).astype("int64"),
804-
)
805-
806-
# `values` and `target_values` are both sorted, so we walk through them
807-
# and memoize the (ordered) set of indices in the (implicit) merged-and
808-
# sorted list of the two which belong to each of them
809-
# the effect of this is to create a factorization for the (sorted)
810-
# merger of the index values, where `new_codes` and `new_target_codes`
811-
# are the subset of the factors which appear in `values` and `target`,
812-
# respectively
813-
i, j, next_code = 0, 0, 0
814-
while i < num_values and j < num_target_values:
815-
val, target_val = values[i], target_values[j]
816-
if val <= target_val:
817-
new_codes[i] = next_code
818-
i += 1
819-
if target_val <= val:
820-
new_target_codes[j] = next_code
821-
j += 1
822-
next_code += 1
823-
824-
# at this point, at least one should have reached the end
825-
# the remaining values of the other should be added to the end
826-
assert i == num_values or j == num_target_values
827-
while i < num_values:
828-
new_codes[i] = next_code
829-
i += 1
830-
next_code += 1
831-
while j < num_target_values:
832-
new_target_codes[j] = next_code
833-
j += 1
834-
next_code += 1
835-
836-
# get the indexer, and undo the sorting of `target.values`
837-
algo = algos.backfill if method == "backfill" else algos.pad
838-
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
839-
return sorted_indexer[np.argsort(target_order)]
840-
841756
def get_loc(self, object key):
842757
if is_definitely_invalid_key(key):
843758
raise TypeError(f"'{key}' is an invalid key")

pandas/core/indexes/base.py

+6-11
Original file line numberDiff line numberDiff line change
@@ -4023,17 +4023,12 @@ def _get_fill_indexer(
40234023
if self._is_multi:
40244024
if not (self.is_monotonic_increasing or self.is_monotonic_decreasing):
40254025
raise ValueError("index must be monotonic increasing or decreasing")
4026-
# error: "IndexEngine" has no attribute "get_indexer_with_fill"
4027-
engine = self._engine
4028-
with warnings.catch_warnings():
4029-
# TODO: We need to fix this. Casting to int64 in cython
4030-
warnings.filterwarnings("ignore", category=RuntimeWarning)
4031-
return engine.get_indexer_with_fill( # type: ignore[union-attr]
4032-
target=target._values,
4033-
values=self._values,
4034-
method=method,
4035-
limit=limit,
4036-
)
4026+
encoded = self.append(target)._engine.values # type: ignore[union-attr]
4027+
self_encoded = Index(encoded[: len(self)])
4028+
target_encoded = Index(encoded[len(self) :])
4029+
return self_encoded._get_fill_indexer(
4030+
target_encoded, method, limit, tolerance
4031+
)
40374032

40384033
if self.is_monotonic_increasing and target.is_monotonic_increasing:
40394034
target_values = target._get_engine_target()

0 commit comments

Comments
 (0)