Skip to content

Commit 5a9f898

Browse files
committed
avoid zip and unzip to improve the speed of get_index
1 parent 0158382 commit 5a9f898

File tree

2 files changed

+38
-7
lines changed

2 files changed

+38
-7
lines changed

pandas/_libs/index.pyx

+5-4
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ cdef class BaseMultiIndexCodesEngine:
584584
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
585585
raise NotImplementedError("Implemented by subclass")
586586

587-
def _extract_level_codes(self, ndarray[object] target) -> np.ndarray:
587+
def _extract_level_codes(self, list target) -> np.ndarray:
588588
"""
589589
Map the requested list of (tuple) keys to their integer representations
590590
for searching in the underlying integer index.
@@ -599,11 +599,12 @@ cdef class BaseMultiIndexCodesEngine:
599599
int_keys : 1-dimensional array of dtype uint64 or object
600600
Integers representing one combination each
601601
"""
602-
level_codes = [lev.get_indexer(codes) + 1 for lev, codes
603-
in zip(self.levels, zip(*target))]
602+
n_levels = len(self.levels)
603+
level_codes = [self.levels[i].get_indexer(target[i]) + 1
604+
for i in range(n_levels)]
604605
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
605606

606-
def get_indexer(self, ndarray[object] target) -> np.ndarray:
607+
def get_indexer(self, list target) -> np.ndarray:
607608
"""
608609
Returns an array giving the positions of each value of `target` in
609610
`self.values`, where -1 represents a value in `target` which does not

pandas/core/indexes/multi.py

+33-3
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,24 @@ def _values(self) -> np.ndarray:
715715
arr = lib.fast_zip(values)
716716
return arr
717717

718+
@cache_readonly
719+
def _values_for_indexer(self):
720+
values = []
721+
722+
for i in range(self.nlevels):
723+
vals = self._get_level_values(i)
724+
if is_categorical_dtype(vals.dtype):
725+
vals = cast("CategoricalIndex", vals)
726+
vals = vals._data._internal_get_values()
727+
if isinstance(vals.dtype, ExtensionDtype):
728+
vals = vals.astype(object)
729+
# error: Incompatible types in assignment (expression has type "ndarray",
730+
# variable has type "Index")
731+
vals = np.array(vals, copy=False) # type: ignore[assignment]
732+
values.append(vals)
733+
734+
return values
735+
718736
@property
719737
def values(self) -> np.ndarray:
720738
return self._values
@@ -2666,6 +2684,15 @@ def _get_partial_string_timestamp_match_key(self, key):
26662684

26672685
return key
26682686

2687+
def _get_engine_target(self) -> np.ndarray:
2688+
"""
2689+
Override base
2690+
Get the ndarray that we can pass to the IndexEngine constructor.
2691+
"""
2692+
# error: Incompatible return value type (got "Union[ExtensionArray,
2693+
# ndarray]", expected "ndarray")
2694+
return self._values_for_indexer # type: ignore[return-value]
2695+
26692696
def _get_indexer(
26702697
self,
26712698
target: Index,
@@ -2686,7 +2713,7 @@ def _get_indexer(
26862713

26872714
# let's instead try with a straight Index
26882715
if method is None:
2689-
return Index(self._values).get_indexer(
2716+
return Index(self._values_for_indexer).get_indexer(
26902717
target, method=method, limit=limit, tolerance=tolerance
26912718
)
26922719

@@ -2701,15 +2728,18 @@ def _get_indexer(
27012728
# TODO: get_indexer_with_fill docstring says values must be _sorted_
27022729
# but that doesn't appear to be enforced
27032730
indexer = self._engine.get_indexer_with_fill(
2704-
target=target._values, values=self._values, method=method, limit=limit
2731+
target=target._values_for_indexer,
2732+
values=self._values_for_indexer,
2733+
method=method,
2734+
limit=limit,
27052735
)
27062736
elif method == "nearest":
27072737
raise NotImplementedError(
27082738
"method='nearest' not implemented yet "
27092739
"for MultiIndex; see GitHub issue 9365"
27102740
)
27112741
else:
2712-
indexer = self._engine.get_indexer(target._values)
2742+
indexer = self._engine.get_indexer(target._values_for_indexer)
27132743

27142744
# Note: we only get here (in extant tests at least) with
27152745
# target.nlevels == self.nlevels

0 commit comments

Comments
 (0)