Skip to content

Commit 2278923

Browse files
authored
REF/PERF: MultiIndex.get_locs to use boolean arrays internally (#46330)
1 parent 98298aa commit 2278923

File tree

2 files changed

+69
-123
lines changed

2 files changed

+69
-123
lines changed

doc/source/whatsnew/v1.5.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ Performance improvements
318318
- Performance improvement in :meth:`.GroupBy.diff` (:issue:`16706`)
319319
- Performance improvement in :meth:`.GroupBy.transform` when broadcasting values for user-defined functions (:issue:`45708`)
320320
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
321-
- Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`, :issue:`46040`)
321+
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`45681`, :issue:`46040`, :issue:`46330`)
322322
- Performance improvement in :attr:`MultiIndex.values` when the MultiIndex contains levels of type DatetimeIndex, TimedeltaIndex or ExtensionDtypes (:issue:`46288`)
323323
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
324324
- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)

pandas/core/indexes/multi.py

+68-122
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@
8484
get_unanimous_names,
8585
)
8686
from pandas.core.indexes.frozen import FrozenList
87-
from pandas.core.indexes.numeric import Int64Index
8887
from pandas.core.ops.invalid import make_invalid_op
8988
from pandas.core.sorting import (
9089
get_group_index,
@@ -3160,47 +3159,37 @@ def maybe_mi_droplevels(indexer, levels):
31603159
return indexer, result_index
31613160

31623161
def _get_level_indexer(
3163-
self, key, level: int = 0, indexer: Int64Index | None = None
3162+
self, key, level: int = 0, indexer: npt.NDArray[np.bool_] | None = None
31643163
):
31653164
# `level` kwarg is _always_ positional, never name
3166-
# return an indexer, boolean array or a slice showing where the key is
3165+
# return a boolean array or slice showing where the key is
31673166
# in the totality of values
31683167
# if the indexer is provided, then use this
31693168

31703169
level_index = self.levels[level]
31713170
level_codes = self.codes[level]
31723171

31733172
def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
3174-
# given the inputs and the codes/indexer, compute an indexer set
3175-
# if we have a provided indexer, then this need not consider
3176-
# the entire labels set
3177-
r = np.arange(start, stop, step)
3178-
3179-
if indexer is not None and len(indexer) != len(codes):
3180-
3181-
# we have an indexer which maps the locations in the labels
3182-
# that we have already selected (and is not an indexer for the
3183-
# entire set) otherwise this is wasteful so we only need to
3184-
# examine locations that are in this set the only magic here is
3185-
# that the result are the mappings to the set that we have
3186-
# selected
3187-
from pandas import Series
3188-
3189-
mapper = Series(indexer)
3190-
indexer = codes.take(ensure_platform_int(indexer))
3191-
result = Series(Index(indexer).isin(r).nonzero()[0])
3192-
m = result.map(mapper)
3193-
# error: Incompatible types in assignment (expression has type
3194-
# "ndarray", variable has type "Series")
3195-
m = np.asarray(m) # type: ignore[assignment]
3196-
3173+
# Compute a bool indexer to identify the positions to take.
3174+
# If we have an existing indexer, we only need to examine the
3175+
# subset of positions where the existing indexer is True.
3176+
if indexer is not None:
3177+
# we only need to look at the subset of codes where the
3178+
# existing indexer equals True
3179+
codes = codes[indexer]
3180+
3181+
if step is None or step == 1:
3182+
new_indexer = (codes >= start) & (codes < stop)
31973183
else:
3198-
# error: Incompatible types in assignment (expression has type
3199-
# "ndarray", variable has type "Series")
3200-
m = np.zeros(len(codes), dtype=bool) # type: ignore[assignment]
3201-
m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True
3184+
r = np.arange(start, stop, step, dtype=codes.dtype)
3185+
new_indexer = algos.isin(codes, r)
3186+
3187+
if indexer is None:
3188+
return new_indexer
32023189

3203-
return m
3190+
indexer = indexer.copy()
3191+
indexer[indexer] = new_indexer
3192+
return indexer
32043193

32053194
if isinstance(key, slice):
32063195
# handle a slice, returning a slice if we can
@@ -3327,62 +3316,41 @@ def get_locs(self, seq):
33273316
f"on levels {true_slices}, lexsort depth {self._lexsort_depth}"
33283317
)
33293318

3330-
n = len(self)
3331-
# indexer is the list of all positions that we want to take; it
3332-
# is created on the first entry in seq and narrowed down as we
3333-
# look at remaining entries
3334-
indexer = None
3335-
33363319
if any(x is Ellipsis for x in seq):
33373320
raise NotImplementedError(
33383321
"MultiIndex does not support indexing with Ellipsis"
33393322
)
33403323

3341-
def _convert_to_indexer(r) -> Int64Index:
3342-
# return an indexer
3343-
if isinstance(r, slice):
3344-
m = np.zeros(n, dtype=bool)
3345-
m[r] = True
3346-
r = m.nonzero()[0]
3347-
elif com.is_bool_indexer(r):
3348-
if len(r) != n:
3349-
raise ValueError(
3350-
"cannot index with a boolean indexer "
3351-
"that is not the same length as the "
3352-
"index"
3353-
)
3354-
r = r.nonzero()[0]
3355-
return Int64Index(r)
3324+
n = len(self)
33563325

3357-
def _update_indexer(idxr: Index, indexer: Index | None) -> Index:
3358-
if indexer is None:
3359-
return idxr
3360-
indexer_intersection = indexer.intersection(idxr)
3361-
if indexer_intersection.empty and not idxr.empty and not indexer.empty:
3362-
raise KeyError(seq)
3363-
return indexer_intersection
3326+
def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]:
3327+
if isinstance(indexer, slice):
3328+
new_indexer = np.zeros(n, dtype=np.bool_)
3329+
new_indexer[indexer] = True
3330+
return new_indexer
3331+
return indexer
3332+
3333+
# a bool indexer for the positions we want to take
3334+
indexer: npt.NDArray[np.bool_] | None = None
33643335

33653336
for i, k in enumerate(seq):
33663337

3338+
lvl_indexer: npt.NDArray[np.bool_] | slice | None = None
3339+
33673340
if com.is_bool_indexer(k):
3368-
# a boolean indexer, must be the same length!
3369-
k = np.asarray(k)
3370-
lvl_indexer = _convert_to_indexer(k)
3371-
indexer = _update_indexer(lvl_indexer, indexer=indexer)
3341+
if len(k) != n:
3342+
raise ValueError(
3343+
"cannot index with a boolean indexer that "
3344+
"is not the same length as the index"
3345+
)
3346+
lvl_indexer = np.asarray(k)
33723347

33733348
elif is_list_like(k):
3374-
# a collection of labels to include from this level (these
3375-
# are or'd)
3376-
3377-
indexers: Int64Index | None = None
3349+
# a collection of labels to include from this level (these are or'd)
33783350

33793351
# GH#27591 check if this is a single tuple key in the level
33803352
try:
3381-
# Argument "indexer" to "_get_level_indexer" of "MultiIndex"
3382-
# has incompatible type "Index"; expected "Optional[Int64Index]"
3383-
lev_loc = self._get_level_indexer(
3384-
k, level=i, indexer=indexer # type: ignore[arg-type]
3385-
)
3353+
lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
33863354
except (InvalidIndexError, TypeError, KeyError) as err:
33873355
# InvalidIndexError e.g. non-hashable, fall back to treating
33883356
# this as a sequence of labels
@@ -3394,11 +3362,8 @@ def _update_indexer(idxr: Index, indexer: Index | None) -> Index:
33943362
# e.g. slice
33953363
raise err
33963364
try:
3397-
# Argument "indexer" to "_get_level_indexer" of "MultiIndex"
3398-
# has incompatible type "Index"; expected
3399-
# "Optional[Int64Index]"
3400-
item_lvl_indexer = self._get_level_indexer(
3401-
x, level=i, indexer=indexer # type: ignore[arg-type]
3365+
item_indexer = self._get_level_indexer(
3366+
x, level=i, indexer=indexer
34023367
)
34033368
except KeyError:
34043369
# ignore not founds; see discussion in GH#39424
@@ -3418,82 +3383,63 @@ def _update_indexer(idxr: Index, indexer: Index | None) -> Index:
34183383
)
34193384
continue
34203385
else:
3421-
idxrs = _convert_to_indexer(item_lvl_indexer)
3422-
3423-
if indexers is None:
3424-
indexers = idxrs
3386+
if lvl_indexer is None:
3387+
lvl_indexer = _to_bool_indexer(item_indexer)
3388+
elif isinstance(item_indexer, slice):
3389+
lvl_indexer[item_indexer] = True # type: ignore[index]
34253390
else:
3426-
indexers = indexers.union(idxrs, sort=False)
3427-
3428-
else:
3429-
idxrs = _convert_to_indexer(lev_loc)
3430-
if indexers is None:
3431-
indexers = idxrs
3432-
else:
3433-
indexers = indexers.union(idxrs, sort=False)
3391+
lvl_indexer |= item_indexer
34343392

3435-
if indexers is not None:
3436-
indexer = _update_indexer(indexers, indexer=indexer)
3437-
else:
3393+
if lvl_indexer is None:
34383394
# no matches we are done
34393395
# test_loc_getitem_duplicates_multiindex_empty_indexer
34403396
return np.array([], dtype=np.intp)
34413397

34423398
elif com.is_null_slice(k):
34433399
# empty slice
3444-
if indexer is None:
3445-
indexer = Index(np.arange(n))
3400+
if indexer is None and i == len(seq) - 1:
3401+
return np.arange(n, dtype=np.intp)
3402+
continue
34463403

3447-
elif isinstance(k, slice):
3404+
else:
3405+
# a slice or a single label
3406+
lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
34483407

3449-
# a slice, include BOTH of the labels
3450-
# Argument "indexer" to "_get_level_indexer" of "MultiIndex" has
3451-
# incompatible type "Index"; expected "Optional[Int64Index]"
3452-
lvl_indexer = self._get_level_indexer(
3453-
k,
3454-
level=i,
3455-
indexer=indexer, # type: ignore[arg-type]
3456-
)
3457-
indexer = _update_indexer(
3458-
_convert_to_indexer(lvl_indexer),
3459-
indexer=indexer,
3460-
)
3408+
# update indexer
3409+
lvl_indexer = _to_bool_indexer(lvl_indexer)
3410+
if indexer is None:
3411+
indexer = lvl_indexer
34613412
else:
3462-
# a single label
3463-
lvl_indexer = self._get_loc_level(k, level=i)[0]
3464-
indexer = _update_indexer(
3465-
_convert_to_indexer(lvl_indexer),
3466-
indexer=indexer,
3467-
)
3413+
indexer &= lvl_indexer
3414+
if not np.any(indexer) and np.any(lvl_indexer):
3415+
raise KeyError(seq)
34683416

34693417
# empty indexer
34703418
if indexer is None:
34713419
return np.array([], dtype=np.intp)
34723420

3473-
assert isinstance(indexer, Int64Index), type(indexer)
3474-
indexer = self._reorder_indexer(seq, indexer)
3475-
3476-
return indexer._values.astype(np.intp, copy=False)
3421+
pos_indexer = indexer.nonzero()[0]
3422+
return self._reorder_indexer(seq, pos_indexer)
34773423

34783424
# --------------------------------------------------------------------
34793425

34803426
def _reorder_indexer(
34813427
self,
34823428
seq: tuple[Scalar | Iterable | AnyArrayLike, ...],
3483-
indexer: Int64Index,
3484-
) -> Int64Index:
3429+
indexer: npt.NDArray[np.intp],
3430+
) -> npt.NDArray[np.intp]:
34853431
"""
3486-
Reorder an indexer of a MultiIndex (self) so that the label are in the
3432+
Reorder an indexer of a MultiIndex (self) so that the labels are in the
34873433
same order as given in seq
34883434
34893435
Parameters
34903436
----------
34913437
seq : label/slice/list/mask or a sequence of such
3492-
indexer: an Int64Index indexer of self
3438+
indexer: a position indexer of self
34933439
34943440
Returns
34953441
-------
3496-
indexer : a sorted Int64Index indexer of self ordered as seq
3442+
indexer : a sorted position indexer of self ordered as seq
34973443
"""
34983444
# If the index is lexsorted and the list_like label in seq are sorted
34993445
# then we do not need to sort

0 commit comments

Comments
 (0)