Skip to content

Commit f34376d

Browse files
authored
Backport PR pandas-dev#42270: PERF: CategoricalIndex.get_indexer (pandas-dev#42282)
* Backport PR pandas-dev#42270: PERF: CategoricalIndex.get_indexer * port _maybe_cast_listlike_indexer
1 parent 8d856e1 commit f34376d

File tree

2 files changed

+31
-3
lines changed

2 files changed

+31
-3
lines changed

pandas/core/indexes/base.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -3411,12 +3411,25 @@ def get_indexer(
34113411
# matched to Interval scalars
34123412
return self._get_indexer_non_comparable(target, method=method, unique=True)
34133413

3414+
if is_categorical_dtype(self.dtype):
3415+
# _maybe_cast_listlike_indexer ensures target has our dtype
3416+
# (could improve perf by doing _should_compare check earlier?)
3417+
assert is_dtype_equal(self.dtype, target.dtype)
3418+
3419+
indexer = self._engine.get_indexer(target.codes)
3420+
if self.hasnans and target.hasnans:
3421+
loc = self.get_loc(np.nan)
3422+
mask = target.isna()
3423+
indexer[mask] = loc
3424+
return indexer
3425+
34143426
if is_categorical_dtype(target.dtype):
34153427
# potential fastpath
34163428
# get an indexer for unique categories then propagate to codes via take_nd
3417-
# Note: calling get_indexer instead of _get_indexer causes
3418-
# RecursionError GH#42088
3419-
categories_indexer = self._get_indexer(target.categories)
3429+
# get_indexer instead of _get_indexer needed for MultiIndex cases
3430+
# e.g. test_append_different_columns_types
3431+
categories_indexer = self.get_indexer(target.categories)
3432+
34203433
indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1)
34213434

34223435
if (not self._is_multi and self.hasnans) and target.hasnans:

pandas/core/indexes/category.py

+15
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,21 @@ def _maybe_cast_indexer(self, key) -> int:
487487
return -1
488488
raise
489489

490+
def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
491+
if isinstance(values, CategoricalIndex):
492+
values = values._data
493+
if isinstance(values, Categorical):
494+
# Indexing on codes is more efficient if categories are the same,
495+
# so we can apply some optimizations based on the degree of
496+
# dtype-matching.
497+
cat = self._data._encode_with_my_categories(values)
498+
codes = cat._codes
499+
else:
500+
codes = self.categories.get_indexer(values)
501+
codes = codes.astype(self.codes.dtype, copy=False)
502+
cat = self._data._from_backing_data(codes)
503+
return type(self)._simple_new(cat)
504+
490505
def _get_indexer(
491506
self,
492507
target: Index,

0 commit comments

Comments
 (0)