diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 438313f3e58e2..c5fe7ce3c673b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -985,6 +985,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) +- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`41934`) - Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4de95079f6480..dc0343a6199e7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3403,6 +3403,22 @@ def get_indexer( # matched to Interval scalars return self._get_indexer_non_comparable(target, method=method, unique=True) + if is_categorical_dtype(target.dtype): + # potential fastpath + # get an indexer for unique categories then propagate to codes via take_nd + categories_indexer = self.get_indexer(target.categories) + indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1) + + if (not self._is_multi and self.hasnans) and target.hasnans: + # Exclude MultiIndex because hasnans raises NotImplementedError + # we should only get here if we are unique, so loc is an integer + # GH#41934 + loc = self.get_loc(np.nan) + mask = target.isna() + indexer[mask] = loc + + return ensure_platform_int(indexer) + pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer( diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 232ca9068abc6..40fee095ee614 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -7,10 +7,8 @@ ) import textwrap from typing import ( - TYPE_CHECKING, Any, Hashable, - cast, ) import numpy as np @@ -46,7 +44,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_categorical_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -63,7 +60,6 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.missing import is_valid_na_for_dtype -from pandas.core.algorithms import take_nd from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -91,9 +87,6 @@ timedelta_range, ) -if TYPE_CHECKING: - from pandas import CategoricalIndex - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( @@ -666,11 +659,7 @@ def _get_indexer( left_indexer = self.left.get_indexer(target.left) right_indexer = self.right.get_indexer(target.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) - elif is_categorical_dtype(target.dtype): - target = cast("CategoricalIndex", target) - # get an indexer for unique categories then propagate to codes via take_nd - categories_indexer = self.get_indexer(target.categories) - indexer = take_nd(categories_indexer, target.codes, fill_value=-1) + elif not is_object_dtype(target): # homogeneous scalar index: use IntervalTree target = self._maybe_convert_i8(target) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index a5a921f42c3ef..aa3359d775c5a 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -275,6 +275,26 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_categorical_with_nans(self): + # GH#41934 nans in both index and in target + ii = IntervalIndex.from_breaks(range(5)) + ii2 = ii.append(IntervalIndex([np.nan])) + ci2 = CategoricalIndex(ii2) + + result = ii2.get_indexer(ci2) + expected = np.arange(5, dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + # not-all-matches + result = ii2[1:].get_indexer(ci2[::-1]) + expected = np.array([3, 2, 1, 0, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + # non-unique target, non-unique nans + result = ii2.get_indexer(ci2.append(ci2)) + expected = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "tuples, closed", [