From add19006da9ad2cab63a9efb5ed00d00435de010 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Jun 2021 14:55:29 -0700 Subject: [PATCH 1/4] REF+BUG: IntervalIndex.get_indexer with categorical both have nans --- pandas/core/indexes/base.py | 16 +++++++++++++++ pandas/core/indexes/interval.py | 13 +----------- .../tests/indexes/interval/test_indexing.py | 20 +++++++++++++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 73f21f908d55d..f584318b77add 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3403,6 +3403,22 @@ def get_indexer( # matched to Interval scalars return self._get_indexer_non_comparable(target, method=method, unique=True) + if is_categorical_dtype(target.dtype) and not is_categorical_dtype(self.dtype): + # TODO: we can remove the self.dtype condition following GH#41933 + # potential fastpath + # get an indexer for unique categories then propagate to codes via take_nd + categories_indexer = self.get_indexer(target.categories) + indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1) + + if (not self._is_multi and self.hasnans) and target.hasnans: + # Exclude MultiIndex because hasnans raises NotImplementedError + # we should only get here if we are unique, so loc is an integer + loc = self.get_loc(np.nan) + mask = target.isna() + indexer[mask] = loc + + return ensure_platform_int(indexer) + pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer( diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7c96336103212..85ff5f482191f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -7,10 +7,8 @@ ) import textwrap from typing import ( - TYPE_CHECKING, Any, Hashable, - cast, ) import numpy as np @@ -46,7 +44,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_categorical_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -63,7 +60,6 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.missing import is_valid_na_for_dtype -from pandas.core.algorithms import take_nd from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -91,9 +87,6 @@ timedelta_range, ) -if TYPE_CHECKING: - from pandas import CategoricalIndex - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( @@ -668,11 +661,7 @@ def _get_indexer( left_indexer = self.left.get_indexer(target.left) right_indexer = self.right.get_indexer(target.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) - elif is_categorical_dtype(target.dtype): - target = cast("CategoricalIndex", target) - # get an indexer for unique categories then propagate to codes via take_nd - categories_indexer = self.get_indexer(target.categories) - indexer = take_nd(categories_indexer, target.codes, fill_value=-1) + elif not is_object_dtype(target): # homogeneous scalar index: use IntervalTree target = self._maybe_convert_i8(target) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index a5a921f42c3ef..dac044b0eaa95 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -275,6 +275,26 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_categorical_with_nans(self): + # nans in both index and in target + ii = IntervalIndex.from_breaks(range(5)) + ii2 = ii.append(IntervalIndex([np.nan])) + ci2 = CategoricalIndex(ii2) + + result = ii2.get_indexer(ci2) + expected = np.arange(5, dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + # not-all-matches + result = ii2[1:].get_indexer(ci2[::-1]) + expected = np.array([3, 2, 1, 0, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + # non-unique target, non-unique nans + result = ii2.get_indexer(ci2.append(ci2)) + expected = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "tuples, closed", [ From 819db87cd85d03d9a0b6bf2f34bbc088baee7b05 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Jun 2021 14:57:21 -0700 Subject: [PATCH 2/4] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d7a6c2c3f0e1a..603396fb8487e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -957,6 +957,7 @@ Indexing - Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) +- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`??`) - Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) From b6251fa9aa1e51d32cfb62575e2be974135258fb Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Jun 2021 14:59:11 -0700 Subject: [PATCH 3/4] GH refs --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/indexes/base.py | 1 + pandas/tests/indexes/interval/test_indexing.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 603396fb8487e..afc4875d2ee29 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -957,7 +957,7 @@ Indexing - Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) -- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`??`) +- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`41934`) - Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f584318b77add..7efce6e836924 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3413,6 +3413,7 @@ def get_indexer( if (not self._is_multi and self.hasnans) and target.hasnans: # Exclude MultiIndex because hasnans raises NotImplementedError # we should only get here if we are unique, so loc is an integer + # GH#41934 loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index dac044b0eaa95..aa3359d775c5a 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -276,7 +276,7 @@ def test_get_indexer_categorical(self, target, ordered): tm.assert_numpy_array_equal(result, expected) def test_get_indexer_categorical_with_nans(self): - # nans in both index and in target + # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) ii2 = ii.append(IntervalIndex([np.nan])) ci2 = CategoricalIndex(ii2) From 1b3162c52f0b68719ff46a351a6ab2b519323fb4 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Jun 2021 17:37:32 -0700 Subject: [PATCH 4/4] remove now-unnecessary check --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f420a3a561bbf..dc0343a6199e7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3403,8 +3403,7 @@ def get_indexer( # matched to Interval scalars return self._get_indexer_non_comparable(target, method=method, unique=True) - if is_categorical_dtype(target.dtype) and not is_categorical_dtype(self.dtype): - # TODO: we can remove the self.dtype condition following GH#41933 + if is_categorical_dtype(target.dtype): # potential fastpath # get an indexer for unique categories then propagate to codes via take_nd categories_indexer = self.get_indexer(target.categories)