Skip to content

BUG: IntervalIndex.get_indexer with categorical both have nans #41934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,7 @@ Indexing
^^^^^^^^
- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`)
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`41934`)
- Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`)
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
- Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`)
Expand Down
16 changes: 16 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3403,6 +3403,22 @@ def get_indexer(
# matched to Interval scalars
return self._get_indexer_non_comparable(target, method=method, unique=True)

if is_categorical_dtype(target.dtype):
# potential fastpath
# get an indexer for unique categories then propagate to codes via take_nd
categories_indexer = self.get_indexer(target.categories)
indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1)

if (not self._is_multi and self.hasnans) and target.hasnans:
# Exclude MultiIndex because hasnans raises NotImplementedError
# we should only get here if we are unique, so loc is an integer
# GH#41934
loc = self.get_loc(np.nan)
mask = target.isna()
indexer[mask] = loc

return ensure_platform_int(indexer)

pself, ptarget = self._maybe_promote(target)
if pself is not self or ptarget is not target:
return pself.get_indexer(
Expand Down
13 changes: 1 addition & 12 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
)
import textwrap
from typing import (
TYPE_CHECKING,
Any,
Hashable,
cast,
)

import numpy as np
Expand Down Expand Up @@ -46,7 +44,6 @@
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_categorical_dtype,
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_dtype_equal,
Expand All @@ -63,7 +60,6 @@
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas.core.dtypes.missing import is_valid_na_for_dtype

from pandas.core.algorithms import take_nd
from pandas.core.arrays.interval import (
IntervalArray,
_interval_shared_docs,
Expand Down Expand Up @@ -91,9 +87,6 @@
timedelta_range,
)

if TYPE_CHECKING:
from pandas import CategoricalIndex

_index_doc_kwargs = dict(ibase._index_doc_kwargs)

_index_doc_kwargs.update(
Expand Down Expand Up @@ -666,11 +659,7 @@ def _get_indexer(
left_indexer = self.left.get_indexer(target.left)
right_indexer = self.right.get_indexer(target.right)
indexer = np.where(left_indexer == right_indexer, left_indexer, -1)
elif is_categorical_dtype(target.dtype):
target = cast("CategoricalIndex", target)
# get an indexer for unique categories then propagate to codes via take_nd
categories_indexer = self.get_indexer(target.categories)
indexer = take_nd(categories_indexer, target.codes, fill_value=-1)

elif not is_object_dtype(target):
# homogeneous scalar index: use IntervalTree
target = self._maybe_convert_i8(target)
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/indexes/interval/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,26 @@ def test_get_indexer_categorical(self, target, ordered):
expected = index.get_indexer(target)
tm.assert_numpy_array_equal(result, expected)

def test_get_indexer_categorical_with_nans(self):
# GH#41934 nans in both index and in target
ii = IntervalIndex.from_breaks(range(5))
ii2 = ii.append(IntervalIndex([np.nan]))
ci2 = CategoricalIndex(ii2)

result = ii2.get_indexer(ci2)
expected = np.arange(5, dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

# not-all-matches
result = ii2[1:].get_indexer(ci2[::-1])
expected = np.array([3, 2, 1, 0, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

# non-unique target, non-unique nans
result = ii2.get_indexer(ci2.append(ci2))
expected = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize(
"tuples, closed",
[
Expand Down