Skip to content

Commit 255bc29

Browse files
jbrockmendelluckyvs1
authored andcommitted
REF: de-duplicate get_indexer methods (pandas-dev#38372)
1 parent e561c7d commit 255bc29

File tree

10 files changed

+60
-76
lines changed

10 files changed

+60
-76
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ Interval
127127

128128
Indexing
129129
^^^^^^^^
130+
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
130131
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
131132
-
132133
-

pandas/core/indexes/base.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -3147,6 +3147,11 @@ def get_indexer(
31473147
method = missing.clean_reindex_fill_method(method)
31483148
target = ensure_index(target)
31493149

3150+
self._check_indexing_method(method)
3151+
3152+
if not self._index_as_unique:
3153+
raise InvalidIndexError(self._requires_unique_msg)
3154+
31503155
# Treat boolean labels passed to a numeric index as not found. Without
31513156
# this fix False and True would be treated as 0 and 1 respectively.
31523157
# (GH #16877)
@@ -3174,11 +3179,6 @@ def _get_indexer(
31743179
target, method=method, limit=limit, tolerance=tolerance
31753180
)
31763181

3177-
if not self.is_unique:
3178-
raise InvalidIndexError(
3179-
"Reindexing only valid with uniquely valued Index objects"
3180-
)
3181-
31823182
if method == "pad" or method == "backfill":
31833183
indexer = self._get_fill_indexer(target, method, limit, tolerance)
31843184
elif method == "nearest":
@@ -3199,6 +3199,24 @@ def _get_indexer(
31993199

32003200
return ensure_platform_int(indexer)
32013201

3202+
def _check_indexing_method(self, method):
3203+
"""
3204+
Raise if we have a get_indexer `method` that is not supported or valid.
3205+
"""
3206+
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
3207+
if not (is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype)):
3208+
return
3209+
3210+
if method is None:
3211+
return
3212+
3213+
if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
3214+
raise NotImplementedError(
3215+
f"method {method} not yet implemented for {type(self).__name__}"
3216+
)
3217+
3218+
raise ValueError("Invalid fill method")
3219+
32023220
def _convert_tolerance(self, tolerance, target):
32033221
# override this method on subclasses
32043222
tolerance = np.asarray(tolerance)
@@ -5014,6 +5032,8 @@ def _index_as_unique(self):
50145032
"""
50155033
return self.is_unique
50165034

5035+
_requires_unique_msg = "Reindexing only valid with uniquely valued Index objects"
5036+
50175037
@final
50185038
def _maybe_promote(self, other: "Index"):
50195039
"""

pandas/core/indexes/category.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -494,14 +494,11 @@ def _reindex_non_unique(self, target):
494494
def _maybe_cast_indexer(self, key) -> int:
495495
return self._data._unbox_scalar(key)
496496

497-
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
498497
def _get_indexer(
499498
self, target: "Index", method=None, limit=None, tolerance=None
500499
) -> np.ndarray:
501500

502-
self._check_indexing_method(method)
503-
504-
if self.is_unique and self.equals(target):
501+
if self.equals(target):
505502
return np.arange(len(self), dtype="intp")
506503

507504
return self._get_indexer_non_unique(target._values)[0]

pandas/core/indexes/extension.py

-15
Original file line numberDiff line numberDiff line change
@@ -254,21 +254,6 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
254254

255255
# ---------------------------------------------------------------------
256256

257-
def _check_indexing_method(self, method):
258-
"""
259-
Raise if we have a get_indexer `method` that is not supported or valid.
260-
"""
261-
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
262-
if method is None:
263-
return
264-
265-
if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
266-
raise NotImplementedError(
267-
f"method {method} not yet implemented for {type(self).__name__}"
268-
)
269-
270-
raise ValueError("Invalid fill method")
271-
272257
def _get_engine_target(self) -> np.ndarray:
273258
return np.asarray(self._data)
274259

pandas/core/indexes/interval.py

+5-26
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset
1414
from pandas._typing import AnyArrayLike, DtypeObj, Label
1515
from pandas.errors import InvalidIndexError
16-
from pandas.util._decorators import Appender, Substitution, cache_readonly
16+
from pandas.util._decorators import Appender, cache_readonly
1717
from pandas.util._exceptions import rewrite_exception
1818

1919
from pandas.core.dtypes.cast import (
@@ -646,23 +646,6 @@ def get_loc(
646646
return mask.argmax()
647647
return lib.maybe_booleans_to_slice(mask.view("u1"))
648648

649-
@Substitution(
650-
**dict(
651-
_index_doc_kwargs,
652-
**{
653-
"raises_section": textwrap.dedent(
654-
"""
655-
Raises
656-
------
657-
NotImplementedError
658-
If any method argument other than the default of
659-
None is specified as these are not yet implemented.
660-
"""
661-
)
662-
},
663-
)
664-
)
665-
@Appender(_index_shared_docs["get_indexer"])
666649
def _get_indexer(
667650
self,
668651
target: Index,
@@ -671,14 +654,6 @@ def _get_indexer(
671654
tolerance: Optional[Any] = None,
672655
) -> np.ndarray:
673656

674-
self._check_indexing_method(method)
675-
676-
if self.is_overlapping:
677-
raise InvalidIndexError(
678-
"cannot handle overlapping indices; "
679-
"use IntervalIndex.get_indexer_non_unique"
680-
)
681-
682657
if isinstance(target, IntervalIndex):
683658
# equal indexes -> 1:1 positional match
684659
if self.equals(target):
@@ -767,6 +742,10 @@ def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]
767742
def _index_as_unique(self):
768743
return not self.is_overlapping
769744

745+
_requires_unique_msg = (
746+
"cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique"
747+
)
748+
770749
def _convert_slice_indexer(self, key: slice, kind: str):
771750
if not (key.step is None or key.step == 1):
772751
# GH#31658 if label-based, we require step == 1,

pandas/core/indexes/multi.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -2595,11 +2595,10 @@ def _get_partial_string_timestamp_match_key(self, key):
25952595

25962596
return key
25972597

2598-
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
25992598
def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
26002599

26012600
# empty indexer
2602-
if is_list_like(target) and not len(target):
2601+
if not len(target):
26032602
return ensure_platform_int(np.array([]))
26042603

26052604
if not isinstance(target, MultiIndex):
@@ -2613,9 +2612,6 @@ def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
26132612
target, method=method, limit=limit, tolerance=tolerance
26142613
)
26152614

2616-
if not self.is_unique:
2617-
raise ValueError("Reindexing only valid with uniquely valued Index objects")
2618-
26192615
if method == "pad" or method == "backfill":
26202616
if tolerance is not None:
26212617
raise NotImplementedError(

pandas/core/indexes/period.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
1010
from pandas._typing import DtypeObj
1111
from pandas.errors import InvalidIndexError
12-
from pandas.util._decorators import Appender, cache_readonly, doc
12+
from pandas.util._decorators import cache_readonly, doc
1313

1414
from pandas.core.dtypes.common import (
1515
is_bool_dtype,
@@ -31,11 +31,7 @@
3131
)
3232
import pandas.core.common as com
3333
import pandas.core.indexes.base as ibase
34-
from pandas.core.indexes.base import (
35-
_index_shared_docs,
36-
ensure_index,
37-
maybe_extract_name,
38-
)
34+
from pandas.core.indexes.base import ensure_index, maybe_extract_name
3935
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
4036
from pandas.core.indexes.datetimes import DatetimeIndex, Index
4137
from pandas.core.indexes.extension import inherit_names
@@ -448,7 +444,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
448444
# ------------------------------------------------------------------------
449445
# Indexing Methods
450446

451-
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
452447
def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
453448

454449
if not self._should_compare(target):

pandas/core/indexes/range.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas._libs.lib import no_default
1111
from pandas._typing import Label
1212
from pandas.compat.numpy import function as nv
13-
from pandas.util._decorators import Appender, cache_readonly, doc
13+
from pandas.util._decorators import cache_readonly, doc
1414

1515
from pandas.core.dtypes.common import (
1616
ensure_platform_int,
@@ -28,7 +28,7 @@
2828
import pandas.core.common as com
2929
from pandas.core.construction import extract_array
3030
import pandas.core.indexes.base as ibase
31-
from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name
31+
from pandas.core.indexes.base import maybe_extract_name
3232
from pandas.core.indexes.numeric import Float64Index, Int64Index
3333
from pandas.core.ops.common import unpack_zerodim_and_defer
3434

@@ -354,7 +354,6 @@ def get_loc(self, key, method=None, tolerance=None):
354354
raise KeyError(key)
355355
return super().get_loc(key, method=method, tolerance=tolerance)
356356

357-
@Appender(_index_shared_docs["get_indexer"])
358357
def _get_indexer(self, target, method=None, limit=None, tolerance=None):
359358
if com.any_not_none(method, tolerance, limit) or not is_list_like(target):
360359
return super()._get_indexer(

pandas/tests/indexes/categorical/test_indexing.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
from pandas.errors import InvalidIndexError
5+
46
import pandas as pd
57
from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp
68
import pandas._testing as tm
@@ -204,18 +206,19 @@ def test_get_indexer_base(self):
204206
with pytest.raises(ValueError, match="Invalid fill method"):
205207
idx.get_indexer(idx, method="invalid")
206208

207-
def test_get_indexer_non_unique(self):
209+
def test_get_indexer_requires_unique(self):
208210
np.random.seed(123456789)
209211

210212
ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
211213
oidx = Index(np.array(ci))
212214

215+
msg = "Reindexing only valid with uniquely valued Index objects"
216+
213217
for n in [1, 2, 5, len(ci)]:
214218
finder = oidx[np.random.randint(0, len(ci), size=n)]
215-
expected = oidx.get_indexer_non_unique(finder)[0]
216219

217-
actual = ci.get_indexer(finder)
218-
tm.assert_numpy_array_equal(expected, actual)
220+
with pytest.raises(InvalidIndexError, match=msg):
221+
ci.get_indexer(finder)
219222

220223
# see gh-17323
221224
#
@@ -224,19 +227,27 @@ def test_get_indexer_non_unique(self):
224227
# respect duplicates instead of taking
225228
# the fast-track path.
226229
for finder in [list("aabbca"), list("aababca")]:
227-
expected = oidx.get_indexer_non_unique(finder)[0]
228230

229-
actual = ci.get_indexer(finder)
230-
tm.assert_numpy_array_equal(expected, actual)
231+
with pytest.raises(InvalidIndexError, match=msg):
232+
ci.get_indexer(finder)
231233

232-
def test_get_indexer(self):
234+
def test_get_indexer_non_unique(self):
233235

234236
idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
235237
idx2 = CategoricalIndex(list("abf"))
236238

237239
for indexer in [idx2, list("abf"), Index(list("abf"))]:
238-
r1 = idx1.get_indexer(idx2)
239-
tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
240+
msg = "Reindexing only valid with uniquely valued Index objects"
241+
with pytest.raises(InvalidIndexError, match=msg):
242+
idx1.get_indexer(idx2)
243+
244+
r1, _ = idx1.get_indexer_non_unique(idx2)
245+
expected = np.array([0, 1, 2, -1], dtype=np.intp)
246+
tm.assert_almost_equal(r1, expected)
247+
248+
def test_get_indexer_method(self):
249+
idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
250+
idx2 = CategoricalIndex(list("abf"))
240251

241252
msg = "method pad not yet implemented for CategoricalIndex"
242253
with pytest.raises(NotImplementedError, match=msg):

pandas/tests/indexes/common.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,10 @@ def test_reindex_base(self):
205205
def test_get_indexer_consistency(self, index):
206206
# See GH 16819
207207
if isinstance(index, IntervalIndex):
208+
# requires index.is_non_overlapping
208209
return
209210

210-
if index.is_unique or isinstance(index, CategoricalIndex):
211+
if index.is_unique:
211212
indexer = index.get_indexer(index[0:2])
212213
assert isinstance(indexer, np.ndarray)
213214
assert indexer.dtype == np.intp

0 commit comments

Comments
 (0)