Skip to content

REF: de-duplicate get_indexer methods #38372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 11, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ Interval

Indexing
^^^^^^^^
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
-
-
Expand Down
30 changes: 25 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3147,6 +3147,11 @@ def get_indexer(
method = missing.clean_reindex_fill_method(method)
target = ensure_index(target)

self._check_indexing_method(method)

if not self._index_as_unique:
raise InvalidIndexError(self._requires_unique_msg)

# Treat boolean labels passed to a numeric index as not found. Without
# this fix False and True would be treated as 0 and 1 respectively.
# (GH #16877)
Expand Down Expand Up @@ -3174,11 +3179,6 @@ def _get_indexer(
target, method=method, limit=limit, tolerance=tolerance
)

if not self.is_unique:
raise InvalidIndexError(
"Reindexing only valid with uniquely valued Index objects"
)

if method == "pad" or method == "backfill":
indexer = self._get_fill_indexer(target, method, limit, tolerance)
elif method == "nearest":
Expand All @@ -3199,6 +3199,24 @@ def _get_indexer(

return ensure_platform_int(indexer)

def _check_indexing_method(self, method):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type at some point

"""
Raise if we have a get_indexer `method` that is not supported or valid.
"""
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
if not (is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype)):
return

if method is None:
return

if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
raise NotImplementedError(
f"method {method} not yet implemented for {type(self).__name__}"
)

raise ValueError("Invalid fill method")

def _convert_tolerance(self, tolerance, target):
# override this method on subclasses
tolerance = np.asarray(tolerance)
Expand Down Expand Up @@ -5014,6 +5032,8 @@ def _index_as_unique(self):
"""
return self.is_unique

_requires_unique_msg = "Reindexing only valid with uniquely valued Index objects"

@final
def _maybe_promote(self, other: "Index"):
"""
Expand Down
5 changes: 1 addition & 4 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,14 +494,11 @@ def _reindex_non_unique(self, target):
def _maybe_cast_indexer(self, key) -> int:
return self._data._unbox_scalar(key)

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def _get_indexer(
self, target: "Index", method=None, limit=None, tolerance=None
) -> np.ndarray:

self._check_indexing_method(method)

if self.is_unique and self.equals(target):
if self.equals(target):
return np.arange(len(self), dtype="intp")

return self._get_indexer_non_unique(target._values)[0]
Expand Down
15 changes: 0 additions & 15 deletions pandas/core/indexes/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,21 +254,6 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:

# ---------------------------------------------------------------------

def _check_indexing_method(self, method):
"""
Raise if we have a get_indexer `method` that is not supported or valid.
"""
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
if method is None:
return

if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
raise NotImplementedError(
f"method {method} not yet implemented for {type(self).__name__}"
)

raise ValueError("Invalid fill method")

def _get_engine_target(self) -> np.ndarray:
return np.asarray(self._data)

Expand Down
31 changes: 5 additions & 26 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset
from pandas._typing import AnyArrayLike, DtypeObj, Label
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._decorators import Appender, cache_readonly
from pandas.util._exceptions import rewrite_exception

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -646,23 +646,6 @@ def get_loc(
return mask.argmax()
return lib.maybe_booleans_to_slice(mask.view("u1"))

@Substitution(
**dict(
_index_doc_kwargs,
**{
"raises_section": textwrap.dedent(
"""
Raises
------
NotImplementedError
If any method argument other than the default of
None is specified as these are not yet implemented.
"""
)
},
)
)
@Appender(_index_shared_docs["get_indexer"])
def _get_indexer(
self,
target: Index,
Expand All @@ -671,14 +654,6 @@ def _get_indexer(
tolerance: Optional[Any] = None,
) -> np.ndarray:

self._check_indexing_method(method)

if self.is_overlapping:
raise InvalidIndexError(
"cannot handle overlapping indices; "
"use IntervalIndex.get_indexer_non_unique"
)

if isinstance(target, IntervalIndex):
# equal indexes -> 1:1 positional match
if self.equals(target):
Expand Down Expand Up @@ -767,6 +742,10 @@ def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]
def _index_as_unique(self):
return not self.is_overlapping

_requires_unique_msg = (
"cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique"
)

def _convert_slice_indexer(self, key: slice, kind: str):
if not (key.step is None or key.step == 1):
# GH#31658 if label-based, we require step == 1,
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2595,11 +2595,10 @@ def _get_partial_string_timestamp_match_key(self, key):

return key

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):

# empty indexer
if is_list_like(target) and not len(target):
if not len(target):
return ensure_platform_int(np.array([]))

if not isinstance(target, MultiIndex):
Expand All @@ -2613,9 +2612,6 @@ def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):
target, method=method, limit=limit, tolerance=tolerance
)

if not self.is_unique:
raise ValueError("Reindexing only valid with uniquely valued Index objects")

if method == "pad" or method == "backfill":
if tolerance is not None:
raise NotImplementedError(
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
from pandas._typing import DtypeObj
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, cache_readonly, doc
from pandas.util._decorators import cache_readonly, doc

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand All @@ -31,11 +31,7 @@
)
import pandas.core.common as com
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
_index_shared_docs,
ensure_index,
maybe_extract_name,
)
from pandas.core.indexes.base import ensure_index, maybe_extract_name
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
from pandas.core.indexes.datetimes import DatetimeIndex, Index
from pandas.core.indexes.extension import inherit_names
Expand Down Expand Up @@ -448,7 +444,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
# ------------------------------------------------------------------------
# Indexing Methods

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None):

if not self._should_compare(target):
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas._libs.lib import no_default
from pandas._typing import Label
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender, cache_readonly, doc
from pandas.util._decorators import cache_readonly, doc

from pandas.core.dtypes.common import (
ensure_platform_int,
Expand All @@ -28,7 +28,7 @@
import pandas.core.common as com
from pandas.core.construction import extract_array
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name
from pandas.core.indexes.base import maybe_extract_name
from pandas.core.indexes.numeric import Float64Index, Int64Index
from pandas.core.ops.common import unpack_zerodim_and_defer

Expand Down Expand Up @@ -354,7 +354,6 @@ def get_loc(self, key, method=None, tolerance=None):
raise KeyError(key)
return super().get_loc(key, method=method, tolerance=tolerance)

@Appender(_index_shared_docs["get_indexer"])
def _get_indexer(self, target, method=None, limit=None, tolerance=None):
if com.any_not_none(method, tolerance, limit) or not is_list_like(target):
return super()._get_indexer(
Expand Down
31 changes: 21 additions & 10 deletions pandas/tests/indexes/categorical/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas.errors import InvalidIndexError

import pandas as pd
from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp
import pandas._testing as tm
Expand Down Expand Up @@ -204,18 +206,19 @@ def test_get_indexer_base(self):
with pytest.raises(ValueError, match="Invalid fill method"):
idx.get_indexer(idx, method="invalid")

def test_get_indexer_non_unique(self):
def test_get_indexer_requires_unique(self):
np.random.seed(123456789)

ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
oidx = Index(np.array(ci))

msg = "Reindexing only valid with uniquely valued Index objects"

for n in [1, 2, 5, len(ci)]:
finder = oidx[np.random.randint(0, len(ci), size=n)]
expected = oidx.get_indexer_non_unique(finder)[0]

actual = ci.get_indexer(finder)
tm.assert_numpy_array_equal(expected, actual)
with pytest.raises(InvalidIndexError, match=msg):
ci.get_indexer(finder)

# see gh-17323
#
Expand All @@ -224,19 +227,27 @@ def test_get_indexer_non_unique(self):
# respect duplicates instead of taking
# the fast-track path.
for finder in [list("aabbca"), list("aababca")]:
expected = oidx.get_indexer_non_unique(finder)[0]

actual = ci.get_indexer(finder)
tm.assert_numpy_array_equal(expected, actual)
with pytest.raises(InvalidIndexError, match=msg):
ci.get_indexer(finder)

def test_get_indexer(self):
def test_get_indexer_non_unique(self):

idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
idx2 = CategoricalIndex(list("abf"))

for indexer in [idx2, list("abf"), Index(list("abf"))]:
r1 = idx1.get_indexer(idx2)
tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
msg = "Reindexing only valid with uniquely valued Index objects"
with pytest.raises(InvalidIndexError, match=msg):
idx1.get_indexer(idx2)

r1, _ = idx1.get_indexer_non_unique(idx2)
expected = np.array([0, 1, 2, -1], dtype=np.intp)
tm.assert_almost_equal(r1, expected)

def test_get_indexer_method(self):
idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
idx2 = CategoricalIndex(list("abf"))

msg = "method pad not yet implemented for CategoricalIndex"
with pytest.raises(NotImplementedError, match=msg):
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,10 @@ def test_reindex_base(self):
def test_get_indexer_consistency(self, index):
# See GH 16819
if isinstance(index, IntervalIndex):
# requires index.is_non_overlapping
return

if index.is_unique or isinstance(index, CategoricalIndex):
if index.is_unique:
indexer = index.get_indexer(index[0:2])
assert isinstance(indexer, np.ndarray)
assert indexer.dtype == np.intp
Expand Down