From c22eb72d5113e9edaee16e861c2ce0ceef904e21 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jun 2021 11:55:04 -0700 Subject: [PATCH 1/2] REF: remove Index._convert_list_indexer --- pandas/core/indexes/base.py | 18 +----------------- pandas/core/indexes/category.py | 12 ------------ pandas/core/indexes/interval.py | 14 -------------- pandas/core/indexing.py | 25 ++++++++++++++++++++++--- pandas/tests/indexing/test_indexing.py | 2 +- 5 files changed, 24 insertions(+), 47 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 47b99f11c32fe..457ec8ab702d5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3720,7 +3720,7 @@ def _convert_listlike_indexer(self, keyarr): else: keyarr = self._convert_arr_indexer(keyarr) - indexer = self._convert_list_indexer(keyarr) + indexer = None return indexer, keyarr def _convert_arr_indexer(self, keyarr) -> np.ndarray: @@ -3738,22 +3738,6 @@ def _convert_arr_indexer(self, keyarr) -> np.ndarray: """ return com.asarray_tuplesafe(keyarr) - def _convert_list_indexer(self, keyarr): - """ - Convert a list-like indexer to the appropriate dtype. - - Parameters - ---------- - keyarr : Index (or sub-class) - Indexer to convert. - kind : iloc, loc, optional - - Returns - ------- - positional indexer or None - """ - return None - @final def _invalid_indexer(self, form: str_t, key) -> TypeError: """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 1541885887dab..554cf33e22555 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -525,18 +525,6 @@ def _get_indexer_non_unique( indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), ensure_platform_int(missing) - @doc(Index._convert_list_indexer) - def _convert_list_indexer(self, keyarr): - # Return our indexer or raise if all of the values are not included in - # the categories - - if self.categories._defer_to_indexing: - # See tests.indexing.interval.test_interval:test_loc_getitem_frame - indexer = self.categories._convert_list_indexer(keyarr) - return Index(self.codes).get_indexer_for(indexer) - - return self.get_indexer_for(keyarr) - # -------------------------------------------------------------------- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 06ab7fdbcf872..58c2b3e26ce06 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -815,20 +815,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound") return getattr(self, side)._maybe_cast_slice_bound(label, side) - @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr): - """ - we are passed a list-like indexer. Return the - indexer for matching intervals. - """ - locs = self.get_indexer_for(keyarr) - - # we have missing values - if (locs == -1).any(): - raise KeyError(keyarr[locs == -1].tolist()) - - return locs - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: if not isinstance(dtype, IntervalDtype): return False diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d5555561088eb..cf8187ffb761e 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -51,8 +51,11 @@ length_of_indexer, ) from pandas.core.indexes.api import ( + CategoricalIndex, Index, + IntervalIndex, MultiIndex, + ensure_index, ) if TYPE_CHECKING: @@ -1297,6 +1300,11 @@ def _get_listlike_indexer(self, key, axis: int): keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) self._validate_read_indexer(keyarr, indexer, axis) + + if isinstance(ax, (IntervalIndex, CategoricalIndex)): + # take instead of reindex to preserve dtype. For IntervalIndex + # this is to map integers to the Intervals they match to. + keyarr = ax.take(indexer) return keyarr, indexer def _validate_read_indexer(self, key, indexer, axis: int): @@ -1329,13 +1337,24 @@ def _validate_read_indexer(self, key, indexer, axis: int): missing = (missing_mask).sum() if missing: + ax = self.obj._get_axis(axis) + + # TODO: remove special-case; this is just to keep exception + # message tests from raising while debugging + use_interval_msg = isinstance(ax, IntervalIndex) or ( + isinstance(ax, CategoricalIndex) + and isinstance(ax.categories, IntervalIndex) + ) + if missing == len(indexer): axis_name = self.obj._get_axis_name(axis) + if use_interval_msg: + raise KeyError(list(key)) raise KeyError(f"None of [{key}] are in the [{axis_name}]") - ax = self.obj._get_axis(axis) - - not_found = list(set(key) - set(ax)) + not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) + if use_interval_msg: + raise KeyError(not_found) raise KeyError(f"{not_found} not in index") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 772aa97c47233..9c6a39c991912 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -338,7 +338,7 @@ def test_multitype_list_index_access(self): # GH 10610 df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) - with pytest.raises(KeyError, match=re.escape("'[-8, 26] not in index'")): + with pytest.raises(KeyError, match=re.escape("'[26, -8] not in index'")): df[[22, 26, -8]] assert df[21].shape[0] == df.shape[0] From 2e06c46ade52eeed6276918cb77bbce8922e3bfc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jun 2021 13:06:43 -0700 Subject: [PATCH 2/2] update to standardize exception messages --- pandas/core/indexing.py | 4 +--- pandas/tests/indexing/interval/test_interval.py | 8 ++++---- pandas/tests/indexing/interval/test_interval_new.py | 3 ++- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cf8187ffb761e..66de374121fb0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1349,12 +1349,10 @@ def _validate_read_indexer(self, key, indexer, axis: int): if missing == len(indexer): axis_name = self.obj._get_axis_name(axis) if use_interval_msg: - raise KeyError(list(key)) + key = list(key) raise KeyError(f"None of [{key}] are in the [{axis_name}]") not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) - if use_interval_msg: - raise KeyError(not_found) raise KeyError(f"{not_found} not in index") diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 395e9297a8fde..503e39041a49f 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -65,10 +65,10 @@ def test_getitem_non_matching(self, series_with_interval_index, indexer_sl): # this is a departure from our current # indexing scheme, but simpler - with pytest.raises(KeyError, match=r"^\[-1\]$"): + with pytest.raises(KeyError, match=r"\[-1\] not in index"): indexer_sl(ser)[[-1, 3, 4, 5]] - with pytest.raises(KeyError, match=r"^\[-1\]$"): + with pytest.raises(KeyError, match=r"\[-1\] not in index"): indexer_sl(ser)[[-1, 3]] @pytest.mark.slow @@ -107,11 +107,11 @@ def test_loc_getitem_frame(self): expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError, match=r"^\[10\]$"): + with pytest.raises(KeyError, match=r"None of \[\[10\]\] are"): df.loc[[10]] # partial missing - with pytest.raises(KeyError, match=r"^\[10\]$"): + with pytest.raises(KeyError, match=r"\[10\] not in index"): df.loc[[10, 4]] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 34dc5d604e90d..aad6523357df6 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -150,7 +150,8 @@ def test_loc_with_overlap(self, indexer_sl): with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): indexer_sl(ser)[Interval(3, 5)] - with pytest.raises(KeyError, match=r"^\[Interval\(3, 5, closed='right'\)\]$"): + msg = r"None of \[\[Interval\(3, 5, closed='right'\)\]\]" + with pytest.raises(KeyError, match=msg): indexer_sl(ser)[[Interval(3, 5)]] # slices with interval (only exact matches)