From 291a72599fe4b54edc5a6b65ff5ac67db414f1a8 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Jul 2021 11:47:32 -0700 Subject: [PATCH 1/3] REF: implement get_indexer_strict --- pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 78 +++++++++++++++++++++++++++++++ pandas/core/indexes/multi.py | 39 ++++++++-------- pandas/core/indexing.py | 90 +----------------------------------- 4 files changed, 102 insertions(+), 107 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 43adb4df7fcb4..841b0fe4f0195 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3456,7 +3456,7 @@ def __getitem__(self, key): else: if is_iterator(key): key = list(key) - indexer = self.loc._get_listlike_indexer(key, axis=1)[1] + indexer = self.columns.get_indexer_strict(key, "columns")[1] # take() does not accept boolean indexers if getattr(indexer, "dtype", None) == bool: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f1d85f1340a4..cff022d6db06a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5390,6 +5390,84 @@ def get_indexer_for(self, target) -> np.ndarray: indexer, _ = self.get_indexer_non_unique(target) return indexer + def get_indexer_strict(self, key, axis_name: str) -> np.ndarray: + """ + Analogue to get_indexer that raises if any elements are missing. + """ + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) + + if self._index_as_unique: + indexer = self.get_indexer_for(keyarr) + keyarr = self.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr) + + self._raise_if_missing(keyarr, indexer, axis_name) + + if ( + needs_i8_conversion(self.dtype) + or is_categorical_dtype(self.dtype) + or is_interval_dtype(self.dtype) + ): + # For CategoricalIndex take instead of reindex to preserve dtype. + # For IntervalIndex this is to map integers to the Intervals they match to. + keyarr = self.take(indexer) + if keyarr.dtype.kind in ["m", "M"]: + # DTI/TDI.take can infer a freq in some cases when we dont want one + if isinstance(key, list) or ( + isinstance(key, type(self)) and key.freq is None + ): + keyarr = keyarr._with_freq(None) + + return keyarr, indexer + + def _raise_if_missing(self, key, indexer, axis_name: str): + """ + Check that indexer can be used to return a result. + + e.g. at least one element was found, + unless the list of keys was actually empty. + + Parameters + ---------- + key : list-like + Targeted labels (only used to show correct error message). + indexer: array-like of booleans + Indices corresponding to the key, + (with -1 indicating not found). + axis_name : str + + Raises + ------ + KeyError + If at least one key was requested but none was found. + """ + if len(key) == 0: + return + + # Count missing values + missing_mask = indexer < 0 + nmissing = missing_mask.sum() + + if nmissing: + + # TODO: remove special-case; this is just to keep exception + # message tests from raising while debugging + use_interval_msg = is_interval_dtype(self.dtype) or ( + is_categorical_dtype(self.dtype) + and is_interval_dtype(self.categories.dtype) + ) + + if nmissing == len(indexer): + if use_interval_msg: + key = list(key) + raise KeyError(f"None of [{key}] are in the [{axis_name}]") + + not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) + raise KeyError(f"{not_found} not in index") + @overload def _get_indexer_non_comparable( self, target: Index, method, unique: Literal[True] = ... diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8903d29782610..6b02358ab2dcc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2542,29 +2542,32 @@ def _get_values_for_loc(self, series: Series, loc, key): new_ser = series._constructor(new_values, index=new_index, name=series.name) return new_ser.__finalize__(series) - def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None: - """ - Analogous to get_indexer when we are partial-indexing on our first level. - - Parameters - ---------- - keyarr : Index, np.ndarray, or ExtensionArray - Indexer to convert. + def get_indexer_strict(self, key, axis_name) -> np.ndarray: - Returns - ------- - np.ndarray[intp] or None - """ - indexer = None + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) - # are we indexing a specific level if len(keyarr) and not isinstance(keyarr[0], tuple): _, indexer = self.reindex(keyarr, level=0) - # take all if indexer is None: + # exact match indexer = np.arange(len(self), dtype=np.intp) - return indexer + + else: + self._raise_if_missing(key, indexer, axis_name) + return self[indexer], indexer + + return super().get_indexer_strict(key, axis_name) + + def _raise_if_missing(self, key, indexer, axis_name: str): + keyarr = key + if not isinstance(key, Index): + keyarr = com.asarray_tuplesafe(key) + + if len(keyarr) and not isinstance(keyarr[0], tuple): + # i.e. same condition for special case in MultiIndex.get_indexer_strict check = self.levels[0].get_indexer(keyarr) mask = check == -1 @@ -2574,8 +2577,8 @@ def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None: # We get here when levels still contain values which are not # actually in Index anymore raise KeyError(f"{keyarr} not in index") - - return indexer + else: + return super()._raise_if_missing(key, indexer, axis_name) def _get_partial_string_timestamp_match_key(self, key): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 387dcca6897b7..f85f74ff7f3b3 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -30,7 +30,6 @@ is_object_dtype, is_scalar, is_sequence, - needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -56,11 +55,8 @@ length_of_indexer, ) from pandas.core.indexes.api import ( - CategoricalIndex, Index, - IntervalIndex, MultiIndex, - ensure_index, ) if TYPE_CHECKING: @@ -1300,94 +1296,12 @@ def _get_listlike_indexer(self, key, axis: int): Indexer for the return object, -1 denotes keys not found. """ ax = self.obj._get_axis(axis) + axis_name = self.obj._get_axis_name(axis) - keyarr = key - if not isinstance(keyarr, Index): - keyarr = com.asarray_tuplesafe(keyarr) - - if isinstance(ax, MultiIndex): - # get_indexer expects a MultiIndex or sequence of tuples, but - # we may be doing partial-indexing, so need an extra check - - # Have the index compute an indexer or return None - # if it cannot handle: - indexer = ax._convert_listlike_indexer(keyarr) - # We only act on all found values: - if indexer is not None and (indexer != -1).all(): - # _validate_read_indexer is a no-op if no -1s, so skip - return ax[indexer], indexer - - if ax._index_as_unique: - indexer = ax.get_indexer_for(keyarr) - keyarr = ax.reindex(keyarr)[0] - else: - keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - - self._validate_read_indexer(keyarr, indexer, axis) - - if needs_i8_conversion(ax.dtype) or isinstance( - ax, (IntervalIndex, CategoricalIndex) - ): - # For CategoricalIndex take instead of reindex to preserve dtype. - # For IntervalIndex this is to map integers to the Intervals they match to. - keyarr = ax.take(indexer) - if keyarr.dtype.kind in ["m", "M"]: - # DTI/TDI.take can infer a freq in some cases when we dont want one - if isinstance(key, list) or ( - isinstance(key, type(ax)) and key.freq is None - ): - keyarr = keyarr._with_freq(None) + keyarr, indexer = ax.get_indexer_strict(key, axis_name) return keyarr, indexer - def _validate_read_indexer(self, key, indexer, axis: int): - """ - Check that indexer can be used to return a result. - - e.g. at least one element was found, - unless the list of keys was actually empty. - - Parameters - ---------- - key : list-like - Targeted labels (only used to show correct error message). - indexer: array-like of booleans - Indices corresponding to the key, - (with -1 indicating not found). - axis : int - Dimension on which the indexing is being made. - - Raises - ------ - KeyError - If at least one key was requested but none was found. - """ - if len(key) == 0: - return - - # Count missing values: - missing_mask = indexer < 0 - missing = (missing_mask).sum() - - if missing: - ax = self.obj._get_axis(axis) - - # TODO: remove special-case; this is just to keep exception - # message tests from raising while debugging - use_interval_msg = isinstance(ax, IntervalIndex) or ( - isinstance(ax, CategoricalIndex) - and isinstance(ax.categories, IntervalIndex) - ) - - if missing == len(indexer): - axis_name = self.obj._get_axis_name(axis) - if use_interval_msg: - key = list(key) - raise KeyError(f"None of [{key}] are in the [{axis_name}]") - - not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) - raise KeyError(f"{not_found} not in index") - @doc(IndexingMixin.iloc) class _iLocIndexer(_LocationIndexer): From 4e7e62825c8fc3fc181e1755775114129965b85a Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Jul 2021 11:49:36 -0700 Subject: [PATCH 2/3] privatize get_indexer_strict->_get_indexer_strict --- pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexing.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 841b0fe4f0195..ddb91d9329710 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3456,7 +3456,7 @@ def __getitem__(self, key): else: if is_iterator(key): key = list(key) - indexer = self.columns.get_indexer_strict(key, "columns")[1] + indexer = self.columns._get_indexer_strict(key, "columns")[1] # take() does not accept boolean indexers if getattr(indexer, "dtype", None) == bool: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cff022d6db06a..eb0af1c3802ba 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5390,7 +5390,7 @@ def get_indexer_for(self, target) -> np.ndarray: indexer, _ = self.get_indexer_non_unique(target) return indexer - def get_indexer_strict(self, key, axis_name: str) -> np.ndarray: + def _get_indexer_strict(self, key, axis_name: str) -> np.ndarray: """ Analogue to get_indexer that raises if any elements are missing. """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6b02358ab2dcc..9b89eb10c89a6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2542,7 +2542,7 @@ def _get_values_for_loc(self, series: Series, loc, key): new_ser = series._constructor(new_values, index=new_index, name=series.name) return new_ser.__finalize__(series) - def get_indexer_strict(self, key, axis_name) -> np.ndarray: + def _get_indexer_strict(self, key, axis_name) -> np.ndarray: keyarr = key if not isinstance(keyarr, Index): @@ -2559,7 +2559,7 @@ def get_indexer_strict(self, key, axis_name) -> np.ndarray: self._raise_if_missing(key, indexer, axis_name) return self[indexer], indexer - return super().get_indexer_strict(key, axis_name) + return super()._get_indexer_strict(key, axis_name) def _raise_if_missing(self, key, indexer, axis_name: str): keyarr = key @@ -2567,7 +2567,7 @@ def _raise_if_missing(self, key, indexer, axis_name: str): keyarr = com.asarray_tuplesafe(key) if len(keyarr) and not isinstance(keyarr[0], tuple): - # i.e. same condition for special case in MultiIndex.get_indexer_strict + # i.e. same condition for special case in MultiIndex._get_indexer_strict check = self.levels[0].get_indexer(keyarr) mask = check == -1 diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f85f74ff7f3b3..371cf58f593ce 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1298,7 +1298,7 @@ def _get_listlike_indexer(self, key, axis: int): ax = self.obj._get_axis(axis) axis_name = self.obj._get_axis_name(axis) - keyarr, indexer = ax.get_indexer_strict(key, axis_name) + keyarr, indexer = ax._get_indexer_strict(key, axis_name) return keyarr, indexer From 79a2e263baf5c47437e4d213b51792518b74accd Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Jul 2021 14:38:10 -0700 Subject: [PATCH 3/3] mypy fixup --- pandas/core/indexes/base.py | 13 +++++++++---- pandas/core/indexes/multi.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eb0af1c3802ba..1d8497545f50a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5390,7 +5390,7 @@ def get_indexer_for(self, target) -> np.ndarray: indexer, _ = self.get_indexer_non_unique(target) return indexer - def _get_indexer_strict(self, key, axis_name: str) -> np.ndarray: + def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]: """ Analogue to get_indexer that raises if any elements are missing. """ @@ -5417,13 +5417,15 @@ def _get_indexer_strict(self, key, axis_name: str) -> np.ndarray: if keyarr.dtype.kind in ["m", "M"]: # DTI/TDI.take can infer a freq in some cases when we dont want one if isinstance(key, list) or ( - isinstance(key, type(self)) and key.freq is None + isinstance(key, type(self)) + # "Index" has no attribute "freq" + and key.freq is None # type: ignore[attr-defined] ): keyarr = keyarr._with_freq(None) return keyarr, indexer - def _raise_if_missing(self, key, indexer, axis_name: str): + def _raise_if_missing(self, key, indexer, axis_name: str_t): """ Check that indexer can be used to return a result. @@ -5457,7 +5459,10 @@ def _raise_if_missing(self, key, indexer, axis_name: str): # message tests from raising while debugging use_interval_msg = is_interval_dtype(self.dtype) or ( is_categorical_dtype(self.dtype) - and is_interval_dtype(self.categories.dtype) + # "Index" has no attribute "categories" [attr-defined] + and is_interval_dtype( + self.categories.dtype # type: ignore[attr-defined] + ) ) if nmissing == len(indexer): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9b89eb10c89a6..87d83dfce052d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2542,7 +2542,7 @@ def _get_values_for_loc(self, series: Series, loc, key): new_ser = series._constructor(new_values, index=new_index, name=series.name) return new_ser.__finalize__(series) - def _get_indexer_strict(self, key, axis_name) -> np.ndarray: + def _get_indexer_strict(self, key, axis_name: str) -> tuple[Index, np.ndarray]: keyarr = key if not isinstance(keyarr, Index):