diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 87a049c77dc32..9f011bc9d2651 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1680,26 +1680,6 @@ def _box_func(self, i: int): return np.NaN return self.categories[i] - def _validate_listlike(self, target: ArrayLike) -> np.ndarray: - """ - Extract integer codes we can use for comparison. - - Notes - ----- - If a value in target is not present, it gets coded as -1. - """ - - if isinstance(target, Categorical): - # Indexing on codes is more efficient if categories are the same, - # so we can apply some optimizations based on the degree of - # dtype-matching. - cat = self._encode_with_my_categories(target) - codes = cat._codes - else: - codes = self.categories.get_indexer(target) - - return codes - def _unbox_scalar(self, key) -> int: # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. @@ -1707,10 +1687,6 @@ def _unbox_scalar(self, key) -> int: code = self._codes.dtype.type(code) return code - def _unbox_listlike(self, value): - unboxed = self.categories.get_indexer(value) - return unboxed.astype(self._ndarray.dtype, copy=False) - # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1884,7 +1860,8 @@ def _validate_setitem_value(self, value): "category, set the categories first" ) - return self._unbox_listlike(rvalue) + codes = self.categories.get_indexer(rvalue) + return codes.astype(self._ndarray.dtype, copy=False) def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 3f04339803bf6..64037f5757a38 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -48,6 +48,9 @@ def recode_for_groupby( """ # we only care about observed values if observed: + # In cases with c.ordered, this is equivalent to + # return c.remove_unused_categories(), c + unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 859c26a40e50d..24bd60a7356dd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -8,7 +8,7 @@ from pandas._libs import index as libindex from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default -from pandas._typing import Label +from pandas._typing import ArrayLike, Label from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( @@ -542,7 +542,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - codes = self._values._validate_listlike(target._values) + # Note: we use engine.get_indexer_non_unique below because, even if + # `target` is unique, any non-category entries in it will be encoded + # as -1 by _get_codes_for_get_indexer, so `codes` may not be unique. + codes = self._get_codes_for_get_indexer(target._values) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) @@ -550,10 +553,30 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - codes = self._values._validate_listlike(target._values) + codes = self._get_codes_for_get_indexer(target._values) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing + def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray: + """ + Extract integer codes we can use for comparison. + + Notes + ----- + If a value in target is not present, it gets coded as -1. + """ + + if isinstance(target, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + cat = self._data._encode_with_my_categories(target) + codes = cat._codes + else: + codes = self.categories.get_indexer(target) + + return codes + @doc(Index._convert_list_indexer) def _convert_list_indexer(self, keyarr): # Return our indexer or raise if all of the values are not included in diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d49e834fedb2d..dd45a00155721 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1951,21 +1951,23 @@ def _factorize_keys( rk, _ = rk._values_for_factorize() elif ( - is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) + is_categorical_dtype(lk.dtype) + and is_categorical_dtype(rk.dtype) + and is_dtype_equal(lk.dtype, rk.dtype) ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) # Cast rk to encoding so we can compare codes with lk - rk = lk._validate_listlike(rk) + rk = lk._encode_with_my_categories(rk) lk = ensure_int64(lk.codes) - rk = ensure_int64(rk) + rk = ensure_int64(rk.codes) elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() - if is_integer_dtype(lk) and is_integer_dtype(rk): + if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer