diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 45623f182144b..b6cd0e325f8a6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -93,7 +93,7 @@ def func(self, other): if is_scalar(other): if other in self.categories: - i = self.categories.get_loc(other) + i = self._unbox_scalar(other) ret = op(self._codes, i) if opname not in {"__eq__", "__ge__", "__gt__"}: @@ -1184,8 +1184,7 @@ def _validate_searchsorted_value(self, value): # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. if is_scalar(value): - codes = self.categories.get_loc(value) - codes = self.codes.dtype.type(codes) + codes = self._unbox_scalar(value) else: locs = [self.categories.get_loc(x) for x in value] codes = np.array(locs, dtype=self.codes.dtype) @@ -1212,7 +1211,7 @@ def _validate_fill_value(self, fill_value): if isna(fill_value): fill_value = -1 elif fill_value in self.categories: - fill_value = self.categories.get_loc(fill_value) + fill_value = self._unbox_scalar(fill_value) else: raise ValueError( f"'fill_value={fill_value}' is not present " @@ -1680,7 +1679,7 @@ def fillna(self, value=None, method=None, limit=None): if isna(value): codes[mask] = -1 else: - codes[mask] = self.categories.get_loc(value) + codes[mask] = self._unbox_scalar(value) else: raise TypeError( @@ -1734,6 +1733,17 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: return codes + def _unbox_scalar(self, key) -> int: + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + code = self.categories.get_loc(key) + code = self._codes.dtype.type(code) + return code + + def _unbox_listlike(self, value): + unboxed = self.categories.get_indexer(value) + return unboxed.astype(self._ndarray.dtype, copy=False) + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1925,11 +1935,7 @@ def _validate_setitem_value(self, value): "category, set the categories first" ) - lindexer = self.categories.get_indexer(rvalue) - if isinstance(lindexer, np.ndarray) and lindexer.dtype.kind == "i": - lindexer = lindexer.astype(self._ndarray.dtype) - - return lindexer + return self._unbox_listlike(rvalue) def _validate_setitem_key(self, key): if lib.is_integer(key): @@ -2155,8 +2161,7 @@ def unique(self): return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): - codes = self.codes.astype("int64") - return codes, -1 + return self._ndarray, -1 @classmethod def _from_factorized(cls, uniques, original): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 377996344dbbc..14f713530868a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -697,7 +697,7 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: return new_obj def _values_for_factorize(self): - return self.asi8, iNaT + return self._ndarray, iNaT @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 85ef3e58576e3..829cf767c448f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -512,10 +512,8 @@ def _reindex_non_unique(self, target): # -------------------------------------------------------------------- # Indexing Methods - def _maybe_cast_indexer(self, key): - code = self.categories.get_loc(key) - code = self.codes.dtype.type(code) - return code + def _maybe_cast_indexer(self, key) -> int: + return self._data._unbox_scalar(key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None):