REF: remove Categorical._validate_listlike, unbox_listlike (#37724)

jbrockmendel · web-flow · commit e534b10c9ef3 · 2020-11-09T22:36:12.000-05:00
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1680,37 +1680,13 @@ def _box_func(self, i: int):
             return np.NaN
         return self.categories[i]
 
-    def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
-        """
-        Extract integer codes we can use for comparison.
-
-        Notes
-        -----
-        If a value in target is not present, it gets coded as -1.
-        """
-
-        if isinstance(target, Categorical):
-            # Indexing on codes is more efficient if categories are the same,
-            #  so we can apply some optimizations based on the degree of
-            #  dtype-matching.
-            cat = self._encode_with_my_categories(target)
-            codes = cat._codes
-        else:
-            codes = self.categories.get_indexer(target)
-
-        return codes
-
     def _unbox_scalar(self, key) -> int:
         # searchsorted is very performance sensitive. By converting codes
         # to same dtype as self.codes, we get much faster performance.
         code = self.categories.get_loc(key)
         code = self._codes.dtype.type(code)
         return code
 
-    def _unbox_listlike(self, value):
-        unboxed = self.categories.get_indexer(value)
-        return unboxed.astype(self._ndarray.dtype, copy=False)
-
     # ------------------------------------------------------------------
 
     def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1884,7 +1860,8 @@ def _validate_setitem_value(self, value):
                 "category, set the categories first"
             )
 
-        return self._unbox_listlike(rvalue)
+        codes = self.categories.get_indexer(rvalue)
+        return codes.astype(self._ndarray.dtype, copy=False)
 
     def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
         """
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
@@ -48,6 +48,9 @@ def recode_for_groupby(
     """
     # we only care about observed values
     if observed:
+        # In cases with c.ordered, this is equivalent to
+        #  return c.remove_unused_categories(), c
+
         unique_codes = unique1d(c.codes)
 
         take_codes = unique_codes[unique_codes != -1]
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -8,7 +8,7 @@
 from pandas._libs import index as libindex
 from pandas._libs.hashtable import duplicated_int64
 from pandas._libs.lib import no_default
-from pandas._typing import Label
+from pandas._typing import ArrayLike, Label
 from pandas.util._decorators import Appender, cache_readonly, doc
 
 from pandas.core.dtypes.common import (
@@ -542,18 +542,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "method='nearest' not implemented yet for CategoricalIndex"
             )
 
-        codes = self._values._validate_listlike(target._values)
+        # Note: we use engine.get_indexer_non_unique below because, even if
+        #  `target` is unique, any non-category entries in it will be encoded
+        #  as -1 by _get_codes_for_get_indexer, so `codes` may not be unique.
+        codes = self._get_codes_for_get_indexer(target._values)
         indexer, _ = self._engine.get_indexer_non_unique(codes)
         return ensure_platform_int(indexer)
 
     @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
     def get_indexer_non_unique(self, target):
         target = ibase.ensure_index(target)
 
-        codes = self._values._validate_listlike(target._values)
+        codes = self._get_codes_for_get_indexer(target._values)
         indexer, missing = self._engine.get_indexer_non_unique(codes)
         return ensure_platform_int(indexer), missing
 
+    def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray:
+        """
+        Extract integer codes we can use for comparison.
+
+        Notes
+        -----
+        If a value in target is not present, it gets coded as -1.
+        """
+
+        if isinstance(target, Categorical):
+            # Indexing on codes is more efficient if categories are the same,
+            #  so we can apply some optimizations based on the degree of
+            #  dtype-matching.
+            cat = self._data._encode_with_my_categories(target)
+            codes = cat._codes
+        else:
+            codes = self.categories.get_indexer(target)
+
+        return codes
+
     @doc(Index._convert_list_indexer)
     def _convert_list_indexer(self, keyarr):
         # Return our indexer or raise if all of the values are not included in
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1951,21 +1951,23 @@ def _factorize_keys(
         rk, _ = rk._values_for_factorize()
 
     elif (
-        is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
+        is_categorical_dtype(lk.dtype)
+        and is_categorical_dtype(rk.dtype)
+        and is_dtype_equal(lk.dtype, rk.dtype)
     ):
         assert isinstance(lk, Categorical)
         assert isinstance(rk, Categorical)
         # Cast rk to encoding so we can compare codes with lk
-        rk = lk._validate_listlike(rk)
+        rk = lk._encode_with_my_categories(rk)
 
         lk = ensure_int64(lk.codes)
-        rk = ensure_int64(rk)
+        rk = ensure_int64(rk.codes)
 
     elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):
         lk, _ = lk._values_for_factorize()
         rk, _ = rk._values_for_factorize()
 
-    if is_integer_dtype(lk) and is_integer_dtype(rk):
+    if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
         # GH#23917 TODO: needs tests for case where lk is integer-dtype
         #  and rk is datetime-dtype
         klass = libhashtable.Int64Factorizer