From 0d7a41ba7820ebbe130f5753963376e844cbf54b Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 5 Nov 2020 07:16:05 -0800
Subject: [PATCH 1/5] REF: implement Categorical.encode_with_my_categories

---
 pandas/core/arrays/categorical.py | 30 +++++++++++++++++++++++-------
 pandas/core/dtypes/concat.py      |  2 +-
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 626fb495dec03..e5a5718d96cbe 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1694,9 +1694,8 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
             # Indexing on codes is more efficient if categories are the same,
             #  so we can apply some optimizations based on the degree of
             #  dtype-matching.
-            codes = recode_for_categories(
-                target.codes, target.categories, self.categories, copy=False
-            )
+            cat = self.encode_with_my_categories(target)
+            codes = cat._codes
         else:
             codes = self.categories.get_indexer(target)
 
@@ -1868,8 +1867,8 @@ def _validate_setitem_value(self, value):
                     "without identical categories"
                 )
             # is_dtype_equal implies categories_match_up_to_permutation
-            new_codes = self._validate_listlike(value)
-            value = Categorical.from_codes(new_codes, dtype=self.dtype)
+            value = self.encode_with_my_categories(value)
+            return value._codes
 
         # wrap scalars and hashable-listlikes in list
         rvalue = value if not is_hashable(value) else [value]
@@ -2101,8 +2100,8 @@ def equals(self, other: object) -> bool:
         if not isinstance(other, Categorical):
             return False
         elif self._categories_match_up_to_permutation(other):
-            other_codes = self._validate_listlike(other)
-            return np.array_equal(self._codes, other_codes)
+            other = self.encode_with_my_categories(other)
+            return np.array_equal(self._codes, other._codes)
         return False
 
     @classmethod
@@ -2113,6 +2112,23 @@ def _concat_same_type(self, to_concat):
 
     # ------------------------------------------------------------------
 
+    def encode_with_my_categories(self, other: "Categorical") -> "Categorical":
+        """
+        Re-encode another categorical using this Categorical's categories.
+
+        Notes
+        -----
+        This assumes we have already checked
+        self._categories_match_up_to_permutation(other).
+        """
+        # Indexing on codes is more efficient if categories are the same,
+        #  so we can apply some optimizations based on the degree of
+        #  dtype-matching.
+        codes = recode_for_categories(
+            other.codes, other.categories, self.categories, copy=False
+        )
+        return self._from_backing_data(codes)
+
     def _categories_match_up_to_permutation(self, other: "Categorical") -> bool:
         """
         Returns True if categoricals are the same dtype
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index 99dc01ef421d1..11f8ed342fe2c 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -301,7 +301,7 @@ def _maybe_unwrap(x):
         categories = first.categories
         ordered = first.ordered
 
-        all_codes = [first._validate_listlike(x) for x in to_union]
+        all_codes = [first.encode_with_my_categories(x)._codes for x in to_union]
         new_codes = np.concatenate(all_codes)
 
         if sort_categories and not ignore_order and ordered:

From 48940d47b896df8df757db1a60f6789f895a3980 Mon Sep 17 00:00:00 2001
From: Micael Jarniac <micael@jarniac.com>
Date: Thu, 5 Nov 2020 09:35:46 -0300
Subject: [PATCH 2/5] DOC: Fix typo (#37636)

"columns(s)" sounded odd, I believe it was supposed to be just "column(s)".
---
 pandas/core/frame.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 9d223ba2bab0c..049d2c4888a69 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -6449,7 +6449,7 @@ def update(
         See Also
         --------
         dict.update : Similar method for dictionaries.
-        DataFrame.merge : For column(s)-on-columns(s) operations.
+        DataFrame.merge : For column(s)-on-column(s) operations.
 
         Examples
         --------
@@ -7985,7 +7985,7 @@ def join(
 
         See Also
         --------
-        DataFrame.merge : For column(s)-on-columns(s) operations.
+        DataFrame.merge : For column(s)-on-column(s) operations.
 
         Notes
         -----

From 16de2a01049a461856a169dcab24fd42009eff23 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 8 Nov 2020 18:50:42 -0800
Subject: [PATCH 3/5] Use _encode_with_my_categories instead of
 _validate_listlike

---
 pandas/core/reshape/merge.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index aa883d518f8d1..436a622aa4ff0 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -1948,21 +1948,23 @@ def _factorize_keys(
         rk, _ = rk._values_for_factorize()
 
     elif (
-        is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
+        is_categorical_dtype(lk.dtype)
+        and is_categorical_dtype(rk.dtype)
+        and is_dtype_equal(lk.dtype, rk.dtype)
     ):
         assert isinstance(lk, Categorical)
         assert isinstance(rk, Categorical)
         # Cast rk to encoding so we can compare codes with lk
-        rk = lk._validate_listlike(rk)
+        rk = lk._encode_with_my_categories(rk)
 
         lk = ensure_int64(lk.codes)
-        rk = ensure_int64(rk)
+        rk = ensure_int64(rk.codes)
 
     elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):
         lk, _ = lk._values_for_factorize()
         rk, _ = rk._values_for_factorize()
 
-    if is_integer_dtype(lk) and is_integer_dtype(rk):
+    if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
         # GH#23917 TODO: needs tests for case where lk is integer-dtype
         #  and rk is datetime-dtype
         klass = libhashtable.Int64Factorizer

From 12e397904f925ee7ed3fac233651792b2fa60c72 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 9 Nov 2020 13:07:26 -0800
Subject: [PATCH 4/5] REF: Categorical._validate_listlike ->
 CategoricalIndex._get_codes_for_get_indexer

---
 pandas/core/arrays/categorical.py  | 20 --------------------
 pandas/core/groupby/categorical.py |  3 +++
 pandas/core/indexes/category.py    | 29 ++++++++++++++++++++++++++---
 3 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 87a049c77dc32..4f8e1b5c2abbf 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1680,26 +1680,6 @@ def _box_func(self, i: int):
             return np.NaN
         return self.categories[i]
 
-    def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
-        """
-        Extract integer codes we can use for comparison.
-
-        Notes
-        -----
-        If a value in target is not present, it gets coded as -1.
-        """
-
-        if isinstance(target, Categorical):
-            # Indexing on codes is more efficient if categories are the same,
-            #  so we can apply some optimizations based on the degree of
-            #  dtype-matching.
-            cat = self._encode_with_my_categories(target)
-            codes = cat._codes
-        else:
-            codes = self.categories.get_indexer(target)
-
-        return codes
-
     def _unbox_scalar(self, key) -> int:
         # searchsorted is very performance sensitive. By converting codes
         # to same dtype as self.codes, we get much faster performance.
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
index 3f04339803bf6..64037f5757a38 100644
--- a/pandas/core/groupby/categorical.py
+++ b/pandas/core/groupby/categorical.py
@@ -48,6 +48,9 @@ def recode_for_groupby(
     """
     # we only care about observed values
     if observed:
+        # In cases with c.ordered, this is equivalent to
+        #  return c.remove_unused_categories(), c
+
         unique_codes = unique1d(c.codes)
 
         take_codes = unique_codes[unique_codes != -1]
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 859c26a40e50d..24bd60a7356dd 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -8,7 +8,7 @@
 from pandas._libs import index as libindex
 from pandas._libs.hashtable import duplicated_int64
 from pandas._libs.lib import no_default
-from pandas._typing import Label
+from pandas._typing import ArrayLike, Label
 from pandas.util._decorators import Appender, cache_readonly, doc
 
 from pandas.core.dtypes.common import (
@@ -542,7 +542,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "method='nearest' not implemented yet for CategoricalIndex"
             )
 
-        codes = self._values._validate_listlike(target._values)
+        # Note: we use engine.get_indexer_non_unique below because, even if
+        #  `target` is unique, any non-category entries in it will be encoded
+        #  as -1 by _get_codes_for_get_indexer, so `codes` may not be unique.
+        codes = self._get_codes_for_get_indexer(target._values)
         indexer, _ = self._engine.get_indexer_non_unique(codes)
         return ensure_platform_int(indexer)
 
@@ -550,10 +553,30 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
     def get_indexer_non_unique(self, target):
         target = ibase.ensure_index(target)
 
-        codes = self._values._validate_listlike(target._values)
+        codes = self._get_codes_for_get_indexer(target._values)
         indexer, missing = self._engine.get_indexer_non_unique(codes)
         return ensure_platform_int(indexer), missing
 
+    def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray:
+        """
+        Extract integer codes we can use for comparison.
+
+        Notes
+        -----
+        If a value in target is not present, it gets coded as -1.
+        """
+
+        if isinstance(target, Categorical):
+            # Indexing on codes is more efficient if categories are the same,
+            #  so we can apply some optimizations based on the degree of
+            #  dtype-matching.
+            cat = self._data._encode_with_my_categories(target)
+            codes = cat._codes
+        else:
+            codes = self.categories.get_indexer(target)
+
+        return codes
+
     @doc(Index._convert_list_indexer)
     def _convert_list_indexer(self, keyarr):
         # Return our indexer or raise if all of the values are not included in

From 30d4cf88032fad5112728b621294c6a8af68344c Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 9 Nov 2020 13:13:18 -0800
Subject: [PATCH 5/5] CLN: remove unbox_listlike

---
 pandas/core/arrays/categorical.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 4f8e1b5c2abbf..9f011bc9d2651 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1687,10 +1687,6 @@ def _unbox_scalar(self, key) -> int:
         code = self._codes.dtype.type(code)
         return code
 
-    def _unbox_listlike(self, value):
-        unboxed = self.categories.get_indexer(value)
-        return unboxed.astype(self._ndarray.dtype, copy=False)
-
     # ------------------------------------------------------------------
 
     def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1864,7 +1860,8 @@ def _validate_setitem_value(self, value):
                 "category, set the categories first"
             )
 
-        return self._unbox_listlike(rvalue)
+        codes = self.categories.get_indexer(rvalue)
+        return codes.astype(self._ndarray.dtype, copy=False)
 
     def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
         """