From 0d7a41ba7820ebbe130f5753963376e844cbf54b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 5 Nov 2020 07:16:05 -0800 Subject: [PATCH 1/5] REF: implement Categorical.encode_with_my_categories --- pandas/core/arrays/categorical.py | 30 +++++++++++++++++++++++------- pandas/core/dtypes/concat.py | 2 +- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 626fb495dec03..e5a5718d96cbe 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1694,9 +1694,8 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - codes = recode_for_categories( - target.codes, target.categories, self.categories, copy=False - ) + cat = self.encode_with_my_categories(target) + codes = cat._codes else: codes = self.categories.get_indexer(target) @@ -1868,8 +1867,8 @@ def _validate_setitem_value(self, value): "without identical categories" ) # is_dtype_equal implies categories_match_up_to_permutation - new_codes = self._validate_listlike(value) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + value = self.encode_with_my_categories(value) + return value._codes # wrap scalars and hashable-listlikes in list rvalue = value if not is_hashable(value) else [value] @@ -2101,8 +2100,8 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self._categories_match_up_to_permutation(other): - other_codes = self._validate_listlike(other) - return np.array_equal(self._codes, other_codes) + other = self.encode_with_my_categories(other) + return np.array_equal(self._codes, other._codes) return False @classmethod @@ -2113,6 +2112,23 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ + def encode_with_my_categories(self, other: "Categorical") -> "Categorical": + """ + Re-encode another categorical using this Categorical's categories. + + Notes + ----- + This assumes we have already checked + self._categories_match_up_to_permutation(other). + """ + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) + return self._from_backing_data(codes) + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 99dc01ef421d1..11f8ed342fe2c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -301,7 +301,7 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - all_codes = [first._validate_listlike(x) for x in to_union] + all_codes = [first.encode_with_my_categories(x)._codes for x in to_union] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: From 48940d47b896df8df757db1a60f6789f895a3980 Mon Sep 17 00:00:00 2001 From: Micael Jarniac Date: Thu, 5 Nov 2020 09:35:46 -0300 Subject: [PATCH 2/5] DOC: Fix typo (#37636) "columns(s)" sounded odd, I believe it was supposed to be just "column(s)". --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d223ba2bab0c..049d2c4888a69 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6449,7 +6449,7 @@ def update( See Also -------- dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-columns(s) operations. + DataFrame.merge : For column(s)-on-column(s) operations. Examples -------- @@ -7985,7 +7985,7 @@ def join( See Also -------- - DataFrame.merge : For column(s)-on-columns(s) operations. + DataFrame.merge : For column(s)-on-column(s) operations. Notes ----- From 16de2a01049a461856a169dcab24fd42009eff23 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Nov 2020 18:50:42 -0800 Subject: [PATCH 3/5] Use _encode_with_my_categories instead of _validate_listlike --- pandas/core/reshape/merge.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index aa883d518f8d1..436a622aa4ff0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1948,21 +1948,23 @@ def _factorize_keys( rk, _ = rk._values_for_factorize() elif ( - is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) + is_categorical_dtype(lk.dtype) + and is_categorical_dtype(rk.dtype) + and is_dtype_equal(lk.dtype, rk.dtype) ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) # Cast rk to encoding so we can compare codes with lk - rk = lk._validate_listlike(rk) + rk = lk._encode_with_my_categories(rk) lk = ensure_int64(lk.codes) - rk = ensure_int64(rk) + rk = ensure_int64(rk.codes) elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() - if is_integer_dtype(lk) and is_integer_dtype(rk): + if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer From 12e397904f925ee7ed3fac233651792b2fa60c72 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 9 Nov 2020 13:07:26 -0800 Subject: [PATCH 4/5] REF: Categorical._validate_listlike -> CategoricalIndex._get_codes_for_get_indexer --- pandas/core/arrays/categorical.py | 20 -------------------- pandas/core/groupby/categorical.py | 3 +++ pandas/core/indexes/category.py | 29 ++++++++++++++++++++++++++--- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 87a049c77dc32..4f8e1b5c2abbf 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1680,26 +1680,6 @@ def _box_func(self, i: int): return np.NaN return self.categories[i] - def _validate_listlike(self, target: ArrayLike) -> np.ndarray: - """ - Extract integer codes we can use for comparison. - - Notes - ----- - If a value in target is not present, it gets coded as -1. - """ - - if isinstance(target, Categorical): - # Indexing on codes is more efficient if categories are the same, - # so we can apply some optimizations based on the degree of - # dtype-matching. - cat = self._encode_with_my_categories(target) - codes = cat._codes - else: - codes = self.categories.get_indexer(target) - - return codes - def _unbox_scalar(self, key) -> int: # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 3f04339803bf6..64037f5757a38 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -48,6 +48,9 @@ def recode_for_groupby( """ # we only care about observed values if observed: + # In cases with c.ordered, this is equivalent to + # return c.remove_unused_categories(), c + unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 859c26a40e50d..24bd60a7356dd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -8,7 +8,7 @@ from pandas._libs import index as libindex from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default -from pandas._typing import Label +from pandas._typing import ArrayLike, Label from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( @@ -542,7 +542,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - codes = self._values._validate_listlike(target._values) + # Note: we use engine.get_indexer_non_unique below because, even if + # `target` is unique, any non-category entries in it will be encoded + # as -1 by _get_codes_for_get_indexer, so `codes` may not be unique. + codes = self._get_codes_for_get_indexer(target._values) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) @@ -550,10 +553,30 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - codes = self._values._validate_listlike(target._values) + codes = self._get_codes_for_get_indexer(target._values) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing + def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray: + """ + Extract integer codes we can use for comparison. + + Notes + ----- + If a value in target is not present, it gets coded as -1. + """ + + if isinstance(target, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + cat = self._data._encode_with_my_categories(target) + codes = cat._codes + else: + codes = self.categories.get_indexer(target) + + return codes + @doc(Index._convert_list_indexer) def _convert_list_indexer(self, keyarr): # Return our indexer or raise if all of the values are not included in From 30d4cf88032fad5112728b621294c6a8af68344c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 9 Nov 2020 13:13:18 -0800 Subject: [PATCH 5/5] CLN: remove unbox_listlike --- pandas/core/arrays/categorical.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4f8e1b5c2abbf..9f011bc9d2651 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1687,10 +1687,6 @@ def _unbox_scalar(self, key) -> int: code = self._codes.dtype.type(code) return code - def _unbox_listlike(self, value): - unboxed = self.categories.get_indexer(value) - return unboxed.astype(self._ndarray.dtype, copy=False) - # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1864,7 +1860,8 @@ def _validate_setitem_value(self, value): "category, set the categories first" ) - return self._unbox_listlike(rvalue) + codes = self.categories.get_indexer(rvalue) + return codes.astype(self._ndarray.dtype, copy=False) def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """