Skip to content

CLN: de-duplicate recode_for_categories #37548

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 2, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 23 additions & 20 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def func(self, other):

if not self.ordered and not self.categories.equals(other.categories):
# both unordered and different order
other_codes = _get_codes_for_values(other, self.categories)
other_codes = recode_for_categories(
other.codes, other.categories, self.categories, copy=False
)
else:
other_codes = other._codes

Expand Down Expand Up @@ -354,9 +356,7 @@ def __init__(
dtype = CategoricalDtype(categories, dtype.ordered)

elif is_categorical_dtype(values.dtype):
old_codes = (
values._values.codes if isinstance(values, ABCSeries) else values.codes
)
old_codes = extract_array(values).codes
codes = recode_for_categories(
old_codes, values.dtype.categories, dtype.categories
)
Expand Down Expand Up @@ -1706,17 +1706,9 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
# Indexing on codes is more efficient if categories are the same,
# so we can apply some optimizations based on the degree of
# dtype-matching.
if self.categories.equals(target.categories):
# We use the same codes, so can go directly to the engine
codes = target.codes
elif self.is_dtype_equal(target):
# We have the same categories up to a reshuffling of codes.
codes = recode_for_categories(
target.codes, target.categories, self.categories
)
else:
code_indexer = self.categories.get_indexer(target.categories)
codes = take_1d(code_indexer, target.codes, fill_value=-1)
codes = recode_for_categories(
target.codes, target.categories, self.categories, copy=False
)
else:
codes = self.categories.get_indexer(target)

Expand Down Expand Up @@ -2472,9 +2464,11 @@ def _delegate_method(self, name, *args, **kwargs):
# utility routines


def _get_codes_for_values(values, categories):
def _get_codes_for_values(values, categories) -> np.ndarray:
"""
utility routine to turn values into codes given the specified categories

If `values` is known to be a Categorical, use recode_for_categories instead.
"""
dtype_equal = is_dtype_equal(values.dtype, categories.dtype)

Expand Down Expand Up @@ -2504,14 +2498,18 @@ def _get_codes_for_values(values, categories):
return coerce_indexer_dtype(t.lookup(vals), cats)


def recode_for_categories(codes: np.ndarray, old_categories, new_categories):
def recode_for_categories(
codes: np.ndarray, old_categories, new_categories, copy: bool = True
) -> np.ndarray:
"""
Convert a set of codes for to a new set of categories

Parameters
----------
codes : np.ndarray
old_categories, new_categories : Index
copy: bool, default True
Whether to copy if the codes are unchanged.

Returns
-------
Expand All @@ -2527,14 +2525,19 @@ def recode_for_categories(codes: np.ndarray, old_categories, new_categories):
"""
if len(old_categories) == 0:
# All null anyway, so just retain the nulls
return codes.copy()
if copy:
return codes.copy()
return codes
elif new_categories.equals(old_categories):
# Same categories, so no need to actually recode
return codes.copy()
if copy:
return codes.copy()
return codes

indexer = coerce_indexer_dtype(
new_categories.get_indexer(old_categories), new_categories
)
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
new_codes = take_1d(indexer, codes, fill_value=-1)
return new_codes


Expand Down