Skip to content

Commit 7e0bf1c

Browse files
authored
REF: implement Categorical._validate_listlike (#36274)
1 parent 39c5e29 commit 7e0bf1c

File tree

4 files changed

+39
-53
lines changed

4 files changed

+39
-53
lines changed

pandas/core/arrays/categorical.py

+32-12
Original file line numberDiff line numberDiff line change
@@ -1716,6 +1716,35 @@ def _box_func(self, i: int):
17161716
return np.NaN
17171717
return self.categories[i]
17181718

1719+
def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
1720+
"""
1721+
Extract integer codes we can use for comparison.
1722+
1723+
Notes
1724+
-----
1725+
If a value in target is not present, it gets coded as -1.
1726+
"""
1727+
1728+
if isinstance(target, Categorical):
1729+
# Indexing on codes is more efficient if categories are the same,
1730+
# so we can apply some optimizations based on the degree of
1731+
# dtype-matching.
1732+
if self.categories.equals(target.categories):
1733+
# We use the same codes, so can go directly to the engine
1734+
codes = target.codes
1735+
elif self.is_dtype_equal(target):
1736+
# We have the same categories up to a reshuffling of codes.
1737+
codes = recode_for_categories(
1738+
target.codes, target.categories, self.categories
1739+
)
1740+
else:
1741+
code_indexer = self.categories.get_indexer(target.categories)
1742+
codes = take_1d(code_indexer, target.codes, fill_value=-1)
1743+
else:
1744+
codes = self.categories.get_indexer(target)
1745+
1746+
return codes
1747+
17191748
# ------------------------------------------------------------------
17201749

17211750
def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1890,11 +1919,8 @@ def _validate_setitem_value(self, value):
18901919
"Cannot set a Categorical with another, "
18911920
"without identical categories"
18921921
)
1893-
if not self.categories.equals(value.categories):
1894-
new_codes = recode_for_categories(
1895-
value.codes, value.categories, self.categories
1896-
)
1897-
value = Categorical.from_codes(new_codes, dtype=self.dtype)
1922+
new_codes = self._validate_listlike(value)
1923+
value = Categorical.from_codes(new_codes, dtype=self.dtype)
18981924

18991925
rvalue = value if is_list_like(value) else [value]
19001926

@@ -2164,13 +2190,7 @@ def equals(self, other: object) -> bool:
21642190
if not isinstance(other, Categorical):
21652191
return False
21662192
elif self.is_dtype_equal(other):
2167-
if self.categories.equals(other.categories):
2168-
# fastpath to avoid re-coding
2169-
other_codes = other._codes
2170-
else:
2171-
other_codes = recode_for_categories(
2172-
other.codes, other.categories, self.categories
2173-
)
2193+
other_codes = self._validate_listlike(other)
21742194
return np.array_equal(self._codes, other_codes)
21752195
return False
21762196

pandas/core/dtypes/concat.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -301,14 +301,8 @@ def _maybe_unwrap(x):
301301
categories = first.categories
302302
ordered = first.ordered
303303

304-
if all(first.categories.equals(other.categories) for other in to_union[1:]):
305-
new_codes = np.concatenate([c.codes for c in to_union])
306-
else:
307-
codes = [first.codes] + [
308-
recode_for_categories(other.codes, other.categories, first.categories)
309-
for other in to_union[1:]
310-
]
311-
new_codes = np.concatenate(codes)
304+
all_codes = [first._validate_listlike(x) for x in to_union]
305+
new_codes = np.concatenate(all_codes)
312306

313307
if sort_categories and not ignore_order and ordered:
314308
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

pandas/core/indexes/category.py

+3-26
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna
2424

2525
from pandas.core import accessor
26-
from pandas.core.algorithms import take_1d
27-
from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories
26+
from pandas.core.arrays.categorical import Categorical, contains
2827
import pandas.core.common as com
2928
from pandas.core.construction import extract_array
3029
import pandas.core.indexes.base as ibase
@@ -558,37 +557,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
558557
"method='nearest' not implemented yet for CategoricalIndex"
559558
)
560559

561-
if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target):
562-
if self._values.equals(target._values):
563-
# we have the same codes
564-
codes = target.codes
565-
else:
566-
codes = recode_for_categories(
567-
target.codes, target.categories, self._values.categories
568-
)
569-
else:
570-
if isinstance(target, CategoricalIndex):
571-
code_indexer = self.categories.get_indexer(target.categories)
572-
codes = take_1d(code_indexer, target.codes, fill_value=-1)
573-
else:
574-
codes = self.categories.get_indexer(target)
575-
560+
codes = self._values._validate_listlike(target._values)
576561
indexer, _ = self._engine.get_indexer_non_unique(codes)
577562
return ensure_platform_int(indexer)
578563

579564
@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
580565
def get_indexer_non_unique(self, target):
581566
target = ibase.ensure_index(target)
582567

583-
if isinstance(target, CategoricalIndex):
584-
# Indexing on codes is more efficient if categories are the same:
585-
if target.categories is self.categories:
586-
target = target.codes
587-
indexer, missing = self._engine.get_indexer_non_unique(target)
588-
return ensure_platform_int(indexer), missing
589-
target = target._values
590-
591-
codes = self.categories.get_indexer(target)
568+
codes = self._values._validate_listlike(target._values)
592569
indexer, missing = self._engine.get_indexer_non_unique(codes)
593570
return ensure_platform_int(indexer), missing
594571

pandas/core/reshape/merge.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
from pandas import Categorical, Index, MultiIndex
4444
from pandas.core import groupby
4545
import pandas.core.algorithms as algos
46-
from pandas.core.arrays.categorical import recode_for_categories
4746
import pandas.core.common as com
4847
from pandas.core.construction import extract_array
4948
from pandas.core.frame import _merge_doc
@@ -1936,12 +1935,8 @@ def _factorize_keys(
19361935
):
19371936
assert isinstance(lk, Categorical)
19381937
assert isinstance(rk, Categorical)
1939-
if lk.categories.equals(rk.categories):
1940-
# if we exactly match in categories, allow us to factorize on codes
1941-
rk = rk.codes
1942-
else:
1943-
# Same categories in different orders -> recode
1944-
rk = recode_for_categories(rk.codes, rk.categories, lk.categories)
1938+
# Cast rk to encoding so we can compare codes with lk
1939+
rk = lk._validate_listlike(rk)
19451940

19461941
lk = ensure_int64(lk.codes)
19471942
rk = ensure_int64(rk)

0 commit comments

Comments
 (0)