Skip to content

Commit e534b10

Browse files
authored
REF: remove Categorical._validate_listlike, unbox_listlike (#37724)
1 parent aa390ec commit e534b10

File tree

4 files changed

+37
-32
lines changed

4 files changed

+37
-32
lines changed

pandas/core/arrays/categorical.py

+2-25
Original file line numberDiff line numberDiff line change
@@ -1680,37 +1680,13 @@ def _box_func(self, i: int):
16801680
return np.NaN
16811681
return self.categories[i]
16821682

1683-
def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
1684-
"""
1685-
Extract integer codes we can use for comparison.
1686-
1687-
Notes
1688-
-----
1689-
If a value in target is not present, it gets coded as -1.
1690-
"""
1691-
1692-
if isinstance(target, Categorical):
1693-
# Indexing on codes is more efficient if categories are the same,
1694-
# so we can apply some optimizations based on the degree of
1695-
# dtype-matching.
1696-
cat = self._encode_with_my_categories(target)
1697-
codes = cat._codes
1698-
else:
1699-
codes = self.categories.get_indexer(target)
1700-
1701-
return codes
1702-
17031683
def _unbox_scalar(self, key) -> int:
17041684
# searchsorted is very performance sensitive. By converting codes
17051685
# to same dtype as self.codes, we get much faster performance.
17061686
code = self.categories.get_loc(key)
17071687
code = self._codes.dtype.type(code)
17081688
return code
17091689

1710-
def _unbox_listlike(self, value):
1711-
unboxed = self.categories.get_indexer(value)
1712-
return unboxed.astype(self._ndarray.dtype, copy=False)
1713-
17141690
# ------------------------------------------------------------------
17151691

17161692
def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1884,7 +1860,8 @@ def _validate_setitem_value(self, value):
18841860
"category, set the categories first"
18851861
)
18861862

1887-
return self._unbox_listlike(rvalue)
1863+
codes = self.categories.get_indexer(rvalue)
1864+
return codes.astype(self._ndarray.dtype, copy=False)
18881865

18891866
def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
18901867
"""

pandas/core/groupby/categorical.py

+3
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ def recode_for_groupby(
4848
"""
4949
# we only care about observed values
5050
if observed:
51+
# In cases with c.ordered, this is equivalent to
52+
# return c.remove_unused_categories(), c
53+
5154
unique_codes = unique1d(c.codes)
5255

5356
take_codes = unique_codes[unique_codes != -1]

pandas/core/indexes/category.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pandas._libs import index as libindex
99
from pandas._libs.hashtable import duplicated_int64
1010
from pandas._libs.lib import no_default
11-
from pandas._typing import Label
11+
from pandas._typing import ArrayLike, Label
1212
from pandas.util._decorators import Appender, cache_readonly, doc
1313

1414
from pandas.core.dtypes.common import (
@@ -542,18 +542,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
542542
"method='nearest' not implemented yet for CategoricalIndex"
543543
)
544544

545-
codes = self._values._validate_listlike(target._values)
545+
# Note: we use engine.get_indexer_non_unique below because, even if
546+
# `target` is unique, any non-category entries in it will be encoded
547+
# as -1 by _get_codes_for_get_indexer, so `codes` may not be unique.
548+
codes = self._get_codes_for_get_indexer(target._values)
546549
indexer, _ = self._engine.get_indexer_non_unique(codes)
547550
return ensure_platform_int(indexer)
548551

549552
@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
550553
def get_indexer_non_unique(self, target):
551554
target = ibase.ensure_index(target)
552555

553-
codes = self._values._validate_listlike(target._values)
556+
codes = self._get_codes_for_get_indexer(target._values)
554557
indexer, missing = self._engine.get_indexer_non_unique(codes)
555558
return ensure_platform_int(indexer), missing
556559

560+
def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray:
561+
"""
562+
Extract integer codes we can use for comparison.
563+
564+
Notes
565+
-----
566+
If a value in target is not present, it gets coded as -1.
567+
"""
568+
569+
if isinstance(target, Categorical):
570+
# Indexing on codes is more efficient if categories are the same,
571+
# so we can apply some optimizations based on the degree of
572+
# dtype-matching.
573+
cat = self._data._encode_with_my_categories(target)
574+
codes = cat._codes
575+
else:
576+
codes = self.categories.get_indexer(target)
577+
578+
return codes
579+
557580
@doc(Index._convert_list_indexer)
558581
def _convert_list_indexer(self, keyarr):
559582
# Return our indexer or raise if all of the values are not included in

pandas/core/reshape/merge.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -1951,21 +1951,23 @@ def _factorize_keys(
19511951
rk, _ = rk._values_for_factorize()
19521952

19531953
elif (
1954-
is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
1954+
is_categorical_dtype(lk.dtype)
1955+
and is_categorical_dtype(rk.dtype)
1956+
and is_dtype_equal(lk.dtype, rk.dtype)
19551957
):
19561958
assert isinstance(lk, Categorical)
19571959
assert isinstance(rk, Categorical)
19581960
# Cast rk to encoding so we can compare codes with lk
1959-
rk = lk._validate_listlike(rk)
1961+
rk = lk._encode_with_my_categories(rk)
19601962

19611963
lk = ensure_int64(lk.codes)
1962-
rk = ensure_int64(rk)
1964+
rk = ensure_int64(rk.codes)
19631965

19641966
elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):
19651967
lk, _ = lk._values_for_factorize()
19661968
rk, _ = rk._values_for_factorize()
19671969

1968-
if is_integer_dtype(lk) and is_integer_dtype(rk):
1970+
if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
19691971
# GH#23917 TODO: needs tests for case where lk is integer-dtype
19701972
# and rk is datetime-dtype
19711973
klass = libhashtable.Int64Factorizer

0 commit comments

Comments
 (0)