Skip to content

Commit a45c7b0

Browse files
committed
API/PERF: Categorical.searchsorted faster and returns scalar
1 parent 4f71755 commit a45c7b0

File tree

4 files changed

+26
-15
lines changed

4 files changed

+26
-15
lines changed

doc/source/whatsnew/v0.24.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,8 @@ Other API Changes
937937
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
938938
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
939939
- Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`)
940+
- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`).
941+
- :meth:`Categorical.searchsorted` now raises a ``keyError`` rather that a ``ValueError``, if a search for key is not found in its categories (:issue:`23466`).
940942
- :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`).
941943

942944
.. _whatsnew_0240.deprecations:
@@ -1052,6 +1054,7 @@ Performance Improvements
10521054
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
10531055
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
10541056
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
1057+
- Improved performance of :meth:`Categorical.searchsorted` (:issue:`23466`)
10551058
- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
10561059
(i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
10571060
is likewise much faster (:issue:`21369`, :issue:`21508`)

pandas/core/arrays/categorical.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -1335,6 +1335,16 @@ def memory_usage(self, deep=False):
13351335
return self._codes.nbytes + self.dtype.categories.memory_usage(
13361336
deep=deep)
13371337

1338+
def _ensure_codes_dtype(self, code):
1339+
"""
1340+
Ensure that ``code`` has the same dtype as self.codes.
1341+
"""
1342+
dtype = self.codes.dtype
1343+
if is_scalar(code):
1344+
return dtype.type(code)
1345+
else:
1346+
return np.array(code, dtype=dtype)
1347+
13381348
@Substitution(klass='Categorical')
13391349
@Appender(_shared_docs['searchsorted'])
13401350
def searchsorted(self, value, side='left', sorter=None):
@@ -1343,16 +1353,13 @@ def searchsorted(self, value, side='left', sorter=None):
13431353
".as_ordered() to change the Categorical to an "
13441354
"ordered one")
13451355

1346-
from pandas.core.series import Series
1347-
1348-
values_as_codes = _get_codes_for_values(Series(value).values,
1349-
self.categories)
1350-
1351-
if -1 in values_as_codes:
1352-
raise ValueError("Value(s) to be inserted must be in categories.")
1356+
if is_scalar(value):
1357+
codes = self.categories.get_loc(value)
1358+
else:
1359+
codes = [self.categories.get_loc(val) for val in value]
1360+
codes = self._ensure_codes_dtype(codes)
13531361

1354-
return self.codes.searchsorted(values_as_codes, side=side,
1355-
sorter=sorter)
1362+
return self.codes.searchsorted(codes, side=side, sorter=sorter)
13561363

13571364
def isna(self):
13581365
"""

pandas/core/indexes/category.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ def get_loc(self, key, method=None):
460460
array([False, True, False, True], dtype=bool)
461461
"""
462462
code = self.categories.get_loc(key)
463-
code = self.codes.dtype.type(code)
463+
code = self.values._ensure_codes_dtype(code)
464464
try:
465465
return self._engine.get_loc(code)
466466
except KeyError:

pandas/tests/arrays/categorical/test_analytics.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,10 @@ def test_searchsorted(self):
8585

8686
# Searching for single item argument, side='left' (default)
8787
res_cat = c1.searchsorted('apple')
88+
assert res_cat == 2
89+
8890
res_ser = s1.searchsorted('apple')
8991
exp = np.array([2], dtype=np.intp)
90-
tm.assert_numpy_array_equal(res_cat, exp)
9192
tm.assert_numpy_array_equal(res_ser, exp)
9293

9394
# Searching for single item array, side='left' (default)
@@ -105,13 +106,13 @@ def test_searchsorted(self):
105106
tm.assert_numpy_array_equal(res_ser, exp)
106107

107108
# Searching for a single value that is not from the Categorical
108-
pytest.raises(ValueError, lambda: c1.searchsorted('cucumber'))
109-
pytest.raises(ValueError, lambda: s1.searchsorted('cucumber'))
109+
pytest.raises(KeyError, lambda: c1.searchsorted('cucumber'))
110+
pytest.raises(KeyError, lambda: s1.searchsorted('cucumber'))
110111

111112
# Searching for multiple values one of each is not from the Categorical
112-
pytest.raises(ValueError,
113+
pytest.raises(KeyError,
113114
lambda: c1.searchsorted(['bread', 'cucumber']))
114-
pytest.raises(ValueError,
115+
pytest.raises(KeyError,
115116
lambda: s1.searchsorted(['bread', 'cucumber']))
116117

117118
# searchsorted call for unordered Categorical

0 commit comments

Comments
 (0)