Skip to content

Commit 86b42d0

Browse files
committed
BUG: Categorical.searchsorted(): use provided categorical order
Previously, it used lexical order instead of the provided categorical order. Tests updated accordingly. Closes #14522
1 parent f1cfe5b commit 86b42d0

File tree

3 files changed

+54
-46
lines changed

3 files changed

+54
-46
lines changed

doc/source/whatsnew/v0.19.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,5 @@ Bug Fixes
9696
- Bug in ``.plot(kind='kde')`` which did not drop missing values to generate the KDE Plot, instead generating an empty plot. (:issue:`14821`)
9797

9898
- Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`)
99+
100+
- Bug in ``Categorical.searchsorted()`` alphabetical instead of provided categorical order was used (:issue:`14522`)

pandas/core/categorical.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,15 @@ def searchsorted(self, value, side='left', sorter=None):
10861086
"ordered one")
10871087

10881088
from pandas.core.series import Series
1089-
values_as_codes = self.categories.values.searchsorted(
1090-
Series(value).values, side=side)
10911089

1092-
return self.codes.searchsorted(values_as_codes, sorter=sorter)
1090+
values_as_codes = _get_codes_for_values(Series(value).values,
1091+
self.categories)
1092+
1093+
if -1 in values_as_codes:
1094+
raise ValueError("Value(s) to be inserted must be in categories.")
1095+
1096+
return self.codes.searchsorted(values_as_codes, side=side,
1097+
sorter=sorter)
10931098

10941099
def isnull(self):
10951100
"""

pandas/tests/test_categorical.py

+44-43
Original file line numberDiff line numberDiff line change
@@ -1548,54 +1548,55 @@ def test_memory_usage(self):
15481548

15491549
def test_searchsorted(self):
15501550
# https://github.com/pandas-dev/pandas/issues/8420
1551-
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk'])
1552-
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts'])
1553-
c1 = pd.Categorical(s1, ordered=True)
1554-
c2 = pd.Categorical(s2, ordered=True)
1555-
1556-
# Single item array
1557-
res = c1.searchsorted(['bread'])
1558-
chk = s1.searchsorted(['bread'])
1559-
exp = np.array([1], dtype=np.intp)
1560-
self.assert_numpy_array_equal(res, exp)
1561-
self.assert_numpy_array_equal(res, chk)
1562-
1563-
# Scalar version of single item array
1564-
# Categorical return np.array like pd.Series, but different from
1565-
# np.array.searchsorted()
1566-
res = c1.searchsorted('bread')
1567-
chk = s1.searchsorted('bread')
1568-
exp = np.array([1], dtype=np.intp)
1569-
self.assert_numpy_array_equal(res, exp)
1570-
self.assert_numpy_array_equal(res, chk)
1551+
# https://github.com/pandas-dev/pandas/issues/14522
1552+
1553+
c1 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
1554+
categories=['cheese', 'milk', 'apple', 'bread'],
1555+
ordered=True)
1556+
s1 = pd.Series(c1)
1557+
c2 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
1558+
categories=['cheese', 'milk', 'apple', 'bread'],
1559+
ordered=False)
1560+
s2 = pd.Series(c2)
1561+
1562+
# Searching for single item argument, side='left' (default)
1563+
res_cat = c1.searchsorted('apple')
1564+
res_ser = s1.searchsorted('apple')
1565+
exp = np.array([2], dtype=np.intp)
1566+
self.assert_numpy_array_equal(res_cat, exp)
1567+
self.assert_numpy_array_equal(res_ser, exp)
1568+
1569+
# Searching for single item array, side='left' (default)
1570+
res_cat = c1.searchsorted(['bread'])
1571+
res_ser = s1.searchsorted(['bread'])
1572+
exp = np.array([3], dtype=np.intp)
1573+
self.assert_numpy_array_equal(res_cat, exp)
1574+
self.assert_numpy_array_equal(res_ser, exp)
1575+
1576+
# Searching for several items array, side='right'
1577+
res_cat = c1.searchsorted(['apple', 'bread'], side='right')
1578+
res_ser = s1.searchsorted(['apple', 'bread'], side='right')
1579+
exp = np.array([3, 5], dtype=np.intp)
1580+
self.assert_numpy_array_equal(res_cat, exp)
1581+
self.assert_numpy_array_equal(res_ser, exp)
15711582

1572-
# Searching for a value that is not present in the Categorical
1573-
res = c1.searchsorted(['bread', 'eggs'])
1574-
chk = s1.searchsorted(['bread', 'eggs'])
1575-
exp = np.array([1, 4], dtype=np.intp)
1576-
self.assert_numpy_array_equal(res, exp)
1577-
self.assert_numpy_array_equal(res, chk)
1583+
# Searching for a single value that is not from the Categorical
1584+
self.assertRaises(ValueError, lambda: c1.searchsorted('cucumber'))
1585+
self.assertRaises(ValueError, lambda: s1.searchsorted('cucumber'))
15781586

1579-
# Searching for a value that is not present, to the right
1580-
res = c1.searchsorted(['bread', 'eggs'], side='right')
1581-
chk = s1.searchsorted(['bread', 'eggs'], side='right')
1582-
exp = np.array([3, 4], dtype=np.intp) # eggs before milk
1583-
self.assert_numpy_array_equal(res, exp)
1584-
self.assert_numpy_array_equal(res, chk)
1585-
1586-
# As above, but with a sorter array to reorder an unsorted array
1587-
res = c2.searchsorted(['bread', 'eggs'], side='right',
1588-
sorter=[0, 1, 2, 3, 5, 4])
1589-
chk = s2.searchsorted(['bread', 'eggs'], side='right',
1590-
sorter=[0, 1, 2, 3, 5, 4])
1591-
# eggs after donuts, after switching milk and donuts
1592-
exp = np.array([3, 5], dtype=np.intp)
1593-
self.assert_numpy_array_equal(res, exp)
1594-
self.assert_numpy_array_equal(res, chk)
1587+
# Searching for multiple values one of each is not from the Categorical
1588+
self.assertRaises(ValueError,
1589+
lambda: c1.searchsorted(['bread', 'cucumber']))
1590+
self.assertRaises(ValueError,
1591+
lambda: s1.searchsorted(['bread', 'cucumber']))
1592+
1593+
# searchsorted call for unordered Categorical
1594+
self.assertRaises(ValueError, lambda: c2.searchsorted('apple'))
1595+
self.assertRaises(ValueError, lambda: s2.searchsorted('apple'))
15951596

15961597
with tm.assert_produces_warning(FutureWarning):
15971598
res = c1.searchsorted(v=['bread'])
1598-
exp = np.array([1], dtype=np.intp)
1599+
exp = np.array([3], dtype=np.intp)
15991600
tm.assert_numpy_array_equal(res, exp)
16001601

16011602
def test_deprecated_labels(self):

0 commit comments

Comments
 (0)