Skip to content

Commit 0252385

Browse files
nathalierjreback
authored andcommitted
BUG: Categorical.searchsorted(): use provided categorical order
closes #14522 Previously, it used lexical order instead of the provided categorical order. Author: Nathalie Rud <[email protected]> Closes #14697 from nathalier/gh-14522 and squashes the following commits: 86b42d0 [Nathalie Rud] BUG: Categorical.searchsorted(): use provided categorical order
1 parent 7dd451d commit 0252385

File tree

3 files changed

+53
-46
lines changed

3 files changed

+53
-46
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ Bug Fixes
319319

320320
- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)
321321

322+
- Bug in ``Categorical.searchsorted()`` where alphabetical instead of the provided categorical order was used (:issue:`14522`)
322323

323324

324325

pandas/core/categorical.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,15 @@ def searchsorted(self, value, side='left', sorter=None):
10861086
"ordered one")
10871087

10881088
from pandas.core.series import Series
1089-
values_as_codes = self.categories.values.searchsorted(
1090-
Series(value).values, side=side)
10911089

1092-
return self.codes.searchsorted(values_as_codes, sorter=sorter)
1090+
values_as_codes = _get_codes_for_values(Series(value).values,
1091+
self.categories)
1092+
1093+
if -1 in values_as_codes:
1094+
raise ValueError("Value(s) to be inserted must be in categories.")
1095+
1096+
return self.codes.searchsorted(values_as_codes, side=side,
1097+
sorter=sorter)
10931098

10941099
def isnull(self):
10951100
"""

pandas/tests/test_categorical.py

+44-43
Original file line numberDiff line numberDiff line change
@@ -1569,54 +1569,55 @@ def test_memory_usage(self):
15691569

15701570
def test_searchsorted(self):
15711571
# https://github.com/pandas-dev/pandas/issues/8420
1572-
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk'])
1573-
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts'])
1574-
c1 = pd.Categorical(s1, ordered=True)
1575-
c2 = pd.Categorical(s2, ordered=True)
1576-
1577-
# Single item array
1578-
res = c1.searchsorted(['bread'])
1579-
chk = s1.searchsorted(['bread'])
1580-
exp = np.array([1], dtype=np.intp)
1581-
self.assert_numpy_array_equal(res, exp)
1582-
self.assert_numpy_array_equal(res, chk)
1583-
1584-
# Scalar version of single item array
1585-
# Categorical return np.array like pd.Series, but different from
1586-
# np.array.searchsorted()
1587-
res = c1.searchsorted('bread')
1588-
chk = s1.searchsorted('bread')
1589-
exp = np.array([1], dtype=np.intp)
1590-
self.assert_numpy_array_equal(res, exp)
1591-
self.assert_numpy_array_equal(res, chk)
1572+
# https://github.com/pandas-dev/pandas/issues/14522
1573+
1574+
c1 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
1575+
categories=['cheese', 'milk', 'apple', 'bread'],
1576+
ordered=True)
1577+
s1 = pd.Series(c1)
1578+
c2 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
1579+
categories=['cheese', 'milk', 'apple', 'bread'],
1580+
ordered=False)
1581+
s2 = pd.Series(c2)
1582+
1583+
# Searching for single item argument, side='left' (default)
1584+
res_cat = c1.searchsorted('apple')
1585+
res_ser = s1.searchsorted('apple')
1586+
exp = np.array([2], dtype=np.intp)
1587+
self.assert_numpy_array_equal(res_cat, exp)
1588+
self.assert_numpy_array_equal(res_ser, exp)
1589+
1590+
# Searching for single item array, side='left' (default)
1591+
res_cat = c1.searchsorted(['bread'])
1592+
res_ser = s1.searchsorted(['bread'])
1593+
exp = np.array([3], dtype=np.intp)
1594+
self.assert_numpy_array_equal(res_cat, exp)
1595+
self.assert_numpy_array_equal(res_ser, exp)
1596+
1597+
# Searching for several items array, side='right'
1598+
res_cat = c1.searchsorted(['apple', 'bread'], side='right')
1599+
res_ser = s1.searchsorted(['apple', 'bread'], side='right')
1600+
exp = np.array([3, 5], dtype=np.intp)
1601+
self.assert_numpy_array_equal(res_cat, exp)
1602+
self.assert_numpy_array_equal(res_ser, exp)
15921603

1593-
# Searching for a value that is not present in the Categorical
1594-
res = c1.searchsorted(['bread', 'eggs'])
1595-
chk = s1.searchsorted(['bread', 'eggs'])
1596-
exp = np.array([1, 4], dtype=np.intp)
1597-
self.assert_numpy_array_equal(res, exp)
1598-
self.assert_numpy_array_equal(res, chk)
1604+
# Searching for a single value that is not from the Categorical
1605+
self.assertRaises(ValueError, lambda: c1.searchsorted('cucumber'))
1606+
self.assertRaises(ValueError, lambda: s1.searchsorted('cucumber'))
15991607

1600-
# Searching for a value that is not present, to the right
1601-
res = c1.searchsorted(['bread', 'eggs'], side='right')
1602-
chk = s1.searchsorted(['bread', 'eggs'], side='right')
1603-
exp = np.array([3, 4], dtype=np.intp) # eggs before milk
1604-
self.assert_numpy_array_equal(res, exp)
1605-
self.assert_numpy_array_equal(res, chk)
1606-
1607-
# As above, but with a sorter array to reorder an unsorted array
1608-
res = c2.searchsorted(['bread', 'eggs'], side='right',
1609-
sorter=[0, 1, 2, 3, 5, 4])
1610-
chk = s2.searchsorted(['bread', 'eggs'], side='right',
1611-
sorter=[0, 1, 2, 3, 5, 4])
1612-
# eggs after donuts, after switching milk and donuts
1613-
exp = np.array([3, 5], dtype=np.intp)
1614-
self.assert_numpy_array_equal(res, exp)
1615-
self.assert_numpy_array_equal(res, chk)
1608+
# Searching for multiple values one of each is not from the Categorical
1609+
self.assertRaises(ValueError,
1610+
lambda: c1.searchsorted(['bread', 'cucumber']))
1611+
self.assertRaises(ValueError,
1612+
lambda: s1.searchsorted(['bread', 'cucumber']))
1613+
1614+
# searchsorted call for unordered Categorical
1615+
self.assertRaises(ValueError, lambda: c2.searchsorted('apple'))
1616+
self.assertRaises(ValueError, lambda: s2.searchsorted('apple'))
16161617

16171618
with tm.assert_produces_warning(FutureWarning):
16181619
res = c1.searchsorted(v=['bread'])
1619-
exp = np.array([1], dtype=np.intp)
1620+
exp = np.array([3], dtype=np.intp)
16201621
tm.assert_numpy_array_equal(res, exp)
16211622

16221623
def test_deprecated_labels(self):

0 commit comments

Comments
 (0)