diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index ea59f3fbf493a..6fe1d5b8fc8af 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -96,3 +96,5 @@ Bug Fixes - Bug in ``.plot(kind='kde')`` which did not drop missing values to generate the KDE Plot, instead generating an empty plot. (:issue:`14821`) - Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`) + +- Bug in ``Categorical.searchsorted()`` alphabetical instead of provided categorical order was used (:issue:`14522`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 5124dc44e2fc8..5d88812801b7b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1086,10 +1086,15 @@ def searchsorted(self, value, side='left', sorter=None): "ordered one") from pandas.core.series import Series - values_as_codes = self.categories.values.searchsorted( - Series(value).values, side=side) - return self.codes.searchsorted(values_as_codes, sorter=sorter) + values_as_codes = _get_codes_for_values(Series(value).values, + self.categories) + + if -1 in values_as_codes: + raise ValueError("Value(s) to be inserted must be in categories.") + + return self.codes.searchsorted(values_as_codes, side=side, + sorter=sorter) def isnull(self): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5d2c317cc0f81..edec6db8ecbbb 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1548,54 +1548,55 @@ def test_memory_usage(self): def test_searchsorted(self): # https://github.com/pandas-dev/pandas/issues/8420 - s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk']) - s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts']) - c1 = pd.Categorical(s1, ordered=True) - c2 = pd.Categorical(s2, ordered=True) - - # Single item array - res = c1.searchsorted(['bread']) - chk = s1.searchsorted(['bread']) - exp = np.array([1], dtype=np.intp) - self.assert_numpy_array_equal(res, exp) - self.assert_numpy_array_equal(res, chk) - - # Scalar version of single item array - # Categorical return np.array like pd.Series, but different from - # np.array.searchsorted() - res = c1.searchsorted('bread') - chk = s1.searchsorted('bread') - exp = np.array([1], dtype=np.intp) - self.assert_numpy_array_equal(res, exp) - self.assert_numpy_array_equal(res, chk) + # https://github.com/pandas-dev/pandas/issues/14522 + + c1 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], + categories=['cheese', 'milk', 'apple', 'bread'], + ordered=True) + s1 = pd.Series(c1) + c2 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], + categories=['cheese', 'milk', 'apple', 'bread'], + ordered=False) + s2 = pd.Series(c2) + + # Searching for single item argument, side='left' (default) + res_cat = c1.searchsorted('apple') + res_ser = s1.searchsorted('apple') + exp = np.array([2], dtype=np.intp) + self.assert_numpy_array_equal(res_cat, exp) + self.assert_numpy_array_equal(res_ser, exp) + + # Searching for single item array, side='left' (default) + res_cat = c1.searchsorted(['bread']) + res_ser = s1.searchsorted(['bread']) + exp = np.array([3], dtype=np.intp) + self.assert_numpy_array_equal(res_cat, exp) + self.assert_numpy_array_equal(res_ser, exp) + + # Searching for several items array, side='right' + res_cat = c1.searchsorted(['apple', 'bread'], side='right') + res_ser = s1.searchsorted(['apple', 'bread'], side='right') + exp = np.array([3, 5], dtype=np.intp) + self.assert_numpy_array_equal(res_cat, exp) + self.assert_numpy_array_equal(res_ser, exp) - # Searching for a value that is not present in the Categorical - res = c1.searchsorted(['bread', 'eggs']) - chk = s1.searchsorted(['bread', 'eggs']) - exp = np.array([1, 4], dtype=np.intp) - self.assert_numpy_array_equal(res, exp) - self.assert_numpy_array_equal(res, chk) + # Searching for a single value that is not from the Categorical + self.assertRaises(ValueError, lambda: c1.searchsorted('cucumber')) + self.assertRaises(ValueError, lambda: s1.searchsorted('cucumber')) - # Searching for a value that is not present, to the right - res = c1.searchsorted(['bread', 'eggs'], side='right') - chk = s1.searchsorted(['bread', 'eggs'], side='right') - exp = np.array([3, 4], dtype=np.intp) # eggs before milk - self.assert_numpy_array_equal(res, exp) - self.assert_numpy_array_equal(res, chk) - - # As above, but with a sorter array to reorder an unsorted array - res = c2.searchsorted(['bread', 'eggs'], side='right', - sorter=[0, 1, 2, 3, 5, 4]) - chk = s2.searchsorted(['bread', 'eggs'], side='right', - sorter=[0, 1, 2, 3, 5, 4]) - # eggs after donuts, after switching milk and donuts - exp = np.array([3, 5], dtype=np.intp) - self.assert_numpy_array_equal(res, exp) - self.assert_numpy_array_equal(res, chk) + # Searching for multiple values one of each is not from the Categorical + self.assertRaises(ValueError, + lambda: c1.searchsorted(['bread', 'cucumber'])) + self.assertRaises(ValueError, + lambda: s1.searchsorted(['bread', 'cucumber'])) + + # searchsorted call for unordered Categorical + self.assertRaises(ValueError, lambda: c2.searchsorted('apple')) + self.assertRaises(ValueError, lambda: s2.searchsorted('apple')) with tm.assert_produces_warning(FutureWarning): res = c1.searchsorted(v=['bread']) - exp = np.array([1], dtype=np.intp) + exp = np.array([3], dtype=np.intp) tm.assert_numpy_array_equal(res, exp) def test_deprecated_labels(self):