From a21dc4d6f0f893b4f6b2a70776e25b8df6829d32 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 28 Jun 2018 22:43:27 +0100 Subject: [PATCH] PERF/API: Remove ordered requirement in Categorical.searchsorted --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/arrays/categorical.py | 7 ---- pandas/core/base.py | 14 +++++++ .../arrays/categorical/test_analytics.py | 41 +++++++------------ 4 files changed, 29 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3406f52b06a61..2668734031ee1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -173,7 +173,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) -- +- :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 870628500af21..33d1de01fa3db 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1399,13 +1399,6 @@ def memory_usage(self, deep=False): @Substitution(klass="Categorical") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - if not self.ordered: - raise ValueError( - "Categorical not ordered\nyou can use " - ".as_ordered() to change the Categorical to an " - "ordered one" - ) - from pandas.core.series import Series codes = _get_codes_for_values(Series(value).values, self.categories) diff --git a/pandas/core/base.py b/pandas/core/base.py index 910b05c47071d..7df3ae97ccad2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1515,6 +1515,12 @@ def factorize(self, sort=False, na_sentinel=-1): corresponding elements in `value` were inserted before the indices, the order of `self` would be preserved. + .. note:: + + The %(klass)s *must* be monotonically sorted, otherwise + wrong locations will likely be returned. Pandas does *not* + check this for you. + Parameters ---------- value : array_like @@ -1540,6 +1546,7 @@ def factorize(self, sort=False, na_sentinel=-1): See Also -------- + sort_values numpy.searchsorted Notes @@ -1578,6 +1585,13 @@ def factorize(self, sort=False, na_sentinel=-1): >>> x.searchsorted(['bread'], side='right') array([3]) + + If the values are not monotonically sorted, wrong locations + may be returned: + + >>> x = pd.Series([2, 1, 3]) + >>> x.searchsorted(1) + 0 # wrong result, correct would be 1 """ @Substitution(klass="Index") diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index d8831d7e6bf36..86750244d5fb5 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -78,42 +78,36 @@ def test_mode(self, values, categories, exp_mode): exp = Categorical(exp_mode, categories=categories, ordered=True) tm.assert_categorical_equal(res, exp) - def test_searchsorted(self): + def test_searchsorted(self, ordered_fixture): # https://github.com/pandas-dev/pandas/issues/8420 # https://github.com/pandas-dev/pandas/issues/14522 - c1 = Categorical( - ["cheese", "milk", "apple", "bread", "bread"], - categories=["cheese", "milk", "apple", "bread"], - ordered=True, - ) - s1 = Series(c1) - c2 = Categorical( + cat = Categorical( ["cheese", "milk", "apple", "bread", "bread"], categories=["cheese", "milk", "apple", "bread"], - ordered=False, + ordered=ordered_fixture, ) - s2 = Series(c2) + ser = Series(cat) # Searching for single item argument, side='left' (default) - res_cat = c1.searchsorted("apple") + res_cat = cat.searchsorted("apple") assert res_cat == 2 assert is_scalar(res_cat) - res_ser = s1.searchsorted("apple") + res_ser = ser.searchsorted("apple") assert res_ser == 2 assert is_scalar(res_ser) # Searching for single item array, side='left' (default) - res_cat = c1.searchsorted(["bread"]) - res_ser = s1.searchsorted(["bread"]) + res_cat = cat.searchsorted(["bread"]) + res_ser = ser.searchsorted(["bread"]) exp = np.array([3], dtype=np.intp) tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) # Searching for several items array, side='right' - res_cat = c1.searchsorted(["apple", "bread"], side="right") - res_ser = s1.searchsorted(["apple", "bread"], side="right") + res_cat = cat.searchsorted(["apple", "bread"], side="right") + res_ser = ser.searchsorted(["apple", "bread"], side="right") exp = np.array([3, 5], dtype=np.intp) tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) @@ -121,22 +115,15 @@ def test_searchsorted(self): # Searching for a single value that is not from the Categorical msg = r"Value\(s\) to be inserted must be in categories" with pytest.raises(KeyError, match=msg): - c1.searchsorted("cucumber") + cat.searchsorted("cucumber") with pytest.raises(KeyError, match=msg): - s1.searchsorted("cucumber") + ser.searchsorted("cucumber") # Searching for multiple values one of each is not from the Categorical with pytest.raises(KeyError, match=msg): - c1.searchsorted(["bread", "cucumber"]) + cat.searchsorted(["bread", "cucumber"]) with pytest.raises(KeyError, match=msg): - s1.searchsorted(["bread", "cucumber"]) - - # searchsorted call for unordered Categorical - msg = "Categorical not ordered" - with pytest.raises(ValueError, match=msg): - c2.searchsorted("apple") - with pytest.raises(ValueError, match=msg): - s2.searchsorted("apple") + ser.searchsorted(["bread", "cucumber"]) def test_unique(self): # categories are reordered based on value when ordered=False