From 97568690b180997ba352cc44b911a950f333bf15 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 2 Nov 2018 22:10:53 +0000 Subject: [PATCH 1/3] API/PERF: Categorical.searchsorted faster and returns scalar --- doc/source/whatsnew/v0.24.0.rst | 3 +++ pandas/core/arrays/categorical.py | 25 ++++++++++++------- pandas/core/indexes/category.py | 2 +- .../arrays/categorical/test_analytics.py | 11 ++++---- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 69232fa836102..67cc21f2725e1 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1007,6 +1007,8 @@ Other API Changes - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`) +- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`). +- :meth:`Categorical.searchsorted` now raises a ``keyError`` rather that a ``ValueError``, if a search for key is not found in its categories (:issue:`23466`). - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`). - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`) @@ -1130,6 +1132,7 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of :meth:`Categorical.searchsorted` (:issue:`23466`) - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d226d8c2e7ee2..06346b76d7929 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1335,6 +1335,16 @@ def memory_usage(self, deep=False): return self._codes.nbytes + self.dtype.categories.memory_usage( deep=deep) + def _ensure_codes_dtype(self, code): + """ + Ensure ``code`` has the same dtype as self.codes. + """ + dtype = self.codes.dtype + if is_scalar(code): + return dtype.type(code) + else: + return np.array(code, dtype=dtype) + @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): @@ -1343,16 +1353,13 @@ def searchsorted(self, value, side='left', sorter=None): ".as_ordered() to change the Categorical to an " "ordered one") - from pandas.core.series import Series - - values_as_codes = _get_codes_for_values(Series(value).values, - self.categories) - - if -1 in values_as_codes: - raise ValueError("Value(s) to be inserted must be in categories.") + if is_scalar(value): + codes = self.categories.get_loc(value) + else: + codes = [self.categories.get_loc(val) for val in value] + codes = self._ensure_codes_dtype(codes) - return self.codes.searchsorted(values_as_codes, side=side, - sorter=sorter) + return self.codes.searchsorted(codes, side=side, sorter=sorter) def isna(self): """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 6e2f0b00fcd6e..7db80c6261b8e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -465,7 +465,7 @@ def get_loc(self, key, method=None): array([False, True, False, True], dtype=bool) """ code = self.categories.get_loc(key) - code = self.codes.dtype.type(code) + code = self.values._ensure_codes_dtype(code) try: return self._engine.get_loc(code) except KeyError: diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index ea6facd66a1a3..d2bba3d12d096 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -85,9 +85,10 @@ def test_searchsorted(self): # Searching for single item argument, side='left' (default) res_cat = c1.searchsorted('apple') + assert res_cat == 2 + res_ser = s1.searchsorted('apple') exp = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) # Searching for single item array, side='left' (default) @@ -105,13 +106,13 @@ def test_searchsorted(self): tm.assert_numpy_array_equal(res_ser, exp) # Searching for a single value that is not from the Categorical - pytest.raises(ValueError, lambda: c1.searchsorted('cucumber')) - pytest.raises(ValueError, lambda: s1.searchsorted('cucumber')) + pytest.raises(KeyError, lambda: c1.searchsorted('cucumber')) + pytest.raises(KeyError, lambda: s1.searchsorted('cucumber')) # Searching for multiple values one of each is not from the Categorical - pytest.raises(ValueError, + pytest.raises(KeyError, lambda: c1.searchsorted(['bread', 'cucumber'])) - pytest.raises(ValueError, + pytest.raises(KeyError, lambda: s1.searchsorted(['bread', 'cucumber'])) # searchsorted call for unordered Categorical From 1476a3e4a8506972cc6bf2c8dd77d17b7daf15f1 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 3 Nov 2018 14:29:42 +0000 Subject: [PATCH 2/3] Updated according to comments --- pandas/core/arrays/categorical.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 06346b76d7929..301ad8a71396b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1356,7 +1356,9 @@ def searchsorted(self, value, side='left', sorter=None): if is_scalar(value): codes = self.categories.get_loc(value) else: - codes = [self.categories.get_loc(val) for val in value] + codes = self.categories.get_indexer(value) + if -1 in codes: + raise KeyError("All values not in self.categories") codes = self._ensure_codes_dtype(codes) return self.codes.searchsorted(codes, side=side, sorter=sorter) From d7b68732724fce7c093d80e2123ddf29ec5818c6 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 14 Nov 2018 20:12:49 +0000 Subject: [PATCH 3/3] track back to use _codes_for_values --- doc/source/whatsnew/v0.24.0.rst | 3 +-- pandas/core/arrays/categorical.py | 23 +++++-------------- pandas/core/indexes/category.py | 2 +- .../arrays/categorical/test_analytics.py | 3 +-- 4 files changed, 9 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 67cc21f2725e1..3c95453aa2089 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1008,7 +1008,7 @@ Other API Changes - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`) - :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`). -- :meth:`Categorical.searchsorted` now raises a ``keyError`` rather that a ``ValueError``, if a search for key is not found in its categories (:issue:`23466`). +- :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`). - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`). - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`) @@ -1132,7 +1132,6 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of :meth:`Categorical.searchsorted` (:issue:`23466`) - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 301ad8a71396b..276ef6426a51b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1335,16 +1335,6 @@ def memory_usage(self, deep=False): return self._codes.nbytes + self.dtype.categories.memory_usage( deep=deep) - def _ensure_codes_dtype(self, code): - """ - Ensure ``code`` has the same dtype as self.codes. - """ - dtype = self.codes.dtype - if is_scalar(code): - return dtype.type(code) - else: - return np.array(code, dtype=dtype) - @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): @@ -1353,13 +1343,12 @@ def searchsorted(self, value, side='left', sorter=None): ".as_ordered() to change the Categorical to an " "ordered one") - if is_scalar(value): - codes = self.categories.get_loc(value) - else: - codes = self.categories.get_indexer(value) - if -1 in codes: - raise KeyError("All values not in self.categories") - codes = self._ensure_codes_dtype(codes) + from pandas.core.series import Series + codes = _get_codes_for_values(Series(value).values, self.categories) + if -1 in codes: + raise KeyError("Value(s) to be inserted must be in categories.") + + codes = codes[0] if is_scalar(value) else codes return self.codes.searchsorted(codes, side=side, sorter=sorter) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7db80c6261b8e..6e2f0b00fcd6e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -465,7 +465,7 @@ def get_loc(self, key, method=None): array([False, True, False, True], dtype=bool) """ code = self.categories.get_loc(key) - code = self.values._ensure_codes_dtype(code) + code = self.codes.dtype.type(code) try: return self._engine.get_loc(code) except KeyError: diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index d2bba3d12d096..4251273e424dd 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -88,8 +88,7 @@ def test_searchsorted(self): assert res_cat == 2 res_ser = s1.searchsorted('apple') - exp = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(res_ser, exp) + assert res_ser == 2 # Searching for single item array, side='left' (default) res_cat = c1.searchsorted(['bread'])