From 97568690b180997ba352cc44b911a950f333bf15 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Fri, 2 Nov 2018 22:10:53 +0000
Subject: [PATCH 1/3] API/PERF: Categorical.searchsorted faster and returns
 scalar

---
 doc/source/whatsnew/v0.24.0.rst               |  3 +++
 pandas/core/arrays/categorical.py             | 25 ++++++++++++-------
 pandas/core/indexes/category.py               |  2 +-
 .../arrays/categorical/test_analytics.py      | 11 ++++----
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 69232fa836102..67cc21f2725e1 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1007,6 +1007,8 @@ Other API Changes
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 - Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`)
+- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`).
+- :meth:`Categorical.searchsorted` now raises a ``keyError`` rather that a ``ValueError``, if a search for key is not found in its categories (:issue:`23466`).
 - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`).
 - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
 
@@ -1130,6 +1132,7 @@ Performance Improvements
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
+- Improved performance of :meth:`Categorical.searchsorted` (:issue:`23466`)
 - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
   (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
   is likewise much faster (:issue:`21369`, :issue:`21508`)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index d226d8c2e7ee2..06346b76d7929 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1335,6 +1335,16 @@ def memory_usage(self, deep=False):
         return self._codes.nbytes + self.dtype.categories.memory_usage(
             deep=deep)
 
+    def _ensure_codes_dtype(self, code):
+        """
+        Ensure ``code`` has the same dtype as self.codes.
+        """
+        dtype = self.codes.dtype
+        if is_scalar(code):
+            return dtype.type(code)
+        else:
+            return np.array(code, dtype=dtype)
+
     @Substitution(klass='Categorical')
     @Appender(_shared_docs['searchsorted'])
     def searchsorted(self, value, side='left', sorter=None):
@@ -1343,16 +1353,13 @@ def searchsorted(self, value, side='left', sorter=None):
                              ".as_ordered() to change the Categorical to an "
                              "ordered one")
 
-        from pandas.core.series import Series
-
-        values_as_codes = _get_codes_for_values(Series(value).values,
-                                                self.categories)
-
-        if -1 in values_as_codes:
-            raise ValueError("Value(s) to be inserted must be in categories.")
+        if is_scalar(value):
+            codes = self.categories.get_loc(value)
+        else:
+            codes = [self.categories.get_loc(val) for val in value]
+        codes = self._ensure_codes_dtype(codes)
 
-        return self.codes.searchsorted(values_as_codes, side=side,
-                                       sorter=sorter)
+        return self.codes.searchsorted(codes, side=side, sorter=sorter)
 
     def isna(self):
         """
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 6e2f0b00fcd6e..7db80c6261b8e 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -465,7 +465,7 @@ def get_loc(self, key, method=None):
         array([False,  True, False,  True], dtype=bool)
         """
         code = self.categories.get_loc(key)
-        code = self.codes.dtype.type(code)
+        code = self.values._ensure_codes_dtype(code)
         try:
             return self._engine.get_loc(code)
         except KeyError:
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index ea6facd66a1a3..d2bba3d12d096 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -85,9 +85,10 @@ def test_searchsorted(self):
 
         # Searching for single item argument, side='left' (default)
         res_cat = c1.searchsorted('apple')
+        assert res_cat == 2
+
         res_ser = s1.searchsorted('apple')
         exp = np.array([2], dtype=np.intp)
-        tm.assert_numpy_array_equal(res_cat, exp)
         tm.assert_numpy_array_equal(res_ser, exp)
 
         # Searching for single item array, side='left' (default)
@@ -105,13 +106,13 @@ def test_searchsorted(self):
         tm.assert_numpy_array_equal(res_ser, exp)
 
         # Searching for a single value that is not from the Categorical
-        pytest.raises(ValueError, lambda: c1.searchsorted('cucumber'))
-        pytest.raises(ValueError, lambda: s1.searchsorted('cucumber'))
+        pytest.raises(KeyError, lambda: c1.searchsorted('cucumber'))
+        pytest.raises(KeyError, lambda: s1.searchsorted('cucumber'))
 
         # Searching for multiple values one of each is not from the Categorical
-        pytest.raises(ValueError,
+        pytest.raises(KeyError,
                       lambda: c1.searchsorted(['bread', 'cucumber']))
-        pytest.raises(ValueError,
+        pytest.raises(KeyError,
                       lambda: s1.searchsorted(['bread', 'cucumber']))
 
         # searchsorted call for unordered Categorical

From 1476a3e4a8506972cc6bf2c8dd77d17b7daf15f1 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 3 Nov 2018 14:29:42 +0000
Subject: [PATCH 2/3] Updated according to comments

---
 pandas/core/arrays/categorical.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 06346b76d7929..301ad8a71396b 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1356,7 +1356,9 @@ def searchsorted(self, value, side='left', sorter=None):
         if is_scalar(value):
             codes = self.categories.get_loc(value)
         else:
-            codes = [self.categories.get_loc(val) for val in value]
+            codes = self.categories.get_indexer(value)
+            if -1 in codes:
+                raise KeyError("All values not in self.categories")
         codes = self._ensure_codes_dtype(codes)
 
         return self.codes.searchsorted(codes, side=side, sorter=sorter)

From d7b68732724fce7c093d80e2123ddf29ec5818c6 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Wed, 14 Nov 2018 20:12:49 +0000
Subject: [PATCH 3/3] track back to use _codes_for_values

---
 doc/source/whatsnew/v0.24.0.rst               |  3 +--
 pandas/core/arrays/categorical.py             | 23 +++++--------------
 pandas/core/indexes/category.py               |  2 +-
 .../arrays/categorical/test_analytics.py      |  3 +--
 4 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 67cc21f2725e1..3c95453aa2089 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1008,7 +1008,7 @@ Other API Changes
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 - Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`)
 - :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`).
-- :meth:`Categorical.searchsorted` now raises a ``keyError`` rather that a ``ValueError``, if a search for key is not found in its categories (:issue:`23466`).
+- :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`).
 - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`).
 - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
 
@@ -1132,7 +1132,6 @@ Performance Improvements
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
-- Improved performance of :meth:`Categorical.searchsorted` (:issue:`23466`)
 - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
   (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
   is likewise much faster (:issue:`21369`, :issue:`21508`)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 301ad8a71396b..276ef6426a51b 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1335,16 +1335,6 @@ def memory_usage(self, deep=False):
         return self._codes.nbytes + self.dtype.categories.memory_usage(
             deep=deep)
 
-    def _ensure_codes_dtype(self, code):
-        """
-        Ensure ``code`` has the same dtype as self.codes.
-        """
-        dtype = self.codes.dtype
-        if is_scalar(code):
-            return dtype.type(code)
-        else:
-            return np.array(code, dtype=dtype)
-
     @Substitution(klass='Categorical')
     @Appender(_shared_docs['searchsorted'])
     def searchsorted(self, value, side='left', sorter=None):
@@ -1353,13 +1343,12 @@ def searchsorted(self, value, side='left', sorter=None):
                              ".as_ordered() to change the Categorical to an "
                              "ordered one")
 
-        if is_scalar(value):
-            codes = self.categories.get_loc(value)
-        else:
-            codes = self.categories.get_indexer(value)
-            if -1 in codes:
-                raise KeyError("All values not in self.categories")
-        codes = self._ensure_codes_dtype(codes)
+        from pandas.core.series import Series
+        codes = _get_codes_for_values(Series(value).values, self.categories)
+        if -1 in codes:
+            raise KeyError("Value(s) to be inserted must be in categories.")
+
+        codes = codes[0] if is_scalar(value) else codes
 
         return self.codes.searchsorted(codes, side=side, sorter=sorter)
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 7db80c6261b8e..6e2f0b00fcd6e 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -465,7 +465,7 @@ def get_loc(self, key, method=None):
         array([False,  True, False,  True], dtype=bool)
         """
         code = self.categories.get_loc(key)
-        code = self.values._ensure_codes_dtype(code)
+        code = self.codes.dtype.type(code)
         try:
             return self._engine.get_loc(code)
         except KeyError:
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index d2bba3d12d096..4251273e424dd 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -88,8 +88,7 @@ def test_searchsorted(self):
         assert res_cat == 2
 
         res_ser = s1.searchsorted('apple')
-        exp = np.array([2], dtype=np.intp)
-        tm.assert_numpy_array_equal(res_ser, exp)
+        assert res_ser == 2
 
         # Searching for single item array, side='left' (default)
         res_cat = c1.searchsorted(['bread'])