From ac94414a9506b485be9b5b5a7b5eb6cabf5f4481 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 19 Dec 2018 07:10:36 -0600
Subject: [PATCH 1/5] ENH: ExtensionArray.searchsorted

---
 doc/source/whatsnew/v0.24.0.rst            |  1 +
 pandas/core/arrays/base.py                 | 49 ++++++++++++++++++++++
 pandas/core/arrays/sparse.py               | 10 +++++
 pandas/tests/extension/base/methods.py     | 20 +++++++++
 pandas/tests/extension/json/test_json.py   |  4 ++
 pandas/tests/extension/test_categorical.py |  4 ++
 pandas/tests/extension/test_sparse.py      |  4 ++
 7 files changed, 92 insertions(+)

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index fe5e4a57c557a..60227652e6e7d 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -990,6 +990,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
 - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
   the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
+- :meth:`~pandas.api.types.ExtensionArray.searchsorted` has been added (:issue:``)
 - An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`)
 - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`).
 - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index cf145064fd7b1..e80af43c624fe 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -64,6 +64,7 @@ class ExtensionArray(object):
     * unique
     * factorize / _values_for_factorize
     * argsort / _values_for_argsort
+    * searchsorted
 
     The remaining methods implemented on this class should be performant,
     as they only compose abstract methods. Still, a more efficient
@@ -505,6 +506,54 @@ def unique(self):
         uniques = unique(self.astype(object))
         return self._from_sequence(uniques, dtype=self.dtype)
 
+    def searchsorted(self, v, side="left", sorter=None):
+        """
+        Find indices where elements should be inserted to maintain order.
+
+        .. versionadded:: 0.25.0
+
+        Find the indices into a sorted array `self` (a) such that, if the
+        corresponding elements in `v` were inserted before the indices, the
+        order of `self` would be preserved.
+
+        Assuming that `a` is sorted:
+
+        ======  ============================
+        `side`  returned index `i` satisfies
+        ======  ============================
+        left    ``self[i-1] < v <= self[i]``
+        right   ``self[i-1] <= v < self[i]``
+        ======  ============================
+
+        Parameters
+        ----------
+        v : array_like
+            Values to insert into `self`.
+        side : {'left', 'right'}, optional
+            If 'left', the index of the first suitable location found is given.
+            If 'right', return the last such index.  If there is no suitable
+            index, return either 0 or N (where N is the length of `self`).
+        sorter : 1-D array_like, optional
+            Optional array of integer indices that sort array a into ascending
+            order. They are typically the result of argsort.
+
+        Returns
+        -------
+        indices : array of ints
+            Array of insertion points with the same shape as `v`.
+
+        See Also
+        --------
+        numpy.searchsorted : Similar method from NumPy.
+        """
+        # Note: the base tests provided by pandas only test the basics.
+        # We do not test
+        # 1. Values outside the range of the `data_for_sorting` fixture
+        # 2. Values between the values in the `data_for_sorting` fixture
+        # 3. Missing values.
+        arr = self.astype(object)
+        return arr.searchsorted(v, side=side, sorter=sorter)
+
     def _values_for_factorize(self):
         # type: () -> Tuple[ndarray, Any]
         """
diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 9e1d2efc21b81..4ceeaa1fa9157 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -1166,6 +1166,16 @@ def _take_without_fill(self, indices):
 
         return taken
 
+    def searchsorted(self, v, side="left", sorter=None):
+        msg = "searchsorted requires high memory usage."
+        warnings.warn(msg, PerformanceWarning, stacklevel=2)
+        if not is_scalar(v):
+            v = np.asarray(v)
+        v = np.asarray(v)
+        return np.asarray(self, dtype=self.dtype.subtype).searchsorted(
+            v, side, sorter
+        )
+
     def copy(self, deep=False):
         if deep:
             values = self.sp_values.copy()
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 4a409a84f3db4..93e6e70372478 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -231,6 +231,26 @@ def test_hash_pandas_object_works(self, data, as_frame):
         b = pd.util.hash_pandas_object(data)
         self.assert_equal(a, b)
 
+    def test_searchsorted(self, data_for_sorting):
+        b, c, a = data_for_sorting
+        arr = type(data_for_sorting)._from_sequence([a, b, c])
+        assert arr.searchsorted(a) == 0
+        assert arr.searchsorted(a, side="right") == 1
+
+        assert arr.searchsorted(b) == 1
+        assert arr.searchsorted(b, side="right") == 2
+
+        assert arr.searchsorted(c) == 2
+        assert arr.searchsorted(c, side="right") == 3
+
+        result = arr.searchsorted(arr.take([0, 2]))
+        expected = np.array([0, 2])
+        tm.assert_numpy_array_equal(result, expected)
+
+        # sorter
+        sorter = np.array([1, 2, 0])
+        assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
+
     @pytest.mark.parametrize("as_frame", [True, False])
     def test_where_series(self, data, na_value, as_frame):
         assert data[0] != data[1]
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index a35997b07fd83..9ee131950f19c 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -232,6 +232,10 @@ def test_where_series(self, data, na_value):
         # with shapes (4,) (4,) (0,)
         super().test_where_series(data, na_value)
 
+    @pytest.mark.skip(reason="Can't compare dicts.")
+    def test_searchsorted(self, data_for_sorting):
+        super(TestMethods, self).test_searchsorted(data_for_sorting)
+
 
 class TestCasting(BaseJSON, base.BaseCastingTests):
     @pytest.mark.skip(reason="failing on np.array(self, dtype=str)")
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index 6106bc3d58620..c876db416470c 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -189,6 +189,10 @@ def test_combine_add(self, data_repeated):
     def test_fillna_length_mismatch(self, data_missing):
         super().test_fillna_length_mismatch(data_missing)
 
+    def test_searchsorted(self, data_for_sorting):
+        if not data_for_sorting.ordered:
+            raise pytest.skip(reason="searchsorted requires ordered data.")
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index ea849a78cda12..7fceade674595 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -286,6 +286,10 @@ def test_combine_first(self, data):
             pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.")
         super(TestMethods, self).test_combine_first(data)
 
+    def test_searchsorted(self, data_for_sorting):
+        with tm.assert_produces_warning(PerformanceWarning):
+            super(TestMethods, self).test_searchsorted(data_for_sorting)
+
 
 class TestCasting(BaseSparseTests, base.BaseCastingTests):
     pass

From 58418abee8b1a02f9b4b1512286b0db96aaf35aa Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 19 Dec 2018 07:22:17 -0600
Subject: [PATCH 2/5] PR number

---
 doc/source/whatsnew/v0.24.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 60227652e6e7d..b26a30f0cd2b8 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -990,7 +990,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
 - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
   the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
-- :meth:`~pandas.api.types.ExtensionArray.searchsorted` has been added (:issue:``)
+- :meth:`~pandas.api.types.ExtensionArray.searchsorted` has been added (:issue:`24350`)
 - An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`)
 - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`).
 - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)

From fee9c1a332de0f272f9c4480915bb0686e44724b Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 19 Dec 2018 08:16:32 -0600
Subject: [PATCH 3/5] 32-bit compat

---
 pandas/tests/extension/base/methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 93e6e70372478..daa7fe05b9cc8 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -244,7 +244,7 @@ def test_searchsorted(self, data_for_sorting):
         assert arr.searchsorted(c, side="right") == 3
 
         result = arr.searchsorted(arr.take([0, 2]))
-        expected = np.array([0, 2])
+        expected = np.array([0, 2], dtype=np.intp)
         tm.assert_numpy_array_equal(result, expected)
 
         # sorter

From ff8bbc361b5bed4c244ba38a4a49c0e2e79758e5 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 19 Dec 2018 08:28:41 -0600
Subject: [PATCH 4/5] updates

---
 pandas/core/arrays/base.py             | 2 +-
 pandas/core/base.py                    | 2 +-
 pandas/tests/extension/base/methods.py | 7 ++++++-
 pandas/tests/extension/test_sparse.py  | 6 ++++--
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index e80af43c624fe..7226311614bc2 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -510,7 +510,7 @@ def searchsorted(self, v, side="left", sorter=None):
         """
         Find indices where elements should be inserted to maintain order.
 
-        .. versionadded:: 0.25.0
+        .. versionadded:: 0.24.0
 
         Find the indices into a sorted array `self` (a) such that, if the
         corresponding elements in `v` were inserted before the indices, the
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 928e90977f95b..2b1dc4e86663a 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1399,7 +1399,7 @@ def factorize(self, sort=False, na_sentinel=-1):
     @Appender(_shared_docs['searchsorted'])
     def searchsorted(self, value, side='left', sorter=None):
         # needs coercion on the key (DatetimeIndex does already)
-        return self.values.searchsorted(value, side=side, sorter=sorter)
+        return self._values.searchsorted(value, side=side, sorter=sorter)
 
     def drop_duplicates(self, keep='first', inplace=False):
         inplace = validate_bool_kwarg(inplace, 'inplace')
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index daa7fe05b9cc8..00d5c18c28edd 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -231,9 +231,13 @@ def test_hash_pandas_object_works(self, data, as_frame):
         b = pd.util.hash_pandas_object(data)
         self.assert_equal(a, b)
 
-    def test_searchsorted(self, data_for_sorting):
+    @pytest.mark.parametrize("as_series", [True, False])
+    def test_searchsorted(self, data_for_sorting, as_series):
         b, c, a = data_for_sorting
         arr = type(data_for_sorting)._from_sequence([a, b, c])
+
+        if as_series:
+            arr = pd.Series(arr)
         assert arr.searchsorted(a) == 0
         assert arr.searchsorted(a, side="right") == 1
 
@@ -245,6 +249,7 @@ def test_searchsorted(self, data_for_sorting):
 
         result = arr.searchsorted(arr.take([0, 2]))
         expected = np.array([0, 2], dtype=np.intp)
+
         tm.assert_numpy_array_equal(result, expected)
 
         # sorter
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index 7fceade674595..257eb44cd94fe 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -286,9 +286,11 @@ def test_combine_first(self, data):
             pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.")
         super(TestMethods, self).test_combine_first(data)
 
-    def test_searchsorted(self, data_for_sorting):
+    @pytest.mark.parametrize("as_series", [True, False])
+    def test_searchsorted(self, data_for_sorting, as_series):
         with tm.assert_produces_warning(PerformanceWarning):
-            super(TestMethods, self).test_searchsorted(data_for_sorting)
+            super(TestMethods, self).test_searchsorted(data_for_sorting,
+                                                       as_series=as_series)
 
 
 class TestCasting(BaseSparseTests, base.BaseCastingTests):

From a91fcec63155c010a1f6677b61dbd6d0235b111d Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 28 Dec 2018 13:54:45 -0600
Subject: [PATCH 5/5] v -> value

---
 pandas/core/arrays/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 0f4bcf41a9331..2d4f8ca9c2cee 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -519,7 +519,7 @@ def unique(self):
         uniques = unique(self.astype(object))
         return self._from_sequence(uniques, dtype=self.dtype)
 
-    def searchsorted(self, v, side="left", sorter=None):
+    def searchsorted(self, value, side="left", sorter=None):
         """
         Find indices where elements should be inserted to maintain order.
 
@@ -540,7 +540,7 @@ def searchsorted(self, v, side="left", sorter=None):
 
         Parameters
         ----------
-        v : array_like
+        value : array_like
             Values to insert into `self`.
         side : {'left', 'right'}, optional
             If 'left', the index of the first suitable location found is given.
@@ -553,7 +553,7 @@ def searchsorted(self, v, side="left", sorter=None):
         Returns
         -------
         indices : array of ints
-            Array of insertion points with the same shape as `v`.
+            Array of insertion points with the same shape as `value`.
 
         See Also
         --------
@@ -565,7 +565,7 @@ def searchsorted(self, v, side="left", sorter=None):
         # 2. Values between the values in the `data_for_sorting` fixture
         # 3. Missing values.
         arr = self.astype(object)
-        return arr.searchsorted(v, side=side, sorter=sorter)
+        return arr.searchsorted(value, side=side, sorter=sorter)
 
     def _values_for_factorize(self):
         # type: () -> Tuple[ndarray, Any]