ENH: Implement Categorical.searchsorted(v, side, sorter) GH8420

stevesimmons · jreback · commit 5928075f5871 · 2014-12-05T08:35:30.000-05:00
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -69,6 +69,7 @@ Enhancements
 - Added the ability to specify the SQL type of columns when writing a DataFrame to a database (:issue:`8778`).
 - Added ability to export Categorical data to Stata (:issue:`8633`).  See :ref:`here <io.stata-categorical>` for limitations of categorical variables exported to Stata data files.
 - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here <io.hdf5-categorical>` for an example and caveats w.r.t. prior versions of pandas.
+- Added support for ``searchsorted()`` on `Categorical` class (:issue:`8420`).
 - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
 - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here<remote_data.ga>`.
 - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`).  See :ref:`here <io.stata-categorical>` for more information on importing categorical variables from Stata data files.
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -782,7 +782,61 @@ def nbytes(self):
         return self._codes.nbytes + self._categories.values.nbytes
 
     def searchsorted(self, v, side='left', sorter=None):
-        raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420")
+        """Find indices where elements should be inserted to maintain order.
+
+        Find the indices into a sorted Categorical `self` such that, if the
+        corresponding elements in `v` were inserted before the indices, the
+        order of `self` would be preserved.
+
+        Parameters
+        ----------
+        v : array_like
+            Array-like values or a scalar value, to insert/search for in `self`.
+        side : {'left', 'right'}, optional
+            If 'left', the index of the first suitable location found is given.
+            If 'right', return the last such index.  If there is no suitable
+            index, return either 0 or N (where N is the length of `a`).
+        sorter : 1-D array_like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+
+        Returns
+        -------
+        indices : array of ints
+            Array of insertion points with the same shape as `v`.
+
+        See Also
+        --------
+        Series.searchsorted
+        numpy.searchsorted
+
+        Notes
+        -----
+        Binary search is used to find the required insertion points.
+
+        Examples
+        --------
+        >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ])
+        [apple, bread, bread, cheese, milk]
+        Categories (4, object): [apple < bread < cheese < milk]
+        >>> x.searchsorted('bread')
+        array([1])     # Note: an array, not a scalar
+        >>> x.searchsorted(['bread'])
+        array([1])
+        >>> x.searchsorted(['bread', 'eggs'])
+        array([1, 4])
+        >>> x.searchsorted(['bread', 'eggs'], side='right')
+        array([3, 4])	    # eggs before milk
+        >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
+        >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
+        array([3, 5])       # eggs after donuts, after switching milk and donuts 
+        """
+        if not self.ordered:
+            raise ValueError("searchsorted requires an ordered Categorical.")
+
+        from pandas.core.series import Series
+        values_as_codes = self.categories.values.searchsorted(Series(v).values, side)
+        return self.codes.searchsorted(values_as_codes, sorter=sorter)
 
     def isnull(self):
         """
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -889,13 +889,47 @@ def test_nbytes(self):
         self.assertEqual(cat.nbytes, exp)
 
     def test_searchsorted(self):
+        # https://github.com/pydata/pandas/issues/8420
+        s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
+        s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
+        c1 = pd.Categorical(s1)
+        c2 = pd.Categorical(s2)
+
+        # Single item array
+        res = c1.searchsorted(['bread'])
+        chk = s1.searchsorted(['bread'])
+        exp = np.array([1])
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
 
-        # See https://github.com/pydata/pandas/issues/8420
-        # TODO: implement me...
-        cat = pd.Categorical([1,2,3])
-        def f():
-            cat.searchsorted(3)
-        self.assertRaises(NotImplementedError, f)
+        # Scalar version of single item array
+        # Categorical return np.array like pd.Series, but different from np.array.searchsorted()
+        res = c1.searchsorted('bread')
+        chk = s1.searchsorted('bread')
+        exp = np.array([1])
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
+       
+        # Searching for a value that is not present in the Categorical 
+        res = c1.searchsorted(['bread', 'eggs'])
+        chk = s1.searchsorted(['bread', 'eggs'])
+        exp = np.array([1, 4])
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
+
+        # Searching for a value that is not present, to the right
+        res = c1.searchsorted(['bread', 'eggs'], side='right')
+        chk = s1.searchsorted(['bread', 'eggs'], side='right')
+        exp = np.array([3, 4])	    # eggs before milk
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
+
+        # As above, but with a sorter array to reorder an unsorted array
+        res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
+        chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
+        exp = np.array([3, 5])       # eggs after donuts, after switching milk and donuts 
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
 
     def test_deprecated_labels(self):
         # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier