Skip to content

Commit 5928075

Browse files
stevesimmonsjreback
authored andcommitted
ENH: Implement Categorical.searchsorted(v, side, sorter) GH8420
1 parent 526f33c commit 5928075

File tree

3 files changed

+96
-7
lines changed

3 files changed

+96
-7
lines changed

doc/source/whatsnew/v0.15.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Enhancements
6969
- Added the ability to specify the SQL type of columns when writing a DataFrame to a database (:issue:`8778`).
7070
- Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here <io.stata-categorical>` for limitations of categorical variables exported to Stata data files.
7171
- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here <io.hdf5-categorical>` for an example and caveats w.r.t. prior versions of pandas.
72+
- Added support for ``searchsorted()`` on `Categorical` class (:issue:`8420`).
7273
- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
7374
- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here<remote_data.ga>`.
7475
- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here <io.stata-categorical>` for more information on importing categorical variables from Stata data files.

pandas/core/categorical.py

+55-1
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,61 @@ def nbytes(self):
782782
return self._codes.nbytes + self._categories.values.nbytes
783783

784784
def searchsorted(self, v, side='left', sorter=None):
785-
raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420")
785+
"""Find indices where elements should be inserted to maintain order.
786+
787+
Find the indices into a sorted Categorical `self` such that, if the
788+
corresponding elements in `v` were inserted before the indices, the
789+
order of `self` would be preserved.
790+
791+
Parameters
792+
----------
793+
v : array_like
794+
Array-like values or a scalar value, to insert/search for in `self`.
795+
side : {'left', 'right'}, optional
796+
If 'left', the index of the first suitable location found is given.
797+
If 'right', return the last such index. If there is no suitable
798+
index, return either 0 or N (where N is the length of `a`).
799+
sorter : 1-D array_like, optional
800+
Optional array of integer indices that sort `self` into ascending
801+
order. They are typically the result of ``np.argsort``.
802+
803+
Returns
804+
-------
805+
indices : array of ints
806+
Array of insertion points with the same shape as `v`.
807+
808+
See Also
809+
--------
810+
Series.searchsorted
811+
numpy.searchsorted
812+
813+
Notes
814+
-----
815+
Binary search is used to find the required insertion points.
816+
817+
Examples
818+
--------
819+
>>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ])
820+
[apple, bread, bread, cheese, milk]
821+
Categories (4, object): [apple < bread < cheese < milk]
822+
>>> x.searchsorted('bread')
823+
array([1]) # Note: an array, not a scalar
824+
>>> x.searchsorted(['bread'])
825+
array([1])
826+
>>> x.searchsorted(['bread', 'eggs'])
827+
array([1, 4])
828+
>>> x.searchsorted(['bread', 'eggs'], side='right')
829+
array([3, 4]) # eggs before milk
830+
>>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
831+
>>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
832+
array([3, 5]) # eggs after donuts, after switching milk and donuts
833+
"""
834+
if not self.ordered:
835+
raise ValueError("searchsorted requires an ordered Categorical.")
836+
837+
from pandas.core.series import Series
838+
values_as_codes = self.categories.values.searchsorted(Series(v).values, side)
839+
return self.codes.searchsorted(values_as_codes, sorter=sorter)
786840

787841
def isnull(self):
788842
"""

pandas/tests/test_categorical.py

+40-6
Original file line numberDiff line numberDiff line change
@@ -889,13 +889,47 @@ def test_nbytes(self):
889889
self.assertEqual(cat.nbytes, exp)
890890

891891
def test_searchsorted(self):
892+
# https://github.com/pydata/pandas/issues/8420
893+
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
894+
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
895+
c1 = pd.Categorical(s1)
896+
c2 = pd.Categorical(s2)
897+
898+
# Single item array
899+
res = c1.searchsorted(['bread'])
900+
chk = s1.searchsorted(['bread'])
901+
exp = np.array([1])
902+
self.assert_numpy_array_equal(res, exp)
903+
self.assert_numpy_array_equal(res, chk)
892904

893-
# See https://github.com/pydata/pandas/issues/8420
894-
# TODO: implement me...
895-
cat = pd.Categorical([1,2,3])
896-
def f():
897-
cat.searchsorted(3)
898-
self.assertRaises(NotImplementedError, f)
905+
# Scalar version of single item array
906+
# Categorical return np.array like pd.Series, but different from np.array.searchsorted()
907+
res = c1.searchsorted('bread')
908+
chk = s1.searchsorted('bread')
909+
exp = np.array([1])
910+
self.assert_numpy_array_equal(res, exp)
911+
self.assert_numpy_array_equal(res, chk)
912+
913+
# Searching for a value that is not present in the Categorical
914+
res = c1.searchsorted(['bread', 'eggs'])
915+
chk = s1.searchsorted(['bread', 'eggs'])
916+
exp = np.array([1, 4])
917+
self.assert_numpy_array_equal(res, exp)
918+
self.assert_numpy_array_equal(res, chk)
919+
920+
# Searching for a value that is not present, to the right
921+
res = c1.searchsorted(['bread', 'eggs'], side='right')
922+
chk = s1.searchsorted(['bread', 'eggs'], side='right')
923+
exp = np.array([3, 4]) # eggs before milk
924+
self.assert_numpy_array_equal(res, exp)
925+
self.assert_numpy_array_equal(res, chk)
926+
927+
# As above, but with a sorter array to reorder an unsorted array
928+
res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
929+
chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
930+
exp = np.array([3, 5]) # eggs after donuts, after switching milk and donuts
931+
self.assert_numpy_array_equal(res, exp)
932+
self.assert_numpy_array_equal(res, chk)
899933

900934
def test_deprecated_labels(self):
901935
# TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier

0 commit comments

Comments
 (0)