From 0047759a9be4b1e2b4490fb885ae8dd458c72147 Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Sat, 29 Nov 2014 15:42:43 +0000
Subject: [PATCH 1/5] Implement Categorical.searchsorted(v, side, sorter)

---
 pandas/core/categorical.py       | 61 +++++++++++++++++++++++++++++++-
 pandas/tests/test_categorical.py | 23 ++++++++----
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index eb0429ad4a0cd..956c11d7d6429 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -776,7 +776,66 @@ def nbytes(self):
         return self._codes.nbytes + self._categories.values.nbytes
 
     def searchsorted(self, v, side='left', sorter=None):
-        raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420")
+        """Find indices where elements should be inserted to maintain order.
+
+        Find the indices into a sorted Categorical `self` such that, if the
+        corresponding elements in `v` were inserted before the indices, the
+        order of `self` would be preserved.
+
+        Parameters
+        ----------
+        v : array_like
+            Array-like values or a scalar value, to insert/search for in `self`.
+        side : {'left', 'right'}, optional
+            If 'left', the index of the first suitable location found is given.
+            If 'right', return the last such index.  If there is no suitable
+            index, return either 0 or N (where N is the length of `a`).
+        sorter : 1-D array_like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+
+        Returns
+        -------
+        indices : array of ints
+            Array of insertion points with the same shape as `v`.
+
+        See Also
+        --------
+        Series.searchsorted
+        numpy.searchsorted
+
+        Notes
+        -----
+        Binary search is used to find the required insertion points.
+
+        Examples
+        --------
+        >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ])
+        [apple, bread, bread, cheese, milk]
+        Categories (4, object): [apple < bread < cheese < milk]
+        >>> x.searchsorted('bread')
+        array([1])
+        >>> x.searchsorted(['bread', 'eggs'])
+        array([1, 4])
+        >>> x.searchsorted(['bread', 'eggs'], side='right')
+        array([3, 4])	    # eggs before milk
+        >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
+        >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
+        array([3, 5])       # eggs before donuts, after switching milk and donuts 
+        """
+        # Fixes https://github.com/pydata/pandas/issues/8420
+        # Uses searchsorted twice, first to map the value to one of the codes,
+        # then to map the found code to the index into the Categorical.
+        # 'side' gets applied to the first one only, otherwise when side='right'
+        # any non-matching values jump too far to the right.
+        if not self.ordered:
+            raise ValueError("searchsorted requires an ordered Categorical.")
+
+        from pandas.core.series import Series	# Local import to avoid circular ref
+        values_as_codes = self.categories.values.searchsorted(Series(v).values, side)
+        indices = self.codes.searchsorted(values_as_codes, sorter=sorter)
+        return indices
+
 
     def isnull(self):
         """
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index dc82abfb40e02..8700647d275df 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -882,13 +882,24 @@ def test_nbytes(self):
         self.assertEqual(cat.nbytes, exp)
 
     def test_searchsorted(self):
+        cats1 = ['apple', 'bread', 'bread', 'cheese', 'milk' ]
+        cats2 = ['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]
+
+        for values in ( 'bread', ['bread'], ['bread','eggs'] ):
+            for side in ( 'left', 'right' ):
+                for cats, sorter in [ (cats1, None), (cats2, [0,1,2,3,5,4] ) ]:
+                    s = pd.Series(cats)
+                    c = pd.Categorical(cats)
+                    # print("values=%r, side=%r, sorter=%r" % (values, side, sorter))
+                    catRes = c.searchsorted(values, side=side, sorter=sorter)
+                    seriesRes = s.searchsorted(values, side=side, sorter=sorter)
+                    #print("--> %r" % (catRes,))
+                    assert type(catRes) == type(seriesRes)
+                    if isinstance( catRes, np.ndarray  ):
+                        self.assertTrue( (catRes - seriesRes == 0).all() )
+                    else:
+                        self.assertEqual(catRes, seriesRes)
 
-        # See https://github.com/pydata/pandas/issues/8420
-        # TODO: implement me...
-        cat = pd.Categorical([1,2,3])
-        def f():
-            cat.searchsorted(3)
-        self.assertRaises(NotImplementedError, f)
 
     def test_deprecated_labels(self):
         # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier

From 769cf88549627dc1668c3561de4b0d5c80a5ec58 Mon Sep 17 00:00:00 2001
From: SteveSimmons <mail@stevesimmons.com>
Date: Sat, 29 Nov 2014 16:03:23 +0000
Subject: [PATCH 2/5] Fix text in comment

---
 pandas/core/categorical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 956c11d7d6429..02376e1883f15 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -821,7 +821,7 @@ def searchsorted(self, v, side='left', sorter=None):
         array([3, 4])	    # eggs before milk
         >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
         >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
-        array([3, 5])       # eggs before donuts, after switching milk and donuts 
+        array([3, 5])       # eggs after donuts, after switching milk and donuts 
         """
         # Fixes https://github.com/pydata/pandas/issues/8420
         # Uses searchsorted twice, first to map the value to one of the codes,

From 300129500ff80e61fe049eaf6f60ba511161a95a Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Sun, 30 Nov 2014 23:01:48 +0000
Subject: [PATCH 3/5] Incorporate categorical.searchsorted() feedback from
 jreback

---
 pandas/core/categorical.py       | 12 +-----
 pandas/tests/test_categorical.py | 67 ++++++++++++++++++++++++--------
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 02376e1883f15..03cd44ebf34b5 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -823,19 +823,11 @@ def searchsorted(self, v, side='left', sorter=None):
         >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
         array([3, 5])       # eggs after donuts, after switching milk and donuts 
         """
-        # Fixes https://github.com/pydata/pandas/issues/8420
-        # Uses searchsorted twice, first to map the value to one of the codes,
-        # then to map the found code to the index into the Categorical.
-        # 'side' gets applied to the first one only, otherwise when side='right'
-        # any non-matching values jump too far to the right.
         if not self.ordered:
             raise ValueError("searchsorted requires an ordered Categorical.")
 
-        from pandas.core.series import Series	# Local import to avoid circular ref
-        values_as_codes = self.categories.values.searchsorted(Series(v).values, side)
-        indices = self.codes.searchsorted(values_as_codes, sorter=sorter)
-        return indices
-
+        values_as_codes = self.categories.values.searchsorted(np.asarray(v), side)
+        return self.codes.searchsorted(values_as_codes, sorter=sorter)
 
     def isnull(self):
         """
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 8700647d275df..05fc0c0fec39b 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -882,24 +882,57 @@ def test_nbytes(self):
         self.assertEqual(cat.nbytes, exp)
 
     def test_searchsorted(self):
-        cats1 = ['apple', 'bread', 'bread', 'cheese', 'milk' ]
-        cats2 = ['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]
-
-        for values in ( 'bread', ['bread'], ['bread','eggs'] ):
-            for side in ( 'left', 'right' ):
-                for cats, sorter in [ (cats1, None), (cats2, [0,1,2,3,5,4] ) ]:
-                    s = pd.Series(cats)
-                    c = pd.Categorical(cats)
-                    # print("values=%r, side=%r, sorter=%r" % (values, side, sorter))
-                    catRes = c.searchsorted(values, side=side, sorter=sorter)
-                    seriesRes = s.searchsorted(values, side=side, sorter=sorter)
-                    #print("--> %r" % (catRes,))
-                    assert type(catRes) == type(seriesRes)
-                    if isinstance( catRes, np.ndarray  ):
-                        self.assertTrue( (catRes - seriesRes == 0).all() )
-                    else:
-                        self.assertEqual(catRes, seriesRes)
+        # https://github.com/pydata/pandas/issues/8420
+        s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
+        s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
+        c1 = pd.Categorical(s1)
+        c2 = pd.Categorical(s2)
+
+        # Single item array
+        res = c1.searchsorted(['bread'])
+        chk = s1.searchsorted(['bread'])
+        exp = np.array([1])
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
+
+        # Scalar version of single item array
+        # Ambiguous what Categorical should return as np.array returns
+        # a scalar and pd.Series returns an array.
+        # We get different results depending on whether 
+        # Categorical.searchsorted(v) passes v through np.asarray()
+        # or pd.Series(v).values. The former returns scalar, the 
+        # latter an array. 
+        # Test code here follows np.array.searchsorted().
+        # Commented out lines below follow pd.Series.
+        res = c1.searchsorted('bread')
+        chk = np.array(s1).searchsorted('bread')
+        exp = 1
+        #exp = np.array([1])
+        #chk = s1.searchsorted('bread')
+        #exp = np.array([1])
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
+       
+        # Searching for a value that is not present in the Categorical 
+        res = c1.searchsorted(['bread', 'eggs'])
+        chk = s1.searchsorted(['bread', 'eggs'])
+        exp = np.array([1, 4])
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
 
+        # Searching for a value that is not present, to the right
+        res = c1.searchsorted(['bread', 'eggs'], side='right')
+        chk = s1.searchsorted(['bread', 'eggs'], side='right')
+        exp = np.array([3, 4])	    # eggs before milk
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
+
+        # As above, but with a sorter array to reorder an unsorted array
+        res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
+        chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
+        exp = np.array([3, 5])       # eggs after donuts, after switching milk and donuts 
+        self.assert_numpy_array_equal(res, exp)
+        self.assert_numpy_array_equal(res, chk)
 
     def test_deprecated_labels(self):
         # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier

From ed2cbecf6daa225a6f23ccb312d05fed0e15c7de Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Sun, 30 Nov 2014 23:08:06 +0000
Subject: [PATCH 4/5] Fix scalar example in docstring for
 Categorical.searchsorted()

---
 pandas/core/categorical.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 03cd44ebf34b5..0e586f22a3190 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -814,6 +814,8 @@ def searchsorted(self, v, side='left', sorter=None):
         [apple, bread, bread, cheese, milk]
         Categories (4, object): [apple < bread < cheese < milk]
         >>> x.searchsorted('bread')
+        1
+        >>> x.searchsorted(['bread'])
         array([1])
         >>> x.searchsorted(['bread', 'eggs'])
         array([1, 4])

From 1b0db931902382d336a339faad6de13681630a7d Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Tue, 2 Dec 2014 00:09:18 +0000
Subject: [PATCH 5/5] Fix for GH#8944. x.size=='L' returns scalar. Needs to be
 x['size']=='L' to give a boolean array.

---
 doc/source/cookbook.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
index 8378873db9a65..6e411626ca770 100644
--- a/doc/source/cookbook.rst
+++ b/doc/source/cookbook.rst
@@ -489,9 +489,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
 .. ipython:: python
 
    def GrowUp(x):
-      avg_weight = sum(x[x.size == 'S'].weight * 1.5) 
-      avg_weight += sum(x[x.size == 'M'].weight * 1.25)
-      avg_weight += sum(x[x.size == 'L'].weight)
+      avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) 
+      avg_weight += sum(x[x['size'] == 'M'].weight * 1.25)
+      avg_weight += sum(x[x['size'] == 'L'].weight)
       avg_weight = avg_weight / len(x)
       return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult'])