From 0047759a9be4b1e2b4490fb885ae8dd458c72147 Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Sat, 29 Nov 2014 15:42:43 +0000 Subject: [PATCH 1/5] Implement Categorical.searchsorted(v, side, sorter) --- pandas/core/categorical.py | 61 +++++++++++++++++++++++++++++++- pandas/tests/test_categorical.py | 23 ++++++++---- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index eb0429ad4a0cd..956c11d7d6429 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -776,7 +776,66 @@ def nbytes(self): return self._codes.nbytes + self._categories.values.nbytes def searchsorted(self, v, side='left', sorter=None): - raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420") + """Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted Categorical `self` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Parameters + ---------- + v : array_like + Array-like values or a scalar value, to insert/search for in `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `a`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + Series.searchsorted + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + [apple, bread, bread, cheese, milk] + Categories (4, object): [apple < bread < cheese < milk] + >>> x.searchsorted('bread') + array([1]) + >>> x.searchsorted(['bread', 'eggs']) + array([1, 4]) + >>> x.searchsorted(['bread', 'eggs'], side='right') + array([3, 4]) # eggs before milk + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + array([3, 5]) # eggs before donuts, after switching milk and donuts + """ + # Fixes https://github.com/pydata/pandas/issues/8420 + # Uses searchsorted twice, first to map the value to one of the codes, + # then to map the found code to the index into the Categorical. + # 'side' gets applied to the first one only, otherwise when side='right' + # any non-matching values jump too far to the right. + if not self.ordered: + raise ValueError("searchsorted requires an ordered Categorical.") + + from pandas.core.series import Series # Local import to avoid circular ref + values_as_codes = self.categories.values.searchsorted(Series(v).values, side) + indices = self.codes.searchsorted(values_as_codes, sorter=sorter) + return indices + def isnull(self): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index dc82abfb40e02..8700647d275df 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -882,13 +882,24 @@ def test_nbytes(self): self.assertEqual(cat.nbytes, exp) def test_searchsorted(self): + cats1 = ['apple', 'bread', 'bread', 'cheese', 'milk' ] + cats2 = ['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ] + + for values in ( 'bread', ['bread'], ['bread','eggs'] ): + for side in ( 'left', 'right' ): + for cats, sorter in [ (cats1, None), (cats2, [0,1,2,3,5,4] ) ]: + s = pd.Series(cats) + c = pd.Categorical(cats) + # print("values=%r, side=%r, sorter=%r" % (values, side, sorter)) + catRes = c.searchsorted(values, side=side, sorter=sorter) + seriesRes = s.searchsorted(values, side=side, sorter=sorter) + #print("--> %r" % (catRes,)) + assert type(catRes) == type(seriesRes) + if isinstance( catRes, np.ndarray ): + self.assertTrue( (catRes - seriesRes == 0).all() ) + else: + self.assertEqual(catRes, seriesRes) - # See https://github.com/pydata/pandas/issues/8420 - # TODO: implement me... - cat = pd.Categorical([1,2,3]) - def f(): - cat.searchsorted(3) - self.assertRaises(NotImplementedError, f) def test_deprecated_labels(self): # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier From 769cf88549627dc1668c3561de4b0d5c80a5ec58 Mon Sep 17 00:00:00 2001 From: SteveSimmons Date: Sat, 29 Nov 2014 16:03:23 +0000 Subject: [PATCH 2/5] Fix text in comment --- pandas/core/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 956c11d7d6429..02376e1883f15 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -821,7 +821,7 @@ def searchsorted(self, v, side='left', sorter=None): array([3, 4]) # eggs before milk >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) - array([3, 5]) # eggs before donuts, after switching milk and donuts + array([3, 5]) # eggs after donuts, after switching milk and donuts """ # Fixes https://github.com/pydata/pandas/issues/8420 # Uses searchsorted twice, first to map the value to one of the codes, From 300129500ff80e61fe049eaf6f60ba511161a95a Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Sun, 30 Nov 2014 23:01:48 +0000 Subject: [PATCH 3/5] Incorporate categorical.searchsorted() feedback from jreback --- pandas/core/categorical.py | 12 +----- pandas/tests/test_categorical.py | 67 ++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 27 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 02376e1883f15..03cd44ebf34b5 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -823,19 +823,11 @@ def searchsorted(self, v, side='left', sorter=None): >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) array([3, 5]) # eggs after donuts, after switching milk and donuts """ - # Fixes https://github.com/pydata/pandas/issues/8420 - # Uses searchsorted twice, first to map the value to one of the codes, - # then to map the found code to the index into the Categorical. - # 'side' gets applied to the first one only, otherwise when side='right' - # any non-matching values jump too far to the right. if not self.ordered: raise ValueError("searchsorted requires an ordered Categorical.") - from pandas.core.series import Series # Local import to avoid circular ref - values_as_codes = self.categories.values.searchsorted(Series(v).values, side) - indices = self.codes.searchsorted(values_as_codes, sorter=sorter) - return indices - + values_as_codes = self.categories.values.searchsorted(np.asarray(v), side) + return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 8700647d275df..05fc0c0fec39b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -882,24 +882,57 @@ def test_nbytes(self): self.assertEqual(cat.nbytes, exp) def test_searchsorted(self): - cats1 = ['apple', 'bread', 'bread', 'cheese', 'milk' ] - cats2 = ['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ] - - for values in ( 'bread', ['bread'], ['bread','eggs'] ): - for side in ( 'left', 'right' ): - for cats, sorter in [ (cats1, None), (cats2, [0,1,2,3,5,4] ) ]: - s = pd.Series(cats) - c = pd.Categorical(cats) - # print("values=%r, side=%r, sorter=%r" % (values, side, sorter)) - catRes = c.searchsorted(values, side=side, sorter=sorter) - seriesRes = s.searchsorted(values, side=side, sorter=sorter) - #print("--> %r" % (catRes,)) - assert type(catRes) == type(seriesRes) - if isinstance( catRes, np.ndarray ): - self.assertTrue( (catRes - seriesRes == 0).all() ) - else: - self.assertEqual(catRes, seriesRes) + # https://github.com/pydata/pandas/issues/8420 + s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + c1 = pd.Categorical(s1) + c2 = pd.Categorical(s2) + + # Single item array + res = c1.searchsorted(['bread']) + chk = s1.searchsorted(['bread']) + exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Scalar version of single item array + # Ambiguous what Categorical should return as np.array returns + # a scalar and pd.Series returns an array. + # We get different results depending on whether + # Categorical.searchsorted(v) passes v through np.asarray() + # or pd.Series(v).values. The former returns scalar, the + # latter an array. + # Test code here follows np.array.searchsorted(). + # Commented out lines below follow pd.Series. + res = c1.searchsorted('bread') + chk = np.array(s1).searchsorted('bread') + exp = 1 + #exp = np.array([1]) + #chk = s1.searchsorted('bread') + #exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Searching for a value that is not present in the Categorical + res = c1.searchsorted(['bread', 'eggs']) + chk = s1.searchsorted(['bread', 'eggs']) + exp = np.array([1, 4]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + # Searching for a value that is not present, to the right + res = c1.searchsorted(['bread', 'eggs'], side='right') + chk = s1.searchsorted(['bread', 'eggs'], side='right') + exp = np.array([3, 4]) # eggs before milk + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # As above, but with a sorter array to reorder an unsorted array + res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + exp = np.array([3, 5]) # eggs after donuts, after switching milk and donuts + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) def test_deprecated_labels(self): # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier From ed2cbecf6daa225a6f23ccb312d05fed0e15c7de Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Sun, 30 Nov 2014 23:08:06 +0000 Subject: [PATCH 4/5] Fix scalar example in docstring for Categorical.searchsorted() --- pandas/core/categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 03cd44ebf34b5..0e586f22a3190 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -814,6 +814,8 @@ def searchsorted(self, v, side='left', sorter=None): [apple, bread, bread, cheese, milk] Categories (4, object): [apple < bread < cheese < milk] >>> x.searchsorted('bread') + 1 + >>> x.searchsorted(['bread']) array([1]) >>> x.searchsorted(['bread', 'eggs']) array([1, 4]) From 1b0db931902382d336a339faad6de13681630a7d Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Tue, 2 Dec 2014 00:09:18 +0000 Subject: [PATCH 5/5] Fix for GH#8944. x.size=='L' returns scalar. Needs to be x['size']=='L' to give a boolean array. --- doc/source/cookbook.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 8378873db9a65..6e411626ca770 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -489,9 +489,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x.size == 'S'].weight * 1.5) - avg_weight += sum(x[x.size == 'M'].weight * 1.25) - avg_weight += sum(x[x.size == 'L'].weight) + avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) + avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) + avg_weight += sum(x[x['size'] == 'L'].weight) avg_weight = avg_weight / len(x) return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult'])