diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 8378873db9a65..6e411626ca770 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -489,9 +489,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x.size == 'S'].weight * 1.5) - avg_weight += sum(x[x.size == 'M'].weight * 1.25) - avg_weight += sum(x[x.size == 'L'].weight) + avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) + avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) + avg_weight += sum(x[x['size'] == 'L'].weight) avg_weight = avg_weight / len(x) return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult']) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index eb0429ad4a0cd..0e586f22a3190 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -776,7 +776,60 @@ def nbytes(self): return self._codes.nbytes + self._categories.values.nbytes def searchsorted(self, v, side='left', sorter=None): - raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420") + """Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted Categorical `self` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Parameters + ---------- + v : array_like + Array-like values or a scalar value, to insert/search for in `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `a`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + Series.searchsorted + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + [apple, bread, bread, cheese, milk] + Categories (4, object): [apple < bread < cheese < milk] + >>> x.searchsorted('bread') + 1 + >>> x.searchsorted(['bread']) + array([1]) + >>> x.searchsorted(['bread', 'eggs']) + array([1, 4]) + >>> x.searchsorted(['bread', 'eggs'], side='right') + array([3, 4]) # eggs before milk + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + array([3, 5]) # eggs after donuts, after switching milk and donuts + """ + if not self.ordered: + raise ValueError("searchsorted requires an ordered Categorical.") + + values_as_codes = self.categories.values.searchsorted(np.asarray(v), side) + return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index dc82abfb40e02..05fc0c0fec39b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -882,13 +882,57 @@ def test_nbytes(self): self.assertEqual(cat.nbytes, exp) def test_searchsorted(self): + # https://github.com/pydata/pandas/issues/8420 + s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + c1 = pd.Categorical(s1) + c2 = pd.Categorical(s2) + + # Single item array + res = c1.searchsorted(['bread']) + chk = s1.searchsorted(['bread']) + exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Scalar version of single item array + # Ambiguous what Categorical should return as np.array returns + # a scalar and pd.Series returns an array. + # We get different results depending on whether + # Categorical.searchsorted(v) passes v through np.asarray() + # or pd.Series(v).values. The former returns scalar, the + # latter an array. + # Test code here follows np.array.searchsorted(). + # Commented out lines below follow pd.Series. + res = c1.searchsorted('bread') + chk = np.array(s1).searchsorted('bread') + exp = 1 + #exp = np.array([1]) + #chk = s1.searchsorted('bread') + #exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Searching for a value that is not present in the Categorical + res = c1.searchsorted(['bread', 'eggs']) + chk = s1.searchsorted(['bread', 'eggs']) + exp = np.array([1, 4]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) - # See https://github.com/pydata/pandas/issues/8420 - # TODO: implement me... - cat = pd.Categorical([1,2,3]) - def f(): - cat.searchsorted(3) - self.assertRaises(NotImplementedError, f) + # Searching for a value that is not present, to the right + res = c1.searchsorted(['bread', 'eggs'], side='right') + chk = s1.searchsorted(['bread', 'eggs'], side='right') + exp = np.array([3, 4]) # eggs before milk + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # As above, but with a sorter array to reorder an unsorted array + res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + exp = np.array([3, 5]) # eggs after donuts, after switching milk and donuts + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) def test_deprecated_labels(self): # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier