-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Categorical.searchsorted(): use provided categorical order #14697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1086,10 +1086,15 @@ def searchsorted(self, value, side='left', sorter=None): | |
"ordered one") | ||
|
||
from pandas.core.series import Series | ||
values_as_codes = self.categories.values.searchsorted( | ||
Series(value).values, side=side) | ||
|
||
return self.codes.searchsorted(values_as_codes, sorter=sorter) | ||
values_as_codes = _get_codes_for_values(Series(value).values, | ||
self.categories) | ||
|
||
if -1 in values_as_codes: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do this check after you search otherwise you end up scanning the data twice There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, could you explain it please? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see here: https://github.com/pandas-dev/pandas/blob/master/pandas/computation/pytables.py#L203. The idea IS to use searchsorted. Then check the 0's (only). If they are not actual categories, then you would raise. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't searching for 0's in the result of searchsorted() generally the same as searching for -1 among codes to be inserted? |
||
raise ValueError("Value(s) to be inserted must be in categories.") | ||
|
||
return self.codes.searchsorted(values_as_codes, side=side, | ||
sorter=sorter) | ||
|
||
def isnull(self): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1548,54 +1548,55 @@ def test_memory_usage(self): | |
|
||
def test_searchsorted(self): | ||
# https://github.com/pandas-dev/pandas/issues/8420 | ||
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk']) | ||
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts']) | ||
c1 = pd.Categorical(s1, ordered=True) | ||
c2 = pd.Categorical(s2, ordered=True) | ||
|
||
# Single item array | ||
res = c1.searchsorted(['bread']) | ||
chk = s1.searchsorted(['bread']) | ||
exp = np.array([1], dtype=np.intp) | ||
self.assert_numpy_array_equal(res, exp) | ||
self.assert_numpy_array_equal(res, chk) | ||
|
||
# Scalar version of single item array | ||
# Categorical return np.array like pd.Series, but different from | ||
# np.array.searchsorted() | ||
res = c1.searchsorted('bread') | ||
chk = s1.searchsorted('bread') | ||
exp = np.array([1], dtype=np.intp) | ||
self.assert_numpy_array_equal(res, exp) | ||
self.assert_numpy_array_equal(res, chk) | ||
# https://github.com/pandas-dev/pandas/issues/14522 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. check these for Series (as well as Categorical) |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a 1-line about what the guarantees are here |
||
c1 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], | ||
categories=['cheese', 'milk', 'apple', 'bread'], | ||
ordered=True) | ||
s1 = pd.Series(c1) | ||
c2 = pd.Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], | ||
categories=['cheese', 'milk', 'apple', 'bread'], | ||
ordered=False) | ||
s2 = pd.Series(c2) | ||
|
||
# Searching for single item argument, side='left' (default) | ||
res_cat = c1.searchsorted('apple') | ||
res_ser = s1.searchsorted('apple') | ||
exp = np.array([2], dtype=np.intp) | ||
self.assert_numpy_array_equal(res_cat, exp) | ||
self.assert_numpy_array_equal(res_ser, exp) | ||
|
||
# Searching for single item array, side='left' (default) | ||
res_cat = c1.searchsorted(['bread']) | ||
res_ser = s1.searchsorted(['bread']) | ||
exp = np.array([3], dtype=np.intp) | ||
self.assert_numpy_array_equal(res_cat, exp) | ||
self.assert_numpy_array_equal(res_ser, exp) | ||
|
||
# Searching for several items array, side='right' | ||
res_cat = c1.searchsorted(['apple', 'bread'], side='right') | ||
res_ser = s1.searchsorted(['apple', 'bread'], side='right') | ||
exp = np.array([3, 5], dtype=np.intp) | ||
self.assert_numpy_array_equal(res_cat, exp) | ||
self.assert_numpy_array_equal(res_ser, exp) | ||
|
||
# Searching for a value that is not present in the Categorical | ||
res = c1.searchsorted(['bread', 'eggs']) | ||
chk = s1.searchsorted(['bread', 'eggs']) | ||
exp = np.array([1, 4], dtype=np.intp) | ||
self.assert_numpy_array_equal(res, exp) | ||
self.assert_numpy_array_equal(res, chk) | ||
# Searching for a single value that is not from the Categorical | ||
self.assertRaises(ValueError, lambda: c1.searchsorted('cucumber')) | ||
self.assertRaises(ValueError, lambda: s1.searchsorted('cucumber')) | ||
|
||
# Searching for a value that is not present, to the right | ||
res = c1.searchsorted(['bread', 'eggs'], side='right') | ||
chk = s1.searchsorted(['bread', 'eggs'], side='right') | ||
exp = np.array([3, 4], dtype=np.intp) # eggs before milk | ||
self.assert_numpy_array_equal(res, exp) | ||
self.assert_numpy_array_equal(res, chk) | ||
|
||
# As above, but with a sorter array to reorder an unsorted array | ||
res = c2.searchsorted(['bread', 'eggs'], side='right', | ||
sorter=[0, 1, 2, 3, 5, 4]) | ||
chk = s2.searchsorted(['bread', 'eggs'], side='right', | ||
sorter=[0, 1, 2, 3, 5, 4]) | ||
# eggs after donuts, after switching milk and donuts | ||
exp = np.array([3, 5], dtype=np.intp) | ||
self.assert_numpy_array_equal(res, exp) | ||
self.assert_numpy_array_equal(res, chk) | ||
# Searching for multiple values one of each is not from the Categorical | ||
self.assertRaises(ValueError, | ||
lambda: c1.searchsorted(['bread', 'cucumber'])) | ||
self.assertRaises(ValueError, | ||
lambda: s1.searchsorted(['bread', 'cucumber'])) | ||
|
||
# searchsorted call for unordered Categorical | ||
self.assertRaises(ValueError, lambda: c2.searchsorted('apple')) | ||
self.assertRaises(ValueError, lambda: s2.searchsorted('apple')) | ||
|
||
with tm.assert_produces_warning(FutureWarning): | ||
res = c1.searchsorted(v=['bread']) | ||
exp = np.array([1], dtype=np.intp) | ||
exp = np.array([3], dtype=np.intp) | ||
tm.assert_numpy_array_equal(res, exp) | ||
|
||
def test_deprecated_labels(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you move to 0.20.0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and put under Other API changes