-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: __contains__ method for Categorical #21022
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
4836a29
40f1cac
5f8b5e0
c5387a7
a3550e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,3 +193,47 @@ def time_categorical_series_is_monotonic_increasing(self): | |
|
||
def time_categorical_series_is_monotonic_decreasing(self): | ||
self.s.is_monotonic_decreasing | ||
|
||
|
||
class Contains(object): | ||
|
||
params = (["a", "c", "d", "z", np.nan], [True, False]) | ||
param_names = ["value", "has_nan"] | ||
|
||
def setup(self, value, has_nan): | ||
n = 1 * 10 ** 4 | ||
obj_values = list("a" * n + "b" * n + "c" * n) | ||
if has_nan: | ||
obj_values = [np.nan] + obj_values[:-2] + [np.nan] | ||
|
||
self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) | ||
self.cat = pd.Categorical(obj_values, categories=list("abcd")) | ||
|
||
def time_contains_index(self, value, has_nan): | ||
value in self.ci | ||
|
||
def time_cat_isin(self, value, has_nan): | ||
value in self.cat | ||
|
||
|
||
class Indexing(object): | ||
|
||
params = (["a", "c"], [True, False]) | ||
param_names = ["value", "has_nan"] | ||
|
||
def setup(self, value, has_nan): | ||
n = 1 * 10 ** 4 | ||
obj_values = list("a" * n + "b" * n + "c" * n) | ||
if has_nan: | ||
obj_values = [np.nan] + obj_values[:-2] + [np.nan] | ||
|
||
self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you are not using then don't assign to self e.g. self.ci is self.cat used? |
||
self.cat = pd.Categorical(obj_values, categories=list("abcd")) | ||
self.df = pd.DataFrame(dict(A=range(n * 3)), index=self.ci) | ||
self.ser = pd.Series(range(n * 3), index=self.ci) | ||
|
||
def time_loc_df(self, value, has_nan): | ||
self.df.loc[value] | ||
|
||
def time_loc_ser(self, value, has_nan): | ||
self.ser.loc[value] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,7 +64,7 @@ Performance Improvements | |
~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) | ||
- | ||
- Improved performance of indexing on a Series/DataFrame with a CategoricalIndex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add the issue number, use double-backticks on CategoricalIndex |
||
|
||
.. _whatsnew_0240.docs: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1847,6 +1847,15 @@ def __iter__(self): | |
"""Returns an Iterator over the values of this Categorical.""" | ||
return iter(self.get_values().tolist()) | ||
|
||
def __contains__(self, key): | ||
"""Returns True if `key` is in this Categorical.""" | ||
if key in self.categories: | ||
return self.categories.get_loc(key) in self.codes | ||
elif isna(key) and self.isna().any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested this in my #21369 and they seem both to be the same speed and the na-version reads better, so IMO thatis ok to use. |
||
return True | ||
else: | ||
return False | ||
|
||
def _tidy_repr(self, max_vals=10, footer=True): | ||
""" a short repr displaying only max_vals and an optional (but default | ||
footer) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
prob don't need all of these, maybe just a, d, np.nan