Skip to content

PERF: __contains__ method for Categorical #21022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,53 @@ def time_categorical_series_is_monotonic_increasing(self):

def time_categorical_series_is_monotonic_decreasing(self):
self.s.is_monotonic_decreasing


class Contains(object):

params = ([
"b", # in array
"d", # in categories but not in codes
"z", # nowhere
np.nan,
],
[True, False],
)
param_names = ["value", "has_nan"]

def setup(self, value, has_nan):
n = 1 * 10 ** 4
obj_values = list("a" * n + "b" * n + "c" * n)
if has_nan:
obj_values = [np.nan] + obj_values[:-2] + [np.nan]

self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
self.cat = pd.Categorical(obj_values, categories=list("abcd"))

def time_contains_index(self, value, has_nan):
value in self.ci

def time_cat_isin(self, value, has_nan):
value in self.cat


class Indexing(object):

params = (["a", "c"], [True, False])
param_names = ["value", "has_nan"]

def setup(self, value, has_nan):
n = 1 * 10 ** 4
obj_values = list("a" * n + "b" * n + "c" * n)
if has_nan:
obj_values = [np.nan] + obj_values[:-2] + [np.nan]

ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
self.df = pd.DataFrame(dict(A=range(n * 3)), index=ci)
self.ser = pd.Series(range(n * 3), index=ci)

def time_loc_df(self, value, has_nan):
self.df.loc[value]

def time_loc_ser(self, value, has_nan):
self.ser.loc[value]
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
-
- Improved performance of indexing on a Series/DataFrame with a ``CategoricalIndex`` (:issue:`21022`)

.. _whatsnew_0240.docs:

Expand All @@ -83,7 +83,7 @@ Bug Fixes
Categorical
^^^^^^^^^^^

-
- Fixed an issue where membership checks on ``CategoricalIndex`` with interval values may return false positive (:issue:`21022`)
-
-

Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1847,6 +1847,19 @@ def __iter__(self):
"""Returns an Iterator over the values of this Categorical."""
return iter(self.get_values().tolist())

def __contains__(self, key):
"""Returns True if `key` is in this Categorical."""
hash(key)
if isna(key):
return self.isna().any()
elif self.categories._defer_to_indexing: # e.g. Interval values
loc = self.categories.get_loc(key)
return np.isin(self.codes, loc).any()
elif key in self.categories:
return self.categories.get_loc(key) in self._codes
else:
return False

def _tidy_repr(self, max_vals=10, footer=True):
""" a short repr displaying only max_vals and an optional (but default
footer)
Expand Down
10 changes: 0 additions & 10 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,20 +323,10 @@ def _reverse_indexer(self):

@Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs)
def __contains__(self, key):
hash(key)

if self.categories._defer_to_indexing:
return key in self.categories

return key in self.values

@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
def contains(self, key):
hash(key)

if self.categories._defer_to_indexing:
return self.categories.contains(key)

return key in self.values

def __array__(self, dtype=None):
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,17 @@ def test_contains(self):
list('aabbca') + [np.nan], categories=list('cabdef'))
assert np.nan in ci

ci = CategoricalIndex(
list('aaa'), categories=list('cabdef'))
assert 'f' not in ci

def test_containst_defer_to_indexing(self):
intervals = pd.interval_range(1, 4)
cat = pd.CategoricalIndex(list(intervals[:-1]), categories=intervals)
assert intervals[0] in cat
assert intervals[1] in cat
assert intervals[2] not in cat

def test_min_max(self):

ci = self.create_index(ordered=False)
Expand Down