diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 5464e7cba22c3..48f42621d183d 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -193,3 +193,16 @@ def time_categorical_series_is_monotonic_increasing(self): def time_categorical_series_is_monotonic_decreasing(self): self.s.is_monotonic_decreasing + + +class Contains(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + self.ci = tm.makeCategoricalIndex(N) + self.cat = self.ci.categories[0] + + def time_contains(self): + self.cat in self.ci diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 3e4326dea2ecc..1ac6e21adc46d 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -24,7 +24,9 @@ Fixed Regressions Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Improved performance of membership checks in :class:`CategoricalIndex` + (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`) - Documentation Changes diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 587090fa72def..7f2860a963423 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -325,19 +325,31 @@ def _reverse_indexer(self): def __contains__(self, key): hash(key) - if self.categories._defer_to_indexing: - return key in self.categories + if isna(key): # if key is a NaN, check if any NaN is in self. + return self.isna().any() + + # is key in self.categories? Then get its location. + # If not (i.e. KeyError), it logically can't be in self either + try: + loc = self.categories.get_loc(key) + except KeyError: + return False - return key in self.values + # loc is the location of key in self.categories, but also the value + # for key in self.codes and in self._engine. key may be in categories, + # but still not in self, check this. Example: + # 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in self._engine + else: + # if self.categories is IntervalIndex, loc is an array + # check if any scalar of the array is in self._engine + return any(loc_ in self._engine for loc_ in loc) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): hash(key) - - if self.categories._defer_to_indexing: - return self.categories.contains(key) - - return key in self.values + return key in self def __array__(self, dtype=None): """ the array interface, return my values """