diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 5464e7cba22c3..41460eaf47699 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -193,3 +193,53 @@ def time_categorical_series_is_monotonic_increasing(self): def time_categorical_series_is_monotonic_decreasing(self): self.s.is_monotonic_decreasing + + +class Contains(object): + + params = ([ + "b", # in array + "d", # in categories but not in codes + "z", # nowhere + np.nan, + ], + [True, False], + ) + param_names = ["value", "has_nan"] + + def setup(self, value, has_nan): + n = 1 * 10 ** 4 + obj_values = list("a" * n + "b" * n + "c" * n) + if has_nan: + obj_values = [np.nan] + obj_values[:-2] + [np.nan] + + self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) + self.cat = pd.Categorical(obj_values, categories=list("abcd")) + + def time_contains_index(self, value, has_nan): + value in self.ci + + def time_cat_isin(self, value, has_nan): + value in self.cat + + +class Indexing(object): + + params = (["a", "c"], [True, False]) + param_names = ["value", "has_nan"] + + def setup(self, value, has_nan): + n = 1 * 10 ** 4 + obj_values = list("a" * n + "b" * n + "c" * n) + if has_nan: + obj_values = [np.nan] + obj_values[:-2] + [np.nan] + + ci = pd.CategoricalIndex(obj_values, categories=list("abcd")) + self.df = pd.DataFrame(dict(A=range(n * 3)), index=ci) + self.ser = pd.Series(range(n * 3), index=ci) + + def time_loc_df(self, value, has_nan): + self.df.loc[value] + + def time_loc_ser(self, value, has_nan): + self.ser.loc[value] diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c69de149a0f35..4158edf683c89 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -64,7 +64,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- +- Improved performance of indexing on a Series/DataFrame with a ``CategoricalIndex`` (:issue:`21022`) .. _whatsnew_0240.docs: @@ -83,7 +83,7 @@ Bug Fixes Categorical ^^^^^^^^^^^ -- +- Fixed an issue where membership checks on ``CategoricalIndex`` with interval values may return false positive (:issue:`21022`) - - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 30f9c56d24f02..3791f854ea7c2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1847,6 +1847,19 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values().tolist()) + def __contains__(self, key): + """Returns True if `key` is in this Categorical.""" + hash(key) + if isna(key): + return self.isna().any() + elif self.categories._defer_to_indexing: # e.g. Interval values + loc = self.categories.get_loc(key) + return np.isin(self.codes, loc).any() + elif key in self.categories: + return self.categories.get_loc(key) in self._codes + else: + return False + def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..8722170ac41d4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -323,20 +323,10 @@ def _reverse_indexer(self): @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): - hash(key) - - if self.categories._defer_to_indexing: - return key in self.categories - return key in self.values @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): - hash(key) - - if self.categories._defer_to_indexing: - return self.categories.contains(key) - return key in self.values def __array__(self, dtype=None): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index a2a4170256088..88d76210da8ba 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -244,6 +244,17 @@ def test_contains(self): list('aabbca') + [np.nan], categories=list('cabdef')) assert np.nan in ci + ci = CategoricalIndex( + list('aaa'), categories=list('cabdef')) + assert 'f' not in ci + + def test_containst_defer_to_indexing(self): + intervals = pd.interval_range(1, 4) + cat = pd.CategoricalIndex(list(intervals[:-1]), categories=intervals) + assert intervals[0] in cat + assert intervals[1] in cat + assert intervals[2] not in cat + def test_min_max(self): ci = self.create_index(ordered=False)