diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 48f42621d183d..73e3933122628 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -202,7 +202,11 @@ class Contains(object): def setup(self): N = 10**5 self.ci = tm.makeCategoricalIndex(N) - self.cat = self.ci.categories[0] + self.c = self.ci.values + self.key = self.ci.categories[0] - def time_contains(self): - self.cat in self.ci + def time_categorical_index_contains(self): + self.key in self.ci + + def time_categorical_contains(self): + self.key in self.c diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0f2c9c4756987..5454dc9eca360 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -26,7 +26,7 @@ Performance Improvements - Improved performance of membership checks in :class:`CategoricalIndex` (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`) + is likewise much faster (:issue:`21369`, :issue:`21508`) - Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e22b0d626a218..7b3cce0f2585d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -157,6 +157,57 @@ def _maybe_to_categorical(array): return array +def contains(cat, key, container): + """ + Helper for membership check for ``key`` in ``cat``. + + This is a helper method for :method:`__contains__` + and :class:`CategoricalIndex.__contains__`. + + Returns True if ``key`` is in ``cat.categories`` and the + location of ``key`` in ``categories`` is in ``container``. + + Parameters + ---------- + cat : :class:`Categorical`or :class:`categoricalIndex` + key : a hashable object + The key to check membership for. + container : Container (e.g. list-like or mapping) + The container to check for membership in. + + Returns + ------- + is_in : bool + True if ``key`` is in ``self.categories`` and location of + ``key`` in ``categories`` is in ``container``, else False. + + Notes + ----- + This method does not check for NaN values. Do that separately + before calling this method. + """ + hash(key) + + # get location of key in categories. + # If a KeyError, the key isn't in categories, so logically + # can't be in container either. + try: + loc = cat.categories.get_loc(key) + except KeyError: + return False + + # loc is the location of key in categories, but also the *value* + # for key in container. So, `key` may be in categories, + # but still not in `container`. Example ('b' in categories, + # but not in values): + # 'b' in Categorical(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in container + else: + # if categories is an IntervalIndex, loc is an array. + return any(loc_ in container for loc_ in loc) + + _codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real @@ -1846,6 +1897,14 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values().tolist()) + def __contains__(self, key): + """Returns True if `key` is in this Categorical.""" + # if key is a NaN, check if any NaN is in self. + if isna(key): + return self.isna().any() + + return contains(self, key, container=self._codes) + def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0093d4940751e..fc669074758da 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -24,6 +24,7 @@ import pandas.core.common as com import pandas.core.missing as missing import pandas.core.indexes.base as ibase +from pandas.core.arrays.categorical import Categorical, contains _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -125,7 +126,6 @@ def _create_from_codes(self, codes, categories=None, ordered=None, CategoricalIndex """ - from pandas.core.arrays import Categorical if categories is None: categories = self.categories if ordered is None: @@ -162,7 +162,6 @@ def _create_categorical(self, data, categories=None, ordered=None, if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False - from pandas.core.arrays import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: @@ -323,32 +322,14 @@ def _reverse_indexer(self): @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): - hash(key) - - if isna(key): # if key is a NaN, check if any NaN is in self. + # if key is a NaN, check if any NaN is in self. + if isna(key): return self.hasnans - # is key in self.categories? Then get its location. - # If not (i.e. KeyError), it logically can't be in self either - try: - loc = self.categories.get_loc(key) - except KeyError: - return False - - # loc is the location of key in self.categories, but also the value - # for key in self.codes and in self._engine. key may be in categories, - # but still not in self, check this. Example: - # 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False - if is_scalar(loc): - return loc in self._engine - else: - # if self.categories is IntervalIndex, loc is an array - # check if any scalar of the array is in self._engine - return any(loc_ in self._engine for loc_ in loc) + return contains(self, key, container=self._engine) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): - hash(key) return key in self def __array__(self, dtype=None): @@ -479,7 +460,6 @@ def where(self, cond, other=None): other = self._na_value values = np.where(cond, self.values, other) - from pandas.core.arrays import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) @@ -862,7 +842,6 @@ def _delegate_method(self, name, *args, **kwargs): def _add_accessors(cls): """ add in Categorical accessor methods """ - from pandas.core.arrays import Categorical CategoricalIndex._add_delegate_accessors( delegate=Categorical, accessors=["rename_categories", "reorder_categories", diff --git a/pandas/tests/categorical/test_operators.py b/pandas/tests/categorical/test_operators.py index fa8bb817616e4..a26de32d7446c 100644 --- a/pandas/tests/categorical/test_operators.py +++ b/pandas/tests/categorical/test_operators.py @@ -291,3 +291,20 @@ def test_numeric_like_ops(self): # invalid ufunc pytest.raises(TypeError, lambda: np.log(s)) + + def test_contains(self): + # GH21508 + c = pd.Categorical(list('aabbca'), categories=list('cab')) + + assert 'b' in c + assert 'z' not in c + assert np.nan not in c + with pytest.raises(TypeError): + assert [1] in c + + # assert codes NOT in index + assert 0 not in c + assert 1 not in c + + c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) + assert np.nan in c