From b44614bd20450e4cab7e79af35f87008235ef810 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 16 Jun 2018 15:27:57 +0100 Subject: [PATCH 1/5] add Categorical.__contains__ --- asv_bench/benchmarks/categoricals.py | 10 ++++--- pandas/core/arrays/categorical.py | 37 ++++++++++++++++++++++++++ pandas/core/indexes/category.py | 24 +++-------------- pandas/tests/categorical/test_algos.py | 17 ++++++++++++ 4 files changed, 64 insertions(+), 24 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 48f42621d183d..73e3933122628 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -202,7 +202,11 @@ class Contains(object): def setup(self): N = 10**5 self.ci = tm.makeCategoricalIndex(N) - self.cat = self.ci.categories[0] + self.c = self.ci.values + self.key = self.ci.categories[0] - def time_contains(self): - self.cat in self.ci + def time_categorical_index_contains(self): + self.key in self.ci + + def time_categorical_contains(self): + self.key in self.c diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e22b0d626a218..1465bb51633b5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1846,6 +1846,43 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values().tolist()) + @staticmethod + def _contains(key, categories, container): + """Returns True if `key` is in `categories` and the + location of `key` in `categories` is in `container`. + + This is a helper method used in :method:`Categorical.__contains__` + and in :class:`CategoricalIndex.__contains__`. + """ + + # is key in categories? Then get its location in categories. + # If not (i.e. KeyError), its location logically can't be in + # container either. + try: + loc = categories.get_loc(key) + except KeyError: + return False + + # loc is the location of key in categories, but also the value + # for key in container. So, key may be in categories, + # but still not in container, this must be checked. Example: + # 'b' in Categorical(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in container + else: + # if categories is an IntervalIndex, loc is an array. + # Check if any scalar of the array is in the container + return any(loc_ in container for loc_ in loc) + + def __contains__(self, key): + """Returns True if `key` is in this Categorical.""" + hash(key) + + if isna(key): # if key is a NaN, check if any NaN is in self. + return self.isna().any() + + return self._contains(key, self.categories, container=self._codes) + def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0093d4940751e..18fe7ff79e804 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -24,6 +24,7 @@ import pandas.core.common as com import pandas.core.missing as missing import pandas.core.indexes.base as ibase +from pandas.core.arrays import Categorical _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -125,7 +126,6 @@ def _create_from_codes(self, codes, categories=None, ordered=None, CategoricalIndex """ - from pandas.core.arrays import Categorical if categories is None: categories = self.categories if ordered is None: @@ -162,7 +162,6 @@ def _create_categorical(self, data, categories=None, ordered=None, if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False - from pandas.core.arrays import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: @@ -328,23 +327,8 @@ def __contains__(self, key): if isna(key): # if key is a NaN, check if any NaN is in self. return self.hasnans - # is key in self.categories? Then get its location. - # If not (i.e. KeyError), it logically can't be in self either - try: - loc = self.categories.get_loc(key) - except KeyError: - return False - - # loc is the location of key in self.categories, but also the value - # for key in self.codes and in self._engine. key may be in categories, - # but still not in self, check this. Example: - # 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False - if is_scalar(loc): - return loc in self._engine - else: - # if self.categories is IntervalIndex, loc is an array - # check if any scalar of the array is in self._engine - return any(loc_ in self._engine for loc_ in loc) + return Categorical._contains(key, categories=self.categories, + container=self._engine) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): @@ -479,7 +463,6 @@ def where(self, cond, other=None): other = self._na_value values = np.where(cond, self.values, other) - from pandas.core.arrays import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) @@ -862,7 +845,6 @@ def _delegate_method(self, name, *args, **kwargs): def _add_accessors(cls): """ add in Categorical accessor methods """ - from pandas.core.arrays import Categorical CategoricalIndex._add_delegate_accessors( delegate=Categorical, accessors=["rename_categories", "reorder_categories", diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index dcf2081ae32fe..ee0c49f76b432 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -71,6 +71,23 @@ def test_isin_empty(empty): tm.assert_numpy_array_equal(expected, result) +def test_contains(): + + c = pd.Categorical(list('aabbca'), categories=list('cab')) + + assert 'b' in c + assert 'z' not in c + assert np.nan not in c + + # assert codes NOT in index + assert 0 not in c + assert 1 not in c + + c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) + + assert np.nan in c + + class TestTake(object): # https://github.com/pandas-dev/pandas/issues/20664 From f5fd77c44f4351e1da7ba3b74d069cabf2d45346 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 17 Jun 2018 10:04:19 +0100 Subject: [PATCH 2/5] Improve doc string of Categorical._contains --- pandas/core/arrays/categorical.py | 30 +++++++++++++++++++++++--- pandas/tests/categorical/test_algos.py | 3 +-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1465bb51633b5..7861a8e9f37c4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1848,11 +1848,35 @@ def __iter__(self): @staticmethod def _contains(key, categories, container): - """Returns True if `key` is in `categories` and the - location of `key` in `categories` is in `container`. + """ + Helper for membership check for ``key``. - This is a helper method used in :method:`Categorical.__contains__` + This helper method is used in :method:`Categorical.__contains__` and in :class:`CategoricalIndex.__contains__`. + + Returns True if ``key`` is in ``categories`` and the + location of ``key`` in ``categories`` is in ``container``. + + Parameters + ---------- + key : a hashable object + The key to check membership for. + categories : Sequence + The possible values for ``key``. The location for ``key`` + in ``categories`` is also its value in ``container`` + container : Container (e.g. list-like or mapping) + The container to check for membership in. + + Returns + ------- + is_in : bool + True if ``key`` is in ``categories`` and location of + ``key`` in ``categories`` is in ``container``, else False. + + Notes + ----- + This method does not check for Nan values. Do that separately + before calling this method. """ # is key in categories? Then get its location in categories. diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index ee0c49f76b432..b7d8eeaa596ff 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -72,7 +72,7 @@ def test_isin_empty(empty): def test_contains(): - + # GH21508 c = pd.Categorical(list('aabbca'), categories=list('cab')) assert 'b' in c @@ -84,7 +84,6 @@ def test_contains(): assert 1 not in c c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) - assert np.nan in c From 913790deb9c37d55350cc68e0f1da14f1b5b0337 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 18 Jun 2018 13:36:27 +0100 Subject: [PATCH 3/5] Reimplenent Categorical._contains --- pandas/core/arrays/categorical.py | 103 +++++++++++++++--------------- pandas/core/indexes/category.py | 5 +- 2 files changed, 52 insertions(+), 56 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7861a8e9f37c4..5d84d61aef2db 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -157,6 +157,55 @@ def _maybe_to_categorical(array): return array +def contains(cat, key, container): + """ + Helper for membership check for ``key`` in ``cat``. + + This is a helper method for :method:`__contains__` + and :class:`CategoricalIndex.__contains__`. + + Returns True if ``key`` is in ``cat.categories`` and the + location of ``key`` in ``categories`` is in ``container``. + + Parameters + ---------- + cat : :class:`Categorical`or :class:`categoricalIndex` + key : a hashable object + The key to check membership for. + container : Container (e.g. list-like or mapping) + The container to check for membership in. + + Returns + ------- + is_in : bool + True if ``key`` is in ``self.categories`` and location of + ``key`` in ``categories`` is in ``container``, else False. + + Notes + ----- + This method does not check for Nan values. Do that separately + before calling this method. + """ + # get location of key in categories. + # If a KeyError, the key isn't in categories, so logically + # can't be in container either. + try: + loc = cat.categories.get_loc(key) + except KeyError: + return False + + # loc is the location of key in categories, but also the *value* + # for key in container. So, `key` may be in categories, + # but still not in `container`. Example ('b' in categories, + # but not in values): + # 'b' in Categorical(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in container + else: + # if categories is an IntervalIndex, loc is an array. + return any(loc_ in container for loc_ in loc) + + _codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real @@ -1846,58 +1895,6 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values().tolist()) - @staticmethod - def _contains(key, categories, container): - """ - Helper for membership check for ``key``. - - This helper method is used in :method:`Categorical.__contains__` - and in :class:`CategoricalIndex.__contains__`. - - Returns True if ``key`` is in ``categories`` and the - location of ``key`` in ``categories`` is in ``container``. - - Parameters - ---------- - key : a hashable object - The key to check membership for. - categories : Sequence - The possible values for ``key``. The location for ``key`` - in ``categories`` is also its value in ``container`` - container : Container (e.g. list-like or mapping) - The container to check for membership in. - - Returns - ------- - is_in : bool - True if ``key`` is in ``categories`` and location of - ``key`` in ``categories`` is in ``container``, else False. - - Notes - ----- - This method does not check for Nan values. Do that separately - before calling this method. - """ - - # is key in categories? Then get its location in categories. - # If not (i.e. KeyError), its location logically can't be in - # container either. - try: - loc = categories.get_loc(key) - except KeyError: - return False - - # loc is the location of key in categories, but also the value - # for key in container. So, key may be in categories, - # but still not in container, this must be checked. Example: - # 'b' in Categorical(['a'], categories=['a', 'b']) # False - if is_scalar(loc): - return loc in container - else: - # if categories is an IntervalIndex, loc is an array. - # Check if any scalar of the array is in the container - return any(loc_ in container for loc_ in loc) - def __contains__(self, key): """Returns True if `key` is in this Categorical.""" hash(key) @@ -1905,7 +1902,7 @@ def __contains__(self, key): if isna(key): # if key is a NaN, check if any NaN is in self. return self.isna().any() - return self._contains(key, self.categories, container=self._codes) + return contains(self, key, container=self._codes) def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 18fe7ff79e804..e7e3472e024ce 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -24,7 +24,7 @@ import pandas.core.common as com import pandas.core.missing as missing import pandas.core.indexes.base as ibase -from pandas.core.arrays import Categorical +from pandas.core.arrays.categorical import Categorical, contains _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -327,8 +327,7 @@ def __contains__(self, key): if isna(key): # if key is a NaN, check if any NaN is in self. return self.hasnans - return Categorical._contains(key, categories=self.categories, - container=self._engine) + return contains(self, key, container=self._engine) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): From b0f12ec1f3f57b941c0ed0cb7f70e89c44467eef Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 19 Jun 2018 16:45:52 +0100 Subject: [PATCH 4/5] changed according to comments --- pandas/core/arrays/categorical.py | 9 +++++---- pandas/core/indexes/category.py | 6 ++---- pandas/tests/categorical/test_algos.py | 16 ---------------- pandas/tests/categorical/test_operators.py | 17 +++++++++++++++++ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5d84d61aef2db..7b3cce0f2585d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -183,9 +183,11 @@ def contains(cat, key, container): Notes ----- - This method does not check for Nan values. Do that separately + This method does not check for NaN values. Do that separately before calling this method. """ + hash(key) + # get location of key in categories. # If a KeyError, the key isn't in categories, so logically # can't be in container either. @@ -1897,9 +1899,8 @@ def __iter__(self): def __contains__(self, key): """Returns True if `key` is in this Categorical.""" - hash(key) - - if isna(key): # if key is a NaN, check if any NaN is in self. + # if key is a NaN, check if any NaN is in self. + if isna(key): return self.isna().any() return contains(self, key, container=self._codes) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e7e3472e024ce..fc669074758da 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -322,16 +322,14 @@ def _reverse_indexer(self): @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): - hash(key) - - if isna(key): # if key is a NaN, check if any NaN is in self. + # if key is a NaN, check if any NaN is in self. + if isna(key): return self.hasnans return contains(self, key, container=self._engine) @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): - hash(key) return key in self def __array__(self, dtype=None): diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index b7d8eeaa596ff..dcf2081ae32fe 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -71,22 +71,6 @@ def test_isin_empty(empty): tm.assert_numpy_array_equal(expected, result) -def test_contains(): - # GH21508 - c = pd.Categorical(list('aabbca'), categories=list('cab')) - - assert 'b' in c - assert 'z' not in c - assert np.nan not in c - - # assert codes NOT in index - assert 0 not in c - assert 1 not in c - - c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) - assert np.nan in c - - class TestTake(object): # https://github.com/pandas-dev/pandas/issues/20664 diff --git a/pandas/tests/categorical/test_operators.py b/pandas/tests/categorical/test_operators.py index fa8bb817616e4..a26de32d7446c 100644 --- a/pandas/tests/categorical/test_operators.py +++ b/pandas/tests/categorical/test_operators.py @@ -291,3 +291,20 @@ def test_numeric_like_ops(self): # invalid ufunc pytest.raises(TypeError, lambda: np.log(s)) + + def test_contains(self): + # GH21508 + c = pd.Categorical(list('aabbca'), categories=list('cab')) + + assert 'b' in c + assert 'z' not in c + assert np.nan not in c + with pytest.raises(TypeError): + assert [1] in c + + # assert codes NOT in index + assert 0 not in c + assert 1 not in c + + c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) + assert np.nan in c From fd89a8a58d6f37b3acc8658d5a25e488b16fbcfd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 20 Jun 2018 06:28:47 -0400 Subject: [PATCH 5/5] doc --- doc/source/whatsnew/v0.23.2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0f2c9c4756987..5454dc9eca360 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -26,7 +26,7 @@ Performance Improvements - Improved performance of membership checks in :class:`CategoricalIndex` (i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`) + is likewise much faster (:issue:`21369`, :issue:`21508`) - Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`) -