Skip to content

Commit cc51575

Browse files
topper-123victor
authored and
victor
committed
PERF: add method Categorical.__contains__ (pandas-dev#21508)
1 parent 4d3f504 commit cc51575

File tree

5 files changed

+88
-29
lines changed

5 files changed

+88
-29
lines changed

asv_bench/benchmarks/categoricals.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,11 @@ class Contains(object):
202202
def setup(self):
203203
N = 10**5
204204
self.ci = tm.makeCategoricalIndex(N)
205-
self.cat = self.ci.categories[0]
205+
self.c = self.ci.values
206+
self.key = self.ci.categories[0]
206207

207-
def time_contains(self):
208-
self.cat in self.ci
208+
def time_categorical_index_contains(self):
209+
self.key in self.ci
210+
211+
def time_categorical_contains(self):
212+
self.key in self.c

doc/source/whatsnew/v0.23.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Performance Improvements
2626

2727
- Improved performance of membership checks in :class:`CategoricalIndex`
2828
(i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains`
29-
is likewise much faster (:issue:`21369`)
29+
is likewise much faster (:issue:`21369`, :issue:`21508`)
3030
- Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`)
3131
-
3232

pandas/core/arrays/categorical.py

+59
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,57 @@ def _maybe_to_categorical(array):
157157
return array
158158

159159

160+
def contains(cat, key, container):
161+
"""
162+
Helper for membership check for ``key`` in ``cat``.
163+
164+
This is a helper method for :method:`__contains__`
165+
and :class:`CategoricalIndex.__contains__`.
166+
167+
Returns True if ``key`` is in ``cat.categories`` and the
168+
location of ``key`` in ``categories`` is in ``container``.
169+
170+
Parameters
171+
----------
172+
cat : :class:`Categorical`or :class:`categoricalIndex`
173+
key : a hashable object
174+
The key to check membership for.
175+
container : Container (e.g. list-like or mapping)
176+
The container to check for membership in.
177+
178+
Returns
179+
-------
180+
is_in : bool
181+
True if ``key`` is in ``self.categories`` and location of
182+
``key`` in ``categories`` is in ``container``, else False.
183+
184+
Notes
185+
-----
186+
This method does not check for NaN values. Do that separately
187+
before calling this method.
188+
"""
189+
hash(key)
190+
191+
# get location of key in categories.
192+
# If a KeyError, the key isn't in categories, so logically
193+
# can't be in container either.
194+
try:
195+
loc = cat.categories.get_loc(key)
196+
except KeyError:
197+
return False
198+
199+
# loc is the location of key in categories, but also the *value*
200+
# for key in container. So, `key` may be in categories,
201+
# but still not in `container`. Example ('b' in categories,
202+
# but not in values):
203+
# 'b' in Categorical(['a'], categories=['a', 'b']) # False
204+
if is_scalar(loc):
205+
return loc in container
206+
else:
207+
# if categories is an IntervalIndex, loc is an array.
208+
return any(loc_ in container for loc_ in loc)
209+
210+
160211
_codes_doc = """The category codes of this categorical.
161212
162213
Level codes are an array if integer which are the positions of the real
@@ -1846,6 +1897,14 @@ def __iter__(self):
18461897
"""Returns an Iterator over the values of this Categorical."""
18471898
return iter(self.get_values().tolist())
18481899

1900+
def __contains__(self, key):
1901+
"""Returns True if `key` is in this Categorical."""
1902+
# if key is a NaN, check if any NaN is in self.
1903+
if isna(key):
1904+
return self.isna().any()
1905+
1906+
return contains(self, key, container=self._codes)
1907+
18491908
def _tidy_repr(self, max_vals=10, footer=True):
18501909
""" a short repr displaying only max_vals and an optional (but default
18511910
footer)

pandas/core/indexes/category.py

+4-25
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import pandas.core.common as com
2525
import pandas.core.missing as missing
2626
import pandas.core.indexes.base as ibase
27+
from pandas.core.arrays.categorical import Categorical, contains
2728

2829
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
2930
_index_doc_kwargs.update(dict(target_klass='CategoricalIndex'))
@@ -125,7 +126,6 @@ def _create_from_codes(self, codes, categories=None, ordered=None,
125126
CategoricalIndex
126127
"""
127128

128-
from pandas.core.arrays import Categorical
129129
if categories is None:
130130
categories = self.categories
131131
if ordered is None:
@@ -162,7 +162,6 @@ def _create_categorical(self, data, categories=None, ordered=None,
162162
if not isinstance(data, ABCCategorical):
163163
if ordered is None and dtype is None:
164164
ordered = False
165-
from pandas.core.arrays import Categorical
166165
data = Categorical(data, categories=categories, ordered=ordered,
167166
dtype=dtype)
168167
else:
@@ -323,32 +322,14 @@ def _reverse_indexer(self):
323322

324323
@Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs)
325324
def __contains__(self, key):
326-
hash(key)
327-
328-
if isna(key): # if key is a NaN, check if any NaN is in self.
325+
# if key is a NaN, check if any NaN is in self.
326+
if isna(key):
329327
return self.hasnans
330328

331-
# is key in self.categories? Then get its location.
332-
# If not (i.e. KeyError), it logically can't be in self either
333-
try:
334-
loc = self.categories.get_loc(key)
335-
except KeyError:
336-
return False
337-
338-
# loc is the location of key in self.categories, but also the value
339-
# for key in self.codes and in self._engine. key may be in categories,
340-
# but still not in self, check this. Example:
341-
# 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False
342-
if is_scalar(loc):
343-
return loc in self._engine
344-
else:
345-
# if self.categories is IntervalIndex, loc is an array
346-
# check if any scalar of the array is in self._engine
347-
return any(loc_ in self._engine for loc_ in loc)
329+
return contains(self, key, container=self._engine)
348330

349331
@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
350332
def contains(self, key):
351-
hash(key)
352333
return key in self
353334

354335
def __array__(self, dtype=None):
@@ -479,7 +460,6 @@ def where(self, cond, other=None):
479460
other = self._na_value
480461
values = np.where(cond, self.values, other)
481462

482-
from pandas.core.arrays import Categorical
483463
cat = Categorical(values,
484464
categories=self.categories,
485465
ordered=self.ordered)
@@ -862,7 +842,6 @@ def _delegate_method(self, name, *args, **kwargs):
862842
def _add_accessors(cls):
863843
""" add in Categorical accessor methods """
864844

865-
from pandas.core.arrays import Categorical
866845
CategoricalIndex._add_delegate_accessors(
867846
delegate=Categorical, accessors=["rename_categories",
868847
"reorder_categories",

pandas/tests/categorical/test_operators.py

+17
Original file line numberDiff line numberDiff line change
@@ -291,3 +291,20 @@ def test_numeric_like_ops(self):
291291

292292
# invalid ufunc
293293
pytest.raises(TypeError, lambda: np.log(s))
294+
295+
def test_contains(self):
296+
# GH21508
297+
c = pd.Categorical(list('aabbca'), categories=list('cab'))
298+
299+
assert 'b' in c
300+
assert 'z' not in c
301+
assert np.nan not in c
302+
with pytest.raises(TypeError):
303+
assert [1] in c
304+
305+
# assert codes NOT in index
306+
assert 0 not in c
307+
assert 1 not in c
308+
309+
c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab'))
310+
assert np.nan in c

0 commit comments

Comments
 (0)