Skip to content

Commit 3eb49bb

Browse files
author
tp
committed
add Categorical.__contains__
1 parent 9e982e1 commit 3eb49bb

File tree

5 files changed

+67
-24
lines changed

5 files changed

+67
-24
lines changed

asv_bench/benchmarks/categoricals.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,11 @@ class Contains(object):
202202
def setup(self):
203203
N = 10**5
204204
self.ci = tm.makeCategoricalIndex(N)
205-
self.cat = self.ci.categories[0]
205+
self.c = self.ci.values
206+
self.key = self.ci.categories[0]
206207

207-
def time_contains(self):
208-
self.cat in self.ci
208+
def time_categorical_index_contains(self):
209+
self.key in self.ci
210+
211+
def time_categorical_contains(self):
212+
self.key in self.c

doc/source/whatsnew/v0.23.2.txt

+3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ Performance Improvements
2727
- Improved performance of membership checks in :class:`CategoricalIndex`
2828
(i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains`
2929
is likewise much faster (:issue:`21369`)
30+
- Improved performance of membership checks in :class:`Categorical`
31+
(i.e. ``x in categorical``-style checks are much faster) (:issue:`21369`)
32+
3033
-
3134

3235
Documentation Changes

pandas/core/arrays/categorical.py

+37
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,43 @@ def __iter__(self):
18471847
"""Returns an Iterator over the values of this Categorical."""
18481848
return iter(self.get_values().tolist())
18491849

1850+
@staticmethod
1851+
def _contains(key, categories, container):
1852+
"""Returns True if `key` is in `categories` and the
1853+
location of `key` in `categories` is in `container`.
1854+
1855+
This is a helper method used in :method:`Categorical.__contains__`
1856+
and in :class:`CategoricalIndex.__contains__`.
1857+
"""
1858+
1859+
# is key in categories? Then get its location in categories.
1860+
# If not (i.e. KeyError), its location logically can't be in
1861+
# container either.
1862+
try:
1863+
loc = categories.get_loc(key)
1864+
except KeyError:
1865+
return False
1866+
1867+
# loc is the location of key in categories, but also the value
1868+
# for key in container. So, key may be in categories,
1869+
# but still not in container, this must be checked. Example:
1870+
# 'b' in Categorical(['a'], categories=['a', 'b']) # False
1871+
if is_scalar(loc):
1872+
return loc in container
1873+
else:
1874+
# if categories is an IntervalIndex, loc is an array.
1875+
# Check if any scalar of the array is in the container
1876+
return any(loc_ in container for loc_ in loc)
1877+
1878+
def __contains__(self, key):
1879+
"""Returns True if `key` is in this Categorical."""
1880+
hash(key)
1881+
1882+
if isna(key): # if key is a NaN, check if any NaN is in self.
1883+
return self.isna().any()
1884+
1885+
return self._contains(key, self.categories, container=self._codes)
1886+
18501887
def _tidy_repr(self, max_vals=10, footer=True):
18511888
""" a short repr displaying only max_vals and an optional (but default
18521889
footer)

pandas/core/indexes/category.py

+3-21
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import pandas.core.common as com
2525
import pandas.core.missing as missing
2626
import pandas.core.indexes.base as ibase
27+
from pandas.core.arrays import Categorical
2728

2829
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
2930
_index_doc_kwargs.update(dict(target_klass='CategoricalIndex'))
@@ -125,7 +126,6 @@ def _create_from_codes(self, codes, categories=None, ordered=None,
125126
CategoricalIndex
126127
"""
127128

128-
from pandas.core.arrays import Categorical
129129
if categories is None:
130130
categories = self.categories
131131
if ordered is None:
@@ -162,7 +162,6 @@ def _create_categorical(self, data, categories=None, ordered=None,
162162
if not isinstance(data, ABCCategorical):
163163
if ordered is None and dtype is None:
164164
ordered = False
165-
from pandas.core.arrays import Categorical
166165
data = Categorical(data, categories=categories, ordered=ordered,
167166
dtype=dtype)
168167
else:
@@ -328,23 +327,8 @@ def __contains__(self, key):
328327
if isna(key): # if key is a NaN, check if any NaN is in self.
329328
return self.hasnans
330329

331-
# is key in self.categories? Then get its location.
332-
# If not (i.e. KeyError), it logically can't be in self either
333-
try:
334-
loc = self.categories.get_loc(key)
335-
except KeyError:
336-
return False
337-
338-
# loc is the location of key in self.categories, but also the value
339-
# for key in self.codes and in self._engine. key may be in categories,
340-
# but still not in self, check this. Example:
341-
# 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False
342-
if is_scalar(loc):
343-
return loc in self._engine
344-
else:
345-
# if self.categories is IntervalIndex, loc is an array
346-
# check if any scalar of the array is in self._engine
347-
return any(loc_ in self._engine for loc_ in loc)
330+
return Categorical._contains(key, categories=self.categories,
331+
container=self._engine)
348332

349333
@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
350334
def contains(self, key):
@@ -479,7 +463,6 @@ def where(self, cond, other=None):
479463
other = self._na_value
480464
values = np.where(cond, self.values, other)
481465

482-
from pandas.core.arrays import Categorical
483466
cat = Categorical(values,
484467
categories=self.categories,
485468
ordered=self.ordered)
@@ -862,7 +845,6 @@ def _delegate_method(self, name, *args, **kwargs):
862845
def _add_accessors(cls):
863846
""" add in Categorical accessor methods """
864847

865-
from pandas.core.arrays import Categorical
866848
CategoricalIndex._add_delegate_accessors(
867849
delegate=Categorical, accessors=["rename_categories",
868850
"reorder_categories",

pandas/tests/categorical/test_algos.py

+17
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,23 @@ def test_isin_empty(empty):
7171
tm.assert_numpy_array_equal(expected, result)
7272

7373

74+
def test_contains():
75+
76+
c = pd.Categorical(list('aabbca'), categories=list('cab'))
77+
78+
assert 'b' in c
79+
assert 'z' not in c
80+
assert np.nan not in c
81+
82+
# assert codes NOT in index
83+
assert 0 not in c
84+
assert 1 not in c
85+
86+
c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab'))
87+
88+
assert np.nan in c
89+
90+
7491
class TestTake(object):
7592
# https://github.com/pandas-dev/pandas/issues/20664
7693

0 commit comments

Comments
 (0)