Skip to content

Commit 1ecce70

Browse files
author
tp
committed
add Categorical.__contains__
1 parent 9e982e1 commit 1ecce70

File tree

4 files changed

+52
-3
lines changed

4 files changed

+52
-3
lines changed

asv_bench/benchmarks/categoricals.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,11 @@ class Contains(object):
202202
def setup(self):
203203
N = 10**5
204204
self.ci = tm.makeCategoricalIndex(N)
205-
self.cat = self.ci.categories[0]
205+
self.c = self.ci.values
206+
self.key = self.ci.categories[0]
206207

207-
def time_contains(self):
208-
self.cat in self.ci
208+
def time_categorical_index_contains(self):
209+
self.key in self.ci
210+
211+
def time_categorical_contains(self):
212+
self.key in self.c

doc/source/whatsnew/v0.23.2.txt

+3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ Performance Improvements
2727
- Improved performance of membership checks in :class:`CategoricalIndex`
2828
(i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains`
2929
is likewise much faster (:issue:`21369`)
30+
- Improved performance of membership checks in :class:`Categorical`
31+
(i.e. ``x in categorical``-style checks are much faster) (:issue:`21369`)
32+
3033
-
3134

3235
Documentation Changes

pandas/core/arrays/categorical.py

+25
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,31 @@ def __iter__(self):
18471847
"""Returns an Iterator over the values of this Categorical."""
18481848
return iter(self.get_values().tolist())
18491849

1850+
def __contains__(self, key):
1851+
"""Returns True if `key` is in this Categorical."""
1852+
hash(key)
1853+
1854+
if isna(key): # if key is a NaN, check if any NaN is in self.
1855+
return self.isna().any()
1856+
1857+
# is key in self.categories? Then get its location.
1858+
# If not (i.e. KeyError), it logically can't be in self either
1859+
try:
1860+
loc = self.categories.get_loc(key)
1861+
except KeyError:
1862+
return False
1863+
1864+
# loc is the location of key in self.categories, but also the value
1865+
# for key in self.codes. key may be in categories,
1866+
# but still not in self, check this. Example:
1867+
# 'b' in Categorical(['a'], categories=['a', 'b']) # False
1868+
if is_scalar(loc):
1869+
return loc in self._codes
1870+
else:
1871+
# if self.categories is IntervalIndex, loc is an array
1872+
# check if any scalar of the array is in self._codes
1873+
return any(loc_ in self._codes for loc_ in loc)
1874+
18501875
def _tidy_repr(self, max_vals=10, footer=True):
18511876
""" a short repr displaying only max_vals and an optional (but default
18521877
footer)

pandas/tests/categorical/test_algos.py

+17
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,23 @@ def test_isin_empty(empty):
7171
tm.assert_numpy_array_equal(expected, result)
7272

7373

74+
def test_contains(self):
75+
76+
c = pd.Categorical(list('aabbca'), categories=list('cab'))
77+
78+
assert 'b' in c
79+
assert 'z' not in c
80+
assert np.nan not in c
81+
82+
# assert codes NOT in index
83+
assert 0 not in c
84+
assert 1 not in c
85+
86+
c = pd.Categorical(list('aabbca')+[np.nan], categories=list('cab'))
87+
88+
assert np.nan in c
89+
90+
7491
class TestTake(object):
7592
# https://github.com/pandas-dev/pandas/issues/20664
7693

0 commit comments

Comments
 (0)