Skip to content

Commit bf1c3dc

Browse files
topper-123jreback
authored andcommitted
PERF: Add __contains__ to CategoricalIndex (#21369)
1 parent a8738ba commit bf1c3dc

File tree

3 files changed

+36
-9
lines changed

3 files changed

+36
-9
lines changed

asv_bench/benchmarks/categoricals.py

+13
Original file line numberDiff line numberDiff line change
@@ -193,3 +193,16 @@ def time_categorical_series_is_monotonic_increasing(self):
193193

194194
def time_categorical_series_is_monotonic_decreasing(self):
195195
self.s.is_monotonic_decreasing
196+
197+
198+
class Contains(object):
199+
200+
goal_time = 0.2
201+
202+
def setup(self):
203+
N = 10**5
204+
self.ci = tm.makeCategoricalIndex(N)
205+
self.cat = self.ci.categories[0]
206+
207+
def time_contains(self):
208+
self.cat in self.ci

doc/source/whatsnew/v0.23.2.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ Fixed Regressions
2424
Performance Improvements
2525
~~~~~~~~~~~~~~~~~~~~~~~~
2626

27-
-
27+
- Improved performance of membership checks in :class:`CategoricalIndex`
28+
(i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains`
29+
is likewise much faster (:issue:`21369`)
2830
-
2931

3032
Documentation Changes

pandas/core/indexes/category.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -325,19 +325,31 @@ def _reverse_indexer(self):
325325
def __contains__(self, key):
326326
hash(key)
327327

328-
if self.categories._defer_to_indexing:
329-
return key in self.categories
328+
if isna(key): # if key is a NaN, check if any NaN is in self.
329+
return self.isna().any()
330+
331+
# is key in self.categories? Then get its location.
332+
# If not (i.e. KeyError), it logically can't be in self either
333+
try:
334+
loc = self.categories.get_loc(key)
335+
except KeyError:
336+
return False
330337

331-
return key in self.values
338+
# loc is the location of key in self.categories, but also the value
339+
# for key in self.codes and in self._engine. key may be in categories,
340+
# but still not in self, check this. Example:
341+
# 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False
342+
if is_scalar(loc):
343+
return loc in self._engine
344+
else:
345+
# if self.categories is IntervalIndex, loc is an array
346+
# check if any scalar of the array is in self._engine
347+
return any(loc_ in self._engine for loc_ in loc)
332348

333349
@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
334350
def contains(self, key):
335351
hash(key)
336-
337-
if self.categories._defer_to_indexing:
338-
return self.categories.contains(key)
339-
340-
return key in self.values
352+
return key in self
341353

342354
def __array__(self, dtype=None):
343355
""" the array interface, return my values """

0 commit comments

Comments
 (0)