Skip to content

Commit 4836a29

Browse files
committed
PERF: implement __contains__ for Categorical
1 parent 0c65c57 commit 4836a29

File tree

3 files changed

+52
-1
lines changed

3 files changed

+52
-1
lines changed

asv_bench/benchmarks/categoricals.py

+44
Original file line numberDiff line numberDiff line change
@@ -193,3 +193,47 @@ def time_categorical_series_is_monotonic_increasing(self):
193193

194194
def time_categorical_series_is_monotonic_decreasing(self):
195195
self.s.is_monotonic_decreasing
196+
197+
198+
class Contains(object):
199+
200+
params = (["a", "c", "d", "z", np.nan], [True, False])
201+
param_names = ["value", "has_nan"]
202+
203+
def setup(self, value, has_nan):
204+
n = 1 * 10 ** 4
205+
obj_values = list("a" * n + "b" * n + "c" * n)
206+
if has_nan:
207+
obj_values = [np.nan] + obj_values[:-2] + [np.nan]
208+
209+
self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
210+
self.cat = pd.Categorical(obj_values, categories=list("abcd"))
211+
212+
def time_contains_index(self, value, has_nan):
213+
value in self.ci
214+
215+
def time_cat_isin(self, value, has_nan):
216+
value in self.cat
217+
218+
219+
class Indexing(object):
220+
221+
params = (["a", "c"], [True, False])
222+
param_names = ["value", "has_nan"]
223+
224+
def setup(self, value, has_nan):
225+
n = 1 * 10 ** 4
226+
obj_values = list("a" * n + "b" * n + "c" * n)
227+
if has_nan:
228+
obj_values = [np.nan] + obj_values[:-2] + [np.nan]
229+
230+
self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
231+
self.cat = pd.Categorical(obj_values, categories=list("abcd"))
232+
self.df = pd.DataFrame(dict(A=range(n * 3)), index=self.ci)
233+
self.ser = pd.Series(range(n * 3), index=self.ci)
234+
235+
def time_loc_df(self, value, has_nan):
236+
self.df.loc[value]
237+
238+
def time_loc_ser(self, value, has_nan):
239+
self.ser.loc[value]

doc/source/whatsnew/v0.24.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ Performance Improvements
6464
~~~~~~~~~~~~~~~~~~~~~~~~
6565

6666
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
67-
-
67+
- Improved performance of indexing on a Series/DataFrame with a CategoricalIndex
6868

6969
.. _whatsnew_0240.docs:
7070

pandas/core/arrays/categorical.py

+7
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,13 @@ def __iter__(self):
18471847
"""Returns an Iterator over the values of this Categorical."""
18481848
return iter(self.get_values().tolist())
18491849

1850+
def __contains__(self, key):
1851+
"""Returns True if `key` is in this Categorical."""
1852+
if key in self.categories:
1853+
return self.categories.get_loc(key) in self.codes
1854+
else:
1855+
return False
1856+
18501857
def _tidy_repr(self, max_vals=10, footer=True):
18511858
""" a short repr displaying only max_vals and an optional (but default
18521859
footer)

0 commit comments

Comments
 (0)