Skip to content

Commit 90ca876

Browse files
topper-123proost
authored andcommitted
PERF: faster categorical ops for equal or larger than scalar (pandas-dev#29820)
1 parent c39f0c3 commit 90ca876

File tree

3 files changed

+35
-20
lines changed

3 files changed

+35
-20
lines changed

asv_bench/benchmarks/categoricals.py

+27-15
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,6 @@
1414
pass
1515

1616

17-
class Concat:
18-
def setup(self):
19-
N = 10 ** 5
20-
self.s = pd.Series(list("aabbcd") * N).astype("category")
21-
22-
self.a = pd.Categorical(list("aabbcd") * N)
23-
self.b = pd.Categorical(list("bbcdjk") * N)
24-
25-
def time_concat(self):
26-
pd.concat([self.s, self.s])
27-
28-
def time_union(self):
29-
union_categoricals([self.a, self.b])
30-
31-
3217
class Constructor:
3318
def setup(self):
3419
N = 10 ** 5
@@ -77,6 +62,33 @@ def time_existing_series(self):
7762
pd.Categorical(self.series)
7863

7964

65+
class CategoricalOps:
66+
params = ["__lt__", "__le__", "__eq__", "__ne__", "__ge__", "__gt__"]
67+
param_names = ["op"]
68+
69+
def setup(self, op):
70+
N = 10 ** 5
71+
self.cat = pd.Categorical(list("aabbcd") * N, ordered=True)
72+
73+
def time_categorical_op(self, op):
74+
getattr(self.cat, op)("b")
75+
76+
77+
class Concat:
78+
def setup(self):
79+
N = 10 ** 5
80+
self.s = pd.Series(list("aabbcd") * N).astype("category")
81+
82+
self.a = pd.Categorical(list("aabbcd") * N)
83+
self.b = pd.Categorical(list("bbcdjk") * N)
84+
85+
def time_concat(self):
86+
pd.concat([self.s, self.s])
87+
88+
def time_union(self):
89+
union_categoricals([self.a, self.b])
90+
91+
8092
class ValueCounts:
8193

8294
params = [True, False]

doc/source/whatsnew/v1.0.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,9 @@ Performance improvements
453453
- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`)
454454
- Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`)
455455
- Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`)
456-
- Performance improvement when comparing a :meth:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`)
456+
- Performance improvement when comparing a :class:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`)
457+
- Performance improvement when checking if values in a :class:`Categorical` are equal, equal or larger or larger than a given scalar.
458+
The improvement is not present if checking if the :class:`Categorical` is less than or less than or equal than the scalar (:issue:`29820`)
457459

458460
.. _whatsnew_1000.bug_fixes:
459461

pandas/core/arrays/categorical.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,9 @@ def func(self, other):
108108
else:
109109
other_codes = other._codes
110110

111-
mask = (self._codes == -1) | (other_codes == -1)
112111
f = getattr(self._codes, opname)
113112
ret = f(other_codes)
113+
mask = (self._codes == -1) | (other_codes == -1)
114114
if mask.any():
115115
# In other series, the leads to False, so do that here too
116116
ret[mask] = False
@@ -121,9 +121,10 @@ def func(self, other):
121121
i = self.categories.get_loc(other)
122122
ret = getattr(self._codes, opname)(i)
123123

124-
# check for NaN in self
125-
mask = self._codes == -1
126-
ret[mask] = False
124+
if opname not in {"__eq__", "__ge__", "__gt__"}:
125+
# check for NaN needed if we are not equal or larger
126+
mask = self._codes == -1
127+
ret[mask] = False
127128
return ret
128129
else:
129130
if opname == "__eq__":

0 commit comments

Comments
 (0)