Skip to content

Commit c45bb0b

Browse files
topper-123jreback
authored andcommitted
PERF: avoid unneeded recoding of categoricals and reuse CategoricalDtypes for greater slicing speed (#21659)
1 parent 76ef7c4 commit c45bb0b

File tree

6 files changed

+97
-13
lines changed

6 files changed

+97
-13
lines changed

asv_bench/benchmarks/categoricals.py

+35
Original file line numberDiff line numberDiff line change
@@ -210,3 +210,38 @@ def time_categorical_index_contains(self):
210210

211211
def time_categorical_contains(self):
212212
self.key in self.c
213+
214+
215+
class CategoricalSlicing(object):
216+
217+
goal_time = 0.2
218+
params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
219+
param_names = ['index']
220+
221+
def setup(self, index):
222+
N = 10**6
223+
values = list('a' * N + 'b' * N + 'c' * N)
224+
indices = {
225+
'monotonic_incr': pd.Categorical(values),
226+
'monotonic_decr': pd.Categorical(reversed(values)),
227+
'non_monotonic': pd.Categorical(list('abc' * N))}
228+
self.data = indices[index]
229+
230+
self.scalar = 10000
231+
self.list = list(range(10000))
232+
self.cat_scalar = 'b'
233+
234+
def time_getitem_scalar(self, index):
235+
self.data[self.scalar]
236+
237+
def time_getitem_slice(self, index):
238+
self.data[:self.scalar]
239+
240+
def time_getitem_list_like(self, index):
241+
self.data[[self.scalar]]
242+
243+
def time_getitem_list(self, index):
244+
self.data[self.list]
245+
246+
def time_getitem_bool_array(self, index):
247+
self.data[self.data == self.cat_scalar]

asv_bench/benchmarks/indexing.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import numpy as np
44
import pandas.util.testing as tm
55
from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
6-
IntervalIndex, IndexSlice, concat, date_range)
6+
IntervalIndex, CategoricalIndex,
7+
IndexSlice, concat, date_range)
78
from .pandas_vb_common import setup, Panel # noqa
89

910

@@ -230,6 +231,49 @@ def time_loc_list(self, monotonic):
230231
monotonic.loc[80000:]
231232

232233

234+
class CategoricalIndexIndexing(object):
235+
236+
goal_time = 0.2
237+
params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
238+
param_names = ['index']
239+
240+
def setup(self, index):
241+
N = 10**5
242+
values = list('a' * N + 'b' * N + 'c' * N)
243+
indices = {
244+
'monotonic_incr': CategoricalIndex(values),
245+
'monotonic_decr': CategoricalIndex(reversed(values)),
246+
'non_monotonic': CategoricalIndex(list('abc' * N))}
247+
self.data = indices[index]
248+
249+
self.int_scalar = 10000
250+
self.int_list = list(range(10000))
251+
252+
self.cat_scalar = 'b'
253+
self.cat_list = ['a', 'c']
254+
255+
def time_getitem_scalar(self, index):
256+
self.data[self.int_scalar]
257+
258+
def time_getitem_slice(self, index):
259+
self.data[:self.int_scalar]
260+
261+
def time_getitem_list_like(self, index):
262+
self.data[[self.int_scalar]]
263+
264+
def time_getitem_list(self, index):
265+
self.data[self.int_list]
266+
267+
def time_getitem_bool_array(self, index):
268+
self.data[self.data == self.cat_scalar]
269+
270+
def time_get_loc_scalar(self, index):
271+
self.data.get_loc(self.cat_scalar)
272+
273+
def time_get_indexer_list(self, index):
274+
self.data.get_indexer(self.cat_list)
275+
276+
233277
class PanelIndexing(object):
234278

235279
goal_time = 0.2

doc/source/whatsnew/v0.24.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ Removal of prior version deprecations/changes
135135
Performance Improvements
136136
~~~~~~~~~~~~~~~~~~~~~~~~
137137

138+
- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
139+
both when indexing by label (using .loc) and position(.iloc).
140+
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
138141
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
139142
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
140143
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)

pandas/core/arrays/categorical.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -2009,8 +2009,7 @@ def __getitem__(self, key):
20092009
return self.categories[i]
20102010
else:
20112011
return self._constructor(values=self._codes[key],
2012-
categories=self.categories,
2013-
ordered=self.ordered, fastpath=True)
2012+
dtype=self.dtype, fastpath=True)
20142013

20152014
def __setitem__(self, key, value):
20162015
""" Item assignment.

pandas/core/dtypes/dtypes.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -184,18 +184,20 @@ def __eq__(self, other):
184184
"""
185185
Rules for CDT equality:
186186
1) Any CDT is equal to the string 'category'
187-
2) Any CDT is equal to a CDT with categories=None regardless of ordered
188-
3) A CDT with ordered=True is only equal to another CDT with
187+
2) Any CDT is equal to itself
188+
3) Any CDT is equal to a CDT with categories=None regardless of ordered
189+
4) A CDT with ordered=True is only equal to another CDT with
189190
ordered=True and identical categories in the same order
190-
4) A CDT with ordered={False, None} is only equal to another CDT with
191+
5) A CDT with ordered={False, None} is only equal to another CDT with
191192
ordered={False, None} and identical categories, but same order is
192193
not required. There is no distinction between False/None.
193-
5) Any other comparison returns False
194+
6) Any other comparison returns False
194195
"""
195196
if isinstance(other, compat.string_types):
196197
return other == self.name
197-
198-
if not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
198+
elif other is self:
199+
return True
200+
elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
199201
return False
200202
elif self.categories is None or other.categories is None:
201203
# We're forced into a suboptimal corner thanks to math and
@@ -348,6 +350,8 @@ def update_dtype(self, dtype):
348350
msg = ('a CategoricalDtype must be passed to perform an update, '
349351
'got {dtype!r}').format(dtype=dtype)
350352
raise ValueError(msg)
353+
elif dtype.categories is not None and dtype.ordered is self.ordered:
354+
return dtype
351355

352356
# dtype is CDT: keep current categories/ordered if None
353357
new_categories = dtype.categories

pandas/core/indexes/category.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def _create_categorical(cls, data, categories=None, ordered=None,
169169
data = data.set_categories(categories, ordered=ordered)
170170
elif ordered is not None and ordered != data.ordered:
171171
data = data.set_ordered(ordered)
172-
if isinstance(dtype, CategoricalDtype):
172+
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
173173
# we want to silently ignore dtype='category'
174174
data = data._set_dtype(dtype)
175175
return data
@@ -236,7 +236,7 @@ def _is_dtype_compat(self, other):
236236
if not is_list_like(values):
237237
values = [values]
238238
other = CategoricalIndex(self._create_categorical(
239-
other, categories=self.categories, ordered=self.ordered))
239+
other, dtype=self.dtype))
240240
if not other.isin(values).all():
241241
raise TypeError("cannot append a non-category item to a "
242242
"CategoricalIndex")
@@ -798,8 +798,7 @@ def _evaluate_compare(self, other):
798798
other = other._values
799799
elif isinstance(other, Index):
800800
other = self._create_categorical(
801-
other._values, categories=self.categories,
802-
ordered=self.ordered)
801+
other._values, dtype=self.dtype)
803802

804803
if isinstance(other, (ABCCategorical, np.ndarray,
805804
ABCSeries)):

0 commit comments

Comments
 (0)