PERF: avoid unneeded recoding of categoricals and reuse CategoricalDtypes for greater slicing speed (#21659)

topper-123 · jreback · commit c45bb0b5ae3b · 2018-06-28T20:36:05.000-04:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -210,3 +210,38 @@ def time_categorical_index_contains(self):
 
     def time_categorical_contains(self):
         self.key in self.c
+
+
+class CategoricalSlicing(object):
+
+    goal_time = 0.2
+    params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
+    param_names = ['index']
+
+    def setup(self, index):
+        N = 10**6
+        values = list('a' * N + 'b' * N + 'c' * N)
+        indices = {
+            'monotonic_incr': pd.Categorical(values),
+            'monotonic_decr': pd.Categorical(reversed(values)),
+            'non_monotonic': pd.Categorical(list('abc' * N))}
+        self.data = indices[index]
+
+        self.scalar = 10000
+        self.list = list(range(10000))
+        self.cat_scalar = 'b'
+
+    def time_getitem_scalar(self, index):
+        self.data[self.scalar]
+
+    def time_getitem_slice(self, index):
+        self.data[:self.scalar]
+
+    def time_getitem_list_like(self, index):
+        self.data[[self.scalar]]
+
+    def time_getitem_list(self, index):
+        self.data[self.list]
+
+    def time_getitem_bool_array(self, index):
+        self.data[self.data == self.cat_scalar]
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -3,7 +3,8 @@
 import numpy as np
 import pandas.util.testing as tm
 from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
-                    IntervalIndex, IndexSlice, concat, date_range)
+                    IntervalIndex, CategoricalIndex,
+                    IndexSlice, concat, date_range)
 from .pandas_vb_common import setup, Panel  # noqa
 
 
@@ -230,6 +231,49 @@ def time_loc_list(self, monotonic):
         monotonic.loc[80000:]
 
 
+class CategoricalIndexIndexing(object):
+
+    goal_time = 0.2
+    params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
+    param_names = ['index']
+
+    def setup(self, index):
+        N = 10**5
+        values = list('a' * N + 'b' * N + 'c' * N)
+        indices = {
+            'monotonic_incr': CategoricalIndex(values),
+            'monotonic_decr': CategoricalIndex(reversed(values)),
+            'non_monotonic': CategoricalIndex(list('abc' * N))}
+        self.data = indices[index]
+
+        self.int_scalar = 10000
+        self.int_list = list(range(10000))
+
+        self.cat_scalar = 'b'
+        self.cat_list = ['a', 'c']
+
+    def time_getitem_scalar(self, index):
+        self.data[self.int_scalar]
+
+    def time_getitem_slice(self, index):
+        self.data[:self.int_scalar]
+
+    def time_getitem_list_like(self, index):
+        self.data[[self.int_scalar]]
+
+    def time_getitem_list(self, index):
+        self.data[self.int_list]
+
+    def time_getitem_bool_array(self, index):
+        self.data[self.data == self.cat_scalar]
+
+    def time_get_loc_scalar(self, index):
+        self.data.get_loc(self.cat_scalar)
+
+    def time_get_indexer_list(self, index):
+        self.data.get_indexer(self.cat_list)
+
+
 class PanelIndexing(object):
 
     goal_time = 0.2
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -135,6 +135,9 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
+- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
+  both when indexing by label (using .loc) and position(.iloc).
+  Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2009,8 +2009,7 @@ def __getitem__(self, key):
                 return self.categories[i]
         else:
             return self._constructor(values=self._codes[key],
-                                     categories=self.categories,
-                                     ordered=self.ordered, fastpath=True)
+                                     dtype=self.dtype, fastpath=True)
 
     def __setitem__(self, key, value):
         """ Item assignment.
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -184,18 +184,20 @@ def __eq__(self, other):
         """
         Rules for CDT equality:
         1) Any CDT is equal to the string 'category'
-        2) Any CDT is equal to a CDT with categories=None regardless of ordered
-        3) A CDT with ordered=True is only equal to another CDT with
+        2) Any CDT is equal to itself
+        3) Any CDT is equal to a CDT with categories=None regardless of ordered
+        4) A CDT with ordered=True is only equal to another CDT with
            ordered=True and identical categories in the same order
-        4) A CDT with ordered={False, None} is only equal to another CDT with
+        5) A CDT with ordered={False, None} is only equal to another CDT with
            ordered={False, None} and identical categories, but same order is
            not required. There is no distinction between False/None.
-        5) Any other comparison returns False
+        6) Any other comparison returns False
         """
         if isinstance(other, compat.string_types):
             return other == self.name
-
-        if not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
+        elif other is self:
+            return True
+        elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
             return False
         elif self.categories is None or other.categories is None:
             # We're forced into a suboptimal corner thanks to math and
@@ -348,6 +350,8 @@ def update_dtype(self, dtype):
             msg = ('a CategoricalDtype must be passed to perform an update, '
                    'got {dtype!r}').format(dtype=dtype)
             raise ValueError(msg)
+        elif dtype.categories is not None and dtype.ordered is self.ordered:
+            return dtype
 
         # dtype is CDT: keep current categories/ordered if None
         new_categories = dtype.categories
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -169,7 +169,7 @@ def _create_categorical(cls, data, categories=None, ordered=None,
                 data = data.set_categories(categories, ordered=ordered)
             elif ordered is not None and ordered != data.ordered:
                 data = data.set_ordered(ordered)
-            if isinstance(dtype, CategoricalDtype):
+            if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
                 # we want to silently ignore dtype='category'
                 data = data._set_dtype(dtype)
         return data
@@ -236,7 +236,7 @@ def _is_dtype_compat(self, other):
             if not is_list_like(values):
                 values = [values]
             other = CategoricalIndex(self._create_categorical(
-                other, categories=self.categories, ordered=self.ordered))
+                other, dtype=self.dtype))
             if not other.isin(values).all():
                 raise TypeError("cannot append a non-category item to a "
                                 "CategoricalIndex")
@@ -798,8 +798,7 @@ def _evaluate_compare(self, other):
                     other = other._values
                 elif isinstance(other, Index):
                     other = self._create_categorical(
-                        other._values, categories=self.categories,
-                        ordered=self.ordered)
+                        other._values, dtype=self.dtype)
 
                 if isinstance(other, (ABCCategorical, np.ndarray,
                                       ABCSeries)):