diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 73e3933122628..2a7717378c280 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -210,3 +210,38 @@ def time_categorical_index_contains(self): def time_categorical_contains(self): self.key in self.c + + +class CategoricalSlicing(object): + + goal_time = 0.2 + params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + param_names = ['index'] + + def setup(self, index): + N = 10**6 + values = list('a' * N + 'b' * N + 'c' * N) + indices = { + 'monotonic_incr': pd.Categorical(values), + 'monotonic_decr': pd.Categorical(reversed(values)), + 'non_monotonic': pd.Categorical(list('abc' * N))} + self.data = indices[index] + + self.scalar = 10000 + self.list = list(range(10000)) + self.cat_scalar = 'b' + + def time_getitem_scalar(self, index): + self.data[self.scalar] + + def time_getitem_slice(self, index): + self.data[:self.scalar] + + def time_getitem_list_like(self, index): + self.data[[self.scalar]] + + def time_getitem_list(self, index): + self.data[self.list] + + def time_getitem_bool_array(self, index): + self.data[self.data == self.cat_scalar] diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 77e013e1e4fb0..739ad6a3d278b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -3,7 +3,8 @@ import numpy as np import pandas.util.testing as tm from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, - IntervalIndex, IndexSlice, concat, date_range) + IntervalIndex, CategoricalIndex, + IndexSlice, concat, date_range) from .pandas_vb_common import setup, Panel # noqa @@ -230,6 +231,49 @@ def time_loc_list(self, monotonic): monotonic.loc[80000:] +class CategoricalIndexIndexing(object): + + goal_time = 0.2 + params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + param_names = ['index'] + + def setup(self, index): + N = 10**5 + values = list('a' * N + 'b' * N + 'c' * N) + indices = { + 'monotonic_incr': CategoricalIndex(values), + 'monotonic_decr': CategoricalIndex(reversed(values)), + 'non_monotonic': CategoricalIndex(list('abc' * N))} + self.data = indices[index] + + self.int_scalar = 10000 + self.int_list = list(range(10000)) + + self.cat_scalar = 'b' + self.cat_list = ['a', 'c'] + + def time_getitem_scalar(self, index): + self.data[self.int_scalar] + + def time_getitem_slice(self, index): + self.data[:self.int_scalar] + + def time_getitem_list_like(self, index): + self.data[[self.int_scalar]] + + def time_getitem_list(self, index): + self.data[self.int_list] + + def time_getitem_bool_array(self, index): + self.data[self.data == self.cat_scalar] + + def time_get_loc_scalar(self, index): + self.data.get_loc(self.cat_scalar) + + def time_get_indexer_list(self, index): + self.data.get_indexer(self.cat_list) + + class PanelIndexing(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 406ca9ba045c9..14d9c03b477ed 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -134,6 +134,9 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, + both when indexing by label (using .loc) and position(.iloc). + Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7b3cce0f2585d..0252b5b52ae94 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2009,8 +2009,7 @@ def __getitem__(self, key): return self.categories[i] else: return self._constructor(values=self._codes[key], - categories=self.categories, - ordered=self.ordered, fastpath=True) + dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): """ Item assignment. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 708f54f5ca75b..1e762c2be92a6 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -184,18 +184,20 @@ def __eq__(self, other): """ Rules for CDT equality: 1) Any CDT is equal to the string 'category' - 2) Any CDT is equal to a CDT with categories=None regardless of ordered - 3) A CDT with ordered=True is only equal to another CDT with + 2) Any CDT is equal to itself + 3) Any CDT is equal to a CDT with categories=None regardless of ordered + 4) A CDT with ordered=True is only equal to another CDT with ordered=True and identical categories in the same order - 4) A CDT with ordered={False, None} is only equal to another CDT with + 5) A CDT with ordered={False, None} is only equal to another CDT with ordered={False, None} and identical categories, but same order is not required. There is no distinction between False/None. - 5) Any other comparison returns False + 6) Any other comparison returns False """ if isinstance(other, compat.string_types): return other == self.name - - if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + elif other is self: + return True + elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -348,6 +350,8 @@ def update_dtype(self, dtype): msg = ('a CategoricalDtype must be passed to perform an update, ' 'got {dtype!r}').format(dtype=dtype) raise ValueError(msg) + elif dtype.categories is not None and dtype.ordered is self.ordered: + return dtype # dtype is CDT: keep current categories/ordered if None new_categories = dtype.categories diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index a2efe2c49c747..8472d5fd49bd9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -169,7 +169,7 @@ def _create_categorical(cls, data, categories=None, ordered=None, data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) - if isinstance(dtype, CategoricalDtype): + if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data @@ -236,7 +236,7 @@ def _is_dtype_compat(self, other): if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( - other, categories=self.categories, ordered=self.ordered)) + other, dtype=self.dtype)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") @@ -798,8 +798,7 @@ def _evaluate_compare(self, other): other = other._values elif isinstance(other, Index): other = self._create_categorical( - other._values, categories=self.categories, - ordered=self.ordered) + other._values, dtype=self.dtype) if isinstance(other, (ABCCategorical, np.ndarray, ABCSeries)):