pandas-dev · topper-123 · Jul 1, 2018 · Jul 5, 2018 · Jul 24, 2018 · Aug 10, 2018
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -2,7 +2,8 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
+from pandas import (Series, DataFrame, MultiIndex,
+                    Int64Index, UInt64Index, Float64Index,
                     IntervalIndex, CategoricalIndex,
                     IndexSlice, concat, date_range)
 from .pandas_vb_common import setup, Panel  # noqa
@@ -11,7 +12,7 @@
 class NumericSeriesIndexing(object):
 
     goal_time = 0.2
-    params = [Int64Index, Float64Index]
+    params = [Int64Index, UInt64Index, Float64Index]
     param = ['index']
 
     def setup(self, index):

diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
@@ -0,0 +1,60 @@
+import numpy as np
+
+from pandas._libs.index import (Int64Engine, Int32Engine,
+                                Int16Engine, Int8Engine,
+                                UInt64Engine, UInt32Engine,
+                                UInt16Engine, UInt8Engine,
+                                Float64Engine, Float32Engine,
+                                ObjectEngine,
+                                )
+
+
+class NumericEngineIndexing(object):
+
+    goal_time = 0.2
+    params = [[Int64Engine, Int32Engine, Int16Engine, Int8Engine,
+               UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine,
+               Float64Engine, Float32Engine,
+               ],
+              ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
+              ]
+    param_names = ['engine', 'index_type']
+
+    def setup(self, engine, index_type):
+        N = 10**5
+        values = list([1] * N + [2] * N + [3] * N)
+        array_ = {
+            'monotonic_incr': np.array(values, dtype=engine._dtype),
+            'monotonic_decr': np.array(list(reversed(values)),
+                                       dtype=engine._dtype),
+            'non_monotonic': np.array([1, 2, 3] * N, dtype=engine._dtype),
+        }[index_type]
+
+        self.data = engine(lambda: array_, len(array_))
+
+    def time_get_loc(self, engine, index_type):
+        self.data.get_loc(2)
+
+
+class ObjectEngineIndexing(object):
+
+    goal_time = 0.2
+    params = [[ObjectEngine],
+              ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
+              ]
+    param_names = ['engine', 'index_type']
+
+    def setup(self, engine, index_type):
+        N = 10**5
+        values = list('a' * N + 'b' * N + 'c' * N)
+        array_ = {
+            'monotonic_incr': np.array(values, dtype=engine._dtype),
+            'monotonic_decr': np.array(list(reversed(values)),
+                                       dtype=engine._dtype),
+            'non_monotonic': np.array(list('abc') * N, dtype=engine._dtype),
+        }[index_type]
+
+        self.data = engine(lambda: array_, len(array_))
+
+    def time_get_loc(self, engine, index_type):
+        self.data.get_loc(2)
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -591,9 +591,10 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
-  both when indexing by label (using .loc) and position(.iloc).
-  Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
+- Slicing Series and Dataframe with an monotonically increasing :class:`CategoricalIndex`
+  is now very fast and has speed comparable to slicing with an ``Int64Index``.
+  The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
+- Slicing a ``CategoricalIndex`` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`)
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -10,11 +10,13 @@ from libc.math cimport fabs, sqrt
 import numpy as np
 cimport numpy as cnp
 from numpy cimport (ndarray,
-                    NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8,
-                    NPY_FLOAT32, NPY_FLOAT64,
+                    NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
+                    NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
+                    NPY_FLOAT64, NPY_FLOAT32,
                     NPY_OBJECT,
-                    int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
-                    uint32_t, uint64_t, float32_t, float64_t,
+                    int64_t, int32_t, int16_t, int8_t,
+                    uint64_t, uint32_t, uint16_t, uint8_t,
+                    float64_t, float32_t,
                     double_t)
 cnp.import_array()
 
@@ -359,9 +361,13 @@ ctypedef fused algos_t:
     float64_t
     float32_t
     object
-    int32_t
     int64_t
+    int32_t
+    int16_t
+    int8_t
     uint64_t
+    uint32_t
+    uint16_t
     uint8_t
 
 

diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
@@ -133,11 +133,14 @@ def ensure_object(object arr):
 # name, c_type, dtype
 dtypes = [('float64', 'FLOAT64', 'float64'),
           ('float32', 'FLOAT32', 'float32'),
-          ('int8', 'INT8', 'int8'),
-          ('int16', 'INT16', 'int16'),
-          ('int32', 'INT32', 'int32'),
           ('int64', 'INT64', 'int64'),
+          ('int32', 'INT32', 'int32'),
+          ('int16', 'INT16', 'int16'),
+          ('int8', 'INT8', 'int8'),
           ('uint64', 'UINT64', 'uint64'),
+          ('uint32', 'UINT32', 'uint32'),
+          ('uint16', 'UINT16', 'uint16'),
+          ('uint8', 'UINT8', 'uint8'),
           # ('platform_int', 'INT', 'int_'),
           # ('object', 'OBJECT', 'object_'),
 ]

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -5,8 +5,11 @@ import cython
 
 import numpy as np
 cimport numpy as cnp
-from numpy cimport (ndarray, float64_t, int32_t,
-                    int64_t, uint8_t, uint64_t, intp_t,
+from numpy cimport (ndarray,
+                    float64_t, float32_t,
+                    int64_t, int32_t, int16_t, int8_t,
+                    uint64_t, uint32_t, uint16_t, uint8_t,
+                    intp_t,
                     # Note: NPY_DATETIME, NPY_TIMEDELTA are only available
                     # for cimport in cython>=0.27.3
                     NPY_DATETIME, NPY_TIMEDELTA)

diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in
@@ -12,18 +12,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 # name, dtype, ctype
 dtypes = [('Float64', 'float64', 'float64_t'),
-          ('UInt64', 'uint64', 'uint64_t'),
+          ('Float32', 'float32', 'float32_t'),
           ('Int64', 'int64', 'int64_t'),
-          ('Object', 'object', 'object')]
+          ('Int32', 'int32', 'int32_t'),
+          ('Int16', 'int16', 'int16_t'),
+          ('Int8', 'int8', 'int8_t'),
+          ('UInt64', 'uint64', 'uint64_t'),
+          ('UInt32', 'uint32', 'uint32_t'),
+          ('UInt16', 'uint16', 'uint16_t'),
+          ('UInt8', 'uint8', 'uint8_t'),
+          ('Object', 'object', 'object'),
+          ]
 }}
 
 {{for name, dtype, ctype in dtypes}}
 
 
 cdef class {{name}}Engine(IndexEngine):
 
+    _dtype = '{{dtype}}'
+
     def _call_monotonic(self, values):
-        return algos.is_monotonic_{{dtype}}(values, timelike=False)
+        return algos.is_monotonic(values, timelike=False)
 
     def get_backfill_indexer(self, other, limit=None):
         return algos.backfill_{{dtype}}(self._get_index_values(),
@@ -36,10 +46,33 @@ cdef class {{name}}Engine(IndexEngine):
     cdef _make_hash_table(self, n):
         {{if name == 'Object'}}
         return _hash.PyObjectHashTable(n)
+        {{elif name in {'Int8', 'Int16', 'Int32'} }}
+        # {{name}}HashTable is not available, so we use Int64HashTable
+        return _hash.Int64HashTable(n)
+        {{elif name in {'UInt8', 'UInt16', 'UInt32'} }}
+        # {{name}}HashTable is not available, so we use UInt64HashTable
+        return _hash.UInt64HashTable(n)
+        {{elif name in {'Float32'} }}
+        # {{name}}HashTable is not available, so we use Float64HashTable
+        return _hash.Float64HashTable(n)
         {{else}}
         return _hash.{{name}}HashTable(n)
         {{endif}}
 
+    {{if name in {'Int8', 'Int16', 'Int32'} }}
+    cpdef _call_map_locations(self, values):
+        # self.mapping is of type Int64HashTable, so convert dtype of values
+        self.mapping.map_locations(algos.ensure_int64(values))
+    {{elif name in {'UInt8', 'UInt16', 'UInt32'} }}
+    cpdef _call_map_locations(self, values):
+        # self.mapping is of type UInt64HashTable, so convert dtype of values
+        self.mapping.map_locations(algos.ensure_uint64(values))
+    {{elif name in {'Float32'} }}
+    cpdef _call_map_locations(self, values):
+        # self.mapping is of type Float64HashTable, so convert dtype of values
+        self.mapping.map_locations(algos.ensure_float64(values))
+    {{endif}}
+
     {{if name != 'Float64' and name != 'Object'}}
     cdef _check_type(self, object val):
         hash(val)
@@ -60,7 +93,7 @@ cdef class {{name}}Engine(IndexEngine):
             ndarray[{{ctype}}] values
             int count = 0
 
-        {{if name != 'Float64'}}
+        {{if name not in {'Float64', 'Float32'} }}
         if not util.is_integer_object(val):
             raise KeyError(val)
         {{endif}}

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -102,14 +102,29 @@ def ip():
 
 
 @pytest.fixture(params=[True, False, None])
-def observed(request):
+def _true_false_none(request):
+    """
+    Base fixture for fixtures that return True, False and None.
+    """
+    return request.param
+
+
+@pytest.fixture
+def observed(_true_false_none):
     """ pass in the observed keyword to groupby for [True, False]
     This indicates whether categoricals should return values for
     values which are not in the grouper [False / None], or only values which
     appear in the grouper [True]. [None] is supported for future compatiblity
     if we decide to change the default (and would need to warn if this
     parameter is not passed)"""
-    return request.param
+    return _true_false_none
+
+
+@pytest.fixture
+def ordered(_true_false_none):
+    """Return the allowed parameters for Categorical/CategoricalIndex.ordered.
+    """
+    return _true_false_none
 
 
 _all_arithmetic_operators = ['__add__', '__radd__',

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3085,6 +3085,10 @@ def _get_unique_index(self, dropna=False):
         -------
         loc : int if unique index, slice if monotonic index, else mask
 
+        Raises
+        ------
+        KeyError : If key is not in self
+
         Examples
         ---------
         >>> unique_index = pd.Index(list('abc'))

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -83,7 +83,13 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
     """
 
     _typ = 'categoricalindex'
-    _engine_type = libindex.Int64Engine
+
+    @property
+    def _engine_type(self):
+        # self.codes can have dtype int8, int16, int 32 or int64, so we need
+        # to return the corresponding engine type (libindex.Int8Engine, etc.).
+        engine_name = "{}Engine".format(self.codes.dtype.name.capitalize())
+        return getattr(libindex, engine_name)
     _attributes = ['name']
 
     def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
@@ -377,7 +383,7 @@ def argsort(self, *args, **kwargs):
     def _engine(self):
 
         # we are going to look things up with the codes themselves
-        return self._engine_type(lambda: self.codes.astype('i8'), len(self))
+        return self._engine_type(lambda: self.codes, len(self))
 
     # introspection
     @cache_readonly
@@ -426,6 +432,10 @@ def get_loc(self, key, method=None):
         -------
         loc : int if unique index, slice if monotonic index, else mask
 
+        Raises
+        ------
+        KeyError : If key is not in self
+
         Examples
         ---------
         >>> unique_index = pd.CategoricalIndex(list('abc'))
@@ -440,10 +450,12 @@ def get_loc(self, key, method=None):
         >>> non_monotonic_index.get_loc('b')
         array([False,  True, False,  True], dtype=bool)
         """
-        codes = self.categories.get_loc(key)
-        if (codes == -1):
-            raise KeyError(key)
-        return self._engine.get_loc(codes)
+        code = self.categories.get_loc(key)
+
+        # dtype must be same as dtype for self.codes else searchsorted is slow
+        code = self.codes.dtype.type(code)
+
+        return self._engine.get_loc(code)
 
     def get_value(self, series, key):
         """

diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py
@@ -4,6 +4,7 @@
 
 import pandas.util.testing as tm
 from pandas.core.indexes.api import Index, MultiIndex
+from pandas._libs import index as li
 from pandas.compat import lzip, long
 
 
@@ -45,3 +46,15 @@ def zero(request):
     # For testing division by (or of) zero for Index with length 5, this
     # gives several scalar-zeros and length-5 vector-zeros
     return request.param
+
+
+@pytest.fixture(
+    params=[
+        'Int64', 'Int32', 'Int16', 'Int8',
+        'UInt64', 'UInt32', 'UInt16', 'UInt8',
+        'Float64', 'Float32',
+    ])
+def numeric_indexing_engine(request):
+    """Return the various numeric indexing engines in pd._libs.index
+    """
+    return getattr(li, "{}Engine".format(request.param))
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -14,6 +14,7 @@
 from pandas import Categorical, IntervalIndex, compat
 from pandas.util.testing import assert_almost_equal
 import pandas.core.config as cf
+from pandas._libs import index as libindex
 import pandas as pd
 
 if PY3:
@@ -1117,3 +1118,33 @@ def test_take_invalid_kwargs(self):
         msg = "the 'mode' parameter is not supported"
         tm.assert_raises_regex(ValueError, msg, idx.take,
                                indices, mode='clip')
+
+
+class TestCategoricalIndexEngine(object):
+
+    def setup_method(self):
+        self.n_categories = {np.int8: 1, np.int16: 129, np.int32: 32769}
+
+        self.engines = {np.int8: libindex.Int8Engine,
+                        np.int16: libindex.Int16Engine,
+                        np.int32: libindex.Int32Engine,
+                        np.int64: libindex.Int64Engine}
+
+    @pytest.mark.parametrize('dtype', [np.int8, np.int16, np.int32, np.int64])
+    def test_engine_type(self, dtype):
+        """
+        Check that a CategoricalIndex has the correct engine type.
+        """
+        if dtype != np.int64:
+            n_categories = self.n_categories[dtype]
+            index = CategoricalIndex(np.arange(n_categories))
+        else:
+            # having actual (2 ** 32) + 1 distinct categories is too
+            # memory-intensive, so we set codes.dtype manually
+            index = CategoricalIndex(['a', 'b', 'c'])
+            index._values._codes = index._values._codes.astype('int64')
+
+        engine = self.engines[dtype]
+
+        assert isinstance(index._engine, engine)
+        assert index.codes.dtype.type == dtype