diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 739ad6a3d278b..5af431dd9f3ae 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -2,7 +2,8 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, +from pandas import (Series, DataFrame, MultiIndex, + Int64Index, UInt64Index, Float64Index, IntervalIndex, CategoricalIndex, IndexSlice, concat, date_range) from .pandas_vb_common import setup, Panel # noqa @@ -11,7 +12,7 @@ class NumericSeriesIndexing(object): goal_time = 0.2 - params = [Int64Index, Float64Index] + params = [Int64Index, UInt64Index, Float64Index] param = ['index'] def setup(self, index): diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py new file mode 100644 index 0000000000000..b01639ca3580d --- /dev/null +++ b/asv_bench/benchmarks/indexing_engines.py @@ -0,0 +1,60 @@ +import numpy as np + +from pandas._libs.index import (Int64Engine, Int32Engine, + Int16Engine, Int8Engine, + UInt64Engine, UInt32Engine, + UInt16Engine, UInt8Engine, + Float64Engine, Float32Engine, + ObjectEngine, + ) + + +class NumericEngineIndexing(object): + + goal_time = 0.2 + params = [[Int64Engine, Int32Engine, Int16Engine, Int8Engine, + UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine, + Float64Engine, Float32Engine, + ], + ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + ] + param_names = ['engine', 'index_type'] + + def setup(self, engine, index_type): + N = 10**5 + values = list([1] * N + [2] * N + [3] * N) + array_ = { + 'monotonic_incr': np.array(values, dtype=engine._dtype), + 'monotonic_decr': np.array(list(reversed(values)), + dtype=engine._dtype), + 'non_monotonic': np.array([1, 2, 3] * N, dtype=engine._dtype), + }[index_type] + + self.data = engine(lambda: array_, len(array_)) + + def time_get_loc(self, engine, index_type): + self.data.get_loc(2) + + +class ObjectEngineIndexing(object): + + goal_time = 0.2 + params = [[ObjectEngine], + ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + ] + param_names = ['engine', 'index_type'] + + def setup(self, engine, index_type): + N = 10**5 + values = list('a' * N + 'b' * N + 'c' * N) + array_ = { + 'monotonic_incr': np.array(values, dtype=engine._dtype), + 'monotonic_decr': np.array(list(reversed(values)), + dtype=engine._dtype), + 'non_monotonic': np.array(list('abc') * N, dtype=engine._dtype), + }[index_type] + + self.data = engine(lambda: array_, len(array_)) + + def time_get_loc(self, engine, index_type): + self.data.get_loc(2) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9d559acfa59e7..280b622445e72 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -591,9 +591,10 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Slicing Series and Dataframe with an monotonically increasing :class:`CategoricalIndex` + is now very fast and has speed comparable to slicing with an ``Int64Index``. + The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) +- Slicing a ``CategoricalIndex`` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d2914dc8ac751..c211f2b6aceb9 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -10,11 +10,13 @@ from libc.math cimport fabs, sqrt import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8, - NPY_FLOAT32, NPY_FLOAT64, + NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8, + NPY_FLOAT64, NPY_FLOAT32, NPY_OBJECT, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, + int64_t, int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, + float64_t, float32_t, double_t) cnp.import_array() @@ -359,9 +361,13 @@ ctypedef fused algos_t: float64_t float32_t object - int32_t int64_t + int32_t + int16_t + int8_t uint64_t + uint32_t + uint16_t uint8_t diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 40b1b1a282670..544ebc9cb26e2 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -133,11 +133,14 @@ def ensure_object(object arr): # name, c_type, dtype dtypes = [('float64', 'FLOAT64', 'float64'), ('float32', 'FLOAT32', 'float32'), - ('int8', 'INT8', 'int8'), - ('int16', 'INT16', 'int16'), - ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('int32', 'INT32', 'int32'), + ('int16', 'INT16', 'int16'), + ('int8', 'INT8', 'int8'), ('uint64', 'UINT64', 'uint64'), + ('uint32', 'UINT32', 'uint32'), + ('uint16', 'UINT16', 'uint16'), + ('uint8', 'UINT8', 'uint8'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), ] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 562c1ba218141..c86b5fbba772f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -5,8 +5,11 @@ import cython import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, float64_t, int32_t, - int64_t, uint8_t, uint64_t, intp_t, +from numpy cimport (ndarray, + float64_t, float32_t, + int64_t, int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, + intp_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available # for cimport in cython>=0.27.3 NPY_DATETIME, NPY_TIMEDELTA) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 4ea35da0626f3..1d5915e5b9afd 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -12,9 +12,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dtype, ctype dtypes = [('Float64', 'float64', 'float64_t'), - ('UInt64', 'uint64', 'uint64_t'), + ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), - ('Object', 'object', 'object')] + ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), + ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t'), + ('Object', 'object', 'object'), + ] }} {{for name, dtype, ctype in dtypes}} @@ -22,8 +30,10 @@ dtypes = [('Float64', 'float64', 'float64_t'), cdef class {{name}}Engine(IndexEngine): + _dtype = '{{dtype}}' + def _call_monotonic(self, values): - return algos.is_monotonic_{{dtype}}(values, timelike=False) + return algos.is_monotonic(values, timelike=False) def get_backfill_indexer(self, other, limit=None): return algos.backfill_{{dtype}}(self._get_index_values(), @@ -36,10 +46,33 @@ cdef class {{name}}Engine(IndexEngine): cdef _make_hash_table(self, n): {{if name == 'Object'}} return _hash.PyObjectHashTable(n) + {{elif name in {'Int8', 'Int16', 'Int32'} }} + # {{name}}HashTable is not available, so we use Int64HashTable + return _hash.Int64HashTable(n) + {{elif name in {'UInt8', 'UInt16', 'UInt32'} }} + # {{name}}HashTable is not available, so we use UInt64HashTable + return _hash.UInt64HashTable(n) + {{elif name in {'Float32'} }} + # {{name}}HashTable is not available, so we use Float64HashTable + return _hash.Float64HashTable(n) {{else}} return _hash.{{name}}HashTable(n) {{endif}} + {{if name in {'Int8', 'Int16', 'Int32'} }} + cpdef _call_map_locations(self, values): + # self.mapping is of type Int64HashTable, so convert dtype of values + self.mapping.map_locations(algos.ensure_int64(values)) + {{elif name in {'UInt8', 'UInt16', 'UInt32'} }} + cpdef _call_map_locations(self, values): + # self.mapping is of type UInt64HashTable, so convert dtype of values + self.mapping.map_locations(algos.ensure_uint64(values)) + {{elif name in {'Float32'} }} + cpdef _call_map_locations(self, values): + # self.mapping is of type Float64HashTable, so convert dtype of values + self.mapping.map_locations(algos.ensure_float64(values)) + {{endif}} + {{if name != 'Float64' and name != 'Object'}} cdef _check_type(self, object val): hash(val) @@ -60,7 +93,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name != 'Float64'}} + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) {{endif}} diff --git a/pandas/conftest.py b/pandas/conftest.py index 621de3ffd4b12..5671facdd0180 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -102,14 +102,29 @@ def ip(): @pytest.fixture(params=[True, False, None]) -def observed(request): +def _true_false_none(request): + """ + Base fixture for fixtures that return True, False and None. + """ + return request.param + + +@pytest.fixture +def observed(_true_false_none): """ pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which appear in the grouper [True]. [None] is supported for future compatiblity if we decide to change the default (and would need to warn if this parameter is not passed)""" - return request.param + return _true_false_none + + +@pytest.fixture +def ordered(_true_false_none): + """Return the allowed parameters for Categorical/CategoricalIndex.ordered. + """ + return _true_false_none _all_arithmetic_operators = ['__add__', '__radd__', diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b42bbdafcab45..12a8b36d7ef48 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3085,6 +3085,10 @@ def _get_unique_index(self, dropna=False): ------- loc : int if unique index, slice if monotonic index, else mask + Raises + ------ + KeyError : If key is not in self + Examples --------- >>> unique_index = pd.Index(list('abc')) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 45703c220a4be..e91de21147833 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -83,7 +83,13 @@ class CategoricalIndex(Index, accessor.PandasDelegate): """ _typ = 'categoricalindex' - _engine_type = libindex.Int64Engine + + @property + def _engine_type(self): + # self.codes can have dtype int8, int16, int 32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + engine_name = "{}Engine".format(self.codes.dtype.name.capitalize()) + return getattr(libindex, engine_name) _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, @@ -377,7 +383,7 @@ def argsort(self, *args, **kwargs): def _engine(self): # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + return self._engine_type(lambda: self.codes, len(self)) # introspection @cache_readonly @@ -426,6 +432,10 @@ def get_loc(self, key, method=None): ------- loc : int if unique index, slice if monotonic index, else mask + Raises + ------ + KeyError : If key is not in self + Examples --------- >>> unique_index = pd.CategoricalIndex(list('abc')) @@ -440,10 +450,12 @@ def get_loc(self, key, method=None): >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ - codes = self.categories.get_loc(key) - if (codes == -1): - raise KeyError(key) - return self._engine.get_loc(codes) + code = self.categories.get_loc(key) + + # dtype must be same as dtype for self.codes else searchsorted is slow + code = self.codes.dtype.type(code) + + return self._engine.get_loc(code) def get_value(self, series, key): """ diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 6d88ef0cfa6c5..8e889ed795408 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -4,6 +4,7 @@ import pandas.util.testing as tm from pandas.core.indexes.api import Index, MultiIndex +from pandas._libs import index as li from pandas.compat import lzip, long @@ -45,3 +46,15 @@ def zero(request): # For testing division by (or of) zero for Index with length 5, this # gives several scalar-zeros and length-5 vector-zeros return request.param + + +@pytest.fixture( + params=[ + 'Int64', 'Int32', 'Int16', 'Int8', + 'UInt64', 'UInt32', 'UInt16', 'UInt8', + 'Float64', 'Float32', + ]) +def numeric_indexing_engine(request): + """Return the various numeric indexing engines in pd._libs.index + """ + return getattr(li, "{}Engine".format(request.param)) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d49a6a6abc7c9..64fad2a87c364 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -14,6 +14,7 @@ from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf +from pandas._libs import index as libindex import pandas as pd if PY3: @@ -1117,3 +1118,33 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" tm.assert_raises_regex(ValueError, msg, idx.take, indices, mode='clip') + + +class TestCategoricalIndexEngine(object): + + def setup_method(self): + self.n_categories = {np.int8: 1, np.int16: 129, np.int32: 32769} + + self.engines = {np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine} + + @pytest.mark.parametrize('dtype', [np.int8, np.int16, np.int32, np.int64]) + def test_engine_type(self, dtype): + """ + Check that a CategoricalIndex has the correct engine type. + """ + if dtype != np.int64: + n_categories = self.n_categories[dtype] + index = CategoricalIndex(np.arange(n_categories)) + else: + # having actual (2 ** 32) + 1 distinct categories is too + # memory-intensive, so we set codes.dtype manually + index = CategoricalIndex(['a', 'b', 'c']) + index._values._codes = index._values._codes.astype('int64') + + engine = self.engines[dtype] + + assert isinstance(index._engine, engine) + assert index.codes.dtype.type == dtype diff --git a/pandas/tests/indexes/test_engine.py b/pandas/tests/indexes/test_engine.py new file mode 100644 index 0000000000000..75a587a6c6a64 --- /dev/null +++ b/pandas/tests/indexes/test_engine.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +import pandas as pd +from pandas._libs.index import (Int64Engine, UInt64Engine, + Float64Engine, ObjectEngine) + + +class TestNumericEngine(object): + + def setup_class(cls): + cls.data = [1, 2, 3] + + def test_engine_type(self, numeric_indexing_engine): + index = pd.Index(self.data, dtype=numeric_indexing_engine._dtype) + if issubclass(index.dtype.type, np.signedinteger): + assert isinstance(index._engine, Int64Engine) + elif issubclass(index.dtype.type, np.unsignedinteger): + assert isinstance(index._engine, UInt64Engine) + elif issubclass(index.dtype.type, np.floating): + assert isinstance(index._engine, Float64Engine) + else: + raise TypeError("unexpected dtype {}".format(index.dtype)) + + def test_is_monotonic_ordered(self, numeric_indexing_engine): + codes = np.array(self.data, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + + assert e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + # reverse sort order + reversed_data = list(reversed(self.data)) + codes = np.array(reversed_data, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + + assert not e.is_monotonic_increasing + assert e.is_monotonic_decreasing + + def test_is_not_monotonic_ordered(self, numeric_indexing_engine): + data = [1, 0, 2] + codes = np.array(data, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + + assert not e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + @pytest.mark.parametrize('values, expected', [ + ([1, 2, 3], True), + ([1, 1, 2], False), + ]) + def test_is_unique(self, values, expected, numeric_indexing_engine): + codes = np.array(values, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + + assert e.is_unique is expected + + @pytest.mark.parametrize('values, value, expected', [ + ([1, 2, 3], 2, 1), + ([1, 2, 2, 3], 2, slice(1, 3)), + ([3, 2, 2, 1], 2, np.array([False, True, True, False])), + ([1, 2, 2, 1], 2, np.array([False, True, True, False])), + ([1, 3, 2], 2, 2), + ]) + def test_get_loc(self, values, value, expected, numeric_indexing_engine): + codes = np.array(values, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + result = e.get_loc(value) + + if isinstance(expected, np.ndarray): + assert (result == expected).all() + else: + assert result == expected + + @pytest.mark.parametrize('values, value, error', [ + ([1, 2, 3], 4, KeyError), + ([1, 2, 3], '4', KeyError), + ]) + def test_get_loc_raises(self, values, value, error, + numeric_indexing_engine): + codes = np.array(values, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + with pytest.raises(error): + e.get_loc(value) + + +class TestObjectEngine(object): + + def setup_class(cls): + cls.data = list('abc') + cls.dtype = object + cls.Engine = ObjectEngine + + def test_engine_type(self): + index = pd.Index(self.data) + assert isinstance(index._engine, self.Engine) + + def test_is_monotonic_ordered(self): + codes = np.array(self.data, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + + assert e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + # reverse sort order + reversed_data = list(reversed(self.data)) + codes = np.array(reversed_data, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + + assert not e.is_monotonic_increasing + assert e.is_monotonic_decreasing + + def test_is_not_monotonic_ordered(self): + codes = np.array(list('cab'), dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + + assert not e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + @pytest.mark.parametrize('values, expected', [ + (list('abc'), True), + (list('aab'), False), + ]) + def test_is_unique(self, values, expected): + codes = np.array(values, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + + assert e.is_unique is expected + + @pytest.mark.parametrize('values, value, expected', [ + (list('abc'), 'b', 1), + (list('abbc'), 'b', slice(1, 3)), + (list('cbba'), 'b', np.array([False, True, True, False])), + (list('abba'), 'b', np.array([False, True, True, False])), + (list('acb'), 'b', 2), + ]) + def test_get_loc(self, values, value, expected): + codes = np.array(values, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + result = e.get_loc(value) + + if isinstance(expected, np.ndarray): + assert (result == expected).all() + else: + assert result == expected + + @pytest.mark.parametrize('values, value, error', [ + (list('abc'), 'd', KeyError), + (list('abc'), 4, KeyError), + ]) + def test_get_loc_raises(self, values, value, error): + codes = np.array(values, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + with pytest.raises(error): + e.get_loc(value)