From 297ec194977f011e7639965464b2e75cec913f86 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 1 Jul 2018 18:53:11 +0100 Subject: [PATCH 1/9] Proof of performance --- pandas/core/indexes/base.py | 4 ++++ pandas/core/indexes/category.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b42bbdafcab45..12a8b36d7ef48 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3085,6 +3085,10 @@ def _get_unique_index(self, dropna=False): ------- loc : int if unique index, slice if monotonic index, else mask + Raises + ------ + KeyError : If key is not in self + Examples --------- >>> unique_index = pd.Index(list('abc')) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 45703c220a4be..9e27f12d556b8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -426,6 +426,10 @@ def get_loc(self, key, method=None): ------- loc : int if unique index, slice if monotonic index, else mask + Raises + ------ + KeyError : If key is not in self + Examples --------- >>> unique_index = pd.CategoricalIndex(list('abc')) @@ -443,6 +447,14 @@ def get_loc(self, key, method=None): codes = self.categories.get_loc(key) if (codes == -1): raise KeyError(key) + + if self.is_monotonic_increasing and not self.is_unique: + if codes not in self._engine: + raise KeyError(key) + codes = self.codes.dtype.type(codes) + lhs = self.codes.searchsorted(codes, side='left') + rhs = self.codes.searchsorted(codes, side='right') + return slice(lhs, rhs) return self._engine.get_loc(codes) def get_value(self, series, key): From 91ee55dbb0bbfd598d97c50cd458681d5703816c Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 5 Jul 2018 10:25:54 +0100 Subject: [PATCH 2/9] add Int8Index --- pandas/_libs/index_class_helper.pxi.in | 8 ++++++-- pandas/core/indexes/category.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 4ea35da0626f3..28cb1bc2a1e46 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -12,9 +12,13 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dtype, ctype dtypes = [('Float64', 'float64', 'float64_t'), - ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), - ('Object', 'object', 'object')] + ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), + ('UInt64', 'uint64', 'uint64_t'), + ('Object', 'object', 'object'), + ] }} {{for name, dtype, ctype in dtypes}} diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9e27f12d556b8..24e3e608d8d9d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -83,7 +83,11 @@ class CategoricalIndex(Index, accessor.PandasDelegate): """ _typ = 'categoricalindex' - _engine_type = libindex.Int64Engine + + @property + def _engine_type(self): + type_name = self.codes.dtype.name.capitalize() + return getattr(libindex, "{}Engine".format(type_name)) _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, @@ -377,7 +381,7 @@ def argsort(self, *args, **kwargs): def _engine(self): # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + return self._engine_type(lambda: self.codes, len(self)) # introspection @cache_readonly From 4d0612ea82a828e4be8fc93b6f608d4b21f67354 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 24 Jul 2018 13:08:34 +0100 Subject: [PATCH 3/9] make Int8/16/32Engine work with Int64HashTable --- pandas/_libs/index.pyx | 6 +++++- pandas/_libs/index_class_helper.pxi.in | 2 ++ pandas/core/indexes/category.py | 18 ++++++------------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 562c1ba218141..ae417052907eb 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -6,7 +6,9 @@ import cython import numpy as np cimport numpy as cnp from numpy cimport (ndarray, float64_t, int32_t, - int64_t, uint8_t, uint64_t, intp_t, + int8_t, int16_t, int32_t, int64_t, + uint8_t, uint64_t, + intp_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available # for cimport in cython>=0.27.3 NPY_DATETIME, NPY_TIMEDELTA) @@ -242,6 +244,8 @@ cdef class IndexEngine: if not self.is_mapping_populated: values = self._get_index_values() + if values.dtype in {'int8', 'int16', 'int32'}: + values = algos.ensure_int64(values) self.mapping = self._make_hash_table(len(values)) self._call_map_locations(values) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 28cb1bc2a1e46..0e750f8c47481 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -40,6 +40,8 @@ cdef class {{name}}Engine(IndexEngine): cdef _make_hash_table(self, n): {{if name == 'Object'}} return _hash.PyObjectHashTable(n) + {{elif name in {'Int8', 'Int16', 'Int32'} }} + return _hash.Int64HashTable(n) {{else}} return _hash.{{name}}HashTable(n) {{endif}} diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 24e3e608d8d9d..d2ae10773b334 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -448,18 +448,12 @@ def get_loc(self, key, method=None): >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ - codes = self.categories.get_loc(key) - if (codes == -1): - raise KeyError(key) - - if self.is_monotonic_increasing and not self.is_unique: - if codes not in self._engine: - raise KeyError(key) - codes = self.codes.dtype.type(codes) - lhs = self.codes.searchsorted(codes, side='left') - rhs = self.codes.searchsorted(codes, side='right') - return slice(lhs, rhs) - return self._engine.get_loc(codes) + code = self.categories.get_loc(key) + + # dtype must be same as dtype for self.codes else searchsorted is slow + code = self.codes.dtype.type(code) + + return self._engine.get_loc(code) def get_value(self, series, key): """ From 5a89a51c7db40119bde65dc12ff56993afff84cb Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 10 Aug 2018 05:31:51 +0100 Subject: [PATCH 4/9] move int8/16/32 type conversions to index_class_helper.pxi.in --- pandas/_libs/index.pyx | 9 ++++----- pandas/_libs/index_class_helper.pxi.in | 17 +++++++++++++++++ pandas/core/indexes/category.py | 6 ++++-- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ae417052907eb..dc4d25041b657 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -5,9 +5,10 @@ import cython import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, float64_t, int32_t, - int8_t, int16_t, int32_t, int64_t, - uint8_t, uint64_t, +from numpy cimport (ndarray, + float64_t, float32_t, + int64_t,int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, intp_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available # for cimport in cython>=0.27.3 @@ -244,8 +245,6 @@ cdef class IndexEngine: if not self.is_mapping_populated: values = self._get_index_values() - if values.dtype in {'int8', 'int16', 'int32'}: - values = algos.ensure_int64(values) self.mapping = self._make_hash_table(len(values)) self._call_map_locations(values) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 0e750f8c47481..367216e630dae 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -12,11 +12,15 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dtype, ctype dtypes = [('Float64', 'float64', 'float64_t'), + ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), ('Int16', 'int16', 'int16_t'), ('Int8', 'int8', 'int8_t'), ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t'), ('Object', 'object', 'object'), ] }} @@ -41,11 +45,24 @@ cdef class {{name}}Engine(IndexEngine): {{if name == 'Object'}} return _hash.PyObjectHashTable(n) {{elif name in {'Int8', 'Int16', 'Int32'} }} + # {{name}}HashTable is not available, so we use Int64HashTable return _hash.Int64HashTable(n) + {{elif name in {'UInt8', 'UInt16', 'UInt32'} }} + # {{name}}HashTable is not available, so we use UInt64HashTable + return _hash.UInt64HashTable(n) + {{elif name in {'Float32'} }} + # {{name}}HashTable is not available, so we use Float64HashTable + return _hash.Float64HashTable(n) {{else}} return _hash.{{name}}HashTable(n) {{endif}} + {{if name in {'Int8', 'Int16', 'Int32'} }} + cpdef _call_map_locations(self, values): + # self.mapping is of type Int64HashTable, so convert dtype of values + self.mapping.map_locations(algos.ensure_int64(values)) + {{endif}} + {{if name != 'Float64' and name != 'Object'}} cdef _check_type(self, object val): hash(val) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d2ae10773b334..e91de21147833 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -86,8 +86,10 @@ class CategoricalIndex(Index, accessor.PandasDelegate): @property def _engine_type(self): - type_name = self.codes.dtype.name.capitalize() - return getattr(libindex, "{}Engine".format(type_name)) + # self.codes can have dtype int8, int16, int 32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + engine_name = "{}Engine".format(self.codes.dtype.name.capitalize()) + return getattr(libindex, engine_name) _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, From 5575e93782601f3f2822a0e0380a98512c43cd28 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 10 Aug 2018 20:11:05 +0100 Subject: [PATCH 5/9] changed according to comments --- pandas/_libs/algos.pyx | 10 +-- pandas/_libs/algos_common_helper.pxi.in | 9 ++- pandas/_libs/index.pyx | 2 +- pandas/_libs/index_class_helper.pxi.in | 12 +++- pandas/conftest.py | 19 ++++- pandas/tests/indexes/conftest.py | 13 ++++ pandas/tests/indexes/test_category.py | 25 +++++++ pandas/tests/indexes/test_engine.py | 94 +++++++++++++++++++++++++ 8 files changed, 173 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/indexes/test_engine.py diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d2914dc8ac751..86c6ee8599cb5 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -10,11 +10,13 @@ from libc.math cimport fabs, sqrt import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8, - NPY_FLOAT32, NPY_FLOAT64, + NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8, + NPY_FLOAT64, NPY_FLOAT32, NPY_OBJECT, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, + int64_t, int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, + float64_t, float32_t, double_t) cnp.import_array() diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 40b1b1a282670..544ebc9cb26e2 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -133,11 +133,14 @@ def ensure_object(object arr): # name, c_type, dtype dtypes = [('float64', 'FLOAT64', 'float64'), ('float32', 'FLOAT32', 'float32'), - ('int8', 'INT8', 'int8'), - ('int16', 'INT16', 'int16'), - ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('int32', 'INT32', 'int32'), + ('int16', 'INT16', 'int16'), + ('int8', 'INT8', 'int8'), ('uint64', 'UINT64', 'uint64'), + ('uint32', 'UINT32', 'uint32'), + ('uint16', 'UINT16', 'uint16'), + ('uint8', 'UINT8', 'uint8'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), ] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index dc4d25041b657..c86b5fbba772f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -7,7 +7,7 @@ import numpy as np cimport numpy as cnp from numpy cimport (ndarray, float64_t, float32_t, - int64_t,int32_t, int16_t, int8_t, + int64_t, int32_t, int16_t, int8_t, uint64_t, uint32_t, uint16_t, uint8_t, intp_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 367216e630dae..eef162f3802c7 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -30,6 +30,8 @@ dtypes = [('Float64', 'float64', 'float64_t'), cdef class {{name}}Engine(IndexEngine): + _dtype = '{{dtype}}' + def _call_monotonic(self, values): return algos.is_monotonic_{{dtype}}(values, timelike=False) @@ -61,6 +63,14 @@ cdef class {{name}}Engine(IndexEngine): cpdef _call_map_locations(self, values): # self.mapping is of type Int64HashTable, so convert dtype of values self.mapping.map_locations(algos.ensure_int64(values)) + {{elif name in {'UInt8', 'UInt16', 'UInt32'} }} + cpdef _call_map_locations(self, values): + # self.mapping is of type UInt64HashTable, so convert dtype of values + self.mapping.map_locations(algos.ensure_uint64(values)) + {{elif name in {'Float32'} }} + cpdef _call_map_locations(self, values): + # self.mapping is of type Float64HashTable, so convert dtype of values + self.mapping.map_locations(algos.ensure_float64(values)) {{endif}} {{if name != 'Float64' and name != 'Object'}} @@ -83,7 +93,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name != 'Float64'}} + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) {{endif}} diff --git a/pandas/conftest.py b/pandas/conftest.py index 621de3ffd4b12..5671facdd0180 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -102,14 +102,29 @@ def ip(): @pytest.fixture(params=[True, False, None]) -def observed(request): +def _true_false_none(request): + """ + Base fixture for fixtures that return True, False and None. + """ + return request.param + + +@pytest.fixture +def observed(_true_false_none): """ pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which appear in the grouper [True]. [None] is supported for future compatiblity if we decide to change the default (and would need to warn if this parameter is not passed)""" - return request.param + return _true_false_none + + +@pytest.fixture +def ordered(_true_false_none): + """Return the allowed parameters for Categorical/CategoricalIndex.ordered. + """ + return _true_false_none _all_arithmetic_operators = ['__add__', '__radd__', diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 6d88ef0cfa6c5..55701cbfe11d5 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -4,6 +4,7 @@ import pandas.util.testing as tm from pandas.core.indexes.api import Index, MultiIndex +from pandas._libs import index as li from pandas.compat import lzip, long @@ -45,3 +46,15 @@ def zero(request): # For testing division by (or of) zero for Index with length 5, this # gives several scalar-zeros and length-5 vector-zeros return request.param + + +@pytest.fixture( + params=[ + 'Int64', 'Int32', 'Int16', 'Int8', + 'UInt64', 'UInt32', 'UInt16', 'UInt8', + 'Float64', 'Float32', + ]) +def num_engine(request): + """Return the various numeric engines in pd._libs.index + """ + return getattr(li, "{}Engine".format(request.param)) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d49a6a6abc7c9..609bf172b5fa2 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -14,6 +14,7 @@ from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf +from pandas._libs import index as li import pandas as pd if PY3: @@ -1117,3 +1118,27 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" tm.assert_raises_regex(ValueError, msg, idx.take, indices, mode='clip') + + +class TestCategoricalIndexEngine(object): + + @pytest.mark.parametrize('nbits', [8, 16, 32, 64]) + def test_engine_type(self, nbits): + """Check that a CategoricalIndex has the correct engine type. + """ + if nbits < 64: + ncategories = int(2 ** (nbits / 2) / 2) # 128 for nbits==16 etc + index = CategoricalIndex(range(ncategories)) + else: + index = CategoricalIndex(['a', 'b', 'c']) + # having actual 2 ** (64 / 2) / 2 categories is too + # memory-intensive, so we set codes.dtype manually + index._values._codes = index._values._codes.astype('int64') + + dtype = {8: np.int8, 16: np.int16, + 32: np.int32, 64: np.int64}[nbits] + engine = {8: li.Int8Engine, 16: li.Int16Engine, + 32: li.Int32Engine, 64: li.Int64Engine}[nbits] + + assert isinstance(index._engine, engine) + assert issubclass(index.codes.dtype.type, dtype) diff --git a/pandas/tests/indexes/test_engine.py b/pandas/tests/indexes/test_engine.py new file mode 100644 index 0000000000000..a089e48479f2f --- /dev/null +++ b/pandas/tests/indexes/test_engine.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +import pandas as pd +from pandas._libs.index import (Int64Engine, UInt64Engine, + Float64Engine, ObjectEngine) + + +class TestNumericEngine(object): + + @pytest.mark.parametrize('data', [[0, 1, 2]]) + def test_engine_type(self, data, num_engine): + index = pd.Index(data, dtype=num_engine._dtype) + if issubclass(index.dtype.type, np.signedinteger): + assert isinstance(index._engine, Int64Engine) + elif issubclass(index.dtype.type, np.unsignedinteger): + assert isinstance(index._engine, UInt64Engine) + elif issubclass(index.dtype.type, np.floating): + assert isinstance(index._engine, Float64Engine) + else: + raise TypeError("unexpected dtype {}".format(index.dtype)) + + @pytest.mark.parametrize('data', [[0, 1, 2]]) + def test_is_monotonic_ordered(self, data, num_engine): + codes = np.array(data, dtype=num_engine._dtype) + e = num_engine(lambda: codes, len(codes)) + assert e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + # reverse sort order + codes = np.array(list(reversed(data)), dtype=num_engine._dtype) + e = num_engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing + assert e.is_monotonic_decreasing + + @pytest.mark.parametrize('data', [[1, 0, 2]]) + def test_is_not_monotonic_ordered(self, data, num_engine): + codes = np.array(data, dtype=num_engine._dtype) + e = num_engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + @pytest.mark.parametrize('values, expected', [ + ([1, 2, 3], True), + ([1, 1, 2], False), + ]) + def test_is_unique(self, values, expected, num_engine): + + codes = np.array(values, dtype=num_engine._dtype) + e = num_engine(lambda: codes, len(codes)) + assert e.is_unique is expected + + +class TestObjectEngine(object): + + def setup_class(cls): + cls.Engine = ObjectEngine + cls.dtype = object + + @pytest.mark.parametrize('data', [['a', 'b', 'c']]) + def test_engine_type(self, data): + index = pd.Index(data) + assert isinstance(index._engine, self.Engine) + + @pytest.mark.parametrize('data', [['a', 'b', 'c']]) + def test_is_monotonic_ordered(self, data): + codes = np.array(data, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + assert e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + # reverse sort order + codes = np.array(list(reversed(data)), dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing + assert e.is_monotonic_decreasing + + @pytest.mark.parametrize('data', [['a', 'c', 'b']]) + def test_is_not_monotonic_ordered(self, data): + codes = np.array(data, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing + assert not e.is_monotonic_decreasing + + @pytest.mark.parametrize('values, expected', [ + (['a', 'b', 'c'], True), + (['a', 'a', 'b'], False), + ]) + def test_is_unique(self, values, expected): + codes = np.array(values, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + assert e.is_unique is expected From 2fa526f0df043880cd88bb5ee99832db87955a8c Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 11 Aug 2018 11:47:15 +0100 Subject: [PATCH 6/9] correct ncategories size + add tests --- asv_bench/benchmarks/indexing.py | 5 ++- pandas/tests/indexes/test_category.py | 10 ++--- pandas/tests/indexes/test_engine.py | 58 ++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 739ad6a3d278b..5af431dd9f3ae 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -2,7 +2,8 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, +from pandas import (Series, DataFrame, MultiIndex, + Int64Index, UInt64Index, Float64Index, IntervalIndex, CategoricalIndex, IndexSlice, concat, date_range) from .pandas_vb_common import setup, Panel # noqa @@ -11,7 +12,7 @@ class NumericSeriesIndexing(object): goal_time = 0.2 - params = [Int64Index, Float64Index] + params = [Int64Index, UInt64Index, Float64Index] param = ['index'] def setup(self, index): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 609bf172b5fa2..e8b5e799e8bdb 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -14,7 +14,7 @@ from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf -from pandas._libs import index as li +from pandas._libs import index as libindex import pandas as pd if PY3: @@ -1127,18 +1127,18 @@ def test_engine_type(self, nbits): """Check that a CategoricalIndex has the correct engine type. """ if nbits < 64: - ncategories = int(2 ** (nbits / 2) / 2) # 128 for nbits==16 etc + ncategories = int(2 ** (nbits / 2) / 2 + 1) # 129 if nbits==16 etc index = CategoricalIndex(range(ncategories)) else: index = CategoricalIndex(['a', 'b', 'c']) - # having actual 2 ** (64 / 2) / 2 categories is too + # having actual 2 ** (64 / 2) / 2 + 1 categories is too # memory-intensive, so we set codes.dtype manually index._values._codes = index._values._codes.astype('int64') dtype = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}[nbits] - engine = {8: li.Int8Engine, 16: li.Int16Engine, - 32: li.Int32Engine, 64: li.Int64Engine}[nbits] + engine = {8: libindex.Int8Engine, 16: libindex.Int16Engine, + 32: libindex.Int32Engine, 64: libindex.Int64Engine}[nbits] assert isinstance(index._engine, engine) assert issubclass(index.codes.dtype.type, dtype) diff --git a/pandas/tests/indexes/test_engine.py b/pandas/tests/indexes/test_engine.py index a089e48479f2f..ec3b4ccb83a72 100644 --- a/pandas/tests/indexes/test_engine.py +++ b/pandas/tests/indexes/test_engine.py @@ -6,6 +6,8 @@ import pandas as pd from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine, ObjectEngine) +from pandas._libs.lib import is_scalar +import pandas.util.testing as tm class TestNumericEngine(object): @@ -52,12 +54,39 @@ def test_is_unique(self, values, expected, num_engine): e = num_engine(lambda: codes, len(codes)) assert e.is_unique is expected + @pytest.mark.parametrize('values, value, expected', [ + ([1, 2, 3], 2, 1), + ([1, 2, 2, 3], 2, slice(1, 3)), + ([3, 2, 2, 1], 2, np.array([False, True, True, False])), + ([1, 2, 2, 1], 2, np.array([False, True, True, False])), + ([1, 3, 2], 2, 2), + ]) + def test_get_loc(self, values, value, expected, num_engine): + codes = np.array(values, dtype=num_engine._dtype) + e = num_engine(lambda: codes, len(codes)) + result = e.get_loc(value) + + if isinstance(expected, np.ndarray): + assert (result == expected).all() + else: + assert result == expected + + @pytest.mark.parametrize('values, value, error', [ + ([1, 2, 3], 4, KeyError), + ([1, 2, 3], '4', KeyError), + ]) + def test_get_loc_raises(self, values, value, error, num_engine): + codes = np.array(values, dtype=num_engine._dtype) + e = num_engine(lambda: codes, len(codes)) + with pytest.raises(error): + e.get_loc(value) + class TestObjectEngine(object): def setup_class(cls): - cls.Engine = ObjectEngine cls.dtype = object + cls.Engine = ObjectEngine @pytest.mark.parametrize('data', [['a', 'b', 'c']]) def test_engine_type(self, data): @@ -92,3 +121,30 @@ def test_is_unique(self, values, expected): codes = np.array(values, dtype=self.dtype) e = self.Engine(lambda: codes, len(codes)) assert e.is_unique is expected + + @pytest.mark.parametrize('values, value, expected', [ + (list('abc'), 'b', 1), + (list('abbc'), 'b', slice(1, 3)), + (list('cbba'), 'b', np.array([False, True, True, False])), + (list('abba'), 'b', np.array([False, True, True, False])), + (list('acb'), 'b', 2), + ]) + def test_get_loc(self, values, value, expected): + codes = np.array(values, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + result = e.get_loc(value) + + if isinstance(expected, np.ndarray): + assert (result == expected).all() + else: + assert result == expected + + @pytest.mark.parametrize('values, value, error', [ + (list('abc'), 'd', KeyError), + (list('abc'), 4, KeyError), + ]) + def test_get_loc_raises(self, values, value, error): + codes = np.array(values, dtype=self.dtype) + e = self.Engine(lambda: codes, len(codes)) + with pytest.raises(error): + e.get_loc(value) From 4bc74f5545cf467c5625fbfb87345958b8027b76 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 12 Aug 2018 03:35:35 +0100 Subject: [PATCH 7/9] add ASVs --- asv_bench/benchmarks/indexing_engines.py | 63 ++++++++++++++++++++++++ pandas/tests/indexes/test_engine.py | 10 ++-- 2 files changed, 67 insertions(+), 6 deletions(-) create mode 100644 asv_bench/benchmarks/indexing_engines.py diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py new file mode 100644 index 0000000000000..a727c6625f183 --- /dev/null +++ b/asv_bench/benchmarks/indexing_engines.py @@ -0,0 +1,63 @@ +import numpy as np + +import pandas.util.testing as tm +from pandas._libs.index import (Int64Engine, Int32Engine, + Int16Engine, Int8Engine, + UInt64Engine, UInt32Engine, + UInt16Engine, UInt8Engine, + Float64Engine, Float32Engine, + ObjectEngine, + ) + + +class NumericEngineIndexing(object): + + goal_time = 0.2 + params = [[Int64Engine, Int32Engine, Int16Engine, Int8Engine, + UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine, + Float64Engine, Float32Engine, + ], + ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + ] + param_names = ['engine', 'index_type'] + + def setup(self, engine, index_type): + N = 10**5 + values = list([1] * N + [2] * N + [3] * N) + array_ = { + 'monotonic_incr': np.array(values, dtype=engine._dtype), + 'monotonic_decr': np.array(list(reversed(values)), + dtype=engine._dtype), + 'non_monotonic': np.array([1, 2, 3] * N, dtype=engine._dtype), + }[index_type] + + self.data = engine(lambda: array_, len(array_)) + self.int_scalar = 2 + + def time_get_loc(self, engine, index_type): + self.data.get_loc(self.int_scalar) + + +class ObjectEngineIndexing(object): + + goal_time = 0.2 + params = [[ObjectEngine], + ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + ] + param_names = ['engine', 'index_type'] + + def setup(self, engine, index_type): + N = 10**5 + values = list('a' * N + 'b' * N + 'c' * N) + array_ = { + 'monotonic_incr': np.array(values, dtype=engine._dtype), + 'monotonic_decr': np.array(list(reversed(values)), + dtype=engine._dtype), + 'non_monotonic': np.array(list('abc') * N, dtype=engine._dtype), + }[index_type] + + self.data = engine(lambda: array_, len(array_)) + self.int_scalar = 'b' + + def time_get_loc(self, engine, index_type): + self.data.get_loc(self.int_scalar) diff --git a/pandas/tests/indexes/test_engine.py b/pandas/tests/indexes/test_engine.py index ec3b4ccb83a72..63d51587e1e29 100644 --- a/pandas/tests/indexes/test_engine.py +++ b/pandas/tests/indexes/test_engine.py @@ -6,8 +6,6 @@ import pandas as pd from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine, ObjectEngine) -from pandas._libs.lib import is_scalar -import pandas.util.testing as tm class TestNumericEngine(object): @@ -57,8 +55,8 @@ def test_is_unique(self, values, expected, num_engine): @pytest.mark.parametrize('values, value, expected', [ ([1, 2, 3], 2, 1), ([1, 2, 2, 3], 2, slice(1, 3)), - ([3, 2, 2, 1], 2, np.array([False, True, True, False])), - ([1, 2, 2, 1], 2, np.array([False, True, True, False])), + ([3, 2, 2, 1], 2, np.array([False, True, True, False])), + ([1, 2, 2, 1], 2, np.array([False, True, True, False])), ([1, 3, 2], 2, 2), ]) def test_get_loc(self, values, value, expected, num_engine): @@ -125,8 +123,8 @@ def test_is_unique(self, values, expected): @pytest.mark.parametrize('values, value, expected', [ (list('abc'), 'b', 1), (list('abbc'), 'b', slice(1, 3)), - (list('cbba'), 'b', np.array([False, True, True, False])), - (list('abba'), 'b', np.array([False, True, True, False])), + (list('cbba'), 'b', np.array([False, True, True, False])), + (list('abba'), 'b', np.array([False, True, True, False])), (list('acb'), 'b', 2), ]) def test_get_loc(self, values, value, expected): From c9f1166584082e3668850abb5f36b7987ebf5bbd Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 17 Aug 2018 09:29:15 +0100 Subject: [PATCH 8/9] clean up ASV code a bit --- asv_bench/benchmarks/indexing_engines.py | 7 +-- pandas/tests/indexes/conftest.py | 4 +- pandas/tests/indexes/test_category.py | 32 ++++++---- pandas/tests/indexes/test_engine.py | 77 +++++++++++++----------- 4 files changed, 66 insertions(+), 54 deletions(-) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index a727c6625f183..b01639ca3580d 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,6 +1,5 @@ import numpy as np -import pandas.util.testing as tm from pandas._libs.index import (Int64Engine, Int32Engine, Int16Engine, Int8Engine, UInt64Engine, UInt32Engine, @@ -32,10 +31,9 @@ def setup(self, engine, index_type): }[index_type] self.data = engine(lambda: array_, len(array_)) - self.int_scalar = 2 def time_get_loc(self, engine, index_type): - self.data.get_loc(self.int_scalar) + self.data.get_loc(2) class ObjectEngineIndexing(object): @@ -57,7 +55,6 @@ def setup(self, engine, index_type): }[index_type] self.data = engine(lambda: array_, len(array_)) - self.int_scalar = 'b' def time_get_loc(self, engine, index_type): - self.data.get_loc(self.int_scalar) + self.data.get_loc(2) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 55701cbfe11d5..8e889ed795408 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -54,7 +54,7 @@ def zero(request): 'UInt64', 'UInt32', 'UInt16', 'UInt8', 'Float64', 'Float32', ]) -def num_engine(request): - """Return the various numeric engines in pd._libs.index +def numeric_indexing_engine(request): + """Return the various numeric indexing engines in pd._libs.index """ return getattr(li, "{}Engine".format(request.param)) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index e8b5e799e8bdb..64fad2a87c364 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -1122,23 +1122,29 @@ def test_take_invalid_kwargs(self): class TestCategoricalIndexEngine(object): - @pytest.mark.parametrize('nbits', [8, 16, 32, 64]) - def test_engine_type(self, nbits): - """Check that a CategoricalIndex has the correct engine type. + def setup_method(self): + self.n_categories = {np.int8: 1, np.int16: 129, np.int32: 32769} + + self.engines = {np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine} + + @pytest.mark.parametrize('dtype', [np.int8, np.int16, np.int32, np.int64]) + def test_engine_type(self, dtype): + """ + Check that a CategoricalIndex has the correct engine type. """ - if nbits < 64: - ncategories = int(2 ** (nbits / 2) / 2 + 1) # 129 if nbits==16 etc - index = CategoricalIndex(range(ncategories)) + if dtype != np.int64: + n_categories = self.n_categories[dtype] + index = CategoricalIndex(np.arange(n_categories)) else: - index = CategoricalIndex(['a', 'b', 'c']) - # having actual 2 ** (64 / 2) / 2 + 1 categories is too + # having actual (2 ** 32) + 1 distinct categories is too # memory-intensive, so we set codes.dtype manually + index = CategoricalIndex(['a', 'b', 'c']) index._values._codes = index._values._codes.astype('int64') - dtype = {8: np.int8, 16: np.int16, - 32: np.int32, 64: np.int64}[nbits] - engine = {8: libindex.Int8Engine, 16: libindex.Int16Engine, - 32: libindex.Int32Engine, 64: libindex.Int64Engine}[nbits] + engine = self.engines[dtype] assert isinstance(index._engine, engine) - assert issubclass(index.codes.dtype.type, dtype) + assert index.codes.dtype.type == dtype diff --git a/pandas/tests/indexes/test_engine.py b/pandas/tests/indexes/test_engine.py index 63d51587e1e29..75a587a6c6a64 100644 --- a/pandas/tests/indexes/test_engine.py +++ b/pandas/tests/indexes/test_engine.py @@ -10,9 +10,11 @@ class TestNumericEngine(object): - @pytest.mark.parametrize('data', [[0, 1, 2]]) - def test_engine_type(self, data, num_engine): - index = pd.Index(data, dtype=num_engine._dtype) + def setup_class(cls): + cls.data = [1, 2, 3] + + def test_engine_type(self, numeric_indexing_engine): + index = pd.Index(self.data, dtype=numeric_indexing_engine._dtype) if issubclass(index.dtype.type, np.signedinteger): assert isinstance(index._engine, Int64Engine) elif issubclass(index.dtype.type, np.unsignedinteger): @@ -22,23 +24,26 @@ def test_engine_type(self, data, num_engine): else: raise TypeError("unexpected dtype {}".format(index.dtype)) - @pytest.mark.parametrize('data', [[0, 1, 2]]) - def test_is_monotonic_ordered(self, data, num_engine): - codes = np.array(data, dtype=num_engine._dtype) - e = num_engine(lambda: codes, len(codes)) + def test_is_monotonic_ordered(self, numeric_indexing_engine): + codes = np.array(self.data, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + assert e.is_monotonic_increasing assert not e.is_monotonic_decreasing # reverse sort order - codes = np.array(list(reversed(data)), dtype=num_engine._dtype) - e = num_engine(lambda: codes, len(codes)) + reversed_data = list(reversed(self.data)) + codes = np.array(reversed_data, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing assert e.is_monotonic_decreasing - @pytest.mark.parametrize('data', [[1, 0, 2]]) - def test_is_not_monotonic_ordered(self, data, num_engine): - codes = np.array(data, dtype=num_engine._dtype) - e = num_engine(lambda: codes, len(codes)) + def test_is_not_monotonic_ordered(self, numeric_indexing_engine): + data = [1, 0, 2] + codes = np.array(data, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing assert not e.is_monotonic_decreasing @@ -46,10 +51,10 @@ def test_is_not_monotonic_ordered(self, data, num_engine): ([1, 2, 3], True), ([1, 1, 2], False), ]) - def test_is_unique(self, values, expected, num_engine): + def test_is_unique(self, values, expected, numeric_indexing_engine): + codes = np.array(values, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) - codes = np.array(values, dtype=num_engine._dtype) - e = num_engine(lambda: codes, len(codes)) assert e.is_unique is expected @pytest.mark.parametrize('values, value, expected', [ @@ -59,9 +64,9 @@ def test_is_unique(self, values, expected, num_engine): ([1, 2, 2, 1], 2, np.array([False, True, True, False])), ([1, 3, 2], 2, 2), ]) - def test_get_loc(self, values, value, expected, num_engine): - codes = np.array(values, dtype=num_engine._dtype) - e = num_engine(lambda: codes, len(codes)) + def test_get_loc(self, values, value, expected, numeric_indexing_engine): + codes = np.array(values, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) result = e.get_loc(value) if isinstance(expected, np.ndarray): @@ -73,9 +78,10 @@ def test_get_loc(self, values, value, expected, num_engine): ([1, 2, 3], 4, KeyError), ([1, 2, 3], '4', KeyError), ]) - def test_get_loc_raises(self, values, value, error, num_engine): - codes = np.array(values, dtype=num_engine._dtype) - e = num_engine(lambda: codes, len(codes)) + def test_get_loc_raises(self, values, value, error, + numeric_indexing_engine): + codes = np.array(values, dtype=numeric_indexing_engine._dtype) + e = numeric_indexing_engine(lambda: codes, len(codes)) with pytest.raises(error): e.get_loc(value) @@ -83,41 +89,44 @@ def test_get_loc_raises(self, values, value, error, num_engine): class TestObjectEngine(object): def setup_class(cls): + cls.data = list('abc') cls.dtype = object cls.Engine = ObjectEngine - @pytest.mark.parametrize('data', [['a', 'b', 'c']]) - def test_engine_type(self, data): - index = pd.Index(data) + def test_engine_type(self): + index = pd.Index(self.data) assert isinstance(index._engine, self.Engine) - @pytest.mark.parametrize('data', [['a', 'b', 'c']]) - def test_is_monotonic_ordered(self, data): - codes = np.array(data, dtype=self.dtype) + def test_is_monotonic_ordered(self): + codes = np.array(self.data, dtype=self.dtype) e = self.Engine(lambda: codes, len(codes)) + assert e.is_monotonic_increasing assert not e.is_monotonic_decreasing # reverse sort order - codes = np.array(list(reversed(data)), dtype=self.dtype) + reversed_data = list(reversed(self.data)) + codes = np.array(reversed_data, dtype=self.dtype) e = self.Engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing assert e.is_monotonic_decreasing - @pytest.mark.parametrize('data', [['a', 'c', 'b']]) - def test_is_not_monotonic_ordered(self, data): - codes = np.array(data, dtype=self.dtype) + def test_is_not_monotonic_ordered(self): + codes = np.array(list('cab'), dtype=self.dtype) e = self.Engine(lambda: codes, len(codes)) + assert not e.is_monotonic_increasing assert not e.is_monotonic_decreasing @pytest.mark.parametrize('values, expected', [ - (['a', 'b', 'c'], True), - (['a', 'a', 'b'], False), + (list('abc'), True), + (list('aab'), False), ]) def test_is_unique(self, values, expected): codes = np.array(values, dtype=self.dtype) e = self.Engine(lambda: codes, len(codes)) + assert e.is_unique is expected @pytest.mark.parametrize('values, value, expected', [ From 6aa94e980ae05683ec4e04fe1c8a18707b452f2c Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 24 Sep 2018 19:21:50 +0100 Subject: [PATCH 9/9] adjust for that is_monotonic uses fused types --- doc/source/whatsnew/v0.24.0.txt | 7 ++++--- pandas/_libs/algos.pyx | 6 +++++- pandas/_libs/index_class_helper.pxi.in | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9d559acfa59e7..280b622445e72 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -591,9 +591,10 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Slicing Series and Dataframe with an monotonically increasing :class:`CategoricalIndex` + is now very fast and has speed comparable to slicing with an ``Int64Index``. + The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) +- Slicing a ``CategoricalIndex`` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 86c6ee8599cb5..c211f2b6aceb9 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -361,9 +361,13 @@ ctypedef fused algos_t: float64_t float32_t object - int32_t int64_t + int32_t + int16_t + int8_t uint64_t + uint32_t + uint16_t uint8_t diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index eef162f3802c7..1d5915e5b9afd 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -33,7 +33,7 @@ cdef class {{name}}Engine(IndexEngine): _dtype = '{{dtype}}' def _call_monotonic(self, values): - return algos.is_monotonic_{{dtype}}(values, timelike=False) + return algos.is_monotonic(values, timelike=False) def get_backfill_indexer(self, other, limit=None): return algos.backfill_{{dtype}}(self._get_index_values(),