Skip to content

PERF: CategoricalIndex.get_loc should avoid expensive cast of .codes to int64 #21699

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import numpy as np
import pandas.util.testing as tm
from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
from pandas import (Series, DataFrame, MultiIndex,
Int64Index, UInt64Index, Float64Index,
IntervalIndex, CategoricalIndex,
IndexSlice, concat, date_range)
from .pandas_vb_common import setup, Panel # noqa
Expand All @@ -11,7 +12,7 @@
class NumericSeriesIndexing(object):

goal_time = 0.2
params = [Int64Index, Float64Index]
params = [Int64Index, UInt64Index, Float64Index]
param = ['index']

def setup(self, index):
Expand Down
60 changes: 60 additions & 0 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np

from pandas._libs.index import (Int64Engine, Int32Engine,
Int16Engine, Int8Engine,
UInt64Engine, UInt32Engine,
UInt16Engine, UInt8Engine,
Float64Engine, Float32Engine,
ObjectEngine,
)


class NumericEngineIndexing(object):

goal_time = 0.2
params = [[Int64Engine, Int32Engine, Int16Engine, Int8Engine,
UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine,
Float64Engine, Float32Engine,
],
['monotonic_incr', 'monotonic_decr', 'non_monotonic']
]
param_names = ['engine', 'index_type']

def setup(self, engine, index_type):
N = 10**5
values = list([1] * N + [2] * N + [3] * N)
array_ = {
'monotonic_incr': np.array(values, dtype=engine._dtype),
'monotonic_decr': np.array(list(reversed(values)),
dtype=engine._dtype),
'non_monotonic': np.array([1, 2, 3] * N, dtype=engine._dtype),
}[index_type]

self.data = engine(lambda: array_, len(array_))

def time_get_loc(self, engine, index_type):
self.data.get_loc(2)


class ObjectEngineIndexing(object):

goal_time = 0.2
params = [[ObjectEngine],
['monotonic_incr', 'monotonic_decr', 'non_monotonic']
]
param_names = ['engine', 'index_type']

def setup(self, engine, index_type):
N = 10**5
values = list('a' * N + 'b' * N + 'c' * N)
array_ = {
'monotonic_incr': np.array(values, dtype=engine._dtype),
'monotonic_decr': np.array(list(reversed(values)),
dtype=engine._dtype),
'non_monotonic': np.array(list('abc') * N, dtype=engine._dtype),
}[index_type]

self.data = engine(lambda: array_, len(array_))

def time_get_loc(self, engine, index_type):
self.data.get_loc(2)
7 changes: 4 additions & 3 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -591,9 +591,10 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
both when indexing by label (using .loc) and position(.iloc).
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
- Slicing Series and Dataframe with an monotonically increasing :class:`CategoricalIndex`
is now very fast and has speed comparable to slicing with an ``Int64Index``.
The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
- Slicing a ``CategoricalIndex`` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`)
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
Expand Down
16 changes: 11 additions & 5 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ from libc.math cimport fabs, sqrt
import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_FLOAT32, NPY_FLOAT64,
NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
NPY_FLOAT64, NPY_FLOAT32,
NPY_OBJECT,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t,
int64_t, int32_t, int16_t, int8_t,
uint64_t, uint32_t, uint16_t, uint8_t,
float64_t, float32_t,
double_t)
cnp.import_array()

Expand Down Expand Up @@ -359,9 +361,13 @@ ctypedef fused algos_t:
float64_t
float32_t
object
int32_t
int64_t
int32_t
int16_t
int8_t
uint64_t
uint32_t
uint16_t
uint8_t


Expand Down
9 changes: 6 additions & 3 deletions pandas/_libs/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,14 @@ def ensure_object(object arr):
# name, c_type, dtype
dtypes = [('float64', 'FLOAT64', 'float64'),
('float32', 'FLOAT32', 'float32'),
('int8', 'INT8', 'int8'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('int32', 'INT32', 'int32'),
('int16', 'INT16', 'int16'),
('int8', 'INT8', 'int8'),
('uint64', 'UINT64', 'uint64'),
('uint32', 'UINT32', 'uint32'),
('uint16', 'UINT16', 'uint16'),
('uint8', 'UINT8', 'uint8'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
]
Expand Down
7 changes: 5 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ import cython

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray, float64_t, int32_t,
int64_t, uint8_t, uint64_t, intp_t,
from numpy cimport (ndarray,
float64_t, float32_t,
int64_t, int32_t, int16_t, int8_t,
uint64_t, uint32_t, uint16_t, uint8_t,
intp_t,
# Note: NPY_DATETIME, NPY_TIMEDELTA are only available
# for cimport in cython>=0.27.3
NPY_DATETIME, NPY_TIMEDELTA)
Expand Down
41 changes: 37 additions & 4 deletions pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

# name, dtype, ctype
dtypes = [('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Float32', 'float32', 'float32_t'),
('Int64', 'int64', 'int64_t'),
('Object', 'object', 'object')]
('Int32', 'int32', 'int32_t'),
('Int16', 'int16', 'int16_t'),
('Int8', 'int8', 'int8_t'),
('UInt64', 'uint64', 'uint64_t'),
('UInt32', 'uint32', 'uint32_t'),
('UInt16', 'uint16', 'uint16_t'),
('UInt8', 'uint8', 'uint8_t'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the lower Uint engines actually used somewhere in the code? (I think you only changed things in Categorical, and those only use the Int.. engines?)

So if they are not used, should we actually add them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lower UInt engine are not used in the code ATM. I added them by suggestion from @jreback. I don't have a super strong preference on whether to add them now, but I can see benefits to aving them, e.g.:

  • if someone want to make specialized indexes based on UInts they'd be nice to have.
  • We don't only add UInt8/16/32 engines but also various UInt8/16/32 algorithms, that could be beneficial to have around for when the need is there.

all in all I'm weakly positive on adding them now, but will let you core devs decide on it.

('Object', 'object', 'object'),
]
}}

{{for name, dtype, ctype in dtypes}}


cdef class {{name}}Engine(IndexEngine):

_dtype = '{{dtype}}'
Copy link
Contributor Author

@topper-123 topper-123 Aug 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For testing purposes (see test_engine.py), we need the dtype that is compatible with this engine + it's useful information to have in general.

EDIT: For specialized engines (DatetimeEngine etc.), there is an internal method _get_box_dtype, that does the same thing. Because it's internal, it's not publicly accessable, and because it's a method, it's only avalable on instances (if I understand Cython correctly).

Should I not consolidate these, so we have them publicly accessable also? For example, so we have DatetimeEngine._dtype = 'M8[ns]'?


def _call_monotonic(self, values):
return algos.is_monotonic_{{dtype}}(values, timelike=False)
return algos.is_monotonic(values, timelike=False)

def get_backfill_indexer(self, other, limit=None):
return algos.backfill_{{dtype}}(self._get_index_values(),
Expand All @@ -36,10 +46,33 @@ cdef class {{name}}Engine(IndexEngine):
cdef _make_hash_table(self, n):
{{if name == 'Object'}}
return _hash.PyObjectHashTable(n)
{{elif name in {'Int8', 'Int16', 'Int32'} }}
# {{name}}HashTable is not available, so we use Int64HashTable
return _hash.Int64HashTable(n)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since all the Int8Engine, Int16Engine, .. are actually still using the same Int64HashTable, is it actually needed to create those different engines? Can the Int64Engine (or IntEngine) not simply always do the ensure_int64 in _call_map_locations? (which seems to be the only difference between Int8Engine and Int64Engine, from a quick skim, didn't look in detail)

Copy link
Contributor Author

@topper-123 topper-123 Aug 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The requirement needed to achieve the speedup is avoiding type conversion. I'm pretty sure that ensure_int64 will convert a int8 array to int64, so lose speeed?

I'm not at the coding computer ATM, but I can double check how ensure_int64 works tonight.

EDIT:
The ensure_{{dtype}} part on algos_common_helper.pxi is:

cpdef ensure_{{name}}(object arr, copy=True):
    if util.is_array(arr):
        if (<ndarray> arr).descr.type_num == NPY_{{c_type}}:
            return arr
        else:
            return arr.astype(np.{{dtype}}, copy=copy)
    else:
        return np.array(arr, dtype=np.{{dtype}})

Which translated is the same as arr.astype(np.int64) if arr is of dtype int8, so the array is casted to int64 which means the speed penalty occurs if we do that.

Copy link
Member

@jorisvandenbossche jorisvandenbossche Aug 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure that ensure_int64 will convert a int8 array to int64, so lose speeed?

But the Int8Engine, Int16Engine, .. are actually calling ensure_int64 in the current PR ? (not ensure_int8, ...) So this conversion already happens?

Copy link
Contributor Author

@topper-123 topper-123 Aug 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the self.mapping.map_locations(algos.ensure_int64(values)) is only converting values, i.e. typically a scalar, so it's very fast.

It would be a different and slower matter if we had to convert the array, but we don't do that...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, OK, I see in _get_index_values it is ensure_{{dtype}} that is used.

{{elif name in {'UInt8', 'UInt16', 'UInt32'} }}
# {{name}}HashTable is not available, so we use UInt64HashTable
return _hash.UInt64HashTable(n)
{{elif name in {'Float32'} }}
# {{name}}HashTable is not available, so we use Float64HashTable
return _hash.Float64HashTable(n)
{{else}}
return _hash.{{name}}HashTable(n)
{{endif}}

{{if name in {'Int8', 'Int16', 'Int32'} }}
cpdef _call_map_locations(self, values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok can you add a bit comment here on what is going on

# self.mapping is of type Int64HashTable, so convert dtype of values
self.mapping.map_locations(algos.ensure_int64(values))
{{elif name in {'UInt8', 'UInt16', 'UInt32'} }}
cpdef _call_map_locations(self, values):
# self.mapping is of type UInt64HashTable, so convert dtype of values
self.mapping.map_locations(algos.ensure_uint64(values))
{{elif name in {'Float32'} }}
cpdef _call_map_locations(self, values):
# self.mapping is of type Float64HashTable, so convert dtype of values
self.mapping.map_locations(algos.ensure_float64(values))
{{endif}}

{{if name != 'Float64' and name != 'Object'}}
cdef _check_type(self, object val):
hash(val)
Expand All @@ -60,7 +93,7 @@ cdef class {{name}}Engine(IndexEngine):
ndarray[{{ctype}}] values
int count = 0

{{if name != 'Float64'}}
{{if name not in {'Float64', 'Float32'} }}
if not util.is_integer_object(val):
raise KeyError(val)
{{endif}}
Expand Down
19 changes: 17 additions & 2 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,29 @@ def ip():


@pytest.fixture(params=[True, False, None])
def observed(request):
def _true_false_none(request):
"""
Base fixture for fixtures that return True, False and None.
"""
return request.param


@pytest.fixture
def observed(_true_false_none):
""" pass in the observed keyword to groupby for [True, False]
This indicates whether categoricals should return values for
values which are not in the grouper [False / None], or only values which
appear in the grouper [True]. [None] is supported for future compatiblity
if we decide to change the default (and would need to warn if this
parameter is not passed)"""
return request.param
return _true_false_none


@pytest.fixture
def ordered(_true_false_none):
"""Return the allowed parameters for Categorical/CategoricalIndex.ordered.
"""
return _true_false_none


_all_arithmetic_operators = ['__add__', '__radd__',
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3085,6 +3085,10 @@ def _get_unique_index(self, dropna=False):
-------
loc : int if unique index, slice if monotonic index, else mask

Raises
------
KeyError : If key is not in self

Examples
---------
>>> unique_index = pd.Index(list('abc'))
Expand Down
24 changes: 18 additions & 6 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,13 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
"""

_typ = 'categoricalindex'
_engine_type = libindex.Int64Engine

@property
def _engine_type(self):
# self.codes can have dtype int8, int16, int 32 or int64, so we need
# to return the corresponding engine type (libindex.Int8Engine, etc.).
engine_name = "{}Engine".format(self.codes.dtype.name.capitalize())
return getattr(libindex, engine_name)
_attributes = ['name']

def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
Expand Down Expand Up @@ -377,7 +383,7 @@ def argsort(self, *args, **kwargs):
def _engine(self):

# we are going to look things up with the codes themselves
return self._engine_type(lambda: self.codes.astype('i8'), len(self))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need any changes in cython if you simply so: .astype('i8', copy=False), which is the cause of the perf issue, no?

its still not as nice as actually using type specific hashtables though (which this PR is not addressing)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried changing that. Still is slow, 14 ms.

In index_class_helper.pxi.in, I also tried changing

    cdef _get_index_values(self):
        return algos.ensure_{{dtype}}(self.vgetter())

to

    cdef _get_index_values(self):
        return self.vgetter()

But also slow, 14 ms.

I agree that type specific hash tables would be nicer, but I've tried and I failed making it work. If someone could contribute those, I could change this PR to use type specific hash tables.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then you have something else going on

Copy link
Contributor Author

@topper-123 topper-123 Aug 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The numpy docs say about the copy parameter of array.astype:

By default, astype always returns a newly allocated array. If this is set
to false, and the dtype, order, and subok requirements are satisfied,
the input array is returned instead of a copy.

So, calling astype(..., copy=False) will only avoid returning a copy when the dtype of codes is int64, i.e. in practice never for Categoricals.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so try using algos.ensure_int64(values) I find this extremely odd that you have to do the if inside cython.

return self._engine_type(lambda: self.codes, len(self))

# introspection
@cache_readonly
Expand Down Expand Up @@ -426,6 +432,10 @@ def get_loc(self, key, method=None):
-------
loc : int if unique index, slice if monotonic index, else mask

Raises
------
KeyError : If key is not in self

Examples
---------
>>> unique_index = pd.CategoricalIndex(list('abc'))
Expand All @@ -440,10 +450,12 @@ def get_loc(self, key, method=None):
>>> non_monotonic_index.get_loc('b')
array([False, True, False, True], dtype=bool)
"""
codes = self.categories.get_loc(key)
if (codes == -1):
raise KeyError(key)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

codes can never be -1. If key is not in self.categories, a KeyError was always raised.

return self._engine.get_loc(codes)
code = self.categories.get_loc(key)

# dtype must be same as dtype for self.codes else searchsorted is slow
code = self.codes.dtype.type(code)

return self._engine.get_loc(code)

def get_value(self, series, key):
"""
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/indexes/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pandas.util.testing as tm
from pandas.core.indexes.api import Index, MultiIndex
from pandas._libs import index as li
from pandas.compat import lzip, long


Expand Down Expand Up @@ -45,3 +46,15 @@ def zero(request):
# For testing division by (or of) zero for Index with length 5, this
# gives several scalar-zeros and length-5 vector-zeros
return request.param


@pytest.fixture(
params=[
'Int64', 'Int32', 'Int16', 'Int8',
'UInt64', 'UInt32', 'UInt16', 'UInt8',
'Float64', 'Float32',
])
def numeric_indexing_engine(request):
"""Return the various numeric indexing engines in pd._libs.index
"""
return getattr(li, "{}Engine".format(request.param))
31 changes: 31 additions & 0 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pandas import Categorical, IntervalIndex, compat
from pandas.util.testing import assert_almost_equal
import pandas.core.config as cf
from pandas._libs import index as libindex
import pandas as pd

if PY3:
Expand Down Expand Up @@ -1117,3 +1118,33 @@ def test_take_invalid_kwargs(self):
msg = "the 'mode' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, idx.take,
indices, mode='clip')


class TestCategoricalIndexEngine(object):

def setup_method(self):
self.n_categories = {np.int8: 1, np.int16: 129, np.int32: 32769}

self.engines = {np.int8: libindex.Int8Engine,
np.int16: libindex.Int16Engine,
np.int32: libindex.Int32Engine,
np.int64: libindex.Int64Engine}

@pytest.mark.parametrize('dtype', [np.int8, np.int16, np.int32, np.int64])
def test_engine_type(self, dtype):
"""
Check that a CategoricalIndex has the correct engine type.
"""
if dtype != np.int64:
n_categories = self.n_categories[dtype]
index = CategoricalIndex(np.arange(n_categories))
else:
# having actual (2 ** 32) + 1 distinct categories is too
# memory-intensive, so we set codes.dtype manually
index = CategoricalIndex(['a', 'b', 'c'])
index._values._codes = index._values._codes.astype('int64')

engine = self.engines[dtype]

assert isinstance(index._engine, engine)
assert index.codes.dtype.type == dtype
Loading