-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: CategoricalIndex.get_loc should avoid expensive cast of .codes to int64 #21699
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
297ec19
91ee55d
4d0612e
5a89a51
5575e93
2fa526f
4bc74f5
c9f1166
6aa94e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import numpy as np | ||
|
||
from pandas._libs.index import (Int64Engine, Int32Engine, | ||
Int16Engine, Int8Engine, | ||
UInt64Engine, UInt32Engine, | ||
UInt16Engine, UInt8Engine, | ||
Float64Engine, Float32Engine, | ||
ObjectEngine, | ||
) | ||
|
||
|
||
class NumericEngineIndexing(object): | ||
|
||
goal_time = 0.2 | ||
params = [[Int64Engine, Int32Engine, Int16Engine, Int8Engine, | ||
UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine, | ||
Float64Engine, Float32Engine, | ||
], | ||
['monotonic_incr', 'monotonic_decr', 'non_monotonic'] | ||
] | ||
param_names = ['engine', 'index_type'] | ||
|
||
def setup(self, engine, index_type): | ||
N = 10**5 | ||
values = list([1] * N + [2] * N + [3] * N) | ||
array_ = { | ||
'monotonic_incr': np.array(values, dtype=engine._dtype), | ||
'monotonic_decr': np.array(list(reversed(values)), | ||
dtype=engine._dtype), | ||
'non_monotonic': np.array([1, 2, 3] * N, dtype=engine._dtype), | ||
}[index_type] | ||
|
||
self.data = engine(lambda: array_, len(array_)) | ||
|
||
def time_get_loc(self, engine, index_type): | ||
self.data.get_loc(2) | ||
|
||
|
||
class ObjectEngineIndexing(object): | ||
|
||
goal_time = 0.2 | ||
params = [[ObjectEngine], | ||
['monotonic_incr', 'monotonic_decr', 'non_monotonic'] | ||
] | ||
param_names = ['engine', 'index_type'] | ||
|
||
def setup(self, engine, index_type): | ||
N = 10**5 | ||
values = list('a' * N + 'b' * N + 'c' * N) | ||
array_ = { | ||
'monotonic_incr': np.array(values, dtype=engine._dtype), | ||
'monotonic_decr': np.array(list(reversed(values)), | ||
dtype=engine._dtype), | ||
'non_monotonic': np.array(list('abc') * N, dtype=engine._dtype), | ||
}[index_type] | ||
|
||
self.data = engine(lambda: array_, len(array_)) | ||
|
||
def time_get_loc(self, engine, index_type): | ||
self.data.get_loc(2) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,18 +12,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in | |
|
||
# name, dtype, ctype | ||
dtypes = [('Float64', 'float64', 'float64_t'), | ||
('UInt64', 'uint64', 'uint64_t'), | ||
('Float32', 'float32', 'float32_t'), | ||
('Int64', 'int64', 'int64_t'), | ||
('Object', 'object', 'object')] | ||
('Int32', 'int32', 'int32_t'), | ||
('Int16', 'int16', 'int16_t'), | ||
('Int8', 'int8', 'int8_t'), | ||
('UInt64', 'uint64', 'uint64_t'), | ||
('UInt32', 'uint32', 'uint32_t'), | ||
('UInt16', 'uint16', 'uint16_t'), | ||
('UInt8', 'uint8', 'uint8_t'), | ||
('Object', 'object', 'object'), | ||
] | ||
}} | ||
|
||
{{for name, dtype, ctype in dtypes}} | ||
|
||
|
||
cdef class {{name}}Engine(IndexEngine): | ||
|
||
_dtype = '{{dtype}}' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For testing purposes (see test_engine.py), we need the dtype that is compatible with this engine + it's useful information to have in general. EDIT: For specialized engines (DatetimeEngine etc.), there is an internal method Should I not consolidate these, so we have them publicly accessable also? For example, so we have DatetimeEngine._dtype = 'M8[ns]'? |
||
|
||
def _call_monotonic(self, values): | ||
return algos.is_monotonic_{{dtype}}(values, timelike=False) | ||
return algos.is_monotonic(values, timelike=False) | ||
|
||
def get_backfill_indexer(self, other, limit=None): | ||
return algos.backfill_{{dtype}}(self._get_index_values(), | ||
|
@@ -36,10 +46,33 @@ cdef class {{name}}Engine(IndexEngine): | |
cdef _make_hash_table(self, n): | ||
{{if name == 'Object'}} | ||
return _hash.PyObjectHashTable(n) | ||
{{elif name in {'Int8', 'Int16', 'Int32'} }} | ||
# {{name}}HashTable is not available, so we use Int64HashTable | ||
return _hash.Int64HashTable(n) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since all the Int8Engine, Int16Engine, .. are actually still using the same Int64HashTable, is it actually needed to create those different engines? Can the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The requirement needed to achieve the speedup is avoiding type conversion. I'm pretty sure that I'm not at the coding computer ATM, but I can double check how EDIT: cpdef ensure_{{name}}(object arr, copy=True):
if util.is_array(arr):
if (<ndarray> arr).descr.type_num == NPY_{{c_type}}:
return arr
else:
return arr.astype(np.{{dtype}}, copy=copy)
else:
return np.array(arr, dtype=np.{{dtype}}) Which translated is the same as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
But the Int8Engine, Int16Engine, .. are actually calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the It would be a different and slower matter if we had to convert the array, but we don't do that... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, OK, I see in |
||
{{elif name in {'UInt8', 'UInt16', 'UInt32'} }} | ||
# {{name}}HashTable is not available, so we use UInt64HashTable | ||
return _hash.UInt64HashTable(n) | ||
{{elif name in {'Float32'} }} | ||
# {{name}}HashTable is not available, so we use Float64HashTable | ||
return _hash.Float64HashTable(n) | ||
{{else}} | ||
return _hash.{{name}}HashTable(n) | ||
{{endif}} | ||
|
||
{{if name in {'Int8', 'Int16', 'Int32'} }} | ||
cpdef _call_map_locations(self, values): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok can you add a bit comment here on what is going on |
||
# self.mapping is of type Int64HashTable, so convert dtype of values | ||
self.mapping.map_locations(algos.ensure_int64(values)) | ||
{{elif name in {'UInt8', 'UInt16', 'UInt32'} }} | ||
cpdef _call_map_locations(self, values): | ||
# self.mapping is of type UInt64HashTable, so convert dtype of values | ||
self.mapping.map_locations(algos.ensure_uint64(values)) | ||
{{elif name in {'Float32'} }} | ||
cpdef _call_map_locations(self, values): | ||
# self.mapping is of type Float64HashTable, so convert dtype of values | ||
self.mapping.map_locations(algos.ensure_float64(values)) | ||
{{endif}} | ||
|
||
{{if name != 'Float64' and name != 'Object'}} | ||
cdef _check_type(self, object val): | ||
hash(val) | ||
|
@@ -60,7 +93,7 @@ cdef class {{name}}Engine(IndexEngine): | |
ndarray[{{ctype}}] values | ||
int count = 0 | ||
|
||
{{if name != 'Float64'}} | ||
{{if name not in {'Float64', 'Float32'} }} | ||
if not util.is_integer_object(val): | ||
raise KeyError(val) | ||
{{endif}} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,7 +83,13 @@ class CategoricalIndex(Index, accessor.PandasDelegate): | |
""" | ||
|
||
_typ = 'categoricalindex' | ||
_engine_type = libindex.Int64Engine | ||
|
||
@property | ||
def _engine_type(self): | ||
# self.codes can have dtype int8, int16, int 32 or int64, so we need | ||
# to return the corresponding engine type (libindex.Int8Engine, etc.). | ||
engine_name = "{}Engine".format(self.codes.dtype.name.capitalize()) | ||
return getattr(libindex, engine_name) | ||
_attributes = ['name'] | ||
|
||
def __new__(cls, data=None, categories=None, ordered=None, dtype=None, | ||
|
@@ -377,7 +383,7 @@ def argsort(self, *args, **kwargs): | |
def _engine(self): | ||
|
||
# we are going to look things up with the codes themselves | ||
return self._engine_type(lambda: self.codes.astype('i8'), len(self)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you need any changes in cython if you simply so: its still not as nice as actually using type specific hashtables though (which this PR is not addressing) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried changing that. Still is slow, 14 ms. In index_class_helper.pxi.in, I also tried changing cdef _get_index_values(self):
return algos.ensure_{{dtype}}(self.vgetter()) to cdef _get_index_values(self):
return self.vgetter() But also slow, 14 ms. I agree that type specific hash tables would be nicer, but I've tried and I failed making it work. If someone could contribute those, I could change this PR to use type specific hash tables. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then you have something else going on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The numpy docs say about the
So, calling astype(..., copy=False) will only avoid returning a copy when the dtype of codes is int64, i.e. in practice never for Categoricals. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so try using |
||
return self._engine_type(lambda: self.codes, len(self)) | ||
|
||
# introspection | ||
@cache_readonly | ||
|
@@ -426,6 +432,10 @@ def get_loc(self, key, method=None): | |
------- | ||
loc : int if unique index, slice if monotonic index, else mask | ||
|
||
Raises | ||
------ | ||
KeyError : If key is not in self | ||
|
||
Examples | ||
--------- | ||
>>> unique_index = pd.CategoricalIndex(list('abc')) | ||
|
@@ -440,10 +450,12 @@ def get_loc(self, key, method=None): | |
>>> non_monotonic_index.get_loc('b') | ||
array([False, True, False, True], dtype=bool) | ||
""" | ||
codes = self.categories.get_loc(key) | ||
if (codes == -1): | ||
raise KeyError(key) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. codes can never be -1. If key is not in self.categories, a KeyError was always raised. |
||
return self._engine.get_loc(codes) | ||
code = self.categories.get_loc(key) | ||
|
||
# dtype must be same as dtype for self.codes else searchsorted is slow | ||
code = self.codes.dtype.type(code) | ||
|
||
return self._engine.get_loc(code) | ||
|
||
def get_value(self, series, key): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the lower Uint engines actually used somewhere in the code? (I think you only changed things in Categorical, and those only use the Int.. engines?)
So if they are not used, should we actually add them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lower UInt engine are not used in the code ATM. I added them by suggestion from @jreback. I don't have a super strong preference on whether to add them now, but I can see benefits to aving them, e.g.:
all in all I'm weakly positive on adding them now, but will let you core devs decide on it.