Skip to content

Commit b08a96a

Browse files
committed
PERF: improved performance of small multiindexes
closes pandas-dev#16319
1 parent 0607e03 commit b08a96a

File tree

6 files changed

+68
-20
lines changed

6 files changed

+68
-20
lines changed

asv_bench/benchmarks/indexing.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,18 @@ def setup(self):
193193
np.arange(1000)], names=['one', 'two'])
194194

195195
import string
196-
self.mistring = MultiIndex.from_product(
196+
197+
self.mi_large = MultiIndex.from_product(
197198
[np.arange(1000),
198199
np.arange(20), list(string.ascii_letters)],
199200
names=['one', 'two', 'three'])
201+
self.mi_med = MultiIndex.from_product(
202+
[np.arange(1000),
203+
np.arange(10), list('A')],
204+
names=['one', 'two', 'three'])
205+
self.mi_small = MultiIndex.from_product(
206+
[np.arange(100), list('A'), list('A')],
207+
names=['one', 'two', 'three'])
200208

201209
def time_series_xs_mi_ix(self):
202210
self.s.ix[999]
@@ -218,8 +226,14 @@ def time_multiindex_get_indexer(self):
218226
(0, 16), (0, 17), (0, 18),
219227
(0, 19)], dtype=object))
220228

229+
def time_multiindex_large_get_loc(self):
230+
self.mi_large.get_loc((999, 19, 'Z'))
231+
232+
def time_multiindex_med_get_loc(self):
233+
self.mi_med.get_loc((999, 9, 'A'))
234+
221235
def time_multiindex_string_get_loc(self):
222-
self.mistring.get_loc((999, 19, 'Z'))
236+
self.mi_small.get_loc((99, 'A'))
223237

224238
def time_is_monotonic(self):
225239
self.miint.is_monotonic

doc/source/whatsnew/v0.20.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Performance Improvements
2727
~~~~~~~~~~~~~~~~~~~~~~~~
2828

2929
- Performance regression fix when indexing with a list-like (:issue:`16285`)
30-
30+
- Performance regression fix for small MultiIndexes (:issuse:`16319`)
3131

3232
.. _whatsnew_0202.bug_fixes:
3333

pandas/_libs/index.pyx

+32-1
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz):
553553
return tz is UTC or isinstance(tz, _du_utc)
554554

555555

556-
cdef class MultiIndexEngine(IndexEngine):
556+
cdef class MultiIndexObjectEngine(ObjectEngine):
557+
"""
558+
provide the same interface as the MultiIndexEngine
559+
but use the IndexEngine for computation
560+
561+
This provides good performance with samller MI's
562+
"""
563+
def get_indexer(self, values):
564+
# convert a MI to an ndarray
565+
if hasattr(values, 'values'):
566+
values = values.values
567+
return super(MultiIndexObjectEngine, self).get_indexer(values)
568+
569+
cpdef get_loc(self, object val):
570+
571+
# convert a MI to an ndarray
572+
if hasattr(val, 'values'):
573+
val = val.values
574+
return super(MultiIndexObjectEngine, self).get_loc(val)
575+
576+
577+
cdef class MultiIndexEngine(MultiIndexObjectEngine):
578+
"""
579+
Use a hashing based MultiIndex impl
580+
but use the IndexEngine for computation
581+
582+
This provides good performance with larger MI's
583+
"""
557584

558585
def _call_monotonic(self, object mi):
559586
# defer these back to the mi iteself
@@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine):
584611
except TypeError:
585612
raise KeyError(val)
586613

614+
def get_indexer(self, values):
615+
self._ensure_mapping_populated()
616+
return self.mapping.lookup(values)
617+
587618
cdef _make_hash_table(self, n):
588619
return _hash.MultiIndexHashTable(n)
589620

pandas/core/dtypes/dtypes.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,12 @@ def is_dtype(cls, dtype):
8888
"""
8989
if hasattr(dtype, 'dtype'):
9090
dtype = dtype.dtype
91-
if isinstance(dtype, cls):
92-
return True
93-
elif isinstance(dtype, np.dtype):
91+
if isinstance(dtype, np.dtype):
9492
return False
9593
elif dtype is None:
9694
return False
95+
elif isinstance(dtype, cls):
96+
return True
9797
try:
9898
return cls.construct_from_string(dtype) is not None
9999
except:

pandas/core/indexes/multi.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ class MultiIndex(Index):
7575
_levels = FrozenList()
7676
_labels = FrozenList()
7777
_comparables = ['names']
78-
_engine_type = libindex.MultiIndexEngine
7978
rename = Index.set_names
8079

8180
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -629,7 +628,15 @@ def _get_level_number(self, level):
629628

630629
@cache_readonly
631630
def _engine(self):
632-
return self._engine_type(lambda: self, len(self))
631+
632+
# choose our engine based on our size
633+
# the hashing based MultiIndex for larger
634+
# sizes, and the MultiIndexOjbect for smaller
635+
l = len(self)
636+
if l > 10000:
637+
return libindex.MultiIndexEngine(lambda: self, l)
638+
639+
return libindex.MultiIndexObjectEngine(lambda: self.values, l)
633640

634641
@property
635642
def values(self):

pandas/core/util/hashing.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,13 @@
55

66
import numpy as np
77
from pandas._libs import hashing
8-
from pandas._libs.lib import is_bool_array
98
from pandas.core.dtypes.generic import (
109
ABCMultiIndex,
1110
ABCIndexClass,
1211
ABCSeries,
1312
ABCDataFrame)
1413
from pandas.core.dtypes.common import (
15-
is_categorical_dtype, is_numeric_dtype,
16-
is_datetime64_dtype, is_timedelta64_dtype,
17-
is_list_like)
14+
is_categorical_dtype, is_list_like)
1815

1916
# 16 byte long hashing key
2017
_default_hash_key = '0123456789123456'
@@ -136,7 +133,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
136133
-------
137134
ndarray of hashed values array
138135
"""
139-
140136
is_tuple = False
141137
if isinstance(vals, tuple):
142138
vals = [vals]
@@ -231,29 +227,29 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
231227

232228
if not hasattr(vals, 'dtype'):
233229
raise TypeError("must pass a ndarray-like")
230+
dtype = vals.dtype
234231

235232
if hash_key is None:
236233
hash_key = _default_hash_key
237234

238235
# For categoricals, we hash the categories, then remap the codes to the
239236
# hash values. (This check is above the complex check so that we don't ask
240237
# numpy if categorical is a subdtype of complex, as it will choke.
241-
if is_categorical_dtype(vals.dtype):
238+
if is_categorical_dtype(dtype):
242239
return _hash_categorical(vals, encoding, hash_key)
243240

244241
# we'll be working with everything as 64-bit values, so handle this
245242
# 128-bit value early
246-
if np.issubdtype(vals.dtype, np.complex128):
243+
elif np.issubdtype(dtype, np.complex128):
247244
return hash_array(vals.real) + 23 * hash_array(vals.imag)
248245

249246
# First, turn whatever array this is into unsigned 64-bit ints, if we can
250247
# manage it.
251-
if is_bool_array(vals):
248+
elif isinstance(dtype, np.bool):
252249
vals = vals.astype('u8')
253-
elif (is_datetime64_dtype(vals) or
254-
is_timedelta64_dtype(vals)):
250+
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
255251
vals = vals.view('i8').astype('u8', copy=False)
256-
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
252+
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
257253
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
258254
else:
259255
# With repeated values, its MUCH faster to categorize object dtypes,

0 commit comments

Comments
 (0)