PERF: improved performance of small multiindexes (pandas-dev#16324)

jreback · stangirala · commit 067f2573287d · 2017-06-11T13:59:42.000-07:00
closes pandas-dev#16319
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -193,9 +193,15 @@ def setup(self):
              np.arange(1000)], names=['one', 'two'])
 
         import string
-        self.mistring = MultiIndex.from_product(
-            [np.arange(1000),
-             np.arange(20), list(string.ascii_letters)],
+
+        self.mi_large = MultiIndex.from_product(
+            [np.arange(1000), np.arange(20), list(string.ascii_letters)],
+            names=['one', 'two', 'three'])
+        self.mi_med = MultiIndex.from_product(
+            [np.arange(1000), np.arange(10), list('A')],
+            names=['one', 'two', 'three'])
+        self.mi_small = MultiIndex.from_product(
+            [np.arange(100), list('A'), list('A')],
             names=['one', 'two', 'three'])
 
     def time_series_xs_mi_ix(self):
@@ -218,8 +224,14 @@ def time_multiindex_get_indexer(self):
                       (0, 16), (0, 17), (0, 18),
                       (0, 19)], dtype=object))
 
+    def time_multiindex_large_get_loc(self):
+        self.mi_large.get_loc((999, 19, 'Z'))
+
+    def time_multiindex_med_get_loc(self):
+        self.mi_med.get_loc((999, 9, 'A'))
+
     def time_multiindex_string_get_loc(self):
-        self.mistring.get_loc((999, 19, 'Z'))
+        self.mi_small.get_loc((99, 'A', 'A'))
 
     def time_is_monotonic(self):
         self.miint.is_monotonic
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -27,7 +27,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Performance regression fix when indexing with a list-like (:issue:`16285`)
-
+- Performance regression fix for small MultiIndexes (:issuse:`16319`)
 
 .. _whatsnew_0202.bug_fixes:
 
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz):
     return tz is UTC or isinstance(tz, _du_utc)
 
 
-cdef class MultiIndexEngine(IndexEngine):
+cdef class MultiIndexObjectEngine(ObjectEngine):
+    """
+    provide the same interface as the MultiIndexEngine
+    but use the IndexEngine for computation
+
+    This provides good performance with samller MI's
+    """
+    def get_indexer(self, values):
+        # convert a MI to an ndarray
+        if hasattr(values, 'values'):
+            values = values.values
+        return super(MultiIndexObjectEngine, self).get_indexer(values)
+
+    cpdef get_loc(self, object val):
+
+        # convert a MI to an ndarray
+        if hasattr(val, 'values'):
+            val = val.values
+        return super(MultiIndexObjectEngine, self).get_loc(val)
+
+
+cdef class MultiIndexHashEngine(ObjectEngine):
+    """
+    Use a hashing based MultiIndex impl
+    but use the IndexEngine for computation
+
+    This provides good performance with larger MI's
+    """
 
     def _call_monotonic(self, object mi):
         # defer these back to the mi iteself
@@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine):
         except TypeError:
             raise KeyError(val)
 
+    def get_indexer(self, values):
+        self._ensure_mapping_populated()
+        return self.mapping.lookup(values)
+
     cdef _make_hash_table(self, n):
         return _hash.MultiIndexHashTable(n)
 
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -88,12 +88,12 @@ def is_dtype(cls, dtype):
         """
         if hasattr(dtype, 'dtype'):
             dtype = dtype.dtype
-        if isinstance(dtype, cls):
-            return True
-        elif isinstance(dtype, np.dtype):
+        if isinstance(dtype, np.dtype):
             return False
         elif dtype is None:
             return False
+        elif isinstance(dtype, cls):
+            return True
         try:
             return cls.construct_from_string(dtype) is not None
         except:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -75,7 +75,6 @@ class MultiIndex(Index):
     _levels = FrozenList()
     _labels = FrozenList()
     _comparables = ['names']
-    _engine_type = libindex.MultiIndexEngine
     rename = Index.set_names
 
     def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -629,7 +628,16 @@ def _get_level_number(self, level):
 
     @cache_readonly
     def _engine(self):
-        return self._engine_type(lambda: self, len(self))
+
+        # choose our engine based on our size
+        # the hashing based MultiIndex for larger
+        # sizes, and the MultiIndexOjbect for smaller
+        # xref: https://github.com/pandas-dev/pandas/pull/16324
+        l = len(self)
+        if l > 10000:
+            return libindex.MultiIndexHashEngine(lambda: self, l)
+
+        return libindex.MultiIndexObjectEngine(lambda: self.values, l)
 
     @property
     def values(self):
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -5,16 +5,13 @@
 
 import numpy as np
 from pandas._libs import hashing
-from pandas._libs.lib import is_bool_array
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
     ABCSeries,
     ABCDataFrame)
 from pandas.core.dtypes.common import (
-    is_categorical_dtype, is_numeric_dtype,
-    is_datetime64_dtype, is_timedelta64_dtype,
-    is_list_like)
+    is_categorical_dtype, is_list_like)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
@@ -136,7 +133,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
     -------
     ndarray of hashed values array
     """
-
     is_tuple = False
     if isinstance(vals, tuple):
         vals = [vals]
@@ -231,29 +227,29 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
     if not hasattr(vals, 'dtype'):
         raise TypeError("must pass a ndarray-like")
+    dtype = vals.dtype
 
     if hash_key is None:
         hash_key = _default_hash_key
 
     # For categoricals, we hash the categories, then remap the codes to the
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke.
-    if is_categorical_dtype(vals.dtype):
+    if is_categorical_dtype(dtype):
         return _hash_categorical(vals, encoding, hash_key)
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
-    if np.issubdtype(vals.dtype, np.complex128):
+    elif np.issubdtype(dtype, np.complex128):
         return hash_array(vals.real) + 23 * hash_array(vals.imag)
 
     # First, turn whatever array this is into unsigned 64-bit ints, if we can
     # manage it.
-    if is_bool_array(vals):
+    elif isinstance(dtype, np.bool):
         vals = vals.astype('u8')
-    elif (is_datetime64_dtype(vals) or
-          is_timedelta64_dtype(vals)):
+    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
         vals = vals.view('i8').astype('u8', copy=False)
-    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
+    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
         # With repeated values, its MUCH faster to categorize object dtypes,