BUG: implement new engine for codes-based MultiIndex indexing

toobaz · toobaz · commit 5e44452c71e8 · 2018-01-11T19:00:36.000+01:00
closes #18519 closes #18818 closes #18520 closes #18485 closes #15994 closes #19086
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -388,9 +388,6 @@ Indexing
 
 - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
 - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`)
-- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
-- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
-- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
 - Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`)
 - Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`)
 - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
@@ -406,6 +403,18 @@ Indexing
 - Bug in :func:`MultiIndex.set_labels` which would cause casting (and potentially clipping) of the new labels if the ``level`` argument is not 0 or a list like [0, 1, ... ]  (:issue:`19057`)
 - Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`)
 
+MultiIndex
+^^^^^^^^^^
+
+- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
+- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
+- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
+- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
+- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
+- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing NaN (:issue:`18485`)
+- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex` which would fail when levels had different dtypes (:issue:`18520`)
+
+
 I/O
 ^^^
 
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -20,6 +20,7 @@ from hashtable cimport HashTable
 from pandas._libs import algos, hashtable as _hash
 from pandas._libs.tslibs import period as periodlib
 from pandas._libs.tslib import Timestamp, Timedelta
+from pandas._libs.missing import checknull
 from datetime import datetime, timedelta, date
 
 from cpython cimport PyTuple_Check, PyList_Check
@@ -599,6 +600,93 @@ cpdef convert_scalar(ndarray arr, object value):
     return value
 
 
+cdef class BaseMultiIndexCodesEngine(object):
+    """
+    Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
+    represent each label in a MultiIndex as an integer, by juxtaposing the bits
+    encoding each level, with appropriate offsets.
+    Consequently, labels are located by first locating each component on the
+    respective level, then locating for (the integer representation of) codes.
+    """
+    def __init__(self, object levels, object labels,
+                 ndarray[uint64_t, ndim=1] offsets):
+        self.levels = levels
+        self.offsets = offsets
+
+        # Transform labels in a single array, and add 1 so that we are working
+        # with positive integers (-1 for NaN becomes 0):
+        codes = (np.array(labels, dtype='int64').T + 1).astype('uint64')
+        # Map each codes combination in the index to an integer unambiguously
+        # (no collisions possible), based on the "offsets", which describe the
+        # number of bits to switch labels for each level:
+        lab_ints = self.codes_to_ints(codes)
+
+        # Initialize underlying index
+        self._base.__init__(self, lambda: lab_ints, len(lab_ints))
+
+    def get_indexer(self, object target, object method=None,
+                    object limit=None):
+        level_codes = [self.levels[lev].get_indexer(codes, method=method) + 1
+                       for lev, codes in enumerate(zip(*target))]
+
+        keys_int = self.codes_to_ints(np.array(level_codes, dtype='uint64').T)
+
+        # All methods (exact, backfill, pad) directly map to the respective
+        # methods of the underlying (integers) index...
+        if method is not None:
+            # but underlying backfill and pad methods require index and keys
+            # to be sorted. The index already is (checked in
+            # Index._get_fill_indexer), so sort keys (integer representations):
+            order = np.argsort(keys_int)
+            keys_int = keys_int[order]
+            sup_meth = getattr(self._base, 'get_{}_indexer'.format(method))
+            indexer = sup_meth(self, keys_int, limit=limit)
+            indexer = indexer[order]
+        else:
+            indexer = self._base.get_indexer(self, keys_int)
+
+        return indexer
+
+    def get_loc(self, object key):
+        if is_definitely_invalid_key(key):
+            raise TypeError("'{key}' is an invalid key".format(key=key))
+        if not isinstance(key, tuple):
+            raise KeyError(key)
+        try:
+            indices = [0 if checknull(v) else self.levels[l].get_loc(v) + 1
+                       for l, v in enumerate(key)]
+        except KeyError:
+            raise KeyError(key)
+
+        # ndmin=2 because codes_to_ints expects multiple labels:
+        indices = np.array(indices, ndmin=2, dtype='uint64')
+        # ... and returns a (length 1, in this case) array of integers:
+        key_int = self.codes_to_ints(indices)[0]
+
+        return self._base.get_loc(self, key_int)
+
+    def get_indexer_non_unique(self, object target):
+        # This needs to be overridden just because the default one works on
+        # target._values, and target can be itself a MultiIndex.
+
+        level_codes = [self.levels[lev].get_indexer(codes) + 1
+                       for lev, codes in enumerate(zip(*target))]
+        codes = np.array(level_codes, dtype='uint64').T
+        keys_int = self.codes_to_ints(codes)
+
+        indexer = self._base.get_indexer_non_unique(self, keys_int)
+
+        return indexer
+
+    def __contains__(self, object val):
+        try:
+            self.get_loc(val)
+            return True
+        except (KeyError, TypeError, ValueError):
+            return False
+
+
+
 cdef class MultiIndexObjectEngine(ObjectEngine):
     """
     provide the same interface as the MultiIndexEngine
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -50,6 +50,43 @@
          target_klass='MultiIndex or list of tuples'))
 
 
+class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
+                           libindex.UInt64Engine):
+    """
+    Manage a MultiIndex by mapping label combinations to positive integers.
+    """
+    _base = libindex.UInt64Engine
+
+    def codes_to_ints(self, codes):
+        # Shift the representation of each level by the pre-calculated number
+        # of bits:
+        codes <<= self.offsets
+        # Now sum and OR are in fact interchangeable. This is a simple
+        # composition of the (disjunct) significant bits of each level (i.e.
+        # each column in "codes") in a single positive integer (per row):
+        return np.bitwise_or.reduce(codes, axis=1)
+
+
+class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
+                            libindex.ObjectEngine):
+    """
+    In those (extreme) cases in which the number of possible label combinations
+    overflows the 64 bits integers, use an ObjectEngine containing Python
+    integers.
+    """
+    _base = libindex.ObjectEngine
+
+    def codes_to_ints(self, codes):
+        # Shift the representation of each level by the pre-calculated number
+        # of bits. Since this can overflow uint64, first make sure we are
+        # working with Python integers:
+        codes = codes.astype('object') << self.offsets
+        # Now sum and OR are in fact interchangeable. This is a simple
+        # composition of the (disjunct) significant bits of each level (i.e.
+        # each column in "codes") in a single positive integer (per row):
+        return np.bitwise_or.reduce(codes, axis=1)
+
+
 class MultiIndex(Index):
     """
     A multi-level, or hierarchical, index object for pandas objects
@@ -692,16 +729,22 @@ def _get_level_number(self, level):
 
     @cache_readonly
     def _engine(self):
-
-        # choose our engine based on our size
-        # the hashing based MultiIndex for larger
-        # sizes, and the MultiIndexOjbect for smaller
-        # xref: https://github.com/pandas-dev/pandas/pull/16324
-        l = len(self)
-        if l > 10000:
-            return libindex.MultiIndexHashEngine(lambda: self, l)
-
-        return libindex.MultiIndexObjectEngine(lambda: self.values, l)
+        # Calculate the number of bits needed to represent labels in each
+        # level, as log2 of their sizes (including -1 for NaN):
+        sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
+        # Sum bit counts, starting from the _right_....
+        lev_bits = np.cumsum(sizes[::-1])[::-1]
+        # ... in order to obtain offsets such that sorting the combination of
+        # shifted codes (one for each level, resulting in a unique integer) is
+        # equivalent to sorting lexicographically the codes themselves. Notice
+        # that each level needs to be shifted by the number of bits needed to
+        # represent the _previous_ ones:
+        offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
+        # Check the total number of bits needed for our representation:
+        if lev_bits[0] > 64:
+            # The levels would overflow a 64 bit uint - use Python integers:
+            return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
+        return MultiIndexUIntEngine(self.levels, self.labels, offsets)
 
     @property
     def values(self):
@@ -1890,7 +1933,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             if tolerance is not None:
                 raise NotImplementedError("tolerance not implemented yet "
                                           'for MultiIndex')
-            indexer = self._get_fill_indexer(target, method, limit)
+            indexer = self._engine.get_indexer(target, method, limit)
         elif method == 'nearest':
             raise NotImplementedError("method='nearest' not implemented yet "
                                       'for MultiIndex; see GitHub issue 9365')