Skip to content

Commit 5e44452

Browse files
committed
BUG: implement new engine for codes-based MultiIndex indexing
closes #18519 closes #18818 closes #18520 closes #18485 closes #15994 closes #19086
1 parent 8acdf80 commit 5e44452

File tree

3 files changed

+154
-14
lines changed

3 files changed

+154
-14
lines changed

doc/source/whatsnew/v0.23.0.txt

+12-3
Original file line numberDiff line numberDiff line change
@@ -388,9 +388,6 @@ Indexing
388388

389389
- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
390390
- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`)
391-
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
392-
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
393-
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
394391
- Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`)
395392
- Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`)
396393
- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
@@ -406,6 +403,18 @@ Indexing
406403
- Bug in :func:`MultiIndex.set_labels` which would cause casting (and potentially clipping) of the new labels if the ``level`` argument is not 0 or a list like [0, 1, ... ] (:issue:`19057`)
407404
- Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`)
408405

406+
MultiIndex
407+
^^^^^^^^^^
408+
409+
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
410+
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
411+
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
412+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
413+
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
414+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing NaN (:issue:`18485`)
415+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex` which would fail when levels had different dtypes (:issue:`18520`)
416+
417+
409418
I/O
410419
^^^
411420

pandas/_libs/index.pyx

+88
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ from hashtable cimport HashTable
2020
from pandas._libs import algos, hashtable as _hash
2121
from pandas._libs.tslibs import period as periodlib
2222
from pandas._libs.tslib import Timestamp, Timedelta
23+
from pandas._libs.missing import checknull
2324
from datetime import datetime, timedelta, date
2425

2526
from cpython cimport PyTuple_Check, PyList_Check
@@ -599,6 +600,93 @@ cpdef convert_scalar(ndarray arr, object value):
599600
return value
600601

601602

603+
cdef class BaseMultiIndexCodesEngine(object):
604+
"""
605+
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
606+
represent each label in a MultiIndex as an integer, by juxtaposing the bits
607+
encoding each level, with appropriate offsets.
608+
Consequently, labels are located by first locating each component on the
609+
respective level, then locating for (the integer representation of) codes.
610+
"""
611+
def __init__(self, object levels, object labels,
612+
ndarray[uint64_t, ndim=1] offsets):
613+
self.levels = levels
614+
self.offsets = offsets
615+
616+
# Transform labels in a single array, and add 1 so that we are working
617+
# with positive integers (-1 for NaN becomes 0):
618+
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64')
619+
# Map each codes combination in the index to an integer unambiguously
620+
# (no collisions possible), based on the "offsets", which describe the
621+
# number of bits to switch labels for each level:
622+
lab_ints = self.codes_to_ints(codes)
623+
624+
# Initialize underlying index
625+
self._base.__init__(self, lambda: lab_ints, len(lab_ints))
626+
627+
def get_indexer(self, object target, object method=None,
628+
object limit=None):
629+
level_codes = [self.levels[lev].get_indexer(codes, method=method) + 1
630+
for lev, codes in enumerate(zip(*target))]
631+
632+
keys_int = self.codes_to_ints(np.array(level_codes, dtype='uint64').T)
633+
634+
# All methods (exact, backfill, pad) directly map to the respective
635+
# methods of the underlying (integers) index...
636+
if method is not None:
637+
# but underlying backfill and pad methods require index and keys
638+
# to be sorted. The index already is (checked in
639+
# Index._get_fill_indexer), so sort keys (integer representations):
640+
order = np.argsort(keys_int)
641+
keys_int = keys_int[order]
642+
sup_meth = getattr(self._base, 'get_{}_indexer'.format(method))
643+
indexer = sup_meth(self, keys_int, limit=limit)
644+
indexer = indexer[order]
645+
else:
646+
indexer = self._base.get_indexer(self, keys_int)
647+
648+
return indexer
649+
650+
def get_loc(self, object key):
651+
if is_definitely_invalid_key(key):
652+
raise TypeError("'{key}' is an invalid key".format(key=key))
653+
if not isinstance(key, tuple):
654+
raise KeyError(key)
655+
try:
656+
indices = [0 if checknull(v) else self.levels[l].get_loc(v) + 1
657+
for l, v in enumerate(key)]
658+
except KeyError:
659+
raise KeyError(key)
660+
661+
# ndmin=2 because codes_to_ints expects multiple labels:
662+
indices = np.array(indices, ndmin=2, dtype='uint64')
663+
# ... and returns a (length 1, in this case) array of integers:
664+
key_int = self.codes_to_ints(indices)[0]
665+
666+
return self._base.get_loc(self, key_int)
667+
668+
def get_indexer_non_unique(self, object target):
669+
# This needs to be overridden just because the default one works on
670+
# target._values, and target can be itself a MultiIndex.
671+
672+
level_codes = [self.levels[lev].get_indexer(codes) + 1
673+
for lev, codes in enumerate(zip(*target))]
674+
codes = np.array(level_codes, dtype='uint64').T
675+
keys_int = self.codes_to_ints(codes)
676+
677+
indexer = self._base.get_indexer_non_unique(self, keys_int)
678+
679+
return indexer
680+
681+
def __contains__(self, object val):
682+
try:
683+
self.get_loc(val)
684+
return True
685+
except (KeyError, TypeError, ValueError):
686+
return False
687+
688+
689+
602690
cdef class MultiIndexObjectEngine(ObjectEngine):
603691
"""
604692
provide the same interface as the MultiIndexEngine

pandas/core/indexes/multi.py

+54-11
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,43 @@
5050
target_klass='MultiIndex or list of tuples'))
5151

5252

53+
class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
54+
libindex.UInt64Engine):
55+
"""
56+
Manage a MultiIndex by mapping label combinations to positive integers.
57+
"""
58+
_base = libindex.UInt64Engine
59+
60+
def codes_to_ints(self, codes):
61+
# Shift the representation of each level by the pre-calculated number
62+
# of bits:
63+
codes <<= self.offsets
64+
# Now sum and OR are in fact interchangeable. This is a simple
65+
# composition of the (disjunct) significant bits of each level (i.e.
66+
# each column in "codes") in a single positive integer (per row):
67+
return np.bitwise_or.reduce(codes, axis=1)
68+
69+
70+
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
71+
libindex.ObjectEngine):
72+
"""
73+
In those (extreme) cases in which the number of possible label combinations
74+
overflows the 64 bits integers, use an ObjectEngine containing Python
75+
integers.
76+
"""
77+
_base = libindex.ObjectEngine
78+
79+
def codes_to_ints(self, codes):
80+
# Shift the representation of each level by the pre-calculated number
81+
# of bits. Since this can overflow uint64, first make sure we are
82+
# working with Python integers:
83+
codes = codes.astype('object') << self.offsets
84+
# Now sum and OR are in fact interchangeable. This is a simple
85+
# composition of the (disjunct) significant bits of each level (i.e.
86+
# each column in "codes") in a single positive integer (per row):
87+
return np.bitwise_or.reduce(codes, axis=1)
88+
89+
5390
class MultiIndex(Index):
5491
"""
5592
A multi-level, or hierarchical, index object for pandas objects
@@ -692,16 +729,22 @@ def _get_level_number(self, level):
692729

693730
@cache_readonly
694731
def _engine(self):
695-
696-
# choose our engine based on our size
697-
# the hashing based MultiIndex for larger
698-
# sizes, and the MultiIndexOjbect for smaller
699-
# xref: https://github.com/pandas-dev/pandas/pull/16324
700-
l = len(self)
701-
if l > 10000:
702-
return libindex.MultiIndexHashEngine(lambda: self, l)
703-
704-
return libindex.MultiIndexObjectEngine(lambda: self.values, l)
732+
# Calculate the number of bits needed to represent labels in each
733+
# level, as log2 of their sizes (including -1 for NaN):
734+
sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
735+
# Sum bit counts, starting from the _right_....
736+
lev_bits = np.cumsum(sizes[::-1])[::-1]
737+
# ... in order to obtain offsets such that sorting the combination of
738+
# shifted codes (one for each level, resulting in a unique integer) is
739+
# equivalent to sorting lexicographically the codes themselves. Notice
740+
# that each level needs to be shifted by the number of bits needed to
741+
# represent the _previous_ ones:
742+
offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
743+
# Check the total number of bits needed for our representation:
744+
if lev_bits[0] > 64:
745+
# The levels would overflow a 64 bit uint - use Python integers:
746+
return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
747+
return MultiIndexUIntEngine(self.levels, self.labels, offsets)
705748

706749
@property
707750
def values(self):
@@ -1890,7 +1933,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
18901933
if tolerance is not None:
18911934
raise NotImplementedError("tolerance not implemented yet "
18921935
'for MultiIndex')
1893-
indexer = self._get_fill_indexer(target, method, limit)
1936+
indexer = self._engine.get_indexer(target, method, limit)
18941937
elif method == 'nearest':
18951938
raise NotImplementedError("method='nearest' not implemented yet "
18961939
'for MultiIndex; see GitHub issue 9365')

0 commit comments

Comments
 (0)