Skip to content

Commit 034ba25

Browse files
committed
BUG: implement new engine for codes-based MultiIndex indexing
closes pandas-dev#18519 closes pandas-dev#18818 closes pandas-dev#18520 closes pandas-dev#18485 closes pandas-dev#15994 closes pandas-dev#19086
1 parent 860c99c commit 034ba25

File tree

3 files changed

+233
-12
lines changed

3 files changed

+233
-12
lines changed

doc/source/whatsnew/v0.23.0.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ Performance Improvements
380380
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
381381
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
382382
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
383+
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
383384

384385

385386
.. _whatsnew_0230.docs:
@@ -476,7 +477,11 @@ MultiIndex
476477
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
477478
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
478479
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
479-
-
480+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
481+
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
482+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
483+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
484+
480485

481486
I/O
482487
^^^

pandas/_libs/index.pyx

+136
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ from hashtable cimport HashTable
2626
from pandas._libs import algos, hashtable as _hash
2727
from pandas._libs.tslibs import period as periodlib
2828
from pandas._libs.tslib import Timestamp, Timedelta
29+
from pandas._libs.missing import checknull
2930

3031
cdef int64_t iNaT = util.get_nat()
3132

@@ -585,6 +586,141 @@ cpdef convert_scalar(ndarray arr, object value):
585586
return value
586587

587588

589+
cdef class BaseMultiIndexCodesEngine:
590+
"""
591+
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
592+
represent each label in a MultiIndex as an integer, by juxtaposing the bits
593+
encoding each level, with appropriate offsets.
594+
595+
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
596+
then their labels can be represented using respectively 2, 3 and 1 bits,
597+
as follows:
598+
_ _ _ _____ _ __ __ __
599+
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
600+
— — — ————— — —— —— ——
601+
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
602+
— — — ————— — —— —— ——
603+
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
604+
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
605+
and the resulting unsigned integer representation will be:
606+
_ _ _ _____ _ __ __ __ __ __ __
607+
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
608+
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
609+
610+
Offsets are calculated at initialization, labels are transformed by method
611+
_codes_to_ints.
612+
613+
Keys are located by first locating each component against the respective
614+
level, then locating (the integer representation of) codes.
615+
"""
616+
def __init__(self, object levels, object labels,
617+
ndarray[uint64_t, ndim=1] offsets):
618+
"""
619+
Parameters
620+
----------
621+
levels : list-like of numpy arrays
622+
Levels of the MultiIndex
623+
labels : list-like of numpy arrays of integer dtype
624+
Labels of the MultiIndex
625+
offsets : numpy array of uint64 dtype
626+
Pre-calculated offsets, one for each level of the index
627+
"""
628+
629+
self.levels = levels
630+
self.offsets = offsets
631+
632+
# Transform labels in a single array, and add 1 so that we are working
633+
# with positive integers (-1 for NaN becomes 0):
634+
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
635+
copy=False)
636+
637+
# Map each codes combination in the index to an integer unambiguously
638+
# (no collisions possible), based on the "offsets", which describe the
639+
# number of bits to switch labels for each level:
640+
lab_ints = self._codes_to_ints(codes)
641+
642+
# Initialize underlying index (e.g. libindex.UInt64Engine) with
643+
# integers representing labels: we will use its get_loc and get_indexer
644+
self._base.__init__(self, lambda: lab_ints, len(lab_ints))
645+
646+
def _extract_level_codes(self, object target, object method=None):
647+
"""
648+
Map the requested list of (tuple) keys to their integer representations
649+
for searching in the underlying integer index.
650+
651+
Parameters
652+
----------
653+
target : list-like of keys
654+
Each key is a tuple, with a label for each level of the index.
655+
656+
Returns
657+
------
658+
int_keys : 1-dimensional array of dtype uint64 or object
659+
Integers representing one combination each
660+
"""
661+
662+
level_codes = [lev.get_indexer(codes) + 1 for lev, codes
663+
in zip(self.levels, zip(*target))]
664+
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
665+
666+
def get_indexer(self, object target, object method=None,
667+
object limit=None):
668+
lab_ints = self._extract_level_codes(target)
669+
670+
# All methods (exact, backfill, pad) directly map to the respective
671+
# methods of the underlying (integers) index...
672+
if method is not None:
673+
# but underlying backfill and pad methods require index and keys
674+
# to be sorted. The index already is (checked in
675+
# Index._get_fill_indexer), sort (integer representations of) keys:
676+
order = np.argsort(lab_ints)
677+
lab_ints = lab_ints[order]
678+
indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
679+
(self, lab_ints, limit=limit))
680+
indexer = indexer[order]
681+
else:
682+
indexer = self._base.get_indexer(self, lab_ints)
683+
684+
return indexer
685+
686+
def get_loc(self, object key):
687+
if is_definitely_invalid_key(key):
688+
raise TypeError("'{key}' is an invalid key".format(key=key))
689+
if not PyTuple_Check(key):
690+
raise KeyError(key)
691+
try:
692+
indices = [0 if checknull(v) else lev.get_loc(v) + 1
693+
for lev, v in zip(self.levels, key)]
694+
except KeyError:
695+
raise KeyError(key)
696+
697+
# ndmin=2 because codes_to_ints expects multiple labels:
698+
indices = np.array(indices, ndmin=2, dtype='uint64')
699+
# ... and returns a (length 1, in this case) array of integers:
700+
lab_int = self._codes_to_ints(indices)[0]
701+
702+
return self._base.get_loc(self, lab_int)
703+
704+
def get_indexer_non_unique(self, object target):
705+
# This needs to be overridden just because the default one works on
706+
# target._values, and target can be itself a MultiIndex.
707+
708+
lab_ints = self._extract_level_codes(target)
709+
indexer = self._base.get_indexer_non_unique(self, lab_ints)
710+
711+
return indexer
712+
713+
def __contains__(self, object val):
714+
# Default __contains__ looks in the underlying mapping, which in this
715+
# case only contains integer representations.
716+
try:
717+
self.get_loc(val)
718+
return True
719+
except (KeyError, TypeError, ValueError):
720+
return False
721+
722+
723+
588724
cdef class MultiIndexObjectEngine(ObjectEngine):
589725
"""
590726
provide the same interface as the MultiIndexEngine

pandas/core/indexes/multi.py

+91-11
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,77 @@
4545
target_klass='MultiIndex or list of tuples'))
4646

4747

48+
class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
49+
libindex.UInt64Engine):
50+
"""
51+
This class manages a MultiIndex by mapping label combinations to positive
52+
integers.
53+
"""
54+
_base = libindex.UInt64Engine
55+
56+
def _codes_to_ints(self, codes):
57+
"""
58+
Transform each row of a 2d array of uint64 in a uint64, in a strictly
59+
monotonic way (i.e. respecting the lexicographic order of integer
60+
combinations): see BaseMultiIndexCodesEngine documentation.
61+
62+
Parameters
63+
----------
64+
codes : 2-dimensional array of dtype uint64
65+
Combinations of integers (one per row)
66+
67+
Returns
68+
------
69+
int_keys : 1-dimensional array of dtype uint64
70+
Integers representing one combination each
71+
"""
72+
# Shift the representation of each level by the pre-calculated number
73+
# of bits:
74+
codes <<= self.offsets
75+
76+
# Now sum and OR are in fact interchangeable. This is a simple
77+
# composition of the (disjunct) significant bits of each level (i.e.
78+
# each column in "codes") in a single positive integer (per row):
79+
return np.bitwise_or.reduce(codes, axis=1)
80+
81+
82+
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
83+
libindex.ObjectEngine):
84+
"""
85+
This class manages those (extreme) cases in which the number of possible
86+
label combinations overflows the 64 bits integers, and uses an ObjectEngine
87+
containing Python integers.
88+
"""
89+
_base = libindex.ObjectEngine
90+
91+
def _codes_to_ints(self, codes):
92+
"""
93+
Transform each row of a 2d array of uint64 in a Python integer, in a
94+
strictly monotonic way (i.e. respecting the lexicographic order of
95+
integer combinations): see BaseMultiIndexCodesEngine documentation.
96+
97+
Parameters
98+
----------
99+
codes : 2-dimensional array of dtype uint64
100+
Combinations of integers (one per row)
101+
102+
Returns
103+
------
104+
int_keys : 1-dimensional array of dtype object
105+
Integers representing one combination each
106+
"""
107+
108+
# Shift the representation of each level by the pre-calculated number
109+
# of bits. Since this can overflow uint64, first make sure we are
110+
# working with Python integers:
111+
codes = codes.astype('object') << self.offsets
112+
113+
# Now sum and OR are in fact interchangeable. This is a simple
114+
# composition of the (disjunct) significant bits of each level (i.e.
115+
# each column in "codes") in a single positive integer (per row):
116+
return np.bitwise_or.reduce(codes, axis=1)
117+
118+
48119
class MultiIndex(Index):
49120
"""
50121
A multi-level, or hierarchical, index object for pandas objects
@@ -687,16 +758,25 @@ def _get_level_number(self, level):
687758

688759
@cache_readonly
689760
def _engine(self):
690-
691-
# choose our engine based on our size
692-
# the hashing based MultiIndex for larger
693-
# sizes, and the MultiIndexOjbect for smaller
694-
# xref: https://github.com/pandas-dev/pandas/pull/16324
695-
l = len(self)
696-
if l > 10000:
697-
return libindex.MultiIndexHashEngine(lambda: self, l)
698-
699-
return libindex.MultiIndexObjectEngine(lambda: self.values, l)
761+
# Calculate the number of bits needed to represent labels in each
762+
# level, as log2 of their sizes (including -1 for NaN):
763+
sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
764+
765+
# Sum bit counts, starting from the _right_....
766+
lev_bits = np.cumsum(sizes[::-1])[::-1]
767+
768+
# ... in order to obtain offsets such that sorting the combination of
769+
# shifted codes (one for each level, resulting in a unique integer) is
770+
# equivalent to sorting lexicographically the codes themselves. Notice
771+
# that each level needs to be shifted by the number of bits needed to
772+
# represent the _previous_ ones:
773+
offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
774+
775+
# Check the total number of bits needed for our representation:
776+
if lev_bits[0] > 64:
777+
# The levels would overflow a 64 bit uint - use Python integers:
778+
return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
779+
return MultiIndexUIntEngine(self.levels, self.labels, offsets)
700780

701781
@property
702782
def values(self):
@@ -1885,7 +1965,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
18851965
if tolerance is not None:
18861966
raise NotImplementedError("tolerance not implemented yet "
18871967
'for MultiIndex')
1888-
indexer = self._get_fill_indexer(target, method, limit)
1968+
indexer = self._engine.get_indexer(target, method, limit)
18891969
elif method == 'nearest':
18901970
raise NotImplementedError("method='nearest' not implemented yet "
18911971
'for MultiIndex; see GitHub issue 9365')

0 commit comments

Comments
 (0)