Skip to content

Commit 11627cd

Browse files
committed
BUG: implement new engine for codes-based MultiIndex indexing
closes pandas-dev#18519 closes pandas-dev#18818 closes pandas-dev#18520 closes pandas-dev#18485 closes pandas-dev#15994 closes pandas-dev#19086
1 parent 8acdf80 commit 11627cd

File tree

3 files changed

+224
-14
lines changed

3 files changed

+224
-14
lines changed

doc/source/whatsnew/v0.23.0.txt

+12-3
Original file line numberDiff line numberDiff line change
@@ -388,9 +388,6 @@ Indexing
388388

389389
- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
390390
- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`)
391-
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
392-
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
393-
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
394391
- Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`)
395392
- Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`)
396393
- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
@@ -406,6 +403,18 @@ Indexing
406403
- Bug in :func:`MultiIndex.set_labels` which would cause casting (and potentially clipping) of the new labels if the ``level`` argument is not 0 or a list like [0, 1, ... ] (:issue:`19057`)
407404
- Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`)
408405

406+
MultiIndex
407+
^^^^^^^^^^
408+
409+
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on a level of ints with missing values (:issue:`17924`)
410+
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
411+
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
412+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
413+
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
414+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
415+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
416+
417+
409418
I/O
410419
^^^
411420

pandas/_libs/index.pyx

+121
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ from hashtable cimport HashTable
2020
from pandas._libs import algos, hashtable as _hash
2121
from pandas._libs.tslibs import period as periodlib
2222
from pandas._libs.tslib import Timestamp, Timedelta
23+
from pandas._libs.missing import checknull
2324
from datetime import datetime, timedelta, date
2425

2526
from cpython cimport PyTuple_Check, PyList_Check
@@ -599,6 +600,126 @@ cpdef convert_scalar(ndarray arr, object value):
599600
return value
600601

601602

603+
cdef class BaseMultiIndexCodesEngine(object):
604+
"""
605+
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
606+
represent each label in a MultiIndex as an integer, by juxtaposing the bits
607+
encoding each level, with appropriate offsets.
608+
609+
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
610+
then their labels can be represented as:
611+
_ _ _ _____ _ __ __ __
612+
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
613+
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
614+
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
615+
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
616+
and the resulting unsigned integer representation will be:
617+
_ _ _ ____ _ __ __ __ __ __ __
618+
|0|0|0| ...|0|c0|b2|b1|b0|a1|a0|
619+
‾ ‾ ‾ ‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
620+
621+
Offsets are calculated at initialization, labels are transformed by method
622+
_codes_to_ints.
623+
624+
Keys are located by first locating each component against the respective
625+
level, then locating (the integer representation of) codes.
626+
"""
627+
def __init__(self, object levels, object labels,
628+
ndarray[uint64_t, ndim=1] offsets):
629+
self.levels = levels
630+
self.offsets = offsets
631+
632+
# Transform labels in a single array, and add 1 so that we are working
633+
# with positive integers (-1 for NaN becomes 0):
634+
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64')
635+
636+
# Map each codes combination in the index to an integer unambiguously
637+
# (no collisions possible), based on the "offsets", which describe the
638+
# number of bits to switch labels for each level:
639+
lab_ints = self._codes_to_ints(codes)
640+
641+
# Initialize underlying index (e.g. libindex.UInt64Engine) with
642+
# integers representing labels: we will use its get_loc and get_indexer
643+
self._base.__init__(self, lambda: lab_ints, len(lab_ints))
644+
645+
def _extract_level_codes(self, object target, object method=None):
646+
"""
647+
Map the requested list of (tuple) keys to their integer representations
648+
for searching in the underlying integer index.
649+
650+
Parameters
651+
----------
652+
target : 2-dimensional array of dtype uint64
653+
Combinations of integers (one per row)
654+
655+
Returns
656+
------
657+
int_keys : 1-dimensional array of dtype uint64 or object
658+
Integers representing one combination each
659+
"""
660+
661+
level_codes = [lev.get_indexer(codes) + 1 for lev, codes
662+
in zip(self.levels, zip(*target))]
663+
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
664+
665+
def get_indexer(self, object target, object method=None,
666+
object limit=None):
667+
lab_ints = self._extract_level_codes(target)
668+
669+
# All methods (exact, backfill, pad) directly map to the respective
670+
# methods of the underlying (integers) index...
671+
if method is not None:
672+
# but underlying backfill and pad methods require index and keys
673+
# to be sorted. The index already is (checked in
674+
# Index._get_fill_indexer), sort (integer representations of) keys:
675+
order = np.argsort(lab_ints)
676+
lab_ints = lab_ints[order]
677+
indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
678+
(self, lab_ints, limit=limit))
679+
indexer = indexer[order]
680+
else:
681+
indexer = self._base.get_indexer(self, lab_ints)
682+
683+
return indexer
684+
685+
def get_loc(self, object key):
686+
if is_definitely_invalid_key(key):
687+
raise TypeError("'{key}' is an invalid key".format(key=key))
688+
if not PyTuple_Check(key):
689+
raise KeyError(key)
690+
try:
691+
indices = [0 if checknull(v) else lev.get_loc(v) + 1
692+
for lev, v in zip(self.levels, key)]
693+
except KeyError:
694+
raise KeyError(key)
695+
696+
# ndmin=2 because codes_to_ints expects multiple labels:
697+
indices = np.array(indices, ndmin=2, dtype='uint64')
698+
# ... and returns a (length 1, in this case) array of integers:
699+
lab_int = self._codes_to_ints(indices)[0]
700+
701+
return self._base.get_loc(self, lab_int)
702+
703+
def get_indexer_non_unique(self, object target):
704+
# This needs to be overridden just because the default one works on
705+
# target._values, and target can be itself a MultiIndex.
706+
707+
lab_ints = self._extract_level_codes(target)
708+
indexer = self._base.get_indexer_non_unique(self, lab_ints)
709+
710+
return indexer
711+
712+
def __contains__(self, object val):
713+
# Default __contains__ looks in the underlying mapping, which in this
714+
# case only contains integer representations.
715+
try:
716+
self.get_loc(val)
717+
return True
718+
except (KeyError, TypeError, ValueError):
719+
return False
720+
721+
722+
602723
cdef class MultiIndexObjectEngine(ObjectEngine):
603724
"""
604725
provide the same interface as the MultiIndexEngine

pandas/core/indexes/multi.py

+91-11
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,77 @@
5050
target_klass='MultiIndex or list of tuples'))
5151

5252

53+
class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
54+
libindex.UInt64Engine):
55+
"""
56+
This class manages a MultiIndex by mapping label combinations to positive
57+
integers.
58+
"""
59+
_base = libindex.UInt64Engine
60+
61+
def _codes_to_ints(self, codes):
62+
"""
63+
Transform each row of a 2d array of uint64 in a uint64, in a strictly
64+
monotonic way (i.e. respecting the lexicographic order of integer
65+
combinations): see BaseMultiIndexCodesEngine documentation.
66+
67+
Parameters
68+
----------
69+
codes : 2-dimensional array of dtype uint64
70+
Combinations of integers (one per row)
71+
72+
Returns
73+
------
74+
int_keys : 1-dimensional array of dtype uint64
75+
Integers representing one combination each
76+
"""
77+
# Shift the representation of each level by the pre-calculated number
78+
# of bits:
79+
codes <<= self.offsets
80+
81+
# Now sum and OR are in fact interchangeable. This is a simple
82+
# composition of the (disjunct) significant bits of each level (i.e.
83+
# each column in "codes") in a single positive integer (per row):
84+
return np.bitwise_or.reduce(codes, axis=1)
85+
86+
87+
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
88+
libindex.ObjectEngine):
89+
"""
90+
This class manages those (extreme) cases in which the number of possible
91+
label combinations overflows the 64 bits integers, and uses an ObjectEngine
92+
containing Python integers.
93+
"""
94+
_base = libindex.ObjectEngine
95+
96+
def _codes_to_ints(self, codes):
97+
"""
98+
Transform each row of a 2d array of uint64 in a Python integer, in a
99+
strictly monotonic way (i.e. respecting the lexicographic order of
100+
integer combinations): see BaseMultiIndexCodesEngine documentation.
101+
102+
Parameters
103+
----------
104+
codes : 2-dimensional array of dtype uint64
105+
Combinations of integers (one per row)
106+
107+
Returns
108+
------
109+
int_keys : 1-dimensional array of dtype object
110+
Integers representing one combination each
111+
"""
112+
113+
# Shift the representation of each level by the pre-calculated number
114+
# of bits. Since this can overflow uint64, first make sure we are
115+
# working with Python integers:
116+
codes = codes.astype('object') << self.offsets
117+
118+
# Now sum and OR are in fact interchangeable. This is a simple
119+
# composition of the (disjunct) significant bits of each level (i.e.
120+
# each column in "codes") in a single positive integer (per row):
121+
return np.bitwise_or.reduce(codes, axis=1)
122+
123+
53124
class MultiIndex(Index):
54125
"""
55126
A multi-level, or hierarchical, index object for pandas objects
@@ -692,16 +763,25 @@ def _get_level_number(self, level):
692763

693764
@cache_readonly
694765
def _engine(self):
695-
696-
# choose our engine based on our size
697-
# the hashing based MultiIndex for larger
698-
# sizes, and the MultiIndexOjbect for smaller
699-
# xref: https://github.com/pandas-dev/pandas/pull/16324
700-
l = len(self)
701-
if l > 10000:
702-
return libindex.MultiIndexHashEngine(lambda: self, l)
703-
704-
return libindex.MultiIndexObjectEngine(lambda: self.values, l)
766+
# Calculate the number of bits needed to represent labels in each
767+
# level, as log2 of their sizes (including -1 for NaN):
768+
sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
769+
770+
# Sum bit counts, starting from the _right_....
771+
lev_bits = np.cumsum(sizes[::-1])[::-1]
772+
773+
# ... in order to obtain offsets such that sorting the combination of
774+
# shifted codes (one for each level, resulting in a unique integer) is
775+
# equivalent to sorting lexicographically the codes themselves. Notice
776+
# that each level needs to be shifted by the number of bits needed to
777+
# represent the _previous_ ones:
778+
offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
779+
780+
# Check the total number of bits needed for our representation:
781+
if lev_bits[0] > 64:
782+
# The levels would overflow a 64 bit uint - use Python integers:
783+
return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
784+
return MultiIndexUIntEngine(self.levels, self.labels, offsets)
705785

706786
@property
707787
def values(self):
@@ -1890,7 +1970,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
18901970
if tolerance is not None:
18911971
raise NotImplementedError("tolerance not implemented yet "
18921972
'for MultiIndex')
1893-
indexer = self._get_fill_indexer(target, method, limit)
1973+
indexer = self._engine.get_indexer(target, method, limit)
18941974
elif method == 'nearest':
18951975
raise NotImplementedError("method='nearest' not implemented yet "
18961976
'for MultiIndex; see GitHub issue 9365')

0 commit comments

Comments
 (0)