Skip to content

Commit cf02b08

Browse files
committed
BUG: implement new engine for codes-based MultiIndex indexing
closes pandas-dev#18519 closes pandas-dev#18818 closes pandas-dev#18520 closes pandas-dev#18485 closes pandas-dev#15994
1 parent 6552718 commit cf02b08

File tree

2 files changed

+115
-10
lines changed

2 files changed

+115
-10
lines changed

doc/source/whatsnew/v0.23.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,9 @@ Indexing
314314
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
315315
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
316316
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
317+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
318+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing NaN (:issue:`18485`)
319+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex` which would fail when levels had different dtypes (:issue:`18520`)
317320
- Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`)
318321
- Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`)
319322
- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)

pandas/core/indexes/multi.py

+112-10
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,109 @@
5050
target_klass='MultiIndex or list of tuples'))
5151

5252

53+
def is_definitely_invalid_key(val):
54+
if isinstance(val, tuple):
55+
try:
56+
hash(val)
57+
except TypeError:
58+
return True
59+
60+
# we have a _data, means we are a NDFrame
61+
return isinstance(val, (slice, np.ndarray, list)) or hasattr(val, '_data')
62+
63+
64+
class BaseMultiIndexCodesEngine(object):
65+
def __init__(self, levels, labels, offsets, **kwargs):
66+
self._levels = levels
67+
self._labels = labels
68+
self._offsets = offsets
69+
70+
# Map each combination to an integer
71+
lab_ints = self._labs_to_ints(labels)
72+
73+
# Initialize underlying index
74+
self._base.__init__(self, lambda: lab_ints, len(lab_ints), **kwargs)
75+
76+
def _labs_to_ints(self, labels):
77+
# Add 1 so that -1 (NaN) becomes 0
78+
codes = (np.asarray(labels).T + 1).astype(self._type)
79+
# Shift:
80+
rot_codes = codes << self._offsets
81+
# Now sum and OR are in fact interchangeable:
82+
return np.bitwise_or.reduce(rot_codes, axis=1)
83+
84+
def get_indexer(self, target, method=None, limit=None):
85+
level_codes = [self._levels[lev].get_indexer(codes, method=method)
86+
for lev, codes in enumerate(zip(*target))]
87+
88+
keys_int = self._labs_to_ints(level_codes)
89+
90+
if method is not None:
91+
# keys must be sorted - the engine already is
92+
order = np.argsort(keys_int)
93+
keys_int = keys_int[order]
94+
sup_meth = getattr(self._base, 'get_{}_indexer'.format(method))
95+
indexer = sup_meth(self, keys_int, limit=limit)
96+
indexer = indexer[order]
97+
else:
98+
indexer = self._base.get_indexer(self, keys_int)
99+
100+
return indexer
101+
102+
def get_loc(self, key):
103+
if is_definitely_invalid_key(key):
104+
raise TypeError("'{key}' is an invalid key".format(key=key))
105+
if not isinstance(key, tuple):
106+
raise KeyError(key)
107+
try:
108+
idces = [-1 if isna(val) else self._levels[lev].get_loc(val)
109+
for lev, val in enumerate(key)]
110+
except KeyError:
111+
raise KeyError(key)
112+
idces = np.array(idces, ndmin=2).T
113+
114+
key_int = self._labs_to_ints(idces)[0]
115+
116+
return self._base.get_loc(self, key_int)
117+
118+
def get_indexer_non_unique(self, target):
119+
# This needs to be overridden just because the default one works on
120+
# target._values, and target can be itself a MultiIndex.
121+
122+
level_codes = [self._levels[lev].get_indexer(codes)
123+
for lev, codes in enumerate(zip(*target))]
124+
keys_int = self._labs_to_ints(level_codes)
125+
126+
indexer = self._base.get_indexer_non_unique(self, keys_int)
127+
128+
return indexer
129+
130+
def __contains__(self, val):
131+
try:
132+
self.get_loc(val)
133+
return True
134+
except:
135+
return False
136+
137+
138+
class MultiIndexUIntEngine(BaseMultiIndexCodesEngine, libindex.UInt64Engine):
139+
"""
140+
Manage a MultiIndex by mapping label combinations to positive integers.
141+
"""
142+
_base = libindex.UInt64Engine
143+
_type = 'uint64'
144+
145+
146+
class MultiIndexPyIntEngine(BaseMultiIndexCodesEngine, libindex.ObjectEngine):
147+
"""
148+
In those (extreme) cases in which the number of possible label combinations
149+
overflows the 64 bits integers, use an ObjectEngine containing Python
150+
integers.
151+
"""
152+
_base = libindex.ObjectEngine
153+
_type = 'object'
154+
155+
53156
class MultiIndex(Index):
54157
"""
55158
A multi-level, or hierarchical, index object for pandas objects
@@ -691,16 +794,15 @@ def _get_level_number(self, level):
691794

692795
@cache_readonly
693796
def _engine(self):
797+
# Find powers of 2 which dominate level sizes - including -1 for NaN:
798+
lev_bits = np.cumsum(np.ceil(np.log2([len(l) + 1 for l in
799+
self.levels[::-1]])))[::-1]
800+
offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint')
694801

695-
# choose our engine based on our size
696-
# the hashing based MultiIndex for larger
697-
# sizes, and the MultiIndexOjbect for smaller
698-
# xref: https://github.com/pandas-dev/pandas/pull/16324
699-
l = len(self)
700-
if l > 10000:
701-
return libindex.MultiIndexHashEngine(lambda: self, l)
702-
703-
return libindex.MultiIndexObjectEngine(lambda: self.values, l)
802+
if lev_bits[0] > 64:
803+
# The levels would overflow a 64 bit integer - use Python integers:
804+
return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
805+
return MultiIndexUIntEngine(self.levels, self.labels, offsets)
704806

705807
@property
706808
def values(self):
@@ -1889,7 +1991,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
18891991
if tolerance is not None:
18901992
raise NotImplementedError("tolerance not implemented yet "
18911993
'for MultiIndex')
1892-
indexer = self._get_fill_indexer(target, method, limit)
1994+
indexer = self._engine.get_indexer(target, method, limit)
18931995
elif method == 'nearest':
18941996
raise NotImplementedError("method='nearest' not implemented yet "
18951997
'for MultiIndex; see GitHub issue 9365')

0 commit comments

Comments
 (0)