Skip to content

Commit 6301f06

Browse files
committed
Merge branch 'mi-key-loc' of https://github.com/behzadnouri/pandas into behzadnouri-mi-key-loc
2 parents f1b270f + d0861e8 commit 6301f06

File tree

4 files changed

+185
-29
lines changed

4 files changed

+185
-29
lines changed

doc/source/whatsnew/v0.15.2.txt

+22
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,26 @@ users upgrade to this version.
1919

2020
API changes
2121
~~~~~~~~~~~
22+
- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
23+
a lexically sorted index will have a better performance. (:issue:`2646`)
24+
25+
.. ipython:: python
26+
27+
df = pd.DataFrame({'jim':[0, 0, 1, 1],
28+
'joe':['x', 'x', 'z', 'y'],
29+
'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
30+
df
31+
df.index.lexsort_depth
32+
33+
# in prior versions this would raise a KeyError
34+
# will now show a PerformanceWarning
35+
df.loc[(1, 'z')]
36+
37+
# lexically sorting
38+
df2 = df.sortlevel()
39+
df2
40+
df2.index.lexsort_depth
41+
df2.loc[(1,'z')]
2242

2343
- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)
2444

@@ -129,3 +149,5 @@ Bug Fixes
129149

130150
- Bugs when trying to stack multiple columns, when some (or all)
131151
of the level names are numbers (:issue:`8584`).
152+
- Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is
153+
not lexically sorted or unique (:issue:`7724`)

pandas/core/index.py

+79-24
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from pandas.core.common import (_values_from_object, is_float, is_integer,
2222
ABCSeries, _ensure_object, _ensure_int64)
2323
from pandas.core.config import get_option
24+
from pandas.io.common import PerformanceWarning
2425

2526
# simplify
2627
default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'),
@@ -4027,30 +4028,83 @@ def _partial_tup_index(self, tup, side='left'):
40274028

40284029
def get_loc(self, key):
40294030
"""
4030-
Get integer location slice for requested label or tuple
4031+
Get integer location, slice or boolean mask for requested label or tuple
4032+
If the key is past the lexsort depth, the return may be a boolean mask
4033+
array, otherwise it is always a slice or int.
40314034
40324035
Parameters
40334036
----------
40344037
key : label or tuple
40354038
40364039
Returns
40374040
-------
4038-
loc : int or slice object
4039-
"""
4040-
if isinstance(key, tuple):
4041-
if len(key) == self.nlevels:
4042-
if self.is_unique:
4043-
return self._engine.get_loc(_values_from_object(key))
4044-
else:
4045-
return slice(*self.slice_locs(key, key))
4046-
else:
4047-
# partial selection
4048-
result = slice(*self.slice_locs(key, key))
4049-
if result.start == result.stop:
4050-
raise KeyError(key)
4051-
return result
4052-
else:
4053-
return self._get_level_indexer(key, level=0)
4041+
loc : int, slice object or boolean mask
4042+
"""
4043+
def _maybe_to_slice(loc):
4044+
'''convert integer indexer to boolean mask or slice if possible'''
4045+
if not isinstance(loc, np.ndarray) or loc.dtype != 'int64':
4046+
return loc
4047+
4048+
loc = lib.maybe_indices_to_slice(loc)
4049+
if isinstance(loc, slice):
4050+
return loc
4051+
4052+
mask = np.empty(len(self), dtype='bool')
4053+
mask.fill(False)
4054+
mask[loc] = True
4055+
return mask
4056+
4057+
if not isinstance(key, tuple):
4058+
loc = self._get_level_indexer(key, level=0)
4059+
return _maybe_to_slice(loc)
4060+
4061+
keylen = len(key)
4062+
if self.nlevels < keylen:
4063+
raise KeyError('Key length ({0}) exceeds index depth ({1})'
4064+
''.format(keylen, self.nlevels))
4065+
4066+
if keylen == self.nlevels and self.is_unique:
4067+
def _maybe_str_to_time_stamp(key, lev):
4068+
if lev.is_all_dates and not isinstance(key, Timestamp):
4069+
try:
4070+
return Timestamp(key, tz=getattr(lev, 'tz', None))
4071+
except Exception:
4072+
pass
4073+
return key
4074+
key = _values_from_object(key)
4075+
key = tuple(map(_maybe_str_to_time_stamp, key, self.levels))
4076+
return self._engine.get_loc(key)
4077+
4078+
# -- partial selection or non-unique index
4079+
# break the key into 2 parts based on the lexsort_depth of the index;
4080+
# the first part returns a continuous slice of the index; the 2nd part
4081+
# needs linear search within the slice
4082+
i = self.lexsort_depth
4083+
lead_key, follow_key = key[:i], key[i:]
4084+
start, stop = self.slice_locs(lead_key, lead_key) \
4085+
if lead_key else (0, len(self))
4086+
4087+
if start == stop:
4088+
raise KeyError(key)
4089+
4090+
if not follow_key:
4091+
return slice(start, stop)
4092+
4093+
warnings.warn('indexing past lexsort depth may impact performance.',
4094+
PerformanceWarning)
4095+
4096+
loc = np.arange(start, stop, dtype='int64')
4097+
4098+
for i, k in enumerate(follow_key, len(lead_key)):
4099+
mask = self.labels[i][loc] == self.levels[i].get_loc(k)
4100+
if not mask.all():
4101+
loc = loc[mask]
4102+
if not len(loc):
4103+
raise KeyError(key)
4104+
4105+
return _maybe_to_slice(loc) \
4106+
if len(loc) != stop - start \
4107+
else slice(start, stop)
40544108

40554109
def get_loc_level(self, key, level=0, drop_level=True):
40564110
"""
@@ -4115,10 +4169,10 @@ def _maybe_drop_levels(indexer, levels, drop_level):
41154169
if not any(isinstance(k, slice) for k in key):
41164170

41174171
# partial selection
4118-
def partial_selection(key):
4119-
indexer = slice(*self.slice_locs(key, key))
4120-
if indexer.start == indexer.stop:
4121-
raise KeyError(key)
4172+
# optionally get indexer to avoid re-calculation
4173+
def partial_selection(key, indexer=None):
4174+
if indexer is None:
4175+
indexer = self.get_loc(key)
41224176
ilevels = [i for i in range(len(key))
41234177
if key[i] != slice(None, None)]
41244178
return indexer, _maybe_drop_levels(indexer, ilevels,
@@ -4139,11 +4193,12 @@ def partial_selection(key):
41394193
if any([
41404194
l.is_all_dates for k, l in zip(key, self.levels)
41414195
]) and not can_index_exactly:
4142-
indexer = slice(*self.slice_locs(key, key))
4196+
indexer = self.get_loc(key)
41434197

41444198
# we have a multiple selection here
4145-
if not indexer.stop - indexer.start == 1:
4146-
return partial_selection(key)
4199+
if not isinstance(indexer, slice) \
4200+
or indexer.stop - indexer.start != 1:
4201+
return partial_selection(key, indexer)
41474202

41484203
key = tuple(self[indexer].tolist()[0])
41494204

pandas/core/internals.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -3257,7 +3257,9 @@ def take(self, indexer, axis=1, verify=True, convert=True):
32573257
Take items along any axis.
32583258
"""
32593259
self._consolidate_inplace()
3260-
indexer = np.asanyarray(indexer, dtype=np.int_)
3260+
indexer = np.arange(indexer.start, indexer.stop, indexer.step,
3261+
dtype='int64') if isinstance(indexer, slice) \
3262+
else np.asanyarray(indexer, dtype='int64')
32613263

32623264
n = self.shape[axis]
32633265
if convert:

pandas/tests/test_indexing.py

+81-4
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,86 @@ def test_loc_multiindex(self):
14881488
result = s.loc[2:4:2, 'a':'c']
14891489
assert_series_equal(result, expected)
14901490

1491+
def test_multiindex_perf_warn(self):
1492+
import sys
1493+
from pandas.io.common import PerformanceWarning
1494+
1495+
if sys.version_info < (2, 7):
1496+
raise nose.SkipTest('python version < 2.7')
1497+
1498+
df = DataFrame({'jim':[0, 0, 1, 1],
1499+
'joe':['x', 'x', 'z', 'y'],
1500+
'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
1501+
1502+
with tm.assert_produces_warning(PerformanceWarning):
1503+
_ = df.loc[(1, 'z')]
1504+
1505+
df = df.iloc[[2,1,3,0]]
1506+
with tm.assert_produces_warning(PerformanceWarning):
1507+
_ = df.loc[(0,)]
1508+
1509+
def test_multiindex_get_loc(self): # GH7724, GH2646
1510+
# test indexing into a multi-index before & past the lexsort depth
1511+
from numpy.random import randint, choice, randn
1512+
cols = ['jim', 'joe', 'jolie', 'joline', 'jolia']
1513+
1514+
def validate(mi, df, key):
1515+
mask = np.ones(len(df)).astype('bool')
1516+
1517+
# test for all partials of this key
1518+
for i, k in enumerate(key):
1519+
mask &= df.iloc[:, i] == k
1520+
1521+
if not mask.any():
1522+
self.assertNotIn(key[:i+1], mi.index)
1523+
continue
1524+
1525+
self.assertIn(key[:i+1], mi.index)
1526+
right = df[mask].copy()
1527+
1528+
if i + 1 != len(key): # partial key
1529+
right.drop(cols[:i+1], axis=1, inplace=True)
1530+
right.set_index(cols[i+1:-1], inplace=True)
1531+
assert_frame_equal(mi.loc[key[:i+1]], right)
1532+
1533+
else: # full key
1534+
right.set_index(cols[:-1], inplace=True)
1535+
if len(right) == 1: # single hit
1536+
right = Series(right['jolia'].values,
1537+
name=right.index[0], index=['jolia'])
1538+
assert_series_equal(mi.loc[key[:i+1]], right)
1539+
else: # multi hit
1540+
assert_frame_equal(mi.loc[key[:i+1]], right)
1541+
1542+
def loop(mi, df, keys):
1543+
for key in keys:
1544+
validate(mi, df, key)
1545+
1546+
n, m = 1000, 50
1547+
1548+
vals = [randint(0, 10, n), choice(list('abcdefghij'), n),
1549+
choice(pd.date_range('20141009', periods=10).tolist(), n),
1550+
choice(list('ZYXWVUTSRQ'), n), randn(n)]
1551+
vals = list(map(tuple, zip(*vals)))
1552+
1553+
# bunch of keys for testing
1554+
keys = [randint(0, 11, m), choice(list('abcdefghijk'), m),
1555+
choice(pd.date_range('20141009', periods=11).tolist(), m),
1556+
choice(list('ZYXWVUTSRQP'), m)]
1557+
keys = list(map(tuple, zip(*keys)))
1558+
keys += list(map(lambda t: t[:-1], vals[::n//m]))
1559+
1560+
# covers both unique index and non-unique index
1561+
df = pd.DataFrame(vals, columns=cols)
1562+
a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1])
1563+
1564+
for frame in a, b:
1565+
for i in range(5): # lexsort depth
1566+
df = frame.copy() if i == 0 else frame.sort(columns=cols[:i])
1567+
mi = df.set_index(cols[:-1])
1568+
assert not mi.index.lexsort_depth < i
1569+
loop(mi, df, keys)
1570+
14911571
def test_series_getitem_multiindex(self):
14921572

14931573
# GH 6018
@@ -1541,10 +1621,7 @@ def test_ix_general(self):
15411621
'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}}
15421622
df = DataFrame(data).set_index(keys=['col', 'year'])
15431623
key = 4.0, 2012
1544-
1545-
# this should raise correct error
1546-
with tm.assertRaises(KeyError):
1547-
df.ix[key]
1624+
tm.assert_frame_equal(df.ix[key], df.iloc[2:])
15481625

15491626
# this is ok
15501627
df.sortlevel(inplace=True)

0 commit comments

Comments
 (0)