Skip to content

Commit 565ee0c

Browse files
committed
Merge pull request #4148 from jreback/mi2
BUG: (GH4145/4146) Fixed bugs in multi-index selection with column multi index duplicates
2 parents 1bd8f57 + 3fd7a7a commit 565ee0c

File tree

4 files changed

+91
-48
lines changed

4 files changed

+91
-48
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,8 @@ pandas 0.12
318318
iterated over when regex=False (:issue:`4115`)
319319
- Fixed bug in ``convert_objects(convert_numeric=True)`` where a mixed numeric and
320320
object Series/Frame was not converting properly (:issue:`4119`)
321+
- Fixed bugs in multi-index selection with column multi-index and duplicates
322+
(:issue:`4145`, :issue:`4146`)
321323

322324

323325
pandas 0.11.0

pandas/core/indexing.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,7 @@ def _convert_to_indexer(self, obj, axis=0):
608608
mask = check == -1
609609
if mask.any():
610610
raise KeyError('%s not in index' % objarr[mask])
611-
611+
612612
return indexer
613613

614614
else:
@@ -1100,9 +1100,14 @@ def _check_slice_bounds(slobj, values):
11001100

11011101
def _maybe_droplevels(index, key):
11021102
# drop levels
1103+
original_index = index
11031104
if isinstance(key, tuple):
11041105
for _ in key:
1105-
index = index.droplevel(0)
1106+
try:
1107+
index = index.droplevel(0)
1108+
except:
1109+
# we have dropped too much, so back out
1110+
return original_index
11061111
else:
11071112
index = index.droplevel(0)
11081113

pandas/core/internals.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -1660,18 +1660,23 @@ def get(self, item):
16601660

16611661
# duplicate index but only a single result
16621662
if com.is_integer(indexer):
1663+
16631664
b, loc = ref_locs[indexer]
1664-
return b.iget(loc)
1665+
values = [ b.iget(loc) ]
1666+
index = Index([ self.items[indexer] ])
1667+
1668+
# we have a multiple result, potentially across blocks
16651669
else:
16661670

1667-
# we have a multiple result, potentially across blocks
16681671
values = [ block.iget(i) for block, i in ref_locs[indexer] ]
16691672
index = self.items[indexer]
1670-
axes = [ index ] + self.axes[1:]
1671-
blocks = form_blocks(values, index, axes)
1672-
mgr = BlockManager(blocks, axes)
1673-
mgr._consolidate_inplace()
1674-
return mgr
1673+
1674+
# create and return a new block manager
1675+
axes = [ index ] + self.axes[1:]
1676+
blocks = form_blocks(values, index, axes)
1677+
mgr = BlockManager(blocks, axes)
1678+
mgr._consolidate_inplace()
1679+
return mgr
16751680

16761681
def iget(self, i):
16771682
item = self.items[i]

pandas/tests/test_indexing.py

+70-39
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import unittest
33
import nose
44
import itertools
5+
from StringIO import StringIO
56

67
from numpy import random, nan
78
from numpy.random import randn
@@ -45,7 +46,7 @@ def _get_value(f, i, values=False):
4546
# check agains values
4647
if values:
4748
return f.values[i]
48-
49+
4950
# this is equiv of f[col][row].....
5051
#v = f
5152
#for a in reversed(i):
@@ -70,7 +71,7 @@ def _get_result(obj, method, key, axis):
7071
xp = getattr(obj, method).__getitem__(_axify(obj,key,axis))
7172
except:
7273
xp = getattr(obj, method).__getitem__(key)
73-
74+
7475
return xp
7576

7677
def _axify(obj, key, axis):
@@ -127,11 +128,11 @@ def setUp(self):
127128
setattr(self,o,d)
128129

129130
def check_values(self, f, func, values = False):
130-
131+
131132
if f is None: return
132133
axes = f.axes
133134
indicies = itertools.product(*axes)
134-
135+
135136
for i in indicies:
136137
result = getattr(f,func)[i]
137138

@@ -194,7 +195,7 @@ def _print(result, error = None):
194195
if fails is True:
195196
if result == 'fail':
196197
result = 'ok (fail)'
197-
198+
198199
if not result.startswith('ok'):
199200
raise AssertionError(_print(result))
200201

@@ -212,7 +213,7 @@ def _print(result, error = None):
212213
result = 'ok (%s)' % type(detail).__name__
213214
_print(result)
214215
return
215-
216+
216217
result = type(detail).__name__
217218
raise AssertionError(_print(result, error = detail))
218219

@@ -244,14 +245,14 @@ def _print(result, error = None):
244245
obj = d[t]
245246
if obj is not None:
246247
obj = obj.copy()
247-
248+
248249
k2 = key2
249250
_eq(t, o, a, obj, key1, k2)
250251

251252
def test_at_and_iat_get(self):
252253

253254
def _check(f, func, values = False):
254-
255+
255256
if f is not None:
256257
indicies = _generate_indices(f, values)
257258
for i in indicies:
@@ -260,7 +261,7 @@ def _check(f, func, values = False):
260261
assert_almost_equal(result, expected)
261262

262263
for o in self._objs:
263-
264+
264265
d = getattr(self,o)
265266

266267
# iat
@@ -274,11 +275,11 @@ def _check(f, func, values = False):
274275
_check(d['labels'],'at')
275276
_check(d['ts'], 'at')
276277
_check(d['floats'],'at')
277-
278+
278279
def test_at_and_iat_set(self):
279280

280281
def _check(f, func, values = False):
281-
282+
282283
if f is not None:
283284
indicies = _generate_indices(f, values)
284285
for i in indicies:
@@ -287,7 +288,7 @@ def _check(f, func, values = False):
287288
assert_almost_equal(expected, 1)
288289

289290
for t in self._objs:
290-
291+
291292
d = getattr(self,t)
292293

293294
_check(d['ints'],'iat',values=True)
@@ -302,12 +303,12 @@ def _check(f, func, values = False):
302303
_check(d['floats'],'at')
303304

304305
def test_at_timestamp(self):
305-
306+
306307
# as timestamp is not a tuple!
307308
dates = date_range('1/1/2000', periods=8)
308309
df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
309310
s = df['A']
310-
311+
311312
result = s.at[dates[5]]
312313
xp = s.values[5]
313314
self.assert_(result == xp)
@@ -320,7 +321,7 @@ def test_iloc_getitem_int(self):
320321
# integer
321322
self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['ints'])
322323
self.check_result('integer', 'iloc', 2, 'indexer', 2, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError)
323-
324+
324325
def test_iloc_getitem_neg_int(self):
325326

326327
# neg integer
@@ -332,7 +333,7 @@ def test_iloc_getitem_list_int(self):
332333
# list of ints
333334
self.check_result('list int', 'iloc', [0,1,2], 'ix', { 0 : [0,2,4], 1 : [0,3,6], 2: [0,4,8] }, typs = ['ints'])
334335
self.check_result('list int', 'iloc', [0,1,2], 'indexer', [0,1,2], typs = ['labels','mixed','ts','floats','empty'], fails = IndexError)
335-
336+
336337
def test_iloc_getitem_dups(self):
337338

338339
# no dups in panel (bug?)
@@ -378,7 +379,7 @@ def test_iloc_setitem(self):
378379
assert_frame_equal(result, expected)
379380

380381
def test_iloc_multiindex(self):
381-
df = DataFrame(np.random.randn(3, 3),
382+
df = DataFrame(np.random.randn(3, 3),
382383
columns=[[2,2,4],[6,8,10]],
383384
index=[[4,4,8],[8,10,12]])
384385

@@ -415,7 +416,7 @@ def test_loc_getitem_label_out_of_range(self):
415416

416417
# out of range label
417418
self.check_result('label range', 'loc', 'f', 'ix', 'f', typs = ['ints','labels','mixed','ts','floats'], fails=KeyError)
418-
419+
419420
def test_loc_getitem_label_list(self):
420421

421422
# list of labels
@@ -426,15 +427,15 @@ def test_loc_getitem_label_list(self):
426427
self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1)
427428
self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2)
428429
self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0)
429-
self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix',
430+
self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix',
430431
[Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0)
431432

432433
# fails
433434
self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError)
434435
self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError)
435436
self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError)
436437
self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError)
437-
438+
438439
# array like
439440
self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0)
440441
self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1)
@@ -449,10 +450,10 @@ def test_loc_getitem_bool(self):
449450

450451
def test_loc_getitem_int_slice(self):
451452

452-
# int slices in int
453+
# int slices in int
453454
self.check_result('int slice1', 'loc', slice(2,4), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'], fails=KeyError)
454455

455-
# ok
456+
# ok
456457
self.check_result('int slice2', 'loc', slice(2,4), 'ix', [2,4], typs = ['ints'], axes = 0)
457458
self.check_result('int slice2', 'loc', slice(3,6), 'ix', [3,6], typs = ['ints'], axes = 1)
458459
self.check_result('int slice2', 'loc', slice(4,8), 'ix', [4,8], typs = ['ints'], axes = 2)
@@ -589,7 +590,7 @@ def test_iloc_getitem_frame(self):
589590
result = df.iloc[s.index]
590591
expected = df.ix[[2,4,6,8]]
591592
assert_frame_equal(result, expected)
592-
593+
593594
# out-of-bounds slice
594595
self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(1,5,None)]))
595596
self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)]))
@@ -648,7 +649,7 @@ def test_iloc_multiindex(self):
648649
['A', 'A', 'B']],
649650
index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y','Y']])
650651

651-
mi_int = DataFrame(np.random.randn(3, 3),
652+
mi_int = DataFrame(np.random.randn(3, 3),
652653
columns=[[2,2,4],[6,8,10]],
653654
index=[[4,4,8],[8,10,12]])
654655

@@ -679,7 +680,7 @@ def test_loc_multiindex(self):
679680
['A', 'A', 'B']],
680681
index=[['i', 'i', 'j'], ['X', 'X', 'Y']])
681682

682-
mi_int = DataFrame(np.random.randn(3, 3),
683+
mi_int = DataFrame(np.random.randn(3, 3),
683684
columns=[[2,2,4],[6,8,10]],
684685
index=[[4,4,8],[8,10,12]])
685686

@@ -749,7 +750,7 @@ def test_xs_multiindex(self):
749750
assert_frame_equal(result, expected)
750751

751752
def test_setitem_dtype_upcast(self):
752-
753+
753754
# GH3216
754755
df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
755756
df['c'] = np.nan
@@ -761,7 +762,7 @@ def test_setitem_dtype_upcast(self):
761762

762763
def test_setitem_iloc(self):
763764

764-
765+
765766
# setitem with an iloc list
766767
df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"])
767768
df.iloc[[0,1],[1,2]]
@@ -830,20 +831,20 @@ def test_indexing_mixed_frame_bug(self):
830831
self.assert_(df.iloc[0,2] == '-----')
831832

832833
#if I look at df, then element [0,2] equals '_'. If instead I type df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I get '_'.
833-
834+
834835

835836
def test_set_index_nan(self):
836837

837838
# GH 3586
838-
df = DataFrame({'PRuid': {17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13',
839-
24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10'},
840-
'QC': {17: 0.0, 18: 0.0, 19: 0.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: 1.0, 25: nan,
841-
26: nan, 27: nan, 28: nan, 29: nan, 30: nan},
842-
'data': {17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999,
839+
df = DataFrame({'PRuid': {17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13',
840+
24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10'},
841+
'QC': {17: 0.0, 18: 0.0, 19: 0.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: 1.0, 25: nan,
842+
26: nan, 27: nan, 28: nan, 29: nan, 30: nan},
843+
'data': {17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999,
843844
21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996,
844-
25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008,
845-
29: 0.80140849999999997, 30: 0.81307740000000006},
846-
'year': {17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985,
845+
25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008,
846+
29: 0.80140849999999997, 30: 0.81307740000000006},
847+
'year': {17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985,
847848
24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986}}).reset_index()
848849

849850
result = df.set_index(['year','PRuid','QC']).reset_index().reindex(columns=df.columns)
@@ -871,7 +872,7 @@ def test_iloc_panel_issue(self):
871872
self.assert_(p.iloc[1, 1, :3].shape == (3,))
872873
self.assert_(p.iloc[1, :3, 1].shape == (3,))
873874
self.assert_(p.iloc[:3, 1, 1].shape == (3,))
874-
875+
875876
def test_multi_assign(self):
876877

877878
# GH 3626, an assignement of a sub-df to a df
@@ -892,7 +893,7 @@ def test_multi_assign(self):
892893
'PF':[0,0,0,0,1,1],
893894
'col1':Series([0,1,4,6,8,10]),
894895
'col2':[12,7,16,np.nan,20,22]})
895-
896+
896897

897898
# frame on rhs
898899
df2.ix[mask, cols]= dft.ix[mask, cols]
@@ -1006,7 +1007,7 @@ def test_non_unique_loc(self):
10061007
## https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs
10071008

10081009
# these are going to raise becuase the we are non monotonic
1009-
df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3])
1010+
df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3])
10101011
self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,None)]))
10111012
self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,None)]))
10121013
self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,2)]))
@@ -1066,6 +1067,36 @@ def test_iloc_non_unique_indexing(self):
10661067
result = df2.loc[idx]
10671068
assert_frame_equal(result, expected)
10681069

1070+
def test_mi_access(self):
1071+
1072+
# GH 4145
1073+
data = """h1 main h3 sub h5
1074+
0 a A 1 A1 1
1075+
1 b B 2 B1 2
1076+
2 c B 3 A1 3
1077+
3 d A 4 B2 4
1078+
4 e A 5 B2 5
1079+
5 f B 6 A2 6
1080+
"""
1081+
1082+
df = pd.read_csv(StringIO(data),sep='\s+',index_col=0)
1083+
df2 = df.set_index(['main', 'sub']).T.sort_index(1)
1084+
index = Index(['h1','h3','h5'])
1085+
columns = MultiIndex.from_tuples([('A','A1')],names=['main','sub'])
1086+
expected = DataFrame([['a',1,1]],index=columns,columns=index).T
1087+
1088+
result = df2.loc[:,('A','A1')]
1089+
assert_frame_equal(result,expected)
1090+
1091+
result = df2[('A','A1')]
1092+
assert_frame_equal(result,expected)
1093+
1094+
# GH 4146, not returning a block manager when selecting a unique index
1095+
# from a duplicate index
1096+
expected = DataFrame([['a',1,1]],index=['A1'],columns=['h1','h3','h5'],).T
1097+
df3 = df2['A']
1098+
result = df3['A1']
1099+
assert_frame_equal(result,expected)
10691100

10701101
if __name__ == '__main__':
10711102
import nose

0 commit comments

Comments
 (0)