Skip to content

Commit 0c38215

Browse files
committed
WIP: add multi-multi index support to join
1 parent ab27073 commit 0c38215

File tree

2 files changed

+73
-9
lines changed

2 files changed

+73
-9
lines changed

pandas/core/index.py

+69-1
Original file line numberDiff line numberDiff line change
@@ -1372,6 +1372,7 @@ def _join_multi(self, other, how, return_indexers=True):
13721372
raise ValueError("cannot join with no level specified and no overlapping names")
13731373
if len(overlap) > 1:
13741374
raise NotImplementedError("merging with more than one level overlap on a multi-index is not implemented")
1375+
13751376
jl = overlap[0]
13761377

13771378
# make the indices into mi's that match
@@ -1392,7 +1393,74 @@ def _join_multi(self, other, how, return_indexers=True):
13921393
return result
13931394

13941395
# 2 multi-indexes
1395-
raise NotImplementedError("merging with both multi-indexes is not implemented")
1396+
left_values = other.get_level_values(jl)
1397+
left_joined, left_lidx, left_ridx = self._join_level(left_values, jl, how=how,
1398+
return_indexers=True)
1399+
right_values = self.get_level_values(jl)
1400+
right_joined, right_lidx, right_ridx = other._join_level(right_values, jl, how=how,
1401+
return_indexers=True)
1402+
1403+
# new levels
1404+
levels = list(left_joined.levels)
1405+
levels_names = set([ l.name for l in levels ])
1406+
levels += [ l for l in right_joined.levels if l.name not in levels_names ]
1407+
1408+
# number of reps of labels
1409+
l = len(left_joined)*len(right_joined)
1410+
1411+
# new labels
1412+
lidx = com._ensure_int64(left_lidx)
1413+
ridx = com._ensure_int64(right_lidx)
1414+
labels = []
1415+
indexers = []
1416+
1417+
def _get_labels(joined, name, indexer):
1418+
ln = joined._get_level_number(name)
1419+
lev_labels = np.tile(joined.labels[ln],l/len(joined.labels[ln]))
1420+
rev_indexer = lib.get_reverse_indexer(indexer,l)
1421+
new_labels = com.take_nd(rev_indexer, lev_labels,
1422+
allow_fill=False)
1423+
omit_mask = new_labels != -1
1424+
new_indexer = np.arange(len(lev_labels))[omit_mask]
1425+
1426+
return new_labels, new_indexer
1427+
1428+
for level in levels:
1429+
1430+
name = level.name
1431+
1432+
in_left = name in left_joined.names
1433+
in_right = name in right_joined.names
1434+
1435+
if not in_left and in_right:
1436+
new_labels, new_indexer = _get_labels(right_joined, name, ridx)
1437+
else:
1438+
# left or the joined
1439+
new_labels, new_indexer = _get_labels(left_joined, name, lidx)
1440+
1441+
labels.append(new_labels)
1442+
indexers.append(new_indexer)
1443+
1444+
import pdb; pdb.set_trace()
1445+
1446+
return MultiIndex(labels=labels,levels=levels), left_lidx, right_ridx
1447+
1448+
#sjl = self._get_level_number(jl)
1449+
#left_indexer = np.array(sorted(list(set(np.arange(len(self.levels)))-set([sjl]))))
1450+
#ojl = other._get_level_number(jl)
1451+
#right_indexer = np.array(sorted(list(set(np.arange(len(other.levels)))-set([ojl]))))
1452+
1453+
#tuples = []
1454+
#for left, right in zip(self.take(lidx).values, other.take(ridx).values):
1455+
# def _create_tuple(left, right, i):
1456+
# t = []
1457+
# t.extend(list(np.array(left).take(left_indexer)))
1458+
# t.append(left[i])
1459+
# t.extend(list(np.array(right).take(right_indexer)))
1460+
# return tuple(t)
1461+
1462+
#tuples = [ _create_tuple(left, right, sjl)
1463+
#return MultiIndex.from_tuples(tuples,names=_create_tuple(self.names,other.names,sjl)), lidx, ridx
13961464

13971465
def _join_non_unique(self, other, how='left', return_indexers=False):
13981466
from pandas.tools.merge import _get_join_indexers

pandas/tools/tests/test_merge.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -1049,8 +1049,6 @@ def test_join_multi_levels(self):
10491049
).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share'])
10501050
assert_frame_equal(result,expected)
10511051

1052-
assert_frame_equal(result,expected)
1053-
10541052
# equivalency
10551053
result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id'])
10561054
assert_frame_equal(result2,expected)
@@ -1097,9 +1095,8 @@ def test_join_multi_levels2(self):
10971095
log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997]
10981096
)).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return'])
10991097

1100-
def f():
1101-
household.join(log_return, how='inner')
1102-
self.assertRaises(NotImplementedError, f)
1098+
result = household.join(log_return, how='inner')
1099+
assert_frame_equal(result,expected)
11031100

11041101
# this is the equivalency
11051102
result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t'])
@@ -1113,9 +1110,8 @@ def f():
11131110
log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None]
11141111
)).set_index(["household_id", "asset_id", "t"])
11151112

1116-
def f():
1117-
household.join(log_return, how='outer')
1118-
self.assertRaises(NotImplementedError, f)
1113+
result = household.join(log_return, how='outer')
1114+
assert_frame_equal(result,expected)
11191115

11201116
def _check_join(left, right, result, join_col, how='left',
11211117
lsuffix='_x', rsuffix='_y'):

0 commit comments

Comments
 (0)