Skip to content

Commit 47bb4fe

Browse files
harisbalharisbal
authored and
harisbal
committed
Review
1 parent 4d4acc5 commit 47bb4fe

File tree

4 files changed

+118
-85
lines changed

4 files changed

+118
-85
lines changed

doc/source/whatsnew/v0.24.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ This is the same behavior as ``Series.values`` for categorical data. See
169169
Joining with two multi-indexes
170170
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
171171

172-
As of Pandas 0.24.0 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`20356`)
172+
As of Pandas 0.24.0 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)
173173

174174
See the :ref:`Merge, join, and concatenate
175175
<merging.Join_with_two_multi_indexes>` documentation section.

pandas/core/indexes/base.py

+19-16
Original file line numberDiff line numberDiff line change
@@ -3912,7 +3912,7 @@ def join(self, other, how='left', level=None, return_indexers=False,
39123912

39133913
def _join_multi(self, other, how, return_indexers=True):
39143914
from .multi import MultiIndex
3915-
from pandas.core.reshape.merge import _complete_multilevel_join
3915+
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin
39163916

39173917
# figure out join names
39183918
self_names = set(com._not_none(*self.names))
@@ -3928,27 +3928,30 @@ def _join_multi(self, other, how, return_indexers=True):
39283928

39293929
if self_is_mi and other_is_mi:
39303930

3931-
# Drop the non matching levels
3932-
ldrop_levels = list(set(self_names) - set(overlap))
3933-
rdrop_levels = list(set(other_names) - set(overlap))
3931+
# Drop the non-matching levels from left and right respectively
3932+
ldrop_names = list(set(self_names) - set(overlap))
3933+
rdrop_names = list(set(other_names) - set(overlap))
39343934

3935-
self_jnlevels = self.droplevel(ldrop_levels)
3936-
other_jnlevels = other.droplevel(rdrop_levels)
3937-
3938-
if not (self_jnlevels.is_unique and other_jnlevels.is_unique):
3939-
raise ValueError("Join on level between two MultiIndex objects"
3940-
"is ambiguous")
3941-
3942-
dropped_levels = ldrop_levels + rdrop_levels
3935+
self_jnlevels = self.droplevel(ldrop_names)
3936+
other_jnlevels = other.droplevel(rdrop_names)
39433937

3938+
# Join left and right
3939+
# Join on same leveled multi-index frames is supported
39443940
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
39453941
return_indexers=True)
39463942

3947-
levels, labels, names = _complete_multilevel_join(self, other, how,
3948-
dropped_levels,
3949-
join_idx,
3950-
lidx, ridx)
3943+
# Restore the dropped levels
3944+
# Returned index level order is
3945+
# common levels, ldrop_names, rdrop_names
3946+
dropped_names = ldrop_names + rdrop_names
3947+
3948+
levels, labels, names = (
3949+
_restore_dropped_levels_multijoin(self, other,
3950+
dropped_names,
3951+
join_idx,
3952+
lidx, ridx))
39513953

3954+
# Re-create the multi-index
39523955
multi_join_idx = MultiIndex(levels=levels, labels=labels,
39533956
names=names, verify_integrity=False)
39543957

pandas/core/reshape/merge.py

+24-18
Original file line numberDiff line numberDiff line change
@@ -1141,12 +1141,12 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
11411141
return join_func(lkey, rkey, count, **kwargs)
11421142

11431143

1144-
def _complete_multilevel_join(left, right, how, dropped_levels,
1145-
join_idx, lidx, ridx):
1144+
def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
1145+
join_idx, lidx, ridx):
11461146
"""
11471147
*this is an internal non-public method*
11481148
1149-
Returns the levels, labels and names of a multilevel to multilevel join
1149+
Returns the levels, labels and names of a multil-index to multi-index join.
11501150
Depending on the type of join, this method restores the appropriate
11511151
dropped levels of the joined multi-index. The method relies on lidx, ridx
11521152
which hold the index positions of left and right, where a join was feasible
@@ -1157,19 +1157,18 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
11571157
left index
11581158
right : Index
11591159
right index
1160+
dropped_level_names : str array
1161+
list of non-common levels
11601162
join_idx : Index
11611163
the index of the join between the common levels of left and right
1162-
how : {'left', 'right', 'outer', 'inner'}
11631164
lidx : intp array
11641165
left indexer
11651166
right : intp array
11661167
right indexer
1167-
dropped_levels : str array
1168-
list of non-common levels
11691168
11701169
Returns
11711170
-------
1172-
levels : intp array
1171+
levels : intp ndarray
11731172
levels of combined multiindexes
11741173
labels : str array
11751174
labels of combined multiindexes
@@ -1178,12 +1177,20 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
11781177
11791178
"""
11801179

1180+
# Convert to 1 level multi-index if not
1181+
if not isinstance(join_idx, MultiIndex):
1182+
levels = [join_idx.values]
1183+
labels = [list(range(0, len(join_idx)))]
1184+
names = [join_idx.name]
1185+
join_idx = MultiIndex(levels=levels, labels=labels,
1186+
names=names, verify_integrity=False)
1187+
11811188
join_levels = join_idx.levels
11821189
join_labels = join_idx.labels
11831190
join_names = join_idx.names
11841191

11851192
# lidx and ridx hold the indexes where the join occured
1186-
# for left and right respectively. If left (right) is None it means that
1193+
# for left and right respectively. If left (right) is None then
11871194
# the join occured on all indices of left (right)
11881195
if lidx is None:
11891196
lidx = range(0, len(left))
@@ -1192,27 +1199,26 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
11921199
ridx = range(0, len(right))
11931200

11941201
# Iterate through the levels that must be restored
1195-
for dl in dropped_levels:
1196-
if dl in left.names:
1202+
for dropped_level_name in dropped_level_names:
1203+
if dropped_level_name in left.names:
11971204
idx = left
11981205
indexer = lidx
11991206
else:
12001207
idx = right
12011208
indexer = ridx
12021209

12031210
# The index of the level name to be restored
1204-
name_idx = idx.names.index(dl)
1211+
name_idx = idx.names.index(dropped_level_name)
12051212

12061213
restore_levels = idx.levels[name_idx].values
1207-
restore_labels = idx.labels[name_idx]
1208-
1209-
join_levels = join_levels.__add__([restore_levels])
1210-
join_names = join_names.__add__([dl])
1211-
12121214
# Inject -1 in the labels list where a join was not possible
12131215
# IOW indexer[i]=-1
1214-
labels = [restore_labels[i] if i != -1 else -1 for i in indexer]
1215-
join_labels = join_labels.__add__([labels])
1216+
labels = idx.labels[name_idx]
1217+
restore_labels = [labels[i] if i != -1 else -1 for i in indexer]
1218+
1219+
join_levels = join_levels.__add__([restore_levels])
1220+
join_labels = join_labels.__add__([restore_labels])
1221+
join_names = join_names.__add__([dropped_level_name])
12161222

12171223
return join_levels, join_labels, join_names
12181224

pandas/tests/reshape/merge/test_multi.py

+74-50
Original file line numberDiff line numberDiff line change
@@ -13,47 +13,50 @@
1313
from pandas.compat import lzip
1414
from pandas.core.reshape.concat import concat
1515
from pandas.core.reshape.merge import merge
16-
from pandas.util.testing import assert_frame_equal
16+
17+
18+
@pytest.fixture
19+
def left():
20+
# a little relevant example with NAs
21+
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
22+
'qux', 'snap']
23+
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
24+
'three', 'one']
25+
26+
data = np.random.randn(len(key1))
27+
return DataFrame({'key1': key1, 'key2': key2, 'data': data})
28+
29+
30+
@pytest.fixture
31+
def right():
32+
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
33+
['one', 'two', 'three']],
34+
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
35+
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
36+
names=['first', 'second'])
37+
38+
return DataFrame(np.random.randn(10, 3), index=index,
39+
columns=['j_one', 'j_two', 'j_three'])
1740

1841

1942
class TestMergeMulti(object):
2043

21-
def setup_method(self):
22-
self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
23-
['one', 'two', 'three']],
24-
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
25-
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
26-
names=['first', 'second'])
27-
self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
28-
columns=['j_one', 'j_two', 'j_three'])
29-
30-
# a little relevant example with NAs
31-
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
32-
'qux', 'snap']
33-
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
34-
'three', 'one']
35-
36-
data = np.random.randn(len(key1))
37-
self.data = DataFrame({'key1': key1, 'key2': key2,
38-
'data': data})
39-
40-
def test_merge_on_multikey(self):
41-
joined = self.data.join(self.to_join, on=['key1', 'key2'])
42-
43-
join_key = Index(lzip(self.data['key1'], self.data['key2']))
44-
indexer = self.to_join.index.get_indexer(join_key)
45-
ex_values = self.to_join.values.take(indexer, axis=0)
44+
def test_merge_on_multikey(self, left, right):
45+
joined = left.join(right, on=['key1', 'key2'])
46+
47+
join_key = Index(lzip(left['key1'], left['key2']))
48+
indexer = right.index.get_indexer(join_key)
49+
ex_values = right.values.take(indexer, axis=0)
4650
ex_values[indexer == -1] = np.nan
47-
expected = self.data.join(DataFrame(ex_values,
48-
columns=self.to_join.columns))
51+
expected = left.join(DataFrame(ex_values, columns=right.columns))
4952

5053
# TODO: columns aren't in the same order yet
51-
assert_frame_equal(joined, expected.loc[:, joined.columns])
54+
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
5255

53-
left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
56+
left = left.join(right, on=['key1', 'key2'], sort=True)
5457
right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'],
5558
kind='mergesort')
56-
assert_frame_equal(left, right)
59+
tm.assert_frame_equal(left, right)
5760

5861
def test_left_join_multi_index(self):
5962
icols = ['1st', '2nd', '3rd']
@@ -119,18 +122,18 @@ def run_asserts(left, right):
119122

120123
run_asserts(left, right)
121124

122-
def test_merge_right_vs_left(self):
125+
def test_merge_right_vs_left(self, left, right):
123126
# compare left vs right merge with multikey
124127
for sort in [False, True]:
125-
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
126-
right_index=True, how='left', sort=sort)
128+
merged1 = left.merge(right, left_on=['key1', 'key2'],
129+
right_index=True, how='left', sort=sort)
127130

128-
merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
129-
left_index=True, how='right',
130-
sort=sort)
131+
merged2 = right.merge(left, right_on=['key1', 'key2'],
132+
left_index=True, how='right',
133+
sort=sort)
131134

132135
merged2 = merged2.loc[:, merged1.columns]
133-
assert_frame_equal(merged1, merged2)
136+
tm.assert_frame_equal(merged1, merged2)
134137

135138
def test_compress_group_combinations(self):
136139

@@ -393,15 +396,13 @@ def test_join_multi_levels(self):
393396
'nl0000289965']))
394397
.set_index(['household_id', 'asset_id'])
395398
.reindex(columns=['male', 'wealth', 'name', 'share']))
396-
assert_frame_equal(result, expected)
397-
398-
assert_frame_equal(result, expected)
399+
tm.assert_frame_equal(result, expected)
399400

400401
# equivalency
401-
result2 = (merge(household.reset_index(), portfolio.reset_index(),
402+
result = (merge(household.reset_index(), portfolio.reset_index(),
402403
on=['household_id'], how='inner')
403404
.set_index(['household_id', 'asset_id']))
404-
assert_frame_equal(result2, expected)
405+
tm.assert_frame_equal(result, expected)
405406

406407
result = household.join(portfolio, how='outer')
407408
expected = (concat([
@@ -412,7 +413,7 @@ def test_join_multi_levels(self):
412413
[(4, np.nan)],
413414
names=['household_id', 'asset_id'])))
414415
], axis=0, sort=True).reindex(columns=expected.columns))
415-
assert_frame_equal(result, expected)
416+
tm.assert_frame_equal(result, expected)
416417

417418
# invalid cases
418419
household.index.name = 'foo'
@@ -471,7 +472,7 @@ def test_join_multi_levels2(self):
471472
result = (merge(household.reset_index(), log_return.reset_index(),
472473
on=['asset_id'], how='inner')
473474
.set_index(['household_id', 'asset_id', 't']))
474-
assert_frame_equal(result, expected)
475+
tm.assert_frame_equal(result, expected)
475476

476477
expected = (
477478
DataFrame(dict(
@@ -496,7 +497,7 @@ def test_join_multi_levels2(self):
496497
on=['asset_id'], how='outer')
497498
.set_index(['household_id', 'asset_id', 't']))
498499

499-
assert_frame_equal(result, expected)
500+
tm.assert_frame_equal(result, expected)
500501

501502

502503
@pytest.fixture
@@ -564,17 +565,17 @@ def test_join_multi_empty_frames(self, left_multi, right_multi, join_type,
564565
result = left_multi.join(right_multi, how=join_type).sort_index()
565566
tm.assert_frame_equal(result, expected)
566567

567-
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
568-
def test_merge_datetime_index(self, klass):
568+
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
569+
def test_merge_datetime_index(self, box):
569570
# see gh-19038
570571
df = DataFrame([1, 2, 3],
571572
["2016-01-01", "2017-01-01", "2018-01-01"],
572573
columns=["a"])
573574
df.index = pd.to_datetime(df.index)
574575
on_vector = df.index.year
575576

576-
if klass is not None:
577-
on_vector = klass(on_vector)
577+
if box is not None:
578+
on_vector = box(on_vector)
578579

579580
expected = DataFrame(
580581
OrderedDict([
@@ -596,3 +597,26 @@ def test_merge_datetime_index(self, klass):
596597

597598
result = df.merge(df, on=[df.index.year], how="inner")
598599
tm.assert_frame_equal(result, expected)
600+
601+
def test_single_common_level(self):
602+
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
603+
('K1', 'X2')],
604+
names=['key', 'X'])
605+
606+
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
607+
'B': ['B0', 'B1', 'B2']},
608+
index=index_left)
609+
610+
index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
611+
('K2', 'Y2'), ('K2', 'Y3')],
612+
names=['key', 'Y'])
613+
614+
right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
615+
'D': ['D0', 'D1', 'D2', 'D3']},
616+
index=index_right)
617+
618+
result = left.join(right)
619+
expected = pd.merge(left.reset_index(), right.reset_index(),
620+
on=['key'], how='inner').set_index(['key', 'X', 'Y'])
621+
622+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)