Skip to content

Merge multi-index with a multi-index #15980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
82 changes: 67 additions & 15 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2896,27 +2896,83 @@ def join(self, other, how='left', level=None, return_indexers=False):

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

def _complete_join():
new_lvls = join_index.levels
new_lbls = join_index.labels
new_nms = join_index.names

for n in not_overlap:
if n in self_names:
idx = lidx
lvls = self.levels[self_names.index(n)].values
lbls = self.labels[self_names.index(n)]
else:
idx = ridx
lvls = other.levels[other_names.index(n)].values
lbls = other.labels[other_names.index(n)]

new_lvls = new_lvls.union([lvls])
l = [lbls[i] if i!=-1 else -1 for i in idx]
new_lbls = new_lbls.union([l])

new_nms = new_nms.union([n])

return new_lvls, new_lbls, new_nms

# figure out join names
self_names = [n for n in self.names if n is not None]
other_names = [n for n in other.names if n is not None]
overlap = list(set(self_names) & set(other_names))

# Drop the non matching levels
ldrop_levels = [l for l in self_names if l not in overlap]
rdrop_levels = [l for l in other_names if l not in overlap]

self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

# need at least 1 in common, but not more than 1
if not len(overlap):
raise ValueError("cannot join with no level specified and no "
"overlapping names")
if len(overlap) > 1:
raise NotImplementedError("merging with more than one level "
"overlap on a multi-index is not "
"implemented")
jl = overlap[0]
raise ValueError("cannot join with no overlapping index names")

if self_is_mi and other_is_mi:
self_tmp = self.droplevel(ldrop_levels)
other_tmp = other.droplevel(rdrop_levels)

if not (other_tmp.is_unique and self_tmp.is_unique):
raise TypeError(" The index resulting from the overlapping "
"levels is not unique")

join_index, lidx, ridx = self_tmp.join(other_tmp, how=how,
return_indexers=True)

# Append to the returned Index the non-overlapping levels
not_overlap = ldrop_levels + rdrop_levels

if how == 'left':
join_index = self
elif how == 'right':
join_index = other
else:
join_index = join_index

if how == 'outer':
new_levels, new_labels, new_names = _complete_join()
else:
new_levels = join_index.levels
new_labels = join_index.labels
new_names = join_index.names

join_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)

return join_index, lidx, ridx

# make the indices into mi's that match
if not (self_is_mi and other_is_mi):
else:
jl = overlap[0]

# make the indices into mi's that match
flip_order = False
if self_is_mi:
self, other = other, self
Expand All @@ -2933,10 +2989,6 @@ def _join_multi(self, other, how, return_indexers=True):
return result[0], result[2], result[1]
return result

# 2 multi-indexes
raise NotImplementedError("merging with both multi-indexes is not "
"implemented")

def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.tools.merge import _get_join_indexers

Expand Down
183 changes: 174 additions & 9 deletions pandas/tests/tools/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,11 +1211,7 @@ def test_join_multi_levels2(self):
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=['share', 'log_return']))

def f():
household.join(log_return, how='inner')
self.assertRaises(NotImplementedError, f)

# this is the equivalency
# this is equivalency the
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='inner')
.set_index(['household_id', 'asset_id', 't']))
Expand All @@ -1224,7 +1220,7 @@ def f():
expected = (
DataFrame(dict(
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why would this change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a typo in the original test. I am quite sure because the deleted ID (nl0000289783) does not appear in any dataframe of 'test_join_multi_levels2' function.

asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
"gb00b03mlx29", "gb00b03mlx29",
"gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
"lu0197800237", "lu0197800237",
Expand All @@ -1237,8 +1233,177 @@ def f():
.09604978, -.06524096, .03532373,
.03025441, .036997, None, None]
))
.set_index(["household_id", "asset_id", "t"]))
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=['share', 'log_return']))

result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='outer')
.set_index(['household_id', 'asset_id', 't']))

assert_frame_equal(result, expected)

def test_join_multi_levels3(self):
# Multi-index join tests
# Self join
matrix = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

distances = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

expected = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Trips_joined=[1987, 3647, 2470, 4296, 4444]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips', 'Trips_joined'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

result = matrix.join(matrix, how='inner', rsuffix='_joined')
assert_frame_equal(result, expected)

#Left join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP', 'AM', 'OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Distance=[100, 80, 90, np.nan, 75]),
columns=['Origin', 'Destination', 'Period', 'TripPurp',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so the very first test you should have is a self-join.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean result = matrix.join(matrix, how='inner', rsuffix='_joined')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jreback , I've implemented the requested changes but I'm not sure they show because I force-pushed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nothing seems to have been pushed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevertheless, the current commit includes all the requested changes. You can review them whenever possible

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it was pushed it would show up
there are 2 commits last one from april 12

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think now it must be fine. My changes were pushed in a second commit but now I merged them. Really sorry about the confusion, still getting to know GIT..

'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

result = matrix.join(distances, how='left')
assert_frame_equal(result, expected)

#Right join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Trips=[1987, 3647, 2470, np.nan, 4444, np.nan, np.nan],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Trips', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

result = matrix.join(distances, how='right')
assert_frame_equal(result, expected)

#Inner join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 3],
Destination=[1, 2, 1, 1],
Period=['AM','PM','IP', 'OP'],
Trips=[1987, 3647, 2470, 4444],
Distance=[100, 80, 90, 75]),
columns=['Origin', 'Destination', 'Period', 'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period']))

result = matrix.join(distances, how='inner')
assert_frame_equal(result, expected)

#Outer join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 3, 1, 2, 6],
Period=['AM','PM','IP', 'AM', 'AM', 'OP', 'IP', 'AM'],
TripPurp=['hbw', 'nhb', 'hbo', np.nan, 'nhb',
'hbw', np.nan, np.nan],
LinkType=['a', 'a', 'c', 'b', np.nan, 'a', 'b', 'a'],
Trips=[1987, 3647, 2470, np.nan, 4296, 4444, np.nan, np.nan],
Distance=[100, 80, 90, 80, np.nan, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType',
'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']))


result = matrix.join(distances, how='outer')
assert_frame_equal(result, expected)

#Non-unique resulting index
distances2 = (
pd.DataFrame(
dict(Origin= [1, 1, 2],
Destination=[1, 1, 1],
Period=['AM','AM', 'PM'],
LinkType=['a', 'b', 'a'],
Distance=[100, 110, 120]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

def f():
matrix.join(distances2, how='left')
self.assertRaises(TypeError, f)

#No-overlapping level names
distances2 = (
pd.DataFrame(
dict(Orig= [1, 1, 2, 2, 3, 3, 5],
Dest=[1, 2, 1, 2, 1, 2, 6],
Per=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkTyp=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Dist=[100, 80, 90, 80, 75, 35, 55]),
columns=['Orig', 'Dest', 'Per',
'LinkTyp', 'Dist'])
.set_index(['Orig', 'Dest','Per', 'LinkTyp']))

def f():
household.join(log_return, how='outer')
self.assertRaises(NotImplementedError, f)
matrix.join(distances2, how='left')
self.assertRaises(ValueError, f)

# Empty Level

distances2 = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))


expected = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Distance=[np.nan, np.nan, np.nan, np.nan, np.nan]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

result = matrix.join(distances2, how='left')
assert_frame_equal(result, expected)