Skip to content

merge multi-index with a multi-index #15978

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,5 @@ doc/source/index.rst
doc/build/html/index.html
# Windows specific leftover:
doc/tmp.sv

Untitled\.ipynb
118 changes: 105 additions & 13 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2896,27 +2896,78 @@ def join(self, other, how='left', level=None, return_indexers=False):

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

def _complete_join(new_lvls, new_lbls, new_nms):
for n in not_overlap:
if n in self_names:
idx = lidx
lvls = self.levels[self_names.index(n)].values
lbls = self.labels[self_names.index(n)]
else:
idx = ridx
lvls = other.levels[other_names.index(n)].values
lbls = other.labels[other_names.index(n)]

new_lvls = new_levels.union([lvls])
l = [lbls[i] if i!=-1 else -1 for i in idx]
new_lbls = new_lbls.union([l])

new_nms = new_nms.union([n])

return new_lvls, new_lbls, new_nms

# figure out join names
self_names = [n for n in self.names if n is not None]
other_names = [n for n in other.names if n is not None]
overlap = list(set(self_names) & set(other_names))

# Drop the non matching levels
ldrop_levels = [l for l in self_names if l not in overlap]
rdrop_levels = [l for l in other_names if l not in overlap]

self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

# need at least 1 in common, but not more than 1
if not len(overlap):
raise ValueError("cannot join with no level specified and no "
"overlapping names")
if len(overlap) > 1:
raise NotImplementedError("merging with more than one level "
"overlap on a multi-index is not "
"implemented")
jl = overlap[0]

# make the indices into mi's that match
if not (self_is_mi and other_is_mi):
if self_is_mi and other_is_mi:
self_tmp = self.droplevel(ldrop_levels)
other_tmp = other.droplevel(rdrop_levels)

join_index, lidx, ridx = self_tmp.join(other_tmp, how=how,
return_indexers=True)

# Append to the returned Index the non-overlapping levels
not_overlap = (set(self_names) ^ set(other_names))

if how == 'left':
ji = self
elif how == 'right':
ji = other
else:
ji = join_index

if how == 'outer':
new_levels, new_labels, new_names = _complete_join(ji.levels,
ji.labels,
ji.names)
else:
new_levels = ji.levels
new_labels = ji.labels
new_names = ji.names

join_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)

return join_index, lidx, ridx

else:
jl = overlap[0]

# make the indices into mi's that match
flip_order = False
if self_is_mi:
self, other = other, self
Expand All @@ -2933,10 +2984,6 @@ def _join_multi(self, other, how, return_indexers=True):
return result[0], result[2], result[1]
return result

# 2 multi-indexes
raise NotImplementedError("merging with both multi-indexes is not "
"implemented")

def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.tools.merge import _get_join_indexers

Expand Down Expand Up @@ -3868,3 +3915,48 @@ def _trim_front(strings):
def _validate_join_method(method):
if method not in ['left', 'right', 'inner', 'outer']:
raise ValueError('do not recognize join method %s' % method)

if __name__ == '__main__':
import pandas as pd

matrix = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444]),
columns=['Origin', 'Destination', 'Period', 'TripPurp', 'Trips'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

distances = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period', 'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP', 'AM', 'OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Distance=[100, 80, 90, np.nan, 75]),
columns=['Origin', 'Destination', 'Period', 'TripPurp',
'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))


print(matrix)
print(distances)

result = matrix.join(distances, how='left')

print(expected)

print(result)
59 changes: 49 additions & 10 deletions pandas/tests/tools/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,11 +1211,7 @@ def test_join_multi_levels2(self):
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=['share', 'log_return']))

def f():
household.join(log_return, how='inner')
self.assertRaises(NotImplementedError, f)

# this is the equivalency
# this is equivalency the
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='inner')
.set_index(['household_id', 'asset_id', 't']))
Expand All @@ -1224,7 +1220,7 @@ def f():
expected = (
DataFrame(dict(
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
"gb00b03mlx29", "gb00b03mlx29",
"gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
"lu0197800237", "lu0197800237",
Expand All @@ -1237,8 +1233,51 @@ def f():
.09604978, -.06524096, .03532373,
.03025441, .036997, None, None]
))
.set_index(["household_id", "asset_id", "t"]))
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=['share', 'log_return']))

def f():
household.join(log_return, how='outer')
self.assertRaises(NotImplementedError, f)
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='outer')
.set_index(['household_id', 'asset_id', 't']))

assert_frame_equal(result, expected)

def test_join_multi_levels3(self):

matrix = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

distances = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP', 'AM', 'OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Distance=[100, 80, 90, np.nan, 75]),
columns=['Origin', 'Destination', 'Period', 'TripPurp',
'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))


result = matrix.join(distances, how='left')
assert_frame_equal(result, expected)