diff --git a/.gitignore b/.gitignore index a509fcf736ea8..8445932cd68c0 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,5 @@ doc/source/index.rst doc/build/html/index.html # Windows specific leftover: doc/tmp.sv + +Untitled\.ipynb diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 5d43d2d32af67..025d02ff5dc07 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2896,27 +2896,78 @@ def join(self, other, how='left', level=None, return_indexers=False): def _join_multi(self, other, how, return_indexers=True): from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + def _complete_join(new_lvls, new_lbls, new_nms): + for n in not_overlap: + if n in self_names: + idx = lidx + lvls = self.levels[self_names.index(n)].values + lbls = self.labels[self_names.index(n)] + else: + idx = ridx + lvls = other.levels[other_names.index(n)].values + lbls = other.labels[other_names.index(n)] + + new_lvls = new_levels.union([lvls]) + l = [lbls[i] if i!=-1 else -1 for i in idx] + new_lbls = new_lbls.union([l]) + + new_nms = new_nms.union([n]) + + return new_lvls, new_lbls, new_nms + # figure out join names self_names = [n for n in self.names if n is not None] other_names = [n for n in other.names if n is not None] overlap = list(set(self_names) & set(other_names)) + # Drop the non matching levels + ldrop_levels = [l for l in self_names if l not in overlap] + rdrop_levels = [l for l in other_names if l not in overlap] + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + # need at least 1 in common, but not more than 1 if not len(overlap): raise ValueError("cannot join with no level specified and no " "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] - # make the indices into mi's that match - if not (self_is_mi and other_is_mi): + if self_is_mi and other_is_mi: + self_tmp = self.droplevel(ldrop_levels) + other_tmp = other.droplevel(rdrop_levels) + + join_index, lidx, ridx = self_tmp.join(other_tmp, how=how, + return_indexers=True) + + # Append to the returned Index the non-overlapping levels + not_overlap = (set(self_names) ^ set(other_names)) + + if how == 'left': + ji = self + elif how == 'right': + ji = other + else: + ji = join_index + + if how == 'outer': + new_levels, new_labels, new_names = _complete_join(ji.levels, + ji.labels, + ji.names) + else: + new_levels = ji.levels + new_labels = ji.labels + new_names = ji.names + + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + return join_index, lidx, ridx + + else: + jl = overlap[0] + # make the indices into mi's that match flip_order = False if self_is_mi: self, other = other, self @@ -2933,10 +2984,6 @@ def _join_multi(self, other, how, return_indexers=True): return result[0], result[2], result[1] return result - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") - def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.tools.merge import _get_join_indexers @@ -3868,3 +3915,48 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: raise ValueError('do not recognize join method %s' % method) + +if __name__ == '__main__': + import pandas as pd + + matrix = ( + pd.DataFrame( + dict(Origin=[1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP','AM','OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + distances = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3, 3, 5], + Destination=[1, 2, 1, 2, 1, 2, 6], + Period=['AM','PM','IP','AM','OP','IP', 'AM'], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination','Period', 'LinkType'])) + + expected = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444], + Distance=[100, 80, 90, np.nan, 75]), + columns=['Origin', 'Destination', 'Period', 'TripPurp', + 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + + print(matrix) + print(distances) + + result = matrix.join(distances, how='left') + + print(expected) + + print(result) \ No newline at end of file diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index b3b5e7e29319b..040a1ac619a9d 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1211,11 +1211,7 @@ def test_join_multi_levels2(self): .set_index(["household_id", "asset_id", "t"]) .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='inner') - self.assertRaises(NotImplementedError, f) - - # this is the equivalency + # this is equivalency the result = (merge(household.reset_index(), log_return.reset_index(), on=['asset_id'], how='inner') .set_index(['household_id', 'asset_id', 't'])) @@ -1224,7 +1220,7 @@ def f(): expected = ( DataFrame(dict( household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", @@ -1237,8 +1233,51 @@ def f(): .09604978, -.06524096, .03532373, .03025441, .036997, None, None] )) - .set_index(["household_id", "asset_id", "t"])) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='outer') - self.assertRaises(NotImplementedError, f) + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) + + assert_frame_equal(result, expected) + + def test_join_multi_levels3(self): + + matrix = ( + pd.DataFrame( + dict(Origin=[1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP','AM','OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + distances = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3, 3, 5], + Destination=[1, 2, 1, 2, 1, 2, 6], + Period=['AM','PM','IP','AM','OP','IP', 'AM'], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination','Period', 'LinkType'])) + + expected = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444], + Distance=[100, 80, 90, np.nan, 75]), + columns=['Origin', 'Destination', 'Period', 'TripPurp', + 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + + result = matrix.join(distances, how='left') + assert_frame_equal(result, expected)