pandas-dev · harisbal · Apr 12, 2017 · Apr 12, 2017 · Apr 12, 2017 · Apr 12, 2017
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -2896,27 +2896,83 @@ def join(self, other, how='left', level=None, return_indexers=False):
 
     def _join_multi(self, other, how, return_indexers=True):
         from .multi import MultiIndex
-        self_is_mi = isinstance(self, MultiIndex)
-        other_is_mi = isinstance(other, MultiIndex)
 
+        def _complete_join():
+            new_lvls = join_index.levels
+            new_lbls = join_index.labels
+            new_nms = join_index.names
+
+            for n in not_overlap:
+                if n in self_names:
+                    idx = lidx
+                    lvls = self.levels[self_names.index(n)].values
+                    lbls = self.labels[self_names.index(n)]
+                else:
+                    idx = ridx
+                    lvls = other.levels[other_names.index(n)].values
+                    lbls = other.labels[other_names.index(n)]
+
+                new_lvls = new_lvls.union([lvls])                    
+                l = [lbls[i] if i!=-1 else -1 for i in idx]  
+                new_lbls = new_lbls.union([l])
+
+                new_nms = new_nms.union([n])
+
+            return  new_lvls, new_lbls, new_nms
+
         # figure out join names
         self_names = [n for n in self.names if n is not None]
         other_names = [n for n in other.names if n is not None]
         overlap = list(set(self_names) & set(other_names))
 
+        # Drop the non matching levels
+        ldrop_levels = [l for l in self_names if l not in overlap]
+        rdrop_levels = [l for l in other_names if l not in overlap]
+
+        self_is_mi = isinstance(self, MultiIndex)
+        other_is_mi = isinstance(other, MultiIndex)
+
         # need at least 1 in common, but not more than 1
         if not len(overlap):
-            raise ValueError("cannot join with no level specified and no "
-                             "overlapping names")
-        if len(overlap) > 1:
-            raise NotImplementedError("merging with more than one level "
-                                      "overlap on a multi-index is not "
-                                      "implemented")
-        jl = overlap[0]
+            raise ValueError("cannot join with no overlapping index names")
+
+        if self_is_mi and other_is_mi:
+            self_tmp = self.droplevel(ldrop_levels)
+            other_tmp = other.droplevel(rdrop_levels)
+
+            if not (other_tmp.is_unique and self_tmp.is_unique):
+                raise TypeError(" The index resulting from the overlapping "
+                                 "levels is not unique")
+
+            join_index, lidx, ridx = self_tmp.join(other_tmp, how=how,
+                                                   return_indexers=True)
+
+            # Append to the returned Index the non-overlapping levels            
+            not_overlap = ldrop_levels + rdrop_levels
+
+            if how == 'left':
+                join_index = self
+            elif how == 'right':
+                join_index = other
+            else:
+                join_index = join_index
+
+            if how == 'outer':
+                new_levels, new_labels, new_names = _complete_join()
+            else:
+                new_levels = join_index.levels
+                new_labels = join_index.labels
+                new_names = join_index.names
+
+            join_index = MultiIndex(levels=new_levels, labels=new_labels,
+                                    names=new_names, verify_integrity=False)
+
+            return join_index, lidx, ridx
 
-        # make the indices into mi's that match
-        if not (self_is_mi and other_is_mi):
+        else:
+            jl = overlap[0]
 
+            # make the indices into mi's that match
             flip_order = False
             if self_is_mi:
                 self, other = other, self
@@ -2933,10 +2989,6 @@ def _join_multi(self, other, how, return_indexers=True):
                     return result[0], result[2], result[1]
             return result
 
-        # 2 multi-indexes
-        raise NotImplementedError("merging with both multi-indexes is not "
-                                  "implemented")
-
     def _join_non_unique(self, other, how='left', return_indexers=False):
         from pandas.tools.merge import _get_join_indexers
 

diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py
@@ -1211,11 +1211,7 @@ def test_join_multi_levels2(self):
             .set_index(["household_id", "asset_id", "t"])
             .reindex(columns=['share', 'log_return']))
 
-        def f():
-            household.join(log_return, how='inner')
-        self.assertRaises(NotImplementedError, f)
-
-        # this is the equivalency
+        # this is equivalency the
         result = (merge(household.reset_index(), log_return.reset_index(),
                         on=['asset_id'], how='inner')
                   .set_index(['household_id', 'asset_id', 't']))
@@ -1224,7 +1220,7 @@ def f():
         expected = (
             DataFrame(dict(
                 household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
-                asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
+                asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
                           "gb00b03mlx29", "gb00b03mlx29",
                           "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
                           "lu0197800237", "lu0197800237",
@@ -1237,8 +1233,177 @@ def f():
                             .09604978, -.06524096, .03532373,
                             .03025441, .036997, None, None]
             ))
-            .set_index(["household_id", "asset_id", "t"]))
+            .set_index(["household_id", "asset_id", "t"])
+            .reindex(columns=['share', 'log_return']))
+
+        result = (merge(household.reset_index(), log_return.reset_index(),
+                on=['asset_id'], how='outer')
+          .set_index(['household_id', 'asset_id', 't']))
+
+        assert_frame_equal(result, expected)
+
+    def test_join_multi_levels3(self):
+        # Multi-index join tests
+        # Self join
+        matrix = (
+            pd.DataFrame(
+                dict(Origin=[1, 1, 2, 2, 3],
+                     Destination=[1, 2, 1, 3, 1],
+                     Period=['AM','PM','IP','AM','OP'],
+                     TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
+                     Trips=[1987, 3647, 2470, 4296, 4444]),
+                columns=['Origin', 'Destination', 'Period',
+                         'TripPurp', 'Trips'])
+            .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
+
+        distances = (
+            pd.DataFrame(
+                dict(Origin=     [1, 1, 2, 2, 3, 3, 5],
+                     Destination=[1, 2, 1, 2, 1, 2, 6],
+                     Period=['AM','PM','IP','AM','OP','IP', 'AM'],
+                     LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
+                     Distance=[100, 80, 90, 80, 75, 35, 55]),
+                columns=['Origin', 'Destination', 'Period', 
+                         'LinkType', 'Distance'])
+            .set_index(['Origin', 'Destination','Period', 'LinkType']))
+
+        expected = (
+            pd.DataFrame(
+                dict(Origin=[1, 1, 2, 2, 3],
+                     Destination=[1, 2, 1, 3, 1],
+                     Period=['AM','PM','IP','AM','OP'],
+                     TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
+                     Trips=[1987, 3647, 2470, 4296, 4444],
+                     Trips_joined=[1987, 3647, 2470, 4296, 4444]),
+                columns=['Origin', 'Destination', 'Period',
+                         'TripPurp', 'Trips', 'Trips_joined'])
+            .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
+
+        result = matrix.join(matrix, how='inner', rsuffix='_joined') 
+        assert_frame_equal(result, expected)
+
+        #Left join
+        expected = (
+            pd.DataFrame(
+                dict(Origin=     [1, 1, 2, 2, 3],
+                     Destination=[1, 2, 1, 3, 1],
+                     Period=['AM','PM','IP', 'AM', 'OP'],
+                     TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
+                     Trips=[1987, 3647, 2470, 4296, 4444],
+                     Distance=[100, 80, 90, np.nan, 75]),
+                columns=['Origin', 'Destination', 'Period', 'TripPurp', 
+                         'Trips', 'Distance'])
+            .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
+
+        result = matrix.join(distances, how='left')
+        assert_frame_equal(result, expected)
+
+        #Right join
+        expected = (
+            pd.DataFrame(
+                dict(Origin=     [1, 1, 2, 2, 3, 3, 5],
+                     Destination=[1, 2, 1, 2, 1, 2, 6],
+                     Period=['AM','PM','IP','AM','OP','IP', 'AM'],
+                     LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
+                     Trips=[1987, 3647, 2470, np.nan, 4444, np.nan, np.nan],
+                     Distance=[100, 80, 90, 80, 75, 35, 55]),
+                columns=['Origin', 'Destination', 'Period', 
+                         'LinkType', 'Trips', 'Distance'])
+            .set_index(['Origin', 'Destination','Period', 'LinkType']))
+
+        result = matrix.join(distances, how='right')
+        assert_frame_equal(result, expected)
+
+        #Inner join
+        expected = (
+            pd.DataFrame(
+                dict(Origin=     [1, 1, 2, 3],
+                     Destination=[1, 2, 1, 1],
+                     Period=['AM','PM','IP', 'OP'],
+                     Trips=[1987, 3647, 2470, 4444],
+                     Distance=[100, 80, 90, 75]),
+                columns=['Origin', 'Destination', 'Period', 'Trips', 'Distance'])
+            .set_index(['Origin', 'Destination', 'Period']))
+
+        result = matrix.join(distances, how='inner')
+        assert_frame_equal(result, expected)
 
+        #Outer join
+        expected = (
+            pd.DataFrame(
+                dict(Origin=     [1, 1, 2, 2, 2, 3, 3, 5],
+                     Destination=[1, 2, 1, 2, 3, 1, 2, 6],
+                     Period=['AM','PM','IP', 'AM', 'AM', 'OP', 'IP', 'AM'],
+                     TripPurp=['hbw', 'nhb', 'hbo', np.nan, 'nhb',
+                               'hbw', np.nan, np.nan],
+                     LinkType=['a', 'a', 'c', 'b', np.nan, 'a', 'b', 'a'],
+                     Trips=[1987, 3647, 2470, np.nan, 4296, 4444, np.nan, np.nan],
+                     Distance=[100, 80, 90, 80, np.nan, 75, 35, 55]),
+                columns=['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType', 
+                         'Trips', 'Distance'])
+            .set_index(['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']))
+
+
+        result = matrix.join(distances, how='outer')
+        assert_frame_equal(result, expected)
+
+        #Non-unique resulting index
+        distances2 = (
+            pd.DataFrame(
+                dict(Origin=     [1, 1, 2],
+                     Destination=[1, 1, 1],
+                     Period=['AM','AM', 'PM'],
+                     LinkType=['a', 'b', 'a'],
+                     Distance=[100, 110, 120]),
+                columns=['Origin', 'Destination', 'Period', 
+                         'LinkType', 'Distance'])
+            .set_index(['Origin', 'Destination','Period', 'LinkType']))
+
+        def f():
+            matrix.join(distances2, how='left')
+        self.assertRaises(TypeError, f)
+
+        #No-overlapping level names
+        distances2 = (
+            pd.DataFrame(
+                dict(Orig=     [1, 1, 2, 2, 3, 3, 5],
+                     Dest=[1, 2, 1, 2, 1, 2, 6],
+                     Per=['AM','PM','IP','AM','OP','IP', 'AM'],
+                     LinkTyp=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
+                     Dist=[100, 80, 90, 80, 75, 35, 55]),
+                columns=['Orig', 'Dest', 'Per', 
+                         'LinkTyp', 'Dist'])
+            .set_index(['Orig', 'Dest','Per', 'LinkTyp']))
+
         def f():
-            household.join(log_return, how='outer')
-        self.assertRaises(NotImplementedError, f)
+            matrix.join(distances2, how='left')
+        self.assertRaises(ValueError, f)
+
+        # Empty Level
+
+        distances2 = (
+            pd.DataFrame(
+                dict(Origin=[1, 1, 2, 2, 3, 3, 5],
+                     Destination=[1, 2, 1, 2, 1, 2, 6],
+                     Period=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
+                     LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
+                     Distance=[100, 80, 90, 80, 75, 35, 55]),
+                columns=['Origin', 'Destination', 'Period', 
+                         'LinkType', 'Distance'])
+            .set_index(['Origin', 'Destination','Period', 'LinkType']))
+
+
+        expected = (
+            pd.DataFrame(
+                dict(Origin=[1, 1, 2, 2, 3],
+                     Destination=[1, 2, 1, 3, 1],
+                     Period=['AM','PM','IP','AM','OP'],
+                     TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
+                     Trips=[1987, 3647, 2470, 4296, 4444],
+                     Distance=[np.nan, np.nan, np.nan, np.nan, np.nan]),
+                columns=['Origin', 'Destination', 'Period',
+                         'TripPurp', 'Trips', 'Distance'])
+            .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
+
+        result = matrix.join(distances2, how='left')
+        assert_frame_equal(result, expected)