BUG: left join on index with multiple matches (GH5391)

behzadnouri · jreback · commit 4411ab65942c · 2014-08-21T18:07:28.000-04:00
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -565,6 +565,7 @@ Bug Fixes
 
 - Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz  (:issue:`7735`)
 - Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`)
+- Bug in ``DataFrame.join`` when doing left join on index and there are multiple matches (:issue:`5391`)
 
 
 
diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx
@@ -103,12 +103,18 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
     left_indexer = _get_result_indexer(left_sorter, left_indexer)
     right_indexer = _get_result_indexer(right_sorter, right_indexer)
 
-    if not sort:
-        if left_sorter.dtype != np.int_:
-            left_sorter = left_sorter.astype(np.int_)
-
-        rev = np.empty(len(left), dtype=np.int_)
-        rev.put(left_sorter, np.arange(len(left)))
+    if not sort:  # if not asked to sort, revert to original order
+        if len(left) == len(left_indexer):
+            # no multiple matches for any row on the left
+            # this is a short-cut to avoid np.argsort;
+            # otherwise, the `else` path also works in this case
+            if left_sorter.dtype != np.int_:
+                left_sorter = left_sorter.astype(np.int_)
+
+            rev = np.empty(len(left), dtype=np.int_)
+            rev.put(left_sorter, np.arange(len(left)))
+        else:
+            rev = np.argsort(left_indexer)
 
         right_indexer = right_indexer.take(rev)
         left_indexer = left_indexer.take(rev)
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -238,7 +238,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                         key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
                                                             left_na_indexer))
 
-            elif left_indexer is not None:
+            elif left_indexer is not None \
+                    and isinstance(self.left_join_keys[i], np.ndarray):
+
                 if name is None:
                     name = 'key_%d' % i
 
@@ -566,9 +568,6 @@ def _get_single_indexer(join_key, index, sort=False):
 
 
 def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
-    join_index = left_ax
-    left_indexer = None
-
     if len(join_keys) > 1:
         if not ((isinstance(right_ax, MultiIndex) and
                  len(join_keys) == right_ax.nlevels)):
@@ -577,22 +576,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
                                  "number of join keys must be the number of "
                                  "levels in right_ax")
 
-        left_tmp, right_indexer = \
-            _get_multiindex_indexer(join_keys, right_ax,
-                                    sort=sort)
-        if sort:
-            left_indexer = left_tmp
-            join_index = left_ax.take(left_indexer)
+        left_indexer, right_indexer = \
+            _get_multiindex_indexer(join_keys, right_ax, sort=sort)
     else:
         jkey = join_keys[0]
-        if sort:
-            left_indexer, right_indexer = \
-                _get_single_indexer(jkey, right_ax, sort=sort)
-            join_index = left_ax.take(left_indexer)
-        else:
-            right_indexer = right_ax.get_indexer(jkey)
 
-    return join_index, left_indexer, right_indexer
+        left_indexer, right_indexer = \
+            _get_single_indexer(jkey, right_ax, sort=sort)
+
+    if sort or len(left_ax) != len(left_indexer):
+        # if asked to sort or there are 1-to-many matches
+        join_index = left_ax.take(left_indexer)
+        return join_index, left_indexer, right_indexer
+    else:
+        # left frame preserves order & length of its index
+        return left_ax, None, right_indexer
 
 
 def _right_outer_join(x, y, max_groups):
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -967,6 +967,98 @@ def test_left_join_index_preserve_order(self):
                        right_on=['k1', 'k2'], how='right')
         tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
 
+    def test_left_join_index_multi_match_multiindex(self):
+        left = DataFrame([
+            ['X', 'Y', 'C', 'a'],
+            ['W', 'Y', 'C', 'e'],
+            ['V', 'Q', 'A', 'h'],
+            ['V', 'R', 'D', 'i'],
+            ['X', 'Y', 'D', 'b'],
+            ['X', 'Y', 'A', 'c'],
+            ['W', 'Q', 'B', 'f'],
+            ['W', 'R', 'C', 'g'],
+            ['V', 'Y', 'C', 'j'],
+            ['X', 'Y', 'B', 'd']],
+            columns=['cola', 'colb', 'colc', 'tag'],
+            index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])
+
+        right = DataFrame([
+            ['W', 'R', 'C',  0],
+            ['W', 'Q', 'B',  3],
+            ['W', 'Q', 'B',  8],
+            ['X', 'Y', 'A',  1],
+            ['X', 'Y', 'A',  4],
+            ['X', 'Y', 'B',  5],
+            ['X', 'Y', 'C',  6],
+            ['X', 'Y', 'C',  9],
+            ['X', 'Q', 'C', -6],
+            ['X', 'R', 'C', -9],
+            ['V', 'Y', 'C',  7],
+            ['V', 'R', 'D',  2],
+            ['V', 'R', 'D', -1],
+            ['V', 'Q', 'A', -3]],
+            columns=['col1', 'col2', 'col3', 'val'])
+
+        right.set_index(['col1', 'col2', 'col3'], inplace=True)
+        result = left.join(right, on=['cola', 'colb', 'colc'], how='left')
+
+        expected = DataFrame([
+            ['X', 'Y', 'C', 'a',   6],
+            ['X', 'Y', 'C', 'a',   9],
+            ['W', 'Y', 'C', 'e', nan],
+            ['V', 'Q', 'A', 'h',  -3],
+            ['V', 'R', 'D', 'i',   2],
+            ['V', 'R', 'D', 'i',  -1],
+            ['X', 'Y', 'D', 'b', nan],
+            ['X', 'Y', 'A', 'c',   1],
+            ['X', 'Y', 'A', 'c',   4],
+            ['W', 'Q', 'B', 'f',   3],
+            ['W', 'Q', 'B', 'f',   8],
+            ['W', 'R', 'C', 'g',   0],
+            ['V', 'Y', 'C', 'j',   7],
+            ['X', 'Y', 'B', 'd',   5]],
+            columns=['cola', 'colb', 'colc', 'tag', 'val'],
+            index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_left_join_index_multi_match(self):
+        left = DataFrame([
+            ['c', 0],
+            ['b', 1],
+            ['a', 2],
+            ['b', 3]],
+            columns=['tag', 'val'],
+            index=[2, 0, 1, 3])
+
+        right = DataFrame([
+            ['a', 'v'],
+            ['c', 'w'],
+            ['c', 'x'],
+            ['d', 'y'],
+            ['a', 'z'],
+            ['c', 'r'],
+            ['e', 'q'],
+            ['c', 's']],
+            columns=['tag', 'char'])
+
+        right.set_index('tag', inplace=True)
+        result = left.join(right, on='tag', how='left')
+
+        expected = DataFrame([
+            ['c', 0, 'w'],
+            ['c', 0, 'x'],
+            ['c', 0, 'r'],
+            ['c', 0, 's'],
+            ['b', 1, nan],
+            ['a', 2, 'v'],
+            ['a', 2, 'z'],
+            ['b', 3, nan]],
+            columns=['tag', 'val', 'char'],
+            index=[2, 2, 2, 2, 0, 1, 1, 3])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_join_multi_dtypes(self):
 
         # test with multi dtypes in the join index
diff --git a/setup.py b/setup.py
@@ -448,7 +448,8 @@ def pxd(name):
            'sources': ['pandas/src/datetime/np_datetime.c',
                        'pandas/src/datetime/np_datetime_strings.c']},
     algos={'pyxfile': 'algos',
-           'depends': [srcpath('generated', suffix='.pyx')]},
+           'depends': [srcpath('generated', suffix='.pyx'),
+                       srcpath('join', suffix='.pyx')]},
     parser=dict(pyxfile='parser',
                 depends=['pandas/src/parser/tokenizer.h',
                          'pandas/src/parser/io.h',

Original file line number	Diff line number	Diff line change
`@@ -565,6 +565,7 @@ Bug Fixes`
`565`	`565`
`566`	`566`	- Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`)
`567`	`567`	- Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`)
	`568`	+- Bug in ``DataFrame.join`` when doing left join on index and there are multiple matches (:issue:`5391`)
`568`	`569`
`569`	`570`
`570`	`571`