Skip to content

Commit 4411ab6

Browse files
behzadnourijreback
authored andcommitted
BUG: left join on index with multiple matches (GH5391)
1 parent aa5e55e commit 4411ab6

File tree

5 files changed

+122
-24
lines changed

5 files changed

+122
-24
lines changed

doc/source/v0.15.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,7 @@ Bug Fixes
565565

566566
- Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`)
567567
- Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`)
568+
- Bug in ``DataFrame.join`` when doing left join on index and there are multiple matches (:issue:`5391`)
568569

569570

570571

pandas/src/join.pyx

+12-6
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,18 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
103103
left_indexer = _get_result_indexer(left_sorter, left_indexer)
104104
right_indexer = _get_result_indexer(right_sorter, right_indexer)
105105

106-
if not sort:
107-
if left_sorter.dtype != np.int_:
108-
left_sorter = left_sorter.astype(np.int_)
109-
110-
rev = np.empty(len(left), dtype=np.int_)
111-
rev.put(left_sorter, np.arange(len(left)))
106+
if not sort: # if not asked to sort, revert to original order
107+
if len(left) == len(left_indexer):
108+
# no multiple matches for any row on the left
109+
# this is a short-cut to avoid np.argsort;
110+
# otherwise, the `else` path also works in this case
111+
if left_sorter.dtype != np.int_:
112+
left_sorter = left_sorter.astype(np.int_)
113+
114+
rev = np.empty(len(left), dtype=np.int_)
115+
rev.put(left_sorter, np.arange(len(left)))
116+
else:
117+
rev = np.argsort(left_indexer)
112118

113119
right_indexer = right_indexer.take(rev)
114120
left_indexer = left_indexer.take(rev)

pandas/tools/merge.py

+15-17
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
238238
key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
239239
left_na_indexer))
240240

241-
elif left_indexer is not None:
241+
elif left_indexer is not None \
242+
and isinstance(self.left_join_keys[i], np.ndarray):
243+
242244
if name is None:
243245
name = 'key_%d' % i
244246

@@ -566,9 +568,6 @@ def _get_single_indexer(join_key, index, sort=False):
566568

567569

568570
def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
569-
join_index = left_ax
570-
left_indexer = None
571-
572571
if len(join_keys) > 1:
573572
if not ((isinstance(right_ax, MultiIndex) and
574573
len(join_keys) == right_ax.nlevels)):
@@ -577,22 +576,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
577576
"number of join keys must be the number of "
578577
"levels in right_ax")
579578

580-
left_tmp, right_indexer = \
581-
_get_multiindex_indexer(join_keys, right_ax,
582-
sort=sort)
583-
if sort:
584-
left_indexer = left_tmp
585-
join_index = left_ax.take(left_indexer)
579+
left_indexer, right_indexer = \
580+
_get_multiindex_indexer(join_keys, right_ax, sort=sort)
586581
else:
587582
jkey = join_keys[0]
588-
if sort:
589-
left_indexer, right_indexer = \
590-
_get_single_indexer(jkey, right_ax, sort=sort)
591-
join_index = left_ax.take(left_indexer)
592-
else:
593-
right_indexer = right_ax.get_indexer(jkey)
594583

595-
return join_index, left_indexer, right_indexer
584+
left_indexer, right_indexer = \
585+
_get_single_indexer(jkey, right_ax, sort=sort)
586+
587+
if sort or len(left_ax) != len(left_indexer):
588+
# if asked to sort or there are 1-to-many matches
589+
join_index = left_ax.take(left_indexer)
590+
return join_index, left_indexer, right_indexer
591+
else:
592+
# left frame preserves order & length of its index
593+
return left_ax, None, right_indexer
596594

597595

598596
def _right_outer_join(x, y, max_groups):

pandas/tools/tests/test_merge.py

+92
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,98 @@ def test_left_join_index_preserve_order(self):
967967
right_on=['k1', 'k2'], how='right')
968968
tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
969969

970+
def test_left_join_index_multi_match_multiindex(self):
971+
left = DataFrame([
972+
['X', 'Y', 'C', 'a'],
973+
['W', 'Y', 'C', 'e'],
974+
['V', 'Q', 'A', 'h'],
975+
['V', 'R', 'D', 'i'],
976+
['X', 'Y', 'D', 'b'],
977+
['X', 'Y', 'A', 'c'],
978+
['W', 'Q', 'B', 'f'],
979+
['W', 'R', 'C', 'g'],
980+
['V', 'Y', 'C', 'j'],
981+
['X', 'Y', 'B', 'd']],
982+
columns=['cola', 'colb', 'colc', 'tag'],
983+
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])
984+
985+
right = DataFrame([
986+
['W', 'R', 'C', 0],
987+
['W', 'Q', 'B', 3],
988+
['W', 'Q', 'B', 8],
989+
['X', 'Y', 'A', 1],
990+
['X', 'Y', 'A', 4],
991+
['X', 'Y', 'B', 5],
992+
['X', 'Y', 'C', 6],
993+
['X', 'Y', 'C', 9],
994+
['X', 'Q', 'C', -6],
995+
['X', 'R', 'C', -9],
996+
['V', 'Y', 'C', 7],
997+
['V', 'R', 'D', 2],
998+
['V', 'R', 'D', -1],
999+
['V', 'Q', 'A', -3]],
1000+
columns=['col1', 'col2', 'col3', 'val'])
1001+
1002+
right.set_index(['col1', 'col2', 'col3'], inplace=True)
1003+
result = left.join(right, on=['cola', 'colb', 'colc'], how='left')
1004+
1005+
expected = DataFrame([
1006+
['X', 'Y', 'C', 'a', 6],
1007+
['X', 'Y', 'C', 'a', 9],
1008+
['W', 'Y', 'C', 'e', nan],
1009+
['V', 'Q', 'A', 'h', -3],
1010+
['V', 'R', 'D', 'i', 2],
1011+
['V', 'R', 'D', 'i', -1],
1012+
['X', 'Y', 'D', 'b', nan],
1013+
['X', 'Y', 'A', 'c', 1],
1014+
['X', 'Y', 'A', 'c', 4],
1015+
['W', 'Q', 'B', 'f', 3],
1016+
['W', 'Q', 'B', 'f', 8],
1017+
['W', 'R', 'C', 'g', 0],
1018+
['V', 'Y', 'C', 'j', 7],
1019+
['X', 'Y', 'B', 'd', 5]],
1020+
columns=['cola', 'colb', 'colc', 'tag', 'val'],
1021+
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])
1022+
1023+
tm.assert_frame_equal(result, expected)
1024+
1025+
def test_left_join_index_multi_match(self):
1026+
left = DataFrame([
1027+
['c', 0],
1028+
['b', 1],
1029+
['a', 2],
1030+
['b', 3]],
1031+
columns=['tag', 'val'],
1032+
index=[2, 0, 1, 3])
1033+
1034+
right = DataFrame([
1035+
['a', 'v'],
1036+
['c', 'w'],
1037+
['c', 'x'],
1038+
['d', 'y'],
1039+
['a', 'z'],
1040+
['c', 'r'],
1041+
['e', 'q'],
1042+
['c', 's']],
1043+
columns=['tag', 'char'])
1044+
1045+
right.set_index('tag', inplace=True)
1046+
result = left.join(right, on='tag', how='left')
1047+
1048+
expected = DataFrame([
1049+
['c', 0, 'w'],
1050+
['c', 0, 'x'],
1051+
['c', 0, 'r'],
1052+
['c', 0, 's'],
1053+
['b', 1, nan],
1054+
['a', 2, 'v'],
1055+
['a', 2, 'z'],
1056+
['b', 3, nan]],
1057+
columns=['tag', 'val', 'char'],
1058+
index=[2, 2, 2, 2, 0, 1, 1, 3])
1059+
1060+
tm.assert_frame_equal(result, expected)
1061+
9701062
def test_join_multi_dtypes(self):
9711063

9721064
# test with multi dtypes in the join index

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,8 @@ def pxd(name):
448448
'sources': ['pandas/src/datetime/np_datetime.c',
449449
'pandas/src/datetime/np_datetime_strings.c']},
450450
algos={'pyxfile': 'algos',
451-
'depends': [srcpath('generated', suffix='.pyx')]},
451+
'depends': [srcpath('generated', suffix='.pyx'),
452+
srcpath('join', suffix='.pyx')]},
452453
parser=dict(pyxfile='parser',
453454
depends=['pandas/src/parser/tokenizer.h',
454455
'pandas/src/parser/io.h',

0 commit comments

Comments
 (0)