Skip to content

Commit 21308b2

Browse files
committed
BUG: DataFrame.join on keys produce wrong result, does not preserve order
1 parent bb654f8 commit 21308b2

File tree

4 files changed

+58
-5
lines changed

4 files changed

+58
-5
lines changed

pandas/src/join.pyx

+12-3
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def inner_join(ndarray[int32_t] left, ndarray[int32_t] right,
5151
_get_result_indexer(right_sorter, right_indexer))
5252

5353
def left_outer_join(ndarray[int32_t] left, ndarray[int32_t] right,
54-
Py_ssize_t max_groups):
54+
Py_ssize_t max_groups, sort=True):
5555
cdef:
5656
Py_ssize_t i, j, k, count = 0
5757
ndarray[int32_t] left_count, right_count, left_sorter, right_sorter
@@ -101,8 +101,17 @@ def left_outer_join(ndarray[int32_t] left, ndarray[int32_t] right,
101101
left_pos += lc
102102
right_pos += rc
103103

104-
return (_get_result_indexer(left_sorter, left_indexer),
105-
_get_result_indexer(right_sorter, right_indexer))
104+
left_indexer = _get_result_indexer(left_sorter, left_indexer)
105+
right_indexer = _get_result_indexer(right_sorter, right_indexer)
106+
107+
if not sort:
108+
rev = np.empty(len(left), dtype='i4')
109+
rev.put(left_sorter, np.arange(len(left)))
110+
111+
right_indexer = right_indexer.take(rev)
112+
left_indexer = left_indexer.take(rev)
113+
114+
return left_indexer, right_indexer
106115

107116

108117
def full_outer_join(ndarray[int32_t] left, ndarray[int32_t] right,

pandas/tests/test_tseries.py

+21
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,27 @@ def test_left_join_indexer():
8080
expected = np.array([1, 1, 2, 3, 3], dtype='i4')
8181
assert(np.array_equal(result, expected))
8282

83+
def test_left_outer_join_bug():
84+
left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3,
85+
2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1,
86+
3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0,
87+
3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3,
88+
2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0,
89+
3, 1, 2, 0, 2], dtype=np.int32)
90+
91+
right = np.array([3, 1], dtype=np.int32)
92+
max_groups = 4
93+
94+
lidx, ridx = lib.left_outer_join(left, right, max_groups, sort=False)
95+
96+
exp_lidx = np.arange(len(left))
97+
exp_ridx = -np.ones(len(left))
98+
exp_ridx[left == 1] = 1
99+
exp_ridx[left == 3] = 0
100+
101+
assert(np.array_equal(lidx, exp_lidx))
102+
assert(np.array_equal(ridx, exp_ridx))
103+
83104
def test_inner_join_indexer():
84105
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
85106
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)

pandas/tools/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -319,12 +319,12 @@ def _get_multiindex_indexer(join_keys, index, sort=True):
319319

320320
left_group_key, right_group_key, max_groups = \
321321
_factorize_int64(left_group_key, right_group_key,
322-
sort=sort)
322+
sort=False)
323323

324324
left_indexer, right_indexer = \
325325
lib.left_outer_join(left_group_key.astype('i4'),
326326
right_group_key.astype('i4'),
327-
max_groups)
327+
max_groups, sort=False)
328328

329329
return right_indexer
330330

pandas/tools/tests/test_merge.py

+23
Original file line numberDiff line numberDiff line change
@@ -1042,6 +1042,29 @@ def test_concat_single_with_key(self):
10421042
expected = concat([df, df], keys=['foo', 'bar'])
10431043
tm.assert_frame_equal(result, expected[:10])
10441044

1045+
def test_left_join_index_preserve_order(self):
1046+
1047+
left = DataFrame({'k1' : [0, 1, 2] * 4,
1048+
'k2' : ['foo', 'bar'] * 6,
1049+
'v' : np.arange(12)})
1050+
1051+
index = MultiIndex.from_tuples([(2, 'bar')])
1052+
right = DataFrame({'v2' : 5}, index=index)
1053+
1054+
result = left.join(right, on=['k1', 'k2'])
1055+
1056+
expected = left.copy()
1057+
expected['v2'] = np.nan
1058+
expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
1059+
1060+
tm.assert_frame_equal(result, expected)
1061+
1062+
# do a right join for an extra test
1063+
joined = merge(right, left, left_index=True,
1064+
right_on=['k1', 'k2'], how='right')
1065+
tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
1066+
1067+
10451068
if __name__ == '__main__':
10461069
import nose
10471070
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)