Skip to content

Commit ff090f4

Browse files
committed
Merge pull request #9210 from behzadnouri/lji
BUG: bug in left join on multi-index with sort=True or nulls
2 parents 22f1486 + 6ca893f commit ff090f4

File tree

3 files changed

+117
-24
lines changed

3 files changed

+117
-24
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ Bug Fixes
101101

102102
- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`)
103103
- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`)
104+
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
104105

105106

106107

pandas/tools/merge.py

+29-18
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pandas.core.categorical import Categorical
1010
from pandas.core.frame import DataFrame, _merge_doc
1111
from pandas.core.generic import NDFrame
12-
from pandas.core.groupby import get_group_index
1312
from pandas.core.series import Series
1413
from pandas.core.index import (Index, MultiIndex, _get_combined_index,
1514
_ensure_index, _get_consensus_names,
@@ -525,27 +524,39 @@ def get_result(self):
525524
return result
526525

527526

528-
def _get_multiindex_indexer(join_keys, index, sort=False):
529-
shape = []
530-
labels = []
531-
for level, key in zip(index.levels, join_keys):
532-
llab, rlab, count = _factorize_keys(level, key, sort=False)
533-
labels.append(rlab)
534-
shape.append(count)
527+
def _get_multiindex_indexer(join_keys, index, sort):
528+
from functools import partial
535529

536-
left_group_key = get_group_index(labels, shape)
537-
right_group_key = get_group_index(index.labels, shape)
530+
# bind `sort` argument
531+
fkeys = partial(_factorize_keys, sort=sort)
538532

539-
left_group_key, right_group_key, max_groups = \
540-
_factorize_keys(left_group_key, right_group_key,
541-
sort=False)
533+
# left & right join labels and num. of levels at each location
534+
rlab, llab, shape = map(list, zip( * map(fkeys, index.levels, join_keys)))
535+
if sort:
536+
rlab = list(map(np.take, rlab, index.labels))
537+
else:
538+
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
539+
rlab = list(map(i8copy, index.labels))
542540

543-
left_indexer, right_indexer = \
544-
algos.left_outer_join(com._ensure_int64(left_group_key),
545-
com._ensure_int64(right_group_key),
546-
max_groups, sort=False)
541+
# fix right labels if there were any nulls
542+
for i in range(len(join_keys)):
543+
mask = index.labels[i] == -1
544+
if mask.any():
545+
# check if there already was any nulls at this location
546+
# if there was, it is factorized to `shape[i] - 1`
547+
a = join_keys[i][llab[i] == shape[i] - 1]
548+
if a.size == 0 or not a[0] != a[0]:
549+
shape[i] += 1
547550

548-
return left_indexer, right_indexer
551+
rlab[i][mask] = shape[i] - 1
552+
553+
# get flat i8 join keys
554+
lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
555+
556+
# factorize keys to a dense i8 space
557+
lkey, rkey, count = fkeys(lkey, rkey)
558+
559+
return algos.left_outer_join(lkey, rkey, count, sort=sort)
549560

550561

551562
def _get_single_indexer(join_key, index, sort=False):

pandas/tools/tests/test_merge.py

+87-6
Original file line numberDiff line numberDiff line change
@@ -901,14 +901,78 @@ def test_merge_on_multikey(self):
901901
# TODO: columns aren't in the same order yet
902902
assert_frame_equal(joined, expected.ix[:, joined.columns])
903903

904+
left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
905+
right = expected.ix[:, joined.columns].sort(['key1', 'key2'],
906+
kind='mergesort')
907+
assert_frame_equal(left, right)
908+
909+
def test_left_join_multi_index(self):
910+
icols = ['1st', '2nd', '3rd']
911+
912+
def bind_cols(df):
913+
iord = lambda a: 0 if a != a else ord(a)
914+
f = lambda ts: ts.map(iord) - ord('a')
915+
return f(df['1st']) + f(df['3rd'])* 1e2 + df['2nd'].fillna(0) * 1e4
916+
917+
def run_asserts(left, right):
918+
for sort in [False, True]:
919+
res = left.join(right, on=icols, how='left', sort=sort)
920+
921+
self.assertTrue(len(left) < len(res) + 1)
922+
self.assertFalse(res['4th'].isnull().any())
923+
self.assertFalse(res['5th'].isnull().any())
924+
925+
tm.assert_series_equal(res['4th'], - res['5th'])
926+
tm.assert_series_equal(res['4th'], bind_cols(res.iloc[:, :-2]))
927+
928+
if sort:
929+
tm.assert_frame_equal(res,
930+
res.sort(icols, kind='mergesort'))
931+
932+
out = merge(left, right.reset_index(), on=icols,
933+
sort=sort, how='left')
934+
935+
res.index = np.arange(len(res))
936+
tm.assert_frame_equal(out, res)
937+
938+
lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
939+
left = DataFrame(np.random.choice(lc, (5000, 2)),
940+
columns=['1st', '3rd'])
941+
left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))
942+
943+
i = np.random.permutation(len(left))
944+
right = left.iloc[i].copy()
945+
946+
left['4th'] = bind_cols(left)
947+
right['5th'] = - bind_cols(right)
948+
right.set_index(icols, inplace=True)
949+
950+
run_asserts(left, right)
951+
952+
# inject some nulls
953+
left.loc[1::23, '1st'] = np.nan
954+
left.loc[2::37, '2nd'] = np.nan
955+
left.loc[3::43, '3rd'] = np.nan
956+
left['4th'] = bind_cols(left)
957+
958+
i = np.random.permutation(len(left))
959+
right = left.iloc[i, :-1]
960+
right['5th'] = - bind_cols(right)
961+
right.set_index(icols, inplace=True)
962+
963+
run_asserts(left, right)
964+
904965
def test_merge_right_vs_left(self):
905966
# compare left vs right merge with multikey
906-
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
907-
right_index=True, how='left')
908-
merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
909-
left_index=True, how='right')
910-
merged2 = merged2.ix[:, merged1.columns]
911-
assert_frame_equal(merged1, merged2)
967+
for sort in [False, True]:
968+
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
969+
right_index=True, how='left', sort=sort)
970+
971+
merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
972+
left_index=True, how='right', sort=sort)
973+
974+
merged2 = merged2.ix[:, merged1.columns]
975+
assert_frame_equal(merged1, merged2)
912976

913977
def test_compress_group_combinations(self):
914978

@@ -943,6 +1007,8 @@ def test_left_join_index_preserve_order(self):
9431007
expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7
9441008

9451009
tm.assert_frame_equal(result, expected)
1010+
tm.assert_frame_equal(result.sort(['k1', 'k2'], kind='mergesort'),
1011+
left.join(right, on=['k1', 'k2'], sort=True))
9461012

9471013
# test join with multi dtypes blocks
9481014
left = DataFrame({'k1': [0, 1, 2] * 8,
@@ -961,6 +1027,8 @@ def test_left_join_index_preserve_order(self):
9611027
expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7
9621028

9631029
tm.assert_frame_equal(result, expected)
1030+
tm.assert_frame_equal(result.sort(['k1', 'k2'], kind='mergesort'),
1031+
left.join(right, on=['k1', 'k2'], sort=True))
9641032

9651033
# do a right join for an extra test
9661034
joined = merge(right, left, left_index=True,
@@ -1022,6 +1090,12 @@ def test_left_join_index_multi_match_multiindex(self):
10221090

10231091
tm.assert_frame_equal(result, expected)
10241092

1093+
result = left.join(right, on=['cola', 'colb', 'colc'],
1094+
how='left', sort=True)
1095+
1096+
tm.assert_frame_equal(result,
1097+
expected.sort(['cola', 'colb', 'colc'], kind='mergesort'))
1098+
10251099
# GH7331 - maintain left frame order in left merge
10261100
right.reset_index(inplace=True)
10271101
right.columns = left.columns[:3].tolist() + right.columns[-1:].tolist()
@@ -1066,6 +1140,9 @@ def test_left_join_index_multi_match(self):
10661140

10671141
tm.assert_frame_equal(result, expected)
10681142

1143+
result = left.join(right, on='tag', how='left', sort=True)
1144+
tm.assert_frame_equal(result, expected.sort('tag', kind='mergesort'))
1145+
10691146
# GH7331 - maintain left frame order in left merge
10701147
result = merge(left, right.reset_index(), how='left', on='tag')
10711148
expected.index = np.arange(len(expected))
@@ -1094,6 +1171,10 @@ def _test(dtype1,dtype2):
10941171

10951172
tm.assert_frame_equal(result, expected)
10961173

1174+
result = left.join(right, on=['k1', 'k2'], sort=True)
1175+
expected.sort(['k1', 'k2'], kind='mergesort', inplace=True)
1176+
tm.assert_frame_equal(result, expected)
1177+
10971178
for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]:
10981179
for d2 in [np.int64,np.float64,np.float32,np.float16]:
10991180
_test(np.dtype(d1),np.dtype(d2))

0 commit comments

Comments
 (0)