Skip to content

Commit 5689f0a

Browse files
committed
Merge remote-tracking branch 'upstream/master' into multi-index-join
# Conflicts: # doc/source/whatsnew/v0.24.0.txt # pandas/core/indexes/base.py # pandas/core/reshape/merge.py # pandas/tests/reshape/merge/test_merge.py
2 parents b581789 + 40dfadd commit 5689f0a

File tree

1 file changed

+112
-10
lines changed

1 file changed

+112
-10
lines changed

pandas/tests/reshape/merge/test_merge.py

+112-10
Original file line numberDiff line numberDiff line change
@@ -896,17 +896,119 @@ def _check_merge(x, y):
896896
assert_frame_equal(result, expected, check_names=False)
897897

898898

899-
class TestMergeDtypes(object):
899+
class TestMergeMulti(object):
900900

901-
@pytest.mark.parametrize('right_vals', [
902-
['foo', 'bar'],
903-
Series(['foo', 'bar']).astype('category'),
904-
[1, 2],
905-
[1.0, 2.0],
906-
Series([1, 2], dtype='uint64'),
907-
Series([1, 2], dtype='int32')
908-
])
909-
def test_different(self, right_vals):
901+
def setup_method(self, method):
902+
self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
903+
['one', 'two', 'three']],
904+
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
905+
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
906+
names=['first', 'second'])
907+
self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
908+
columns=['j_one', 'j_two', 'j_three'])
909+
910+
# a little relevant example with NAs
911+
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
912+
'qux', 'snap']
913+
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
914+
'three', 'one']
915+
916+
data = np.random.randn(len(key1))
917+
self.data = DataFrame({'key1': key1, 'key2': key2,
918+
'data': data})
919+
920+
def test_merge_on_multikey(self):
921+
joined = self.data.join(self.to_join, on=['key1', 'key2'])
922+
923+
join_key = Index(lzip(self.data['key1'], self.data['key2']))
924+
indexer = self.to_join.index.get_indexer(join_key)
925+
ex_values = self.to_join.values.take(indexer, axis=0)
926+
ex_values[indexer == -1] = np.nan
927+
expected = self.data.join(DataFrame(ex_values,
928+
columns=self.to_join.columns))
929+
930+
# TODO: columns aren't in the same order yet
931+
assert_frame_equal(joined, expected.loc[:, joined.columns])
932+
933+
left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
934+
right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'],
935+
kind='mergesort')
936+
assert_frame_equal(left, right)
937+
938+
def test_left_join_multi_index(self):
939+
icols = ['1st', '2nd', '3rd']
940+
941+
def bind_cols(df):
942+
iord = lambda a: 0 if a != a else ord(a)
943+
f = lambda ts: ts.map(iord) - ord('a')
944+
return (f(df['1st']) + f(df['3rd']) * 1e2 +
945+
df['2nd'].fillna(0) * 1e4)
946+
947+
def run_asserts(left, right):
948+
for sort in [False, True]:
949+
res = left.join(right, on=icols, how='left', sort=sort)
950+
951+
assert len(left) < len(res) + 1
952+
assert not res['4th'].isna().any()
953+
assert not res['5th'].isna().any()
954+
955+
tm.assert_series_equal(
956+
res['4th'], - res['5th'], check_names=False)
957+
result = bind_cols(res.iloc[:, :-2])
958+
tm.assert_series_equal(res['4th'], result, check_names=False)
959+
assert result.name is None
960+
961+
if sort:
962+
tm.assert_frame_equal(
963+
res, res.sort_values(icols, kind='mergesort'))
964+
965+
out = merge(left, right.reset_index(), on=icols,
966+
sort=sort, how='left')
967+
968+
res.index = np.arange(len(res))
969+
tm.assert_frame_equal(out, res)
970+
971+
lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
972+
left = DataFrame(np.random.choice(lc, (5000, 2)),
973+
columns=['1st', '3rd'])
974+
left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))
975+
976+
i = np.random.permutation(len(left))
977+
right = left.iloc[i].copy()
978+
979+
left['4th'] = bind_cols(left)
980+
right['5th'] = - bind_cols(right)
981+
right.set_index(icols, inplace=True)
982+
983+
run_asserts(left, right)
984+
985+
# inject some nulls
986+
left.loc[1::23, '1st'] = np.nan
987+
left.loc[2::37, '2nd'] = np.nan
988+
left.loc[3::43, '3rd'] = np.nan
989+
left['4th'] = bind_cols(left)
990+
991+
i = np.random.permutation(len(left))
992+
right = left.iloc[i, :-1]
993+
right['5th'] = - bind_cols(right)
994+
right.set_index(icols, inplace=True)
995+
996+
run_asserts(left, right)
997+
998+
def test_merge_right_vs_left(self):
999+
# compare left vs right merge with multikey
1000+
for sort in [False, True]:
1001+
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
1002+
right_index=True, how='left', sort=sort)
1003+
1004+
merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
1005+
left_index=True, how='right',
1006+
sort=sort)
1007+
1008+
merged2 = merged2.loc[:, merged1.columns]
1009+
assert_frame_equal(merged1, merged2)
1010+
1011+
def test_compress_group_combinations(self):
9101012

9111013
# ~ 40000000 possible unique groups
9121014
key1 = tm.rands_array(10, 10000)

0 commit comments

Comments
 (0)