@@ -896,17 +896,119 @@ def _check_merge(x, y):
896
896
assert_frame_equal (result , expected , check_names = False )
897
897
898
898
899
- class TestMergeDtypes (object ):
899
+ class TestMergeMulti (object ):
900
900
901
- @pytest .mark .parametrize ('right_vals' , [
902
- ['foo' , 'bar' ],
903
- Series (['foo' , 'bar' ]).astype ('category' ),
904
- [1 , 2 ],
905
- [1.0 , 2.0 ],
906
- Series ([1 , 2 ], dtype = 'uint64' ),
907
- Series ([1 , 2 ], dtype = 'int32' )
908
- ])
909
- def test_different (self , right_vals ):
901
+ def setup_method (self , method ):
902
+ self .index = MultiIndex (levels = [['foo' , 'bar' , 'baz' , 'qux' ],
903
+ ['one' , 'two' , 'three' ]],
904
+ labels = [[0 , 0 , 0 , 1 , 1 , 2 , 2 , 3 , 3 , 3 ],
905
+ [0 , 1 , 2 , 0 , 1 , 1 , 2 , 0 , 1 , 2 ]],
906
+ names = ['first' , 'second' ])
907
+ self .to_join = DataFrame (np .random .randn (10 , 3 ), index = self .index ,
908
+ columns = ['j_one' , 'j_two' , 'j_three' ])
909
+
910
+ # a little relevant example with NAs
911
+ key1 = ['bar' , 'bar' , 'bar' , 'foo' , 'foo' , 'baz' , 'baz' , 'qux' ,
912
+ 'qux' , 'snap' ]
913
+ key2 = ['two' , 'one' , 'three' , 'one' , 'two' , 'one' , 'two' , 'two' ,
914
+ 'three' , 'one' ]
915
+
916
+ data = np .random .randn (len (key1 ))
917
+ self .data = DataFrame ({'key1' : key1 , 'key2' : key2 ,
918
+ 'data' : data })
919
+
920
+ def test_merge_on_multikey (self ):
921
+ joined = self .data .join (self .to_join , on = ['key1' , 'key2' ])
922
+
923
+ join_key = Index (lzip (self .data ['key1' ], self .data ['key2' ]))
924
+ indexer = self .to_join .index .get_indexer (join_key )
925
+ ex_values = self .to_join .values .take (indexer , axis = 0 )
926
+ ex_values [indexer == - 1 ] = np .nan
927
+ expected = self .data .join (DataFrame (ex_values ,
928
+ columns = self .to_join .columns ))
929
+
930
+ # TODO: columns aren't in the same order yet
931
+ assert_frame_equal (joined , expected .loc [:, joined .columns ])
932
+
933
+ left = self .data .join (self .to_join , on = ['key1' , 'key2' ], sort = True )
934
+ right = expected .loc [:, joined .columns ].sort_values (['key1' , 'key2' ],
935
+ kind = 'mergesort' )
936
+ assert_frame_equal (left , right )
937
+
938
+ def test_left_join_multi_index (self ):
939
+ icols = ['1st' , '2nd' , '3rd' ]
940
+
941
+ def bind_cols (df ):
942
+ iord = lambda a : 0 if a != a else ord (a )
943
+ f = lambda ts : ts .map (iord ) - ord ('a' )
944
+ return (f (df ['1st' ]) + f (df ['3rd' ]) * 1e2 +
945
+ df ['2nd' ].fillna (0 ) * 1e4 )
946
+
947
+ def run_asserts (left , right ):
948
+ for sort in [False , True ]:
949
+ res = left .join (right , on = icols , how = 'left' , sort = sort )
950
+
951
+ assert len (left ) < len (res ) + 1
952
+ assert not res ['4th' ].isna ().any ()
953
+ assert not res ['5th' ].isna ().any ()
954
+
955
+ tm .assert_series_equal (
956
+ res ['4th' ], - res ['5th' ], check_names = False )
957
+ result = bind_cols (res .iloc [:, :- 2 ])
958
+ tm .assert_series_equal (res ['4th' ], result , check_names = False )
959
+ assert result .name is None
960
+
961
+ if sort :
962
+ tm .assert_frame_equal (
963
+ res , res .sort_values (icols , kind = 'mergesort' ))
964
+
965
+ out = merge (left , right .reset_index (), on = icols ,
966
+ sort = sort , how = 'left' )
967
+
968
+ res .index = np .arange (len (res ))
969
+ tm .assert_frame_equal (out , res )
970
+
971
+ lc = list (map (chr , np .arange (ord ('a' ), ord ('z' ) + 1 )))
972
+ left = DataFrame (np .random .choice (lc , (5000 , 2 )),
973
+ columns = ['1st' , '3rd' ])
974
+ left .insert (1 , '2nd' , np .random .randint (0 , 1000 , len (left )))
975
+
976
+ i = np .random .permutation (len (left ))
977
+ right = left .iloc [i ].copy ()
978
+
979
+ left ['4th' ] = bind_cols (left )
980
+ right ['5th' ] = - bind_cols (right )
981
+ right .set_index (icols , inplace = True )
982
+
983
+ run_asserts (left , right )
984
+
985
+ # inject some nulls
986
+ left .loc [1 ::23 , '1st' ] = np .nan
987
+ left .loc [2 ::37 , '2nd' ] = np .nan
988
+ left .loc [3 ::43 , '3rd' ] = np .nan
989
+ left ['4th' ] = bind_cols (left )
990
+
991
+ i = np .random .permutation (len (left ))
992
+ right = left .iloc [i , :- 1 ]
993
+ right ['5th' ] = - bind_cols (right )
994
+ right .set_index (icols , inplace = True )
995
+
996
+ run_asserts (left , right )
997
+
998
+ def test_merge_right_vs_left (self ):
999
+ # compare left vs right merge with multikey
1000
+ for sort in [False , True ]:
1001
+ merged1 = self .data .merge (self .to_join , left_on = ['key1' , 'key2' ],
1002
+ right_index = True , how = 'left' , sort = sort )
1003
+
1004
+ merged2 = self .to_join .merge (self .data , right_on = ['key1' , 'key2' ],
1005
+ left_index = True , how = 'right' ,
1006
+ sort = sort )
1007
+
1008
+ merged2 = merged2 .loc [:, merged1 .columns ]
1009
+ assert_frame_equal (merged1 , merged2 )
1010
+
1011
+ def test_compress_group_combinations (self ):
910
1012
911
1013
# ~ 40000000 possible unique groups
912
1014
key1 = tm .rands_array (10 , 10000 )
0 commit comments