@@ -2520,139 +2520,6 @@ def test_combineMult(self):
2520
2520
comb = self .empty .combineMult (self .frame )
2521
2521
assert_frame_equal (comb , self .frame )
2522
2522
2523
- def test_join_on (self ):
2524
- index , data = tm .getMixedTypeDict ()
2525
- target = DataFrame (data , index = index )
2526
-
2527
- # Join on string value
2528
- source = DataFrame ({'MergedA' : data ['A' ], 'MergedD' : data ['D' ]},
2529
- index = data ['C' ])
2530
- merged = target .join (source , on = 'C' )
2531
- self .assert_ (np .array_equal (merged ['MergedA' ], target ['A' ]))
2532
- self .assert_ (np .array_equal (merged ['MergedD' ], target ['D' ]))
2533
-
2534
- # join with duplicates (fix regression from DataFrame/Matrix merge)
2535
- df = DataFrame ({'key' : ['a' , 'a' , 'b' , 'b' , 'c' ]})
2536
- df2 = DataFrame ({'value' : [0 , 1 , 2 ]}, index = ['a' , 'b' , 'c' ])
2537
- joined = df .join (df2 , on = 'key' )
2538
- expected = DataFrame ({'key' : ['a' , 'a' , 'b' , 'b' , 'c' ],
2539
- 'value' : [0 , 0 , 1 , 1 , 2 ]})
2540
- assert_frame_equal (joined , expected )
2541
-
2542
- # Test when some are missing
2543
- df_a = DataFrame ([[1 ], [2 ], [3 ]], index = ['a' , 'b' , 'c' ],
2544
- columns = ['one' ])
2545
- df_b = DataFrame ([['foo' ], ['bar' ]], index = [1 , 2 ],
2546
- columns = ['two' ])
2547
- df_c = DataFrame ([[1 ], [2 ]], index = [1 , 2 ],
2548
- columns = ['three' ])
2549
- joined = df_a .join (df_b , on = 'one' )
2550
- joined = joined .join (df_c , on = 'one' )
2551
- self .assert_ (np .isnan (joined ['two' ]['c' ]))
2552
- self .assert_ (np .isnan (joined ['three' ]['c' ]))
2553
-
2554
- # merge column not p resent
2555
- self .assertRaises (Exception , target .join , source , on = 'E' )
2556
-
2557
- # nothing to merge
2558
- merged = target .join (source .reindex ([]), on = 'C' )
2559
-
2560
- # overlap
2561
- source_copy = source .copy ()
2562
- source_copy ['A' ] = 0
2563
- self .assertRaises (Exception , target .join , source_copy , on = 'A' )
2564
-
2565
- # can't specify how
2566
- self .assertRaises (Exception , target .join , source , on = 'C' ,
2567
- how = 'left' )
2568
-
2569
- def test_join_on_singlekey_list (self ):
2570
- df = DataFrame ({'key' : ['a' , 'a' , 'b' , 'b' , 'c' ]})
2571
- df2 = DataFrame ({'value' : [0 , 1 , 2 ]}, index = ['a' , 'b' , 'c' ])
2572
-
2573
- # corner cases
2574
- joined = df .join (df2 , on = ['key' ])
2575
- expected = df .join (df2 , on = 'key' )
2576
-
2577
- assert_frame_equal (joined , expected )
2578
-
2579
- def test_join_on_multikey (self ):
2580
- index = MultiIndex (levels = [['foo' , 'bar' , 'baz' , 'qux' ],
2581
- ['one' , 'two' , 'three' ]],
2582
- labels = [[0 , 0 , 0 , 1 , 1 , 2 , 2 , 3 , 3 , 3 ],
2583
- [0 , 1 , 2 , 0 , 1 , 1 , 2 , 0 , 1 , 2 ]],
2584
- names = ['first' , 'second' ])
2585
- to_join = DataFrame (np .random .randn (10 , 3 ), index = index ,
2586
- columns = ['j_one' , 'j_two' , 'j_three' ])
2587
-
2588
- # a little relevant example with NAs
2589
- key1 = ['bar' , 'bar' , 'bar' , 'foo' , 'foo' , 'baz' , 'baz' , 'qux' ,
2590
- 'qux' , 'snap' ]
2591
- key2 = ['two' , 'one' , 'three' , 'one' , 'two' , 'one' , 'two' , 'two' ,
2592
- 'three' , 'one' ]
2593
-
2594
- data = np .random .randn (len (key1 ))
2595
- data = DataFrame ({'key1' : key1 , 'key2' : key2 ,
2596
- 'data' : data })
2597
-
2598
- joined = data .join (to_join , on = ['key1' , 'key2' ])
2599
-
2600
- join_key = Index (zip (key1 , key2 ))
2601
- indexer = to_join .index .get_indexer (join_key )
2602
- ex_values = to_join .values .take (indexer , axis = 0 )
2603
- ex_values [indexer == - 1 ] = np .nan
2604
- expected = data .join (DataFrame (ex_values , columns = to_join .columns ))
2605
-
2606
- # TODO: columns aren't in the same order yet
2607
- assert_frame_equal (joined , expected .ix [:, joined .columns ])
2608
-
2609
- def test_join_index_mixed (self ):
2610
-
2611
- df1 = DataFrame ({'A' : 1. , 'B' : 2 , 'C' : 'foo' , 'D' : True },
2612
- index = np .arange (10 ),
2613
- columns = ['A' , 'B' , 'C' , 'D' ])
2614
- self .assert_ (df1 ['B' ].dtype == np .int_ )
2615
- self .assert_ (df1 ['D' ].dtype == np .bool_ )
2616
-
2617
- df2 = DataFrame ({'A' : 1. , 'B' : 2 , 'C' : 'foo' , 'D' : True },
2618
- index = np .arange (0 , 10 , 2 ),
2619
- columns = ['A' , 'B' , 'C' , 'D' ])
2620
-
2621
- # overlap
2622
- joined = df1 .join (df2 , lsuffix = '_one' , rsuffix = '_two' )
2623
- expected_columns = ['A_one' , 'B_one' , 'C_one' , 'D_one' ,
2624
- 'A_two' , 'B_two' , 'C_two' , 'D_two' ]
2625
- df1 .columns = expected_columns [:4 ]
2626
- df2 .columns = expected_columns [4 :]
2627
- expected = _join_by_hand (df1 , df2 )
2628
- assert_frame_equal (joined , expected )
2629
-
2630
- # no overlapping blocks
2631
- df1 = DataFrame (index = np .arange (10 ))
2632
- df1 ['bool' ] = True
2633
- df1 ['string' ] = 'foo'
2634
-
2635
- df2 = DataFrame (index = np .arange (5 , 15 ))
2636
- df2 ['int' ] = 1
2637
- df2 ['float' ] = 1.
2638
-
2639
- for kind in JOIN_TYPES :
2640
- joined = df1 .join (df2 , how = kind )
2641
- expected = _join_by_hand (df1 , df2 , how = kind )
2642
- assert_frame_equal (joined , expected )
2643
-
2644
- joined = df2 .join (df1 , how = kind )
2645
- expected = _join_by_hand (df2 , df1 , how = kind )
2646
- assert_frame_equal (joined , expected )
2647
-
2648
- def test_join_on_series (self ):
2649
- pass
2650
-
2651
- def test_join_empty_bug (self ):
2652
- # generated an exception in 0.4.3
2653
- x = DataFrame ()
2654
- x .join (DataFrame ([3 ], index = [0 ], columns = ['A' ]), how = 'outer' )
2655
-
2656
2523
def test_clip (self ):
2657
2524
median = self .frame .median ().median ()
2658
2525
@@ -3184,6 +3051,160 @@ def test_series_put_names(self):
3184
3051
for k , v in series .iteritems ():
3185
3052
self .assertEqual (v .name , k )
3186
3053
3054
+
3055
+
3056
+ class TestDataFrameJoin (unittest .TestCase ):
3057
+
3058
+ def setUp (self ):
3059
+ index , data = tm .getMixedTypeDict ()
3060
+ self .target = DataFrame (data , index = index )
3061
+
3062
+ # Join on string value
3063
+ self .source = DataFrame ({'MergedA' : data ['A' ], 'MergedD' : data ['D' ]},
3064
+ index = data ['C' ])
3065
+
3066
+ def test_join_on (self ):
3067
+ target = self .target
3068
+ source = self .source
3069
+
3070
+ merged = target .join (source , on = 'C' )
3071
+ self .assert_ (np .array_equal (merged ['MergedA' ], target ['A' ]))
3072
+ self .assert_ (np .array_equal (merged ['MergedD' ], target ['D' ]))
3073
+
3074
+ # join with duplicates (fix regression from DataFrame/Matrix merge)
3075
+ df = DataFrame ({'key' : ['a' , 'a' , 'b' , 'b' , 'c' ]})
3076
+ df2 = DataFrame ({'value' : [0 , 1 , 2 ]}, index = ['a' , 'b' , 'c' ])
3077
+ joined = df .join (df2 , on = 'key' )
3078
+ expected = DataFrame ({'key' : ['a' , 'a' , 'b' , 'b' , 'c' ],
3079
+ 'value' : [0 , 0 , 1 , 1 , 2 ]})
3080
+ assert_frame_equal (joined , expected )
3081
+
3082
+ # Test when some are missing
3083
+ df_a = DataFrame ([[1 ], [2 ], [3 ]], index = ['a' , 'b' , 'c' ],
3084
+ columns = ['one' ])
3085
+ df_b = DataFrame ([['foo' ], ['bar' ]], index = [1 , 2 ],
3086
+ columns = ['two' ])
3087
+ df_c = DataFrame ([[1 ], [2 ]], index = [1 , 2 ],
3088
+ columns = ['three' ])
3089
+ joined = df_a .join (df_b , on = 'one' )
3090
+ joined = joined .join (df_c , on = 'one' )
3091
+ self .assert_ (np .isnan (joined ['two' ]['c' ]))
3092
+ self .assert_ (np .isnan (joined ['three' ]['c' ]))
3093
+
3094
+ # merge column not p resent
3095
+ self .assertRaises (Exception , target .join , source , on = 'E' )
3096
+
3097
+ # overlap
3098
+ source_copy = source .copy ()
3099
+ source_copy ['A' ] = 0
3100
+ self .assertRaises (Exception , target .join , source_copy , on = 'A' )
3101
+
3102
+ def test_join_with_len0 (self ):
3103
+ # nothing to merge
3104
+ merged = self .target .join (self .source .reindex ([]), on = 'C' )
3105
+ for col in self .source :
3106
+ self .assert_ (col in merged )
3107
+ self .assert_ (merged [col ].isnull ().all ())
3108
+
3109
+ def test_join_on_inner (self ):
3110
+ df = DataFrame ({'key' : ['a' , 'a' , 'd' , 'b' , 'b' , 'c' ]})
3111
+ df2 = DataFrame ({'value' : [0 , 1 ]}, index = ['a' , 'b' ])
3112
+
3113
+ joined = df .join (df2 , on = 'key' , how = 'inner' )
3114
+
3115
+ expected = df .join (df2 , on = 'key' )
3116
+ expected = expected [expected ['value' ].notnull ()]
3117
+ self .assert_ (np .array_equal (joined ['key' ], expected ['key' ]))
3118
+ self .assert_ (np .array_equal (joined ['value' ], expected ['value' ]))
3119
+ self .assert_ (joined .index .equals (expected .index ))
3120
+
3121
+ def test_join_on_singlekey_list (self ):
3122
+ df = DataFrame ({'key' : ['a' , 'a' , 'b' , 'b' , 'c' ]})
3123
+ df2 = DataFrame ({'value' : [0 , 1 , 2 ]}, index = ['a' , 'b' , 'c' ])
3124
+
3125
+ # corner cases
3126
+ joined = df .join (df2 , on = ['key' ])
3127
+ expected = df .join (df2 , on = 'key' )
3128
+
3129
+ assert_frame_equal (joined , expected )
3130
+
3131
+ def test_join_on_multikey (self ):
3132
+ index = MultiIndex (levels = [['foo' , 'bar' , 'baz' , 'qux' ],
3133
+ ['one' , 'two' , 'three' ]],
3134
+ labels = [[0 , 0 , 0 , 1 , 1 , 2 , 2 , 3 , 3 , 3 ],
3135
+ [0 , 1 , 2 , 0 , 1 , 1 , 2 , 0 , 1 , 2 ]],
3136
+ names = ['first' , 'second' ])
3137
+ to_join = DataFrame (np .random .randn (10 , 3 ), index = index ,
3138
+ columns = ['j_one' , 'j_two' , 'j_three' ])
3139
+
3140
+ # a little relevant example with NAs
3141
+ key1 = ['bar' , 'bar' , 'bar' , 'foo' , 'foo' , 'baz' , 'baz' , 'qux' ,
3142
+ 'qux' , 'snap' ]
3143
+ key2 = ['two' , 'one' , 'three' , 'one' , 'two' , 'one' , 'two' , 'two' ,
3144
+ 'three' , 'one' ]
3145
+
3146
+ data = np .random .randn (len (key1 ))
3147
+ data = DataFrame ({'key1' : key1 , 'key2' : key2 ,
3148
+ 'data' : data })
3149
+
3150
+ joined = data .join (to_join , on = ['key1' , 'key2' ])
3151
+
3152
+ join_key = Index (zip (key1 , key2 ))
3153
+ indexer = to_join .index .get_indexer (join_key )
3154
+ ex_values = to_join .values .take (indexer , axis = 0 )
3155
+ ex_values [indexer == - 1 ] = np .nan
3156
+ expected = data .join (DataFrame (ex_values , columns = to_join .columns ))
3157
+
3158
+ # TODO: columns aren't in the same order yet
3159
+ assert_frame_equal (joined , expected .ix [:, joined .columns ])
3160
+
3161
+ def test_join_index_mixed (self ):
3162
+
3163
+ df1 = DataFrame ({'A' : 1. , 'B' : 2 , 'C' : 'foo' , 'D' : True },
3164
+ index = np .arange (10 ),
3165
+ columns = ['A' , 'B' , 'C' , 'D' ])
3166
+ self .assert_ (df1 ['B' ].dtype == np .int_ )
3167
+ self .assert_ (df1 ['D' ].dtype == np .bool_ )
3168
+
3169
+ df2 = DataFrame ({'A' : 1. , 'B' : 2 , 'C' : 'foo' , 'D' : True },
3170
+ index = np .arange (0 , 10 , 2 ),
3171
+ columns = ['A' , 'B' , 'C' , 'D' ])
3172
+
3173
+ # overlap
3174
+ joined = df1 .join (df2 , lsuffix = '_one' , rsuffix = '_two' )
3175
+ expected_columns = ['A_one' , 'B_one' , 'C_one' , 'D_one' ,
3176
+ 'A_two' , 'B_two' , 'C_two' , 'D_two' ]
3177
+ df1 .columns = expected_columns [:4 ]
3178
+ df2 .columns = expected_columns [4 :]
3179
+ expected = _join_by_hand (df1 , df2 )
3180
+ assert_frame_equal (joined , expected )
3181
+
3182
+ # no overlapping blocks
3183
+ df1 = DataFrame (index = np .arange (10 ))
3184
+ df1 ['bool' ] = True
3185
+ df1 ['string' ] = 'foo'
3186
+
3187
+ df2 = DataFrame (index = np .arange (5 , 15 ))
3188
+ df2 ['int' ] = 1
3189
+ df2 ['float' ] = 1.
3190
+
3191
+ for kind in JOIN_TYPES :
3192
+ joined = df1 .join (df2 , how = kind )
3193
+ expected = _join_by_hand (df1 , df2 , how = kind )
3194
+ assert_frame_equal (joined , expected )
3195
+
3196
+ joined = df2 .join (df1 , how = kind )
3197
+ expected = _join_by_hand (df2 , df1 , how = kind )
3198
+ assert_frame_equal (joined , expected )
3199
+
3200
+ def test_join_on_series (self ):
3201
+ pass
3202
+
3203
+ def test_join_empty_bug (self ):
3204
+ # generated an exception in 0.4.3
3205
+ x = DataFrame ()
3206
+ x .join (DataFrame ([3 ], index = [0 ], columns = ['A' ]), how = 'outer' )
3207
+
3187
3208
def _join_by_hand (a , b , how = 'left' ):
3188
3209
join_index = a .index .join (b .index , how = how )
3189
3210
0 commit comments