11
11
import pandas ._tseries as lib
12
12
from pandas ._sandbox import Factorizer
13
13
14
- def merge (left , right , how = 'inner ' , cols = None , left_cols = None , right_cols = None ,
14
+ def merge (left , right , how = 'left ' , on = None , left_on = None , right_on = None ,
15
15
left_index = False , right_index = False , sort = True ,
16
16
suffixes = ('.x' , '.y' ), copy = True ):
17
17
"""
@@ -25,17 +25,25 @@ def merge(left, right, how='inner', cols=None, left_cols=None, right_cols=None,
25
25
how : {'left', 'right', 'outer', 'inner'}
26
26
How to handle indexes of the two objects. Default: 'left'
27
27
for joining on index, None otherwise
28
- * left: use only keys from left frame
29
- * right: use only keys from right frame
30
- * outer: use union of keys from both frames
31
- * inner: use intersection of keys from both frames
32
- cols
33
- left_cols
34
- right_cols
35
- left_index
36
- right_index
37
- sort
38
- suffixes
28
+ * left: use only keys from left frame (SQL: left outer join)
29
+ * right: use only keys from right frame (SQL: right outer join)
30
+ * outer: use union of keys from both frames (SQL: full outer join)
31
+ * inner: use intersection of keys from both frames (SQL: inner join)
32
+ on : label or list
33
+
34
+ left_on : label or list
35
+
36
+ right_on : label or list
37
+
38
+ left_index : boolean, default True
39
+
40
+ right_index : boolean, default True
41
+
42
+ sort : boolean, default True
43
+
44
+ suffixes : 2-length sequence (tuple, list, ...)
45
+ Suffix to apply to overlapping column names in the left and right
46
+ side, respectively
39
47
copy : boolean, default True
40
48
If False, do not copy data unnecessarily
41
49
@@ -46,48 +54,153 @@ def merge(left, right, how='inner', cols=None, left_cols=None, right_cols=None,
46
54
-------
47
55
merged : DataFrame
48
56
"""
49
- left_join_keys , right_join_keys = _get_merge_keys (left , right , cols ,
50
- left_cols , right_cols ,
51
- left_index , right_index )
52
-
53
- # max groups = largest possible number of distinct groups
54
- left_key , right_key , max_groups = _get_group_keys (left_join_keys ,
55
- right_join_keys )
57
+ op = _MergeOperation (left , right , how = how , on = on , left_on = left_on ,
58
+ right_on = right_on , left_index = left_index ,
59
+ right_index = right_index , sort = sort , suffixes = suffixes ,
60
+ copy = copy )
61
+ return op .get_result ()
56
62
57
- join_func = _join_functions [how ]
58
- left_indexer , right_indexer = join_func (left_key , right_key , max_groups )
59
- new_axis = Index (np .arange (len (left_indexer )))
60
63
61
- join_op = _JoinOperation (left , right , new_axis , left_indexer ,
62
- right_indexer , axis = 1 )
63
- result_data = join_op .get_result (copy = copy )
64
- return DataFrame (result_data )
64
+ # TODO: shortcuts with MultiIndex labels already computed
65
+ # TODO: NA group handling
66
+ # TODO: DONE group column names in result
67
+ # TODO: transformations??
68
+ # TODO: only copy DataFrames when modification necessary
65
69
66
70
class _MergeOperation (object ):
67
71
68
- def __init__ (self , left , right , how = 'inner' , cols = None ,
69
- left_cols = None , right_cols = None ,
72
+ def __init__ (self , left , right , how = 'inner' , on = None ,
73
+ left_on = None , right_on = None ,
70
74
left_index = False , right_index = False , sort = True ,
71
75
suffixes = ('.x' , '.y' ), copy = True ):
72
- pass
76
+ self .left = left
77
+ self .right = right
78
+ self .how = how
73
79
74
- def _get_merge_keys ( left , right , cols , left_cols , right_cols ,
75
- left_index = False , right_index = False ):
76
- """
80
+ self . on = _maybe_make_list ( on )
81
+ self . left_on = _maybe_make_list ( left_on )
82
+ self . right_on = _maybe_make_list ( right_on )
77
83
78
- Parameters
79
- ----------
84
+ self .copy = copy
80
85
81
- Returns
82
- -------
86
+ self .suffixes = suffixes
83
87
84
- """
85
- if on is None :
86
- pass
87
- else :
88
- pass
88
+ self .sort = sort
89
+
90
+ self .left_index = left_index
91
+ self .right_index = right_index
92
+
93
+ def get_result (self ):
94
+ # note this function has side effects
95
+ left_join_keys , right_join_keys , join_names = self ._get_merge_keys ()
96
+
97
+ # this is a bit kludgy
98
+ ldata , rdata = self ._get_merge_data (join_names )
99
+
100
+ # max groups = largest possible number of distinct groups
101
+ left_key , right_key , max_groups = \
102
+ _get_group_keys (left_join_keys , right_join_keys , sort = self .sort )
103
+
104
+ join_func = _join_functions [self .how ]
105
+ left_indexer , right_indexer = join_func (left_key .astype ('i4' ),
106
+ right_key .astype ('i4' ),
107
+ max_groups )
108
+
109
+ new_axis = Index (np .arange (len (left_indexer )))
110
+
111
+ join_op = _JoinOperation (ldata , rdata , new_axis ,
112
+ left_indexer , right_indexer , axis = 1 )
89
113
90
- def _get_group_keys (left_keys , right_keys ):
114
+ result_data = join_op .get_result (copy = self .copy )
115
+ return DataFrame (result_data )
116
+
117
+ def _get_merge_data (self , join_names ):
118
+ """
119
+ Handles overlapping column names etc.
120
+ """
121
+ ldata , rdata = self .left ._data , self .right ._data
122
+ lsuf , rsuf = self .suffixes
123
+
124
+ # basically by construction the column names are stored in
125
+ # left_on...for now
126
+ ldata , rdata = ldata ._maybe_rename_join (rdata , lsuf , rsuf ,
127
+ exclude = join_names ,
128
+ copydata = False )
129
+
130
+ return ldata , rdata
131
+
132
+ def _get_merge_keys (self ):
133
+ """
134
+ Note: has side effects (copy/delete key columns)
135
+
136
+ Parameters
137
+ ----------
138
+ left
139
+ right
140
+ on
141
+
142
+ Returns
143
+ -------
144
+ left_keys, right_keys
145
+ """
146
+ # Hm, any way to make this logic less complicated??
147
+ left_keys = []
148
+ right_keys = []
149
+ join_names = []
150
+
151
+ need_set_names = False
152
+ pop_right = False
153
+
154
+ if (self .on is None and self .left_on is None
155
+ and self .right_on is None ):
156
+
157
+ if self .left_index and self .right_index :
158
+ left_keys .append (self .left .index .values )
159
+ right_keys .append (self .right .index .values )
160
+
161
+ need_set_names = True
162
+ # XXX something better than this
163
+ join_names .append ('join_key' )
164
+ elif self .left_index :
165
+ left_keys .append (self .left .index .values )
166
+ if self .right_on is None :
167
+ raise Exception ('Must pass right_on or right_index=True' )
168
+ elif self .right_index :
169
+ right_keys .append (self .right .index .values )
170
+ if self .left_on is None :
171
+ raise Exception ('Must pass left_on or left_index=True' )
172
+ else :
173
+ # use the common columns
174
+ common_cols = self .left .columns .intersection (self .right .columns )
175
+ self .left_on = self .right_on = common_cols
176
+ pop_right = True
177
+ elif self .on is not None :
178
+ if self .left_on is not None or self .right_on is not None :
179
+ raise Exception ('Can only pass on OR left_on and '
180
+ 'right_on' )
181
+ self .left_on = self .right_on = self .on
182
+ pop_right = True
183
+
184
+ if self .right_on is not None :
185
+ # this is a touch kludgy, but accomplishes the goal
186
+ if pop_right :
187
+ right = self .right .copy ()
188
+ right_keys .extend ([right .pop (k ) for k in self .right_on ])
189
+ self .right = right
190
+ else :
191
+ right_keys .extend ([right [k ] for k in self .right_on ])
192
+
193
+ if need_set_names :
194
+ self .left = self .left .copy ()
195
+ for i , (lkey , name ) in enumerate (zip (left_keys , join_names )):
196
+ self .left .insert (i , name , lkey )
197
+
198
+ if self .left_on is not None :
199
+ left_keys .extend ([self .left [k ] for k in self .left_on ])
200
+
201
+ return left_keys , right_keys , join_names
202
+
203
+ def _get_group_keys (left_keys , right_keys , sort = True ):
91
204
"""
92
205
93
206
Parameters
@@ -111,9 +224,21 @@ def _get_group_keys(left_keys, right_keys):
111
224
llab , _ = rizer .factorize (lk .astype ('O' ))
112
225
rlab , _ = rizer .factorize (rk .astype ('O' ))
113
226
227
+ count = rizer .get_count ()
228
+
229
+ if sort :
230
+ sorter = Index (rizer .uniques ).argsort ()
231
+ reverse_indexer = np .empty (len (sorter ), dtype = np .int32 )
232
+ reverse_indexer .put (sorter , np .arange (len (sorter )))
233
+
234
+ llab = reverse_indexer .take (llab )
235
+ rlab = reverse_indexer .take (rlab )
236
+
237
+ # TODO: na handling
238
+
114
239
left_labels .append (llab )
115
240
right_labels .append (rlab )
116
- group_sizes .append (rizer . get_count () )
241
+ group_sizes .append (count )
117
242
118
243
left_group_key = get_group_index (left_labels , group_sizes )
119
244
right_group_key = get_group_index (right_labels , group_sizes )
@@ -123,6 +248,11 @@ def _get_group_keys(left_keys, right_keys):
123
248
124
249
import pandas ._sandbox as sbx
125
250
251
+ def _maybe_make_list (obj ):
252
+ if obj is not None and not isinstance (obj , (tuple , list )):
253
+ return [obj ]
254
+ return obj
255
+
126
256
def _right_outer_join (x , y ):
127
257
right_indexer , left_indexer = sbx .left_outer_join (y , x )
128
258
return left_indexer , right_indexer
0 commit comments