@@ -64,22 +64,26 @@ def merge(left, right, how='left', on=None, left_on=None, right_on=None,
64
64
# TODO: transformations??
65
65
# TODO: only copy DataFrames when modification necessary
66
66
67
- def join_managers (left , right , axis = 1 , how = 'left' , copy = True ):
68
- join_index , left_indexer , right_indexer = \
69
- left .axes [axis ].join (right .axes [axis ], how = how , return_indexers = True )
70
- op = _JoinOperation (left , right , join_index , left_indexer ,
71
- right_indexer , axis = axis )
72
- return op .get_result (copy = copy )
67
+ # def join_managers(left, right, axis=1, how='left', copy=True):
68
+ # join_index, left_indexer, right_indexer = \
69
+ # left.axes[axis].join(right.axes[axis], how=how, return_indexers=True)
70
+ # op = _BlockJoinOperation (left, right, join_index, left_indexer,
71
+ # right_indexer, axis=axis)
72
+ # return op.get_result(copy=copy)
73
73
74
74
class _MergeOperation (object ):
75
+ """
76
+
77
+ """
75
78
76
79
def __init__ (self , left , right , how = 'inner' , on = None ,
77
- left_on = None , right_on = None ,
80
+ left_on = None , right_on = None , axis = 1 ,
78
81
left_index = False , right_index = False , sort = True ,
79
82
suffixes = ('.x' , '.y' ), copy = True ):
80
83
self .left = self .orig_left = left
81
84
self .right = self .orig_right = right
82
85
self .how = how
86
+ self .axis = axis
83
87
84
88
self .on = _maybe_make_list (on )
85
89
self .left_on = _maybe_make_list (left_on )
@@ -100,15 +104,14 @@ def __init__(self, left, right, how='inner', on=None,
100
104
self .join_names ) = self ._get_merge_keys ()
101
105
102
106
def get_result (self ):
103
- left_indexer , right_indexer = self ._get_join_indexers ()
104
- new_axis = self ._get_new_axis (left_indexer )
107
+ join_index , left_indexer , right_indexer = self ._get_join_info ()
105
108
106
109
# this is a bit kludgy
107
110
ldata , rdata = self ._get_merge_data (self .join_names )
108
111
109
112
# TODO: more efficiently handle group keys to avoid extra consolidation!
110
- join_op = _JoinOperation (ldata , rdata , new_axis ,
111
- left_indexer , right_indexer , axis = 1 )
113
+ join_op = _BlockJoinOperation (ldata , rdata , join_index ,
114
+ left_indexer , right_indexer , axis = 1 )
112
115
113
116
result_data = join_op .get_result (copy = self .copy )
114
117
result = DataFrame (result_data )
@@ -118,6 +121,10 @@ def get_result(self):
118
121
return result
119
122
120
123
def _maybe_add_join_keys (self , result , left_indexer , right_indexer ):
124
+ if self .left_index or self .right_index :
125
+ # do nothing, already found in one of the DataFrames
126
+ return
127
+
121
128
# insert group keys
122
129
for i , name in enumerate (self .join_names ):
123
130
# a faster way?
@@ -128,37 +135,63 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
128
135
right_na_indexer ))
129
136
result .insert (i , name , key_col )
130
137
131
- def _get_join_indexers (self ):
132
- # max groups = largest possible number of distinct groups
133
- left_key , right_key , max_groups = \
134
- _get_group_keys (self .left_join_keys , self .right_join_keys ,
135
- sort = self .sort )
138
+ def _get_join_info (self ):
139
+ left_ax = self .left ._data .axes [self .axis ]
140
+ right_ax = self .right ._data .axes [self .axis ]
141
+ if self .left_index and self .right_index :
142
+ join_index , left_indexer , right_indexer = \
143
+ left_ax .join (right_ax , how = self .how , return_indexers = True )
144
+ elif self .right_index and self .how == 'left' :
145
+ join_index = left_ax
146
+ left_indexer = None
147
+
148
+ # oh this is odious
149
+ if len (self .left_join_keys ) > 1 :
150
+ join_key = lib .fast_zip (self .left_join_keys )
151
+ else :
152
+ join_key = self .left_join_keys [0 ]
136
153
137
- join_func = _join_functions [ self . how ]
138
- left_indexer , right_indexer = join_func ( left_key . astype ( 'i4' ),
139
- right_key . astype ( 'i4' ),
140
- max_groups )
154
+ right_indexer = right_ax . get_indexer ( join_key )
155
+ elif self . left_index and self . how == 'right' :
156
+ join_index = right_ax
157
+ right_indexer = None
141
158
142
- return left_indexer , right_indexer
159
+ # oh this is odious
160
+ if len (self .right_join_keys ) > 1 :
161
+ join_key = lib .fast_zip (self .right_join_keys )
162
+ else :
163
+ join_key = self .right_join_keys [0 ]
143
164
144
- def _get_new_axis (self , left_indexer ):
145
- if left_indexer is None :
146
- new_axis = self .left .index
165
+ left_indexer = left_ax .get_indexer (join_key )
147
166
else :
148
- new_axis = Index (np .arange (len (left_indexer )))
149
- return new_axis
167
+ # max groups = largest possible number of distinct groups
168
+ left_key , right_key , max_groups = \
169
+ _get_group_keys (self .left_join_keys , self .right_join_keys ,
170
+ sort = self .sort )
171
+
172
+ join_func = _join_functions [self .how ]
173
+ left_indexer , right_indexer = join_func (left_key .astype ('i4' ),
174
+ right_key .astype ('i4' ),
175
+ max_groups )
176
+
177
+ if self .right_index :
178
+ join_index = self .left .index .take (left_indexer )
179
+ elif self .left_index :
180
+ join_index = self .right .index .take (right_indexer )
181
+ else :
182
+ join_index = Index (np .arange (len (left_indexer )))
183
+
184
+ return join_index , left_indexer , right_indexer
150
185
151
186
def _get_merge_data (self , join_names ):
152
187
"""
153
188
Handles overlapping column names etc.
154
189
"""
155
190
ldata , rdata = self .left ._data , self .right ._data
156
191
lsuf , rsuf = self .suffixes
157
-
158
- # basically by construction the column names are stored in
159
- # left_on...for now
192
+ exclude_names = [x for x in join_names if x is not None ]
160
193
ldata , rdata = ldata ._maybe_rename_join (rdata , lsuf , rsuf ,
161
- exclude = join_names ,
194
+ exclude = exclude_names ,
162
195
copydata = False )
163
196
164
197
return ldata , rdata
@@ -178,63 +211,69 @@ def _get_merge_keys(self):
178
211
left_keys, right_keys
179
212
"""
180
213
# Hm, any way to make this logic less complicated??
181
- left_keys = []
182
- right_keys = []
183
214
join_names = []
184
215
185
- # need_set_names = False
186
- # pop_right = False
216
+ drop = False
187
217
188
218
if (self .on is None and self .left_on is None
189
219
and self .right_on is None ):
190
220
191
221
if self .left_index and self .right_index :
192
- left_keys .append (self .left .index .values )
193
- right_keys .append (self .right .index .values )
194
-
195
- # need_set_names = True
196
-
197
- # XXX something better than this
198
- join_names .append ('join_key' )
222
+ pass
199
223
elif self .left_index :
200
- left_keys .append (self .left .index .values )
201
224
if self .right_on is None :
202
225
raise Exception ('Must pass right_on or right_index=True' )
203
226
elif self .right_index :
204
- right_keys .append (self .right .index .values )
205
227
if self .left_on is None :
206
228
raise Exception ('Must pass left_on or left_index=True' )
207
229
else :
208
230
# use the common columns
209
231
common_cols = self .left .columns .intersection (self .right .columns )
210
232
self .left_on = self .right_on = common_cols
211
-
212
- # pop_right = True
233
+ drop = True
213
234
214
235
elif self .on is not None :
215
236
if self .left_on is not None or self .right_on is not None :
216
237
raise Exception ('Can only pass on OR left_on and '
217
238
'right_on' )
218
239
self .left_on = self .right_on = self .on
219
-
220
- # pop_right = True
240
+ drop = True
221
241
222
242
# this is a touch kludgy, but accomplishes the goal
223
243
if self .right_on is not None :
224
- right = self .right .copy ()
225
- right_keys .extend ([right .pop (k ) for k in self .right_on ])
226
- self .right = right
244
+ self .right , right_keys , right_names = \
245
+ _get_keys (self .right , self .right_on , drop = drop )
246
+ join_names = right_names
247
+ else :
248
+ right_keys = [self .right .index .values ]
227
249
228
250
if self .left_on is not None :
229
- left = self .left .copy ()
230
- left_keys .extend ([left .pop (k ) for k in self .left_on ])
231
- self .left = left
232
-
233
- # TODO: something else?
234
- join_names = self .left_on
251
+ self .left , left_keys , left_names = \
252
+ _get_keys (self .left , self .left_on , drop = drop )
253
+ join_names = left_names
254
+ else :
255
+ left_keys = [self .left .index .values ]
235
256
236
257
return left_keys , right_keys , join_names
237
258
259
+ def _get_keys (frame , on , drop = False ):
260
+ to_drop = []
261
+ keys = []
262
+ names = []
263
+ for k in on :
264
+ if isinstance (k , np .ndarray ) and len (k ) == len (frame ):
265
+ keys .append (k )
266
+ names .append (None ) # super kludge-tastic
267
+ else :
268
+ to_drop .append (k )
269
+ keys .append (frame [k ].values )
270
+ names .append (k )
271
+
272
+ if drop :
273
+ frame = frame .drop (to_drop , axis = 1 )
274
+
275
+ return frame , keys , names
276
+
238
277
def _get_group_keys (left_keys , right_keys , sort = True ):
239
278
"""
240
279
@@ -326,7 +365,7 @@ def _sort_labels(uniques, left, right):
326
365
return reverse_indexer .take (left ), reverse_indexer .take (right )
327
366
328
367
329
- class _JoinOperation (object ):
368
+ class _BlockJoinOperation (object ):
330
369
"""
331
370
Object responsible for orchestrating efficient join operation between two
332
371
BlockManager data structures
0 commit comments