Skip to content

Commit 65de156

Browse files
committed
ENH: test suite passes with unified merge operation!! #249
1 parent 8d2c2a8 commit 65de156

File tree

3 files changed

+129
-98
lines changed

3 files changed

+129
-98
lines changed

pandas/core/frame.py

+32-40
Original file line numberDiff line numberDiff line change
@@ -2767,51 +2767,43 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
27672767
rsuffix=rsuffix)
27682768

27692769
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
2770-
# from pandas.tools.merge import merge
2771-
2772-
# if isinstance(other, Series):
2773-
# assert(other.name is not None)
2774-
# other = DataFrame({other.name : other})
2775-
2776-
# return merge(self, other, left_on=on, left_index=on is None,
2777-
# right_index=True, suffixes=(lsuffix, rsuffix))
2770+
from pandas.tools.merge import merge
27782771

27792772
if isinstance(other, Series):
27802773
assert(other.name is not None)
27812774
other = DataFrame({other.name : other})
27822775

2783-
if on is not None:
2784-
return self._join_on(other, on, how, lsuffix, rsuffix)
2785-
else:
2786-
return self._join_index(other, how, lsuffix, rsuffix)
2787-
2788-
def _join_on(self, other, on, how, lsuffix, rsuffix):
2789-
if how not in ('left', 'inner'): # pragma: no cover
2790-
raise Exception('Only inner / left joins currently supported')
2791-
2792-
if isinstance(on, (list, tuple)):
2793-
if len(on) == 1:
2794-
join_key = self[on[0]].values
2795-
else:
2796-
join_key = lib.fast_zip([self[k] for k in on])
2797-
elif isinstance(on, np.ndarray) and len(on) == len(self):
2798-
join_key = on
2799-
else:
2800-
join_key = self[on].values
2801-
2802-
new_data = self._data.join_on(other._data, join_key, how=how, axis=1,
2803-
lsuffix=lsuffix, rsuffix=rsuffix)
2804-
return self._constructor(new_data)
2805-
2806-
def _join_index(self, other, how, lsuffix, rsuffix):
2807-
from pandas.tools.merge import join_managers
2808-
2809-
thisdata, otherdata = self._data._maybe_rename_join(
2810-
other._data, lsuffix, rsuffix, copydata=False)
2811-
2812-
# this will always ensure copied data
2813-
merged_data = join_managers(thisdata, otherdata, axis=1, how=how)
2814-
return self._constructor(merged_data)
2776+
return merge(self, other, left_on=on, how=how,
2777+
left_index=on is None, right_index=True,
2778+
suffixes=(lsuffix, rsuffix))
2779+
2780+
# def _join_on(self, other, on, how, lsuffix, rsuffix):
2781+
# if how not in ('left', 'inner'): # pragma: no cover
2782+
# raise Exception('Only inner / left joins currently supported')
2783+
2784+
# if isinstance(on, (list, tuple)):
2785+
# if len(on) == 1:
2786+
# join_key = self[on[0]].values
2787+
# else:
2788+
# join_key = lib.fast_zip([self[k] for k in on])
2789+
# elif isinstance(on, np.ndarray) and len(on) == len(self):
2790+
# join_key = on
2791+
# else:
2792+
# join_key = self[on].values
2793+
2794+
# new_data = self._data.join_on(other._data, join_key, how=how, axis=1,
2795+
# lsuffix=lsuffix, rsuffix=rsuffix)
2796+
# return self._constructor(new_data)
2797+
2798+
# def _join_index(self, other, how, lsuffix, rsuffix):
2799+
# from pandas.tools.merge import join_managers
2800+
2801+
# thisdata, otherdata = self._data._maybe_rename_join(
2802+
# other._data, lsuffix, rsuffix, copydata=False)
2803+
2804+
# # this will always ensure copied data
2805+
# merged_data = join_managers(thisdata, otherdata, axis=1, how=how)
2806+
# return self._constructor(merged_data)
28152807

28162808
#----------------------------------------------------------------------
28172809
# Statistical methods, etc.

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ def merge(self, other, lsuffix=None, rsuffix=None):
790790
def _maybe_rename_join(self, other, lsuffix, rsuffix, exclude=None,
791791
copydata=True):
792792
to_rename = self.items.intersection(other.items)
793-
if exclude is not None:
793+
if exclude is not None and len(exclude) > 0:
794794
to_rename = to_rename - exclude
795795

796796
if len(to_rename) > 0:

pandas/tools/merge.py

+96-57
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,26 @@ def merge(left, right, how='left', on=None, left_on=None, right_on=None,
6464
# TODO: transformations??
6565
# TODO: only copy DataFrames when modification necessary
6666

67-
def join_managers(left, right, axis=1, how='left', copy=True):
68-
join_index, left_indexer, right_indexer = \
69-
left.axes[axis].join(right.axes[axis], how=how, return_indexers=True)
70-
op = _JoinOperation(left, right, join_index, left_indexer,
71-
right_indexer, axis=axis)
72-
return op.get_result(copy=copy)
67+
# def join_managers(left, right, axis=1, how='left', copy=True):
68+
# join_index, left_indexer, right_indexer = \
69+
# left.axes[axis].join(right.axes[axis], how=how, return_indexers=True)
70+
# op = _BlockJoinOperation(left, right, join_index, left_indexer,
71+
# right_indexer, axis=axis)
72+
# return op.get_result(copy=copy)
7373

7474
class _MergeOperation(object):
75+
"""
76+
77+
"""
7578

7679
def __init__(self, left, right, how='inner', on=None,
77-
left_on=None, right_on=None,
80+
left_on=None, right_on=None, axis=1,
7881
left_index=False, right_index=False, sort=True,
7982
suffixes=('.x', '.y'), copy=True):
8083
self.left = self.orig_left = left
8184
self.right = self.orig_right = right
8285
self.how = how
86+
self.axis = axis
8387

8488
self.on = _maybe_make_list(on)
8589
self.left_on = _maybe_make_list(left_on)
@@ -100,15 +104,14 @@ def __init__(self, left, right, how='inner', on=None,
100104
self.join_names) = self._get_merge_keys()
101105

102106
def get_result(self):
103-
left_indexer, right_indexer = self._get_join_indexers()
104-
new_axis = self._get_new_axis(left_indexer)
107+
join_index, left_indexer, right_indexer = self._get_join_info()
105108

106109
# this is a bit kludgy
107110
ldata, rdata = self._get_merge_data(self.join_names)
108111

109112
# TODO: more efficiently handle group keys to avoid extra consolidation!
110-
join_op = _JoinOperation(ldata, rdata, new_axis,
111-
left_indexer, right_indexer, axis=1)
113+
join_op = _BlockJoinOperation(ldata, rdata, join_index,
114+
left_indexer, right_indexer, axis=1)
112115

113116
result_data = join_op.get_result(copy=self.copy)
114117
result = DataFrame(result_data)
@@ -118,6 +121,10 @@ def get_result(self):
118121
return result
119122

120123
def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
124+
if self.left_index or self.right_index:
125+
# do nothing, already found in one of the DataFrames
126+
return
127+
121128
# insert group keys
122129
for i, name in enumerate(self.join_names):
123130
# a faster way?
@@ -128,37 +135,63 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
128135
right_na_indexer))
129136
result.insert(i, name, key_col)
130137

131-
def _get_join_indexers(self):
132-
# max groups = largest possible number of distinct groups
133-
left_key, right_key, max_groups = \
134-
_get_group_keys(self.left_join_keys, self.right_join_keys,
135-
sort=self.sort)
138+
def _get_join_info(self):
139+
left_ax = self.left._data.axes[self.axis]
140+
right_ax = self.right._data.axes[self.axis]
141+
if self.left_index and self.right_index:
142+
join_index, left_indexer, right_indexer = \
143+
left_ax.join(right_ax, how=self.how, return_indexers=True)
144+
elif self.right_index and self.how == 'left':
145+
join_index = left_ax
146+
left_indexer = None
147+
148+
# oh this is odious
149+
if len(self.left_join_keys) > 1:
150+
join_key = lib.fast_zip(self.left_join_keys)
151+
else:
152+
join_key = self.left_join_keys[0]
136153

137-
join_func = _join_functions[self.how]
138-
left_indexer, right_indexer = join_func(left_key.astype('i4'),
139-
right_key.astype('i4'),
140-
max_groups)
154+
right_indexer = right_ax.get_indexer(join_key)
155+
elif self.left_index and self.how == 'right':
156+
join_index = right_ax
157+
right_indexer = None
141158

142-
return left_indexer, right_indexer
159+
# oh this is odious
160+
if len(self.right_join_keys) > 1:
161+
join_key = lib.fast_zip(self.right_join_keys)
162+
else:
163+
join_key = self.right_join_keys[0]
143164

144-
def _get_new_axis(self, left_indexer):
145-
if left_indexer is None:
146-
new_axis = self.left.index
165+
left_indexer = left_ax.get_indexer(join_key)
147166
else:
148-
new_axis = Index(np.arange(len(left_indexer)))
149-
return new_axis
167+
# max groups = largest possible number of distinct groups
168+
left_key, right_key, max_groups = \
169+
_get_group_keys(self.left_join_keys, self.right_join_keys,
170+
sort=self.sort)
171+
172+
join_func = _join_functions[self.how]
173+
left_indexer, right_indexer = join_func(left_key.astype('i4'),
174+
right_key.astype('i4'),
175+
max_groups)
176+
177+
if self.right_index:
178+
join_index = self.left.index.take(left_indexer)
179+
elif self.left_index:
180+
join_index = self.right.index.take(right_indexer)
181+
else:
182+
join_index = Index(np.arange(len(left_indexer)))
183+
184+
return join_index, left_indexer, right_indexer
150185

151186
def _get_merge_data(self, join_names):
152187
"""
153188
Handles overlapping column names etc.
154189
"""
155190
ldata, rdata = self.left._data, self.right._data
156191
lsuf, rsuf = self.suffixes
157-
158-
# basically by construction the column names are stored in
159-
# left_on...for now
192+
exclude_names = [x for x in join_names if x is not None]
160193
ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf,
161-
exclude=join_names,
194+
exclude=exclude_names,
162195
copydata=False)
163196

164197
return ldata, rdata
@@ -178,63 +211,69 @@ def _get_merge_keys(self):
178211
left_keys, right_keys
179212
"""
180213
# Hm, any way to make this logic less complicated??
181-
left_keys = []
182-
right_keys = []
183214
join_names = []
184215

185-
# need_set_names = False
186-
# pop_right = False
216+
drop = False
187217

188218
if (self.on is None and self.left_on is None
189219
and self.right_on is None):
190220

191221
if self.left_index and self.right_index:
192-
left_keys.append(self.left.index.values)
193-
right_keys.append(self.right.index.values)
194-
195-
# need_set_names = True
196-
197-
# XXX something better than this
198-
join_names.append('join_key')
222+
pass
199223
elif self.left_index:
200-
left_keys.append(self.left.index.values)
201224
if self.right_on is None:
202225
raise Exception('Must pass right_on or right_index=True')
203226
elif self.right_index:
204-
right_keys.append(self.right.index.values)
205227
if self.left_on is None:
206228
raise Exception('Must pass left_on or left_index=True')
207229
else:
208230
# use the common columns
209231
common_cols = self.left.columns.intersection(self.right.columns)
210232
self.left_on = self.right_on = common_cols
211-
212-
# pop_right = True
233+
drop = True
213234

214235
elif self.on is not None:
215236
if self.left_on is not None or self.right_on is not None:
216237
raise Exception('Can only pass on OR left_on and '
217238
'right_on')
218239
self.left_on = self.right_on = self.on
219-
220-
# pop_right = True
240+
drop = True
221241

222242
# this is a touch kludgy, but accomplishes the goal
223243
if self.right_on is not None:
224-
right = self.right.copy()
225-
right_keys.extend([right.pop(k) for k in self.right_on])
226-
self.right = right
244+
self.right, right_keys, right_names = \
245+
_get_keys(self.right, self.right_on, drop=drop)
246+
join_names = right_names
247+
else:
248+
right_keys = [self.right.index.values]
227249

228250
if self.left_on is not None:
229-
left = self.left.copy()
230-
left_keys.extend([left.pop(k) for k in self.left_on])
231-
self.left = left
232-
233-
# TODO: something else?
234-
join_names = self.left_on
251+
self.left, left_keys, left_names = \
252+
_get_keys(self.left, self.left_on, drop=drop)
253+
join_names = left_names
254+
else:
255+
left_keys = [self.left.index.values]
235256

236257
return left_keys, right_keys, join_names
237258

259+
def _get_keys(frame, on, drop=False):
260+
to_drop = []
261+
keys = []
262+
names = []
263+
for k in on:
264+
if isinstance(k, np.ndarray) and len(k) == len(frame):
265+
keys.append(k)
266+
names.append(None) # super kludge-tastic
267+
else:
268+
to_drop.append(k)
269+
keys.append(frame[k].values)
270+
names.append(k)
271+
272+
if drop:
273+
frame = frame.drop(to_drop, axis=1)
274+
275+
return frame, keys, names
276+
238277
def _get_group_keys(left_keys, right_keys, sort=True):
239278
"""
240279
@@ -326,7 +365,7 @@ def _sort_labels(uniques, left, right):
326365
return reverse_indexer.take(left), reverse_indexer.take(right)
327366

328367

329-
class _JoinOperation(object):
368+
class _BlockJoinOperation(object):
330369
"""
331370
Object responsible for orchestrating efficient join operation between two
332371
BlockManager data structures

0 commit comments

Comments
 (0)