Skip to content

Commit 30075f8

Browse files
committed
ENH: be a bit more clever to avoid consolidation with join keys, GH #733
1 parent 3cb301e commit 30075f8

File tree

3 files changed

+98
-53
lines changed

3 files changed

+98
-53
lines changed

pandas/core/internals.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -802,12 +802,8 @@ def merge(self, other, lsuffix=None, rsuffix=None):
802802

803803
return BlockManager(consolidated, new_axes)
804804

805-
def _maybe_rename_join(self, other, lsuffix, rsuffix, exclude=None,
806-
copydata=True):
805+
def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
807806
to_rename = self.items.intersection(other.items)
808-
if exclude is not None and len(exclude) > 0:
809-
to_rename = to_rename - exclude
810-
811807
if len(to_rename) > 0:
812808
if not lsuffix and not rsuffix:
813809
raise Exception('columns overlap: %s' % to_rename)
@@ -822,7 +818,6 @@ def rrenamer(x):
822818
return '%s%s' % (x, rsuffix)
823819
return x
824820

825-
# XXX: COPIES DATA!
826821
this = self.rename_items(lrenamer, copydata=copydata)
827822
other = other.rename_items(rrenamer, copydata=copydata)
828823
else:

pandas/tools/merge.py

Lines changed: 75 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,27 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
9797

9898
# insert group keys
9999
for i, name in enumerate(self.join_names):
100-
# a faster way?
101-
key_col = com.take_1d(self.left_join_keys[i], left_indexer)
102-
na_indexer = (left_indexer == -1).nonzero()[0]
103-
right_na_indexer = right_indexer.take(na_indexer)
104-
key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
105-
right_na_indexer))
106-
result.insert(i, name, key_col)
100+
if name in result:
101+
key_col = result[name]
102+
103+
if name in self.left:
104+
na_indexer = (left_indexer == -1).nonzero()[0]
105+
right_na_indexer = right_indexer.take(na_indexer)
106+
key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
107+
right_na_indexer))
108+
else:
109+
na_indexer = (right_indexer == -1).nonzero()[0]
110+
left_na_indexer = right_indexer.take(na_indexer)
111+
key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
112+
left_na_indexer))
113+
else:
114+
# a faster way?
115+
key_col = com.take_1d(self.left_join_keys[i], left_indexer)
116+
na_indexer = (left_indexer == -1).nonzero()[0]
117+
right_na_indexer = right_indexer.take(na_indexer)
118+
key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
119+
right_na_indexer))
120+
result.insert(i, name, key_col)
107121

108122
def _get_join_info(self):
109123
left_ax = self.left._data.axes[self.axis]
@@ -144,17 +158,8 @@ def _get_merge_data(self):
144158
"""
145159
ldata, rdata = self.left._data, self.right._data
146160
lsuf, rsuf = self.suffixes
147-
exclude_names = set(x for x in self.join_names if x is not None)
148-
if self.left_on is not None:
149-
exclude_names -= set(c.name if hasattr(c, 'name') else c
150-
for c in self.left_on)
151-
if self.right_on is not None:
152-
exclude_names -= set(c.name if hasattr(c, 'name') else c
153-
for c in self.right_on)
154161
ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf,
155-
exclude=exclude_names,
156162
copydata=False)
157-
158163
return ldata, rdata
159164

160165
def _get_merge_keys(self):
@@ -172,8 +177,6 @@ def _get_merge_keys(self):
172177
left_keys, right_keys
173178
"""
174179
# Hm, any way to make this logic less complicated??
175-
join_names = []
176-
177180
if (self.on is None and self.left_on is None
178181
and self.right_on is None):
179182

@@ -198,18 +201,56 @@ def _get_merge_keys(self):
198201
self.left_on = self.right_on = self.on
199202
self.drop_keys = True
200203

201-
# this is a touch kludgy, but accomplishes the goal
202-
left_keys = None
203-
if self.left_on is not None:
204-
self.left, left_keys, left_names = \
205-
_get_keys(self.left, self.left_on, drop=self.drop_keys)
206-
join_names = left_names
207-
208-
right_keys = None
209-
if self.right_on is not None:
210-
self.right, right_keys, right_names = \
211-
_get_keys(self.right, self.right_on, drop=self.drop_keys)
212-
join_names = right_names
204+
left_keys = []
205+
right_keys = []
206+
join_names = []
207+
left_drop, right_drop = [], []
208+
left, right = self.left, self.right
209+
210+
is_lkey = lambda x: isinstance(x, np.ndarray) and len(x) == len(left)
211+
is_rkey = lambda x: isinstance(x, np.ndarray) and len(x) == len(right)
212+
213+
# ugh, spaghetti re #733
214+
if _any(self.left_on) and _any(self.right_on):
215+
for lk, rk in zip(self.left_on, self.right_on):
216+
if is_lkey(lk):
217+
left_keys.append(lk)
218+
if is_rkey(rk):
219+
right_keys.append(rk)
220+
join_names.append(None) # what to do?
221+
else:
222+
right_keys.append(right[rk].values)
223+
join_names.append(rk)
224+
else:
225+
if not is_rkey(rk):
226+
right_keys.append(right[rk].values)
227+
if lk == rk:
228+
right_drop.append(rk)
229+
else:
230+
right_keys.append(rk)
231+
left_keys.append(left[lk].values)
232+
join_names.append(lk)
233+
elif _any(self.left_on):
234+
for k in self.left_on:
235+
if is_lkey(k):
236+
left_keys.append(k)
237+
join_names.append(None)
238+
else:
239+
left_keys.append(left[k].values)
240+
join_names.append(k)
241+
elif _any(self.right_on):
242+
for k in self.right_on:
243+
if is_rkey(k):
244+
right_keys.append(k)
245+
join_names.append(None)
246+
else:
247+
right_keys.append(right[k].values)
248+
join_names.append(k)
249+
250+
if right_drop:
251+
self.right = self.right.drop(right_drop, axis=1)
252+
if left_drop:
253+
self.left = self.left.drop(left_drop, axis=1)
213254

214255
return left_keys, right_keys, join_names
215256

@@ -271,7 +312,8 @@ def _get_group_keys(self):
271312
sort=self.sort)
272313
return left_group_key, right_group_key, max_groups
273314

274-
def _get_keys(frame, on, drop=False):
315+
316+
def _get_join_keys(left, right, left_on, right_on, drop=False):
275317
to_drop = []
276318
keys = []
277319
names = []
@@ -993,3 +1035,5 @@ def _all_indexes_same(indexes):
9931035
return False
9941036
return True
9951037

1038+
def _any(x):
1039+
return x is not None and len(x) > 0

pandas/tools/tests/test_merge.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -180,25 +180,9 @@ def test_handle_overlap_arbitrary_key(self):
180180
joined = merge(self.df, self.df2,
181181
left_on='key2', right_on='key1',
182182
suffixes=['.foo', '.bar'])
183-
184183
self.assert_('key1.foo' in joined)
185184
self.assert_('key2.bar' in joined)
186185

187-
# result = merge(self.df, self.df2, on='key1')
188-
# left_on = self.df['key2'].copy()
189-
# left_on.name = 'baz'
190-
# right_on = self.df2['key1'].copy()
191-
# right_on.name = 'baz'
192-
193-
# grouped = self.df2.groupby('key1').mean()
194-
# self.assert_('key2' in grouped)
195-
196-
# joined = merge(self.df, grouped, left_on='key1',
197-
# right_index=True, suffixes=['.foo', '.bar'])
198-
# foo
199-
# self.assert_('key2.foo' in joined)
200-
# self.assert_('key2.bar' in joined)
201-
202186
def test_merge_common(self):
203187
joined = merge(self.df, self.df2)
204188
exp = merge(self.df, self.df2, on=['key1', 'key2'])
@@ -524,6 +508,28 @@ def test_join_sort(self):
524508
index=[1, 2, 0, 3])
525509
assert_frame_equal(joined, expected)
526510

511+
# smoke test
512+
joined = left.join(right, on='key', sort=False)
513+
self.assert_(np.array_equal(joined.index, range(4)))
514+
515+
def test_intelligently_handle_join_key(self):
516+
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
517+
518+
left = DataFrame({'key' : [1, 1, 2, 2, 3],
519+
'value' : range(5)}, columns=['value', 'key'])
520+
right = DataFrame({'key' : [1, 1, 2, 3, 4, 5],
521+
'rvalue' : range(6)})
522+
523+
joined = merge(left, right, on='key', how='outer')
524+
expected = DataFrame({'key' : [1, 1, 1, 1, 2, 2, 3, 4, 5.],
525+
'value' : np.array([0, 0, 1, 1, 2, 3, 4,
526+
np.nan, np.nan]),
527+
'rvalue' : np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
528+
columns=['value', 'key', 'rvalue'])
529+
assert_frame_equal(joined, expected)
530+
531+
self.assert_(joined._data.is_consolidated())
532+
527533
class TestMergeMulti(unittest.TestCase):
528534

529535
def setUp(self):

0 commit comments

Comments
 (0)