Skip to content

Commit fc4ca8d

Browse files
committed
ENH: merge ops taking shape, much testing needed, GH #249
1 parent 07f3914 commit fc4ca8d

File tree

10 files changed

+228
-79
lines changed

10 files changed

+228
-79
lines changed

TODO.rst

-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
Join methods todo
2-
-----------------
3-
- Joint factorizer
4-
- NA group handling
5-
61
DONE
72
----
83
- SparseSeries name integration + tests

bench/bench_merge.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import random
33

44
N = 10000
5-
ngroups = 3
5+
ngroups = 10
66

77
def get_test_data(ngroups=100, n=N):
88
unique_groups = range(ngroups)
@@ -21,16 +21,12 @@ def get_test_data(ngroups=100, n=N):
2121
'data1' : np.random.randn(N),
2222
'data2' : np.random.randn(N)})
2323

24-
df2 = DataFrame({'key1' : [0, 1, 2, 0, 1, 2],
25-
'key2' : [0, 1, 2, 0, 1, 2],
26-
'value' : list('abcdef')})
24+
df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10),
25+
'key2' : get_test_data(ngroups=ngroups//2, n=N//10),
26+
'value' : np.random.randn(N // 10)})
2727

2828

2929
import pandas.tools.merge as merge
3030
reload(merge)
3131

32-
left, right = merge._get_group_keys([df['key1'], df['key2']],
33-
[df2['key1'], df2['key2']])
34-
35-
left, right = merge._get_group_keys([df['key1']], [df2['key1']])
36-
32+
result = merge.merge(df, df2, on='key2')

pandas/core/internals.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -789,20 +789,23 @@ def merge(self, other, lsuffix=None, rsuffix=None):
789789

790790
return BlockManager(consolidated, new_axes)
791791

792-
def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
793-
intersection = self.items.intersection(other.items)
792+
def _maybe_rename_join(self, other, lsuffix, rsuffix, exclude=None,
793+
copydata=True):
794+
to_rename = self.items.intersection(other.items)
795+
if exclude is not None:
796+
to_rename = to_rename - exclude
794797

795-
if len(intersection) > 0:
798+
if len(to_rename) > 0:
796799
if not lsuffix and not rsuffix:
797-
raise Exception('columns overlap: %s' % intersection)
800+
raise Exception('columns overlap: %s' % to_rename)
798801

799802
def lrenamer(x):
800-
if x in intersection:
803+
if x in to_rename:
801804
return '%s%s' % (x, lsuffix)
802805
return x
803806

804807
def rrenamer(x):
805-
if x in intersection:
808+
if x in to_rename:
806809
return '%s%s' % (x, rsuffix)
807810
return x
808811

pandas/src/hashtable.pyx

+5-5
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ cdef class PyObjectHashTable:
464464
labels, counts = self.get_labels(values, reverse, 0)
465465
return reverse, labels, counts
466466

467-
cpdef get_labels(self, ndarray[object] values, dict reverse,
467+
cpdef get_labels(self, ndarray[object] values, list uniques,
468468
Py_ssize_t count_prior):
469469
cdef:
470470
Py_ssize_t i, n = len(values)
@@ -488,7 +488,7 @@ cdef class PyObjectHashTable:
488488
else:
489489
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
490490
self.table.vals[k] = count
491-
reverse[count] = val
491+
uniques.append(val)
492492
labels[i] = count
493493
counts[count] = 1
494494
count += 1
@@ -499,19 +499,19 @@ cdef class Factorizer:
499499

500500
cdef public:
501501
PyObjectHashTable table
502-
dict id_table
502+
list uniques
503503
Py_ssize_t count
504504

505505
def __init__(self, size_hint):
506506
self.table = PyObjectHashTable(size_hint)
507-
self.id_table = {}
507+
self.uniques = []
508508
self.count = 0
509509

510510
def get_count(self):
511511
return self.count
512512

513513
def factorize(self, ndarray[object] values):
514-
labels, counts = self.table.get_labels(values, self.id_table,
514+
labels, counts = self.table.get_labels(values, self.uniques,
515515
self.count)
516516
self.count = len(counts)
517517
return labels, counts

pandas/src/join.pyx

+7-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,13 @@ def left_outer_join(ndarray[int32_t] left, ndarray[int32_t] right,
5858
right_pos += rc
5959
position += lc * rc
6060

61-
return left_sorter, left_indexer, right_sorter, right_indexer
61+
res_left = left_sorter.take(left_indexer)
62+
np.putmask(res_left, left_indexer == -1, -1)
63+
64+
res_right = right_sorter.take(right_indexer)
65+
np.putmask(res_right, right_indexer == -1, -1)
66+
67+
return res_left, res_right
6268

6369

6470
def full_outer_join(ndarray[int32_t] left, ndarray[int32_t] right):

pandas/src/sandbox.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -175,4 +175,5 @@ def roll_median(ndarray[float64_t] arg, int win, int minp):
175175
return output
176176

177177
include "hashtable.pyx"
178+
178179
include "join.pyx"

pandas/tools/merge.py

+173-43
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pandas._tseries as lib
1212
from pandas._sandbox import Factorizer
1313

14-
def merge(left, right, how='inner', cols=None, left_cols=None, right_cols=None,
14+
def merge(left, right, how='left', on=None, left_on=None, right_on=None,
1515
left_index=False, right_index=False, sort=True,
1616
suffixes=('.x', '.y'), copy=True):
1717
"""
@@ -25,17 +25,25 @@ def merge(left, right, how='inner', cols=None, left_cols=None, right_cols=None,
2525
how : {'left', 'right', 'outer', 'inner'}
2626
How to handle indexes of the two objects. Default: 'left'
2727
for joining on index, None otherwise
28-
* left: use only keys from left frame
29-
* right: use only keys from right frame
30-
* outer: use union of keys from both frames
31-
* inner: use intersection of keys from both frames
32-
cols
33-
left_cols
34-
right_cols
35-
left_index
36-
right_index
37-
sort
38-
suffixes
28+
* left: use only keys from left frame (SQL: left outer join)
29+
* right: use only keys from right frame (SQL: right outer join)
30+
* outer: use union of keys from both frames (SQL: full outer join)
31+
* inner: use intersection of keys from both frames (SQL: inner join)
32+
on : label or list
33+
34+
left_on : label or list
35+
36+
right_on : label or list
37+
38+
left_index : boolean, default True
39+
40+
right_index : boolean, default True
41+
42+
sort : boolean, default True
43+
44+
suffixes : 2-length sequence (tuple, list, ...)
45+
Suffix to apply to overlapping column names in the left and right
46+
side, respectively
3947
copy : boolean, default True
4048
If False, do not copy data unnecessarily
4149
@@ -46,48 +54,153 @@ def merge(left, right, how='inner', cols=None, left_cols=None, right_cols=None,
4654
-------
4755
merged : DataFrame
4856
"""
49-
left_join_keys, right_join_keys = _get_merge_keys(left, right, cols,
50-
left_cols, right_cols,
51-
left_index, right_index)
52-
53-
# max groups = largest possible number of distinct groups
54-
left_key, right_key, max_groups = _get_group_keys(left_join_keys,
55-
right_join_keys)
57+
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
58+
right_on=right_on, left_index=left_index,
59+
right_index=right_index, sort=sort, suffixes=suffixes,
60+
copy=copy)
61+
return op.get_result()
5662

57-
join_func = _join_functions[how]
58-
left_indexer, right_indexer = join_func(left_key, right_key, max_groups)
59-
new_axis = Index(np.arange(len(left_indexer)))
6063

61-
join_op = _JoinOperation(left, right, new_axis, left_indexer,
62-
right_indexer, axis=1)
63-
result_data = join_op.get_result(copy=copy)
64-
return DataFrame(result_data)
64+
# TODO: shortcuts with MultiIndex labels already computed
65+
# TODO: NA group handling
66+
# TODO: DONE group column names in result
67+
# TODO: transformations??
68+
# TODO: only copy DataFrames when modification necessary
6569

6670
class _MergeOperation(object):
6771

68-
def __init__(self, left, right, how='inner', cols=None,
69-
left_cols=None, right_cols=None,
72+
def __init__(self, left, right, how='inner', on=None,
73+
left_on=None, right_on=None,
7074
left_index=False, right_index=False, sort=True,
7175
suffixes=('.x', '.y'), copy=True):
72-
pass
76+
self.left = left
77+
self.right = right
78+
self.how = how
7379

74-
def _get_merge_keys(left, right, cols, left_cols, right_cols,
75-
left_index=False, right_index=False):
76-
"""
80+
self.on = _maybe_make_list(on)
81+
self.left_on = _maybe_make_list(left_on)
82+
self.right_on = _maybe_make_list(right_on)
7783

78-
Parameters
79-
----------
84+
self.copy = copy
8085

81-
Returns
82-
-------
86+
self.suffixes = suffixes
8387

84-
"""
85-
if on is None:
86-
pass
87-
else:
88-
pass
88+
self.sort = sort
89+
90+
self.left_index = left_index
91+
self.right_index = right_index
92+
93+
def get_result(self):
94+
# note this function has side effects
95+
left_join_keys, right_join_keys, join_names = self._get_merge_keys()
96+
97+
# this is a bit kludgy
98+
ldata, rdata = self._get_merge_data(join_names)
99+
100+
# max groups = largest possible number of distinct groups
101+
left_key, right_key, max_groups = \
102+
_get_group_keys(left_join_keys, right_join_keys, sort=self.sort)
103+
104+
join_func = _join_functions[self.how]
105+
left_indexer, right_indexer = join_func(left_key.astype('i4'),
106+
right_key.astype('i4'),
107+
max_groups)
108+
109+
new_axis = Index(np.arange(len(left_indexer)))
110+
111+
join_op = _JoinOperation(ldata, rdata, new_axis,
112+
left_indexer, right_indexer, axis=1)
89113

90-
def _get_group_keys(left_keys, right_keys):
114+
result_data = join_op.get_result(copy=self.copy)
115+
return DataFrame(result_data)
116+
117+
def _get_merge_data(self, join_names):
118+
"""
119+
Handles overlapping column names etc.
120+
"""
121+
ldata, rdata = self.left._data, self.right._data
122+
lsuf, rsuf = self.suffixes
123+
124+
# basically by construction the column names are stored in
125+
# left_on...for now
126+
ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf,
127+
exclude=join_names,
128+
copydata=False)
129+
130+
return ldata, rdata
131+
132+
def _get_merge_keys(self):
133+
"""
134+
Note: has side effects (copy/delete key columns)
135+
136+
Parameters
137+
----------
138+
left
139+
right
140+
on
141+
142+
Returns
143+
-------
144+
left_keys, right_keys
145+
"""
146+
# Hm, any way to make this logic less complicated??
147+
left_keys = []
148+
right_keys = []
149+
join_names = []
150+
151+
need_set_names = False
152+
pop_right = False
153+
154+
if (self.on is None and self.left_on is None
155+
and self.right_on is None):
156+
157+
if self.left_index and self.right_index:
158+
left_keys.append(self.left.index.values)
159+
right_keys.append(self.right.index.values)
160+
161+
need_set_names = True
162+
# XXX something better than this
163+
join_names.append('join_key')
164+
elif self.left_index:
165+
left_keys.append(self.left.index.values)
166+
if self.right_on is None:
167+
raise Exception('Must pass right_on or right_index=True')
168+
elif self.right_index:
169+
right_keys.append(self.right.index.values)
170+
if self.left_on is None:
171+
raise Exception('Must pass left_on or left_index=True')
172+
else:
173+
# use the common columns
174+
common_cols = self.left.columns.intersection(self.right.columns)
175+
self.left_on = self.right_on = common_cols
176+
pop_right = True
177+
elif self.on is not None:
178+
if self.left_on is not None or self.right_on is not None:
179+
raise Exception('Can only pass on OR left_on and '
180+
'right_on')
181+
self.left_on = self.right_on = self.on
182+
pop_right = True
183+
184+
if self.right_on is not None:
185+
# this is a touch kludgy, but accomplishes the goal
186+
if pop_right:
187+
right = self.right.copy()
188+
right_keys.extend([right.pop(k) for k in self.right_on])
189+
self.right = right
190+
else:
191+
right_keys.extend([right[k] for k in self.right_on])
192+
193+
if need_set_names:
194+
self.left = self.left.copy()
195+
for i, (lkey, name) in enumerate(zip(left_keys, join_names)):
196+
self.left.insert(i, name, lkey)
197+
198+
if self.left_on is not None:
199+
left_keys.extend([self.left[k] for k in self.left_on])
200+
201+
return left_keys, right_keys, join_names
202+
203+
def _get_group_keys(left_keys, right_keys, sort=True):
91204
"""
92205
93206
Parameters
@@ -111,9 +224,21 @@ def _get_group_keys(left_keys, right_keys):
111224
llab, _ = rizer.factorize(lk.astype('O'))
112225
rlab, _ = rizer.factorize(rk.astype('O'))
113226

227+
count = rizer.get_count()
228+
229+
if sort:
230+
sorter = Index(rizer.uniques).argsort()
231+
reverse_indexer = np.empty(len(sorter), dtype=np.int32)
232+
reverse_indexer.put(sorter, np.arange(len(sorter)))
233+
234+
llab = reverse_indexer.take(llab)
235+
rlab = reverse_indexer.take(rlab)
236+
237+
# TODO: na handling
238+
114239
left_labels.append(llab)
115240
right_labels.append(rlab)
116-
group_sizes.append(rizer.get_count())
241+
group_sizes.append(count)
117242

118243
left_group_key = get_group_index(left_labels, group_sizes)
119244
right_group_key = get_group_index(right_labels, group_sizes)
@@ -123,6 +248,11 @@ def _get_group_keys(left_keys, right_keys):
123248

124249
import pandas._sandbox as sbx
125250

251+
def _maybe_make_list(obj):
252+
if obj is not None and not isinstance(obj, (tuple, list)):
253+
return [obj]
254+
return obj
255+
126256
def _right_outer_join(x, y):
127257
right_indexer, left_indexer = sbx.left_outer_join(y, x)
128258
return left_indexer, right_indexer

0 commit comments

Comments
 (0)