Skip to content

Commit 3cb301e

Browse files
committed
ENH: add sort option to DataFrame.join + vbench, GH #731
1 parent 5d55410 commit 3cb301e

File tree

6 files changed

+82
-35
lines changed

6 files changed

+82
-35
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ pandas 0.7.0
8888
- Add ``isin`` method to Index objects, works just like ``Series.isin`` (GH
8989
#657)
9090
- Implement array interface on Panel so that ufuncs work (re: #740)
91+
- Add ``sort`` option to ``DataFrame.join`` (GH #731)
9192

9293
**API Changes**
9394

pandas/core/frame.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3000,7 +3000,8 @@ def append(self, other, ignore_index=False, verify_integrity=True):
30003000
return concat(to_concat, ignore_index=ignore_index,
30013001
verify_integrity=verify_integrity)
30023002

3003-
def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
3003+
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
3004+
sort=False):
30043005
"""
30053006
Join columns with other DataFrame either on index or on a key
30063007
column. Efficiently Join multiple DataFrame objects by index at once by
@@ -3028,6 +3029,9 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
30283029
Suffix to use from left frame's overlapping columns
30293030
rsuffix : string
30303031
Suffix to use from right frame's overlapping columns
3032+
sort : boolean, default False
3033+
Order result DataFrame lexicographically by the join key. If False,
3034+
preserves the index order of the calling (left) DataFrame
30313035
30323036
Notes
30333037
-----
@@ -3040,9 +3044,10 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
30403044
"""
30413045
# For SparseDataFrame's benefit
30423046
return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
3043-
rsuffix=rsuffix)
3047+
rsuffix=rsuffix, sort=sort)
30443048

3045-
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
3049+
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
3050+
sort=False):
30463051
from pandas.tools.merge import merge, concat
30473052

30483053
if isinstance(other, Series):
@@ -3052,7 +3057,7 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
30523057
if isinstance(other, DataFrame):
30533058
return merge(self, other, left_on=on, how=how,
30543059
left_index=on is None, right_index=True,
3055-
suffixes=(lsuffix, rsuffix), sort=False)
3060+
suffixes=(lsuffix, rsuffix), sort=sort)
30563061
else:
30573062
if on is not None:
30583063
raise ValueError('Joining multiple DataFrames only supported'

pandas/sparse/frame.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,8 @@ def add_suffix(self, suffix):
589589
f = ('%s' + ('%s' % suffix)).__mod__
590590
return self.rename(columns=f)
591591

592-
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
592+
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
593+
sort=False):
593594
if on is not None:
594595
raise NotImplementedError
595596
else:

pandas/tools/merge.py

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -112,29 +112,14 @@ def _get_join_info(self):
112112
join_index, left_indexer, right_indexer = \
113113
left_ax.join(right_ax, how=self.how, return_indexers=True)
114114
elif self.right_index and self.how == 'left':
115-
join_index = left_ax
116-
left_indexer = None
117-
118-
if len(self.left_join_keys) > 1:
119-
assert(isinstance(right_ax, MultiIndex) and
120-
len(self.left_join_keys) == right_ax.nlevels)
121-
122-
right_indexer = _get_multiindex_indexer(self.left_join_keys,
123-
right_ax, sort=False)
124-
else:
125-
right_indexer = right_ax.get_indexer(self.left_join_keys[0])
115+
join_index, left_indexer, right_indexer = \
116+
_left_join_on_index(left_ax, right_ax, self.left_join_keys,
117+
sort=self.sort)
126118

127119
elif self.left_index and self.how == 'right':
128-
join_index = right_ax
129-
right_indexer = None
130-
131-
if len(self.right_join_keys) > 1:
132-
assert(isinstance(left_ax, MultiIndex) and
133-
len(self.right_join_keys) == left_ax.nlevels)
134-
left_indexer = _get_multiindex_indexer(self.right_join_keys,
135-
left_ax, sort=False)
136-
else:
137-
left_indexer = left_ax.get_indexer(self.right_join_keys[0])
120+
join_index, right_indexer, left_indexer = \
121+
_left_join_on_index(right_ax, left_ax, self.right_join_keys,
122+
sort=self.sort)
138123
else:
139124
# max groups = largest possible number of distinct groups
140125
left_key, right_key, max_groups = self._get_group_keys()
@@ -307,16 +292,16 @@ def _get_keys(frame, on, drop=False):
307292
return frame, keys, names
308293

309294

310-
def _get_multiindex_indexer(join_keys, index, sort=True):
295+
def _get_multiindex_indexer(join_keys, index, sort=False):
311296
shape = []
312297
labels = []
313298
for level, key in zip(index.levels, join_keys):
314299
llab, rlab, count = _factorize_objects(level, key, sort=False)
315300
labels.append(rlab)
316301
shape.append(count)
317302

318-
left_group_key = get_group_index(labels, shape) #.astype('i4')
319-
right_group_key = get_group_index(index.labels, shape) #.astype('i4')
303+
left_group_key = get_group_index(labels, shape)
304+
right_group_key = get_group_index(index.labels, shape)
320305

321306
left_group_key, right_group_key, max_groups = \
322307
_factorize_int64(left_group_key, right_group_key,
@@ -327,17 +312,47 @@ def _get_multiindex_indexer(join_keys, index, sort=True):
327312
right_group_key.astype('i4'),
328313
max_groups, sort=False)
329314

330-
return right_indexer
315+
return left_indexer, right_indexer
316+
317+
def _get_single_indexer(join_key, index, sort=False):
318+
left_key, right_key, count = _factorize_objects(join_key, index, sort=sort)
331319

332-
# after refactorizing, I don't think reordering is necessary
320+
left_indexer, right_indexer = \
321+
lib.left_outer_join(left_key.astype('i4'), right_key.astype('i4'),
322+
count, sort=sort)
333323

334-
# NOW! reorder
335-
#right_indexer.take(left_indexer.argsort())
324+
return left_indexer, right_indexer
336325

337326
def _right_outer_join(x, y, max_groups):
338327
right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups)
339328
return left_indexer, right_indexer
340329

330+
def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
331+
join_index = left_ax
332+
left_indexer = None
333+
334+
if len(join_keys) > 1:
335+
assert(isinstance(right_ax, MultiIndex) and
336+
len(join_keys) == right_ax.nlevels)
337+
338+
left_tmp, right_indexer = \
339+
_get_multiindex_indexer(join_keys, right_ax,
340+
sort=sort)
341+
if sort:
342+
left_indexer = left_tmp
343+
join_index = left_ax.take(left_indexer)
344+
else:
345+
jkey = join_keys[0]
346+
if sort:
347+
left_indexer, right_indexer = \
348+
_get_single_indexer(jkey, right_ax, sort=sort)
349+
join_index = left_ax.take(left_indexer)
350+
else:
351+
right_indexer = right_ax.get_indexer(jkey)
352+
353+
return join_index, left_indexer, right_indexer
354+
355+
341356
_join_functions = {
342357
'inner' : lib.inner_join,
343358
'left' : lib.left_outer_join,

pandas/tools/tests/test_merge.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,9 +438,15 @@ def test_merge_index_singlekey_right_vs_left(self):
438438
index=['d', 'b', 'c', 'a'])
439439

440440
merged1 = merge(left, right, left_on='key',
441-
right_index=True, how='left')
441+
right_index=True, how='left', sort=False)
442442
merged2 = merge(right, left, right_on='key',
443-
left_index=True, how='right')
443+
left_index=True, how='right', sort=False)
444+
assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
445+
446+
merged1 = merge(left, right, left_on='key',
447+
right_index=True, how='left', sort=True)
448+
merged2 = merge(right, left, right_on='key',
449+
left_index=True, how='right', sort=True)
444450
assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
445451

446452
def test_merge_index_singlekey_inner(self):
@@ -505,6 +511,18 @@ def test_merge_nocopy(self):
505511
merged['d'] = 'peekaboo'
506512
self.assert_((right['d'] == 'peekaboo').all())
507513

514+
def test_join_sort(self):
515+
left = DataFrame({'key' : ['foo', 'bar', 'baz', 'foo'],
516+
'value' : [1, 2, 3, 4]})
517+
right = DataFrame({'value2' : ['a', 'b', 'c']},
518+
index=['bar', 'baz', 'foo'])
519+
520+
joined = left.join(right, on='key', sort=True)
521+
expected = DataFrame({'key' : ['bar', 'baz', 'foo', 'foo'],
522+
'value' : [2, 3, 1, 4],
523+
'value2' : ['a', 'b', 'c', 'c']},
524+
index=[1, 2, 0, 3])
525+
assert_frame_equal(joined, expected)
508526

509527
class TestMergeMulti(unittest.TestCase):
510528

vb_suite/join_merge.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
columns=['A', 'B', 'C', 'D'])
4242
df_key2 = DataFrame(np.random.randn(len(level2), 4), index=level2,
4343
columns=['A', 'B', 'C', 'D'])
44+
45+
df_shuf = df.reindex(df.index[shuf])
4446
"""
4547

4648
#----------------------------------------------------------------------
@@ -54,6 +56,11 @@
5456
Benchmark("df.join(df_key2, on='key2')", setup,
5557
name='join_dataframe_index_single_key_bigger')
5658

59+
join_dataframe_index_single_key_bigger_sort = \
60+
Benchmark("df_shuf.join(df_key2, on='key2', sort=True)", setup,
61+
name='join_dataframe_index_single_key_bigger',
62+
start_date=datetime(2012, 2, 5))
63+
5764
join_dataframe_index_multi = \
5865
Benchmark("df.join(df_multi, on=['key1', 'key2'])", setup,
5966
name='join_dataframe_index_multi',

0 commit comments

Comments
 (0)