Skip to content

Commit 7800290

Browse files
committed
Merge pull request #8172 from behzadnouri/lj-cnt-sort
counting sort instead of np.argsort in left outer join
2 parents 3ecb760 + f4768cc commit 7800290

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

pandas/src/join.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,15 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
106106
if not sort: # if not asked to sort, revert to original order
107107
if len(left) == len(left_indexer):
108108
# no multiple matches for any row on the left
109-
# this is a short-cut to avoid np.argsort;
109+
# this is a short-cut to avoid groupsort_indexer
110110
# otherwise, the `else` path also works in this case
111111
if left_sorter.dtype != np.int_:
112112
left_sorter = left_sorter.astype(np.int_)
113113

114114
rev = np.empty(len(left), dtype=np.int_)
115115
rev.put(left_sorter, np.arange(len(left)))
116116
else:
117-
rev = np.argsort(left_indexer)
117+
rev, _ = groupsort_indexer(left_indexer, len(left))
118118

119119
right_indexer = right_indexer.take(rev)
120120
left_indexer = left_indexer.take(rev)

vb_suite/join_merge.py

+13
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,16 @@ def sample(values, k):
237237
join_non_unique_equal = Benchmark('fracofday * temp[fracofday.index]', setup,
238238
start_date=datetime(2013, 1, 1))
239239

240+
241+
setup = common_setup + '''
242+
np.random.seed(2718281)
243+
n = 50000
244+
245+
left = DataFrame(np.random.randint(1, n/500, (n, 2)),
246+
columns=['jim', 'joe'])
247+
248+
right = DataFrame(np.random.randint(1, n/500, (n, 2)),
249+
columns=['jolie', 'jolia']).set_index('jolie')
250+
'''
251+
252+
left_outer_join_index = Benchmark("left.join(right, on='jim')", setup)

0 commit comments

Comments
 (0)