diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 4c32aa902d64d..f8eef55a86361 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -106,7 +106,7 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, if not sort: # if not asked to sort, revert to original order if len(left) == len(left_indexer): # no multiple matches for any row on the left - # this is a short-cut to avoid np.argsort; + # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case if left_sorter.dtype != np.int_: left_sorter = left_sorter.astype(np.int_) @@ -114,7 +114,7 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, rev = np.empty(len(left), dtype=np.int_) rev.put(left_sorter, np.arange(len(left))) else: - rev = np.argsort(left_indexer) + rev, _ = groupsort_indexer(left_indexer, len(left)) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index f91f45f661af0..eb0608f12a8cb 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -237,3 +237,16 @@ def sample(values, k): join_non_unique_equal = Benchmark('fracofday * temp[fracofday.index]', setup, start_date=datetime(2013, 1, 1)) + +setup = common_setup + ''' +np.random.seed(2718281) +n = 50000 + +left = DataFrame(np.random.randint(1, n/500, (n, 2)), + columns=['jim', 'joe']) + +right = DataFrame(np.random.randint(1, n/500, (n, 2)), + columns=['jolie', 'jolia']).set_index('jolie') +''' + +left_outer_join_index = Benchmark("left.join(right, on='jim')", setup)