Severe performance problems with multi-index joins #14775

SReicheltPTV · 2016-11-30T16:20:07Z

Code Sample (microbenchmark)

from __future__ import division

import pandas as pd
import numpy as np
import functools
import operator as op
import time


def join_raw(df1, df2, *args, **kwargs):
    start_time = time.time()
    result = df1.join(df2, *args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time


def join_with_workaround(df1, df2, *args, **kwargs):
    index1 = list(df1.index.names)
    index2 = list(df2.index.names)
    index1common = [level for level in index1 if level in index2]
    index2common = [level for level in index2 if level in index1]
    index1remove = [level for level in index1 if level not in index2]
    index2remove = [level for level in index2 if level not in index1]

    if index1remove:
        df1 = df1.reset_index(index1remove)
    if index2remove:
        df2 = df2.reset_index(index2remove)

    # The indices should be the same now, except that levels might be in a
    # different order, which causes "join" to fail as well.
    if df1.index.names != df2.index.names:
        df2 = df2.reorder_levels(df1.index.names)

    start_time = time.time()
    result = df1.join(df2, *args, **kwargs)
    end_time = time.time()

    # Add all removed levels to the index of the result.
    if index1remove:
        result.set_index(index1remove, append=True, inplace=True)
        # Make sure the levels are ordered the same way as in df1.
        # Additional levels only appearing in df2 will be added at the
        # end, though.
        if result.index.names != index1:
            result = result.reorder_levels(index1)
    if index2remove:
        result.set_index(index2remove, append=True, inplace=True)

    return result, end_time - start_time


def join_with_reset_index(df1, df2, *args, **kwargs):
    index1 = list(df1.index.names)
    index2 = list(df2.index.names)
    index2common = [level for level in index2 if level in index1]
    index2remove = [level for level in index2 if level not in index1]

    df1 = df1.reset_index()
    if index2remove:
        df2 = df2.reset_index(index2remove)

    start_time = time.time()
    result = df1.join(df2, on=index2common, *args, **kwargs)
    end_time = time.time()

    result.set_index(index1 + index2remove, inplace=True)

    return result, end_time - start_time


def join_via_concat(df1, df2, how='left', lsuffix='', rsuffix=''):
    index1 = list(df1.index.names)
    index2 = list(df2.index.names)
    index1remove = [level for level in index1 if level not in index2]
    index2remove = [level for level in index2 if level not in index1]

    # Remove levels from both indices as necessary.
    if index1remove:
        df1 = df1.reset_index(index1remove)
    if index2remove:
        df2 = df2.reset_index(index2remove)

    # The indices should be the same now, except that levels might be in a
    # different order.
    if df1.index.names != df2.index.names:
        df2 = df2.reorder_levels(df1.index.names)

    if lsuffix:
        df1 = df1.rename(columns=lambda c: c + lsuffix if c in df2.columns.values else c, copy=False)
    if rsuffix:
        df2 = df2.rename(columns=lambda c: c + rsuffix if c in df1.columns.values else c, copy=False)
    overlapping_columns = [column for column in df1.columns.values if column in df2.columns.values]
    if overlapping_columns:
        raise Exception('Overlapping column names when joining data frames: %s (specify lsuffix or rsuffix)' % overlapping_columns)

    # Use concat to simulate join.
    start_time = time.time()
    if how == 'left':
        result = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
    elif how == 'right':
        result = pd.concat([df1, df2], axis=1, join_axes=[df2.index])
    else:
        result = pd.concat([df1, df2], axis=1, join=how)
    end_time = time.time()

    # Add all removed levels to the index of the result.
    if index1remove:
        result.set_index(index1remove, append=True, inplace=True)
        # Make sure the levels are ordered the same way as in df1.
        # Additional levels only appearing in df2 will be added at the
        # end, though.
        if result.index.names != index1:
            result = result.reorder_levels(index1)
    if index2remove:
        result.set_index(index2remove, append=True, inplace=True)

    return result, end_time - start_time


result_row_count = 10000000

random_numbers = np.random.randn(result_row_count)

def create_dataframe(index_columns, index_sizes):
    index_ranges = [list(range(size)) for size in index_sizes]
    index = pd.MultiIndex.from_product(index_ranges, names=index_columns)
    row_count = functools.reduce(op.mul, index_sizes, 1)
    return pd.DataFrame(random_numbers[:row_count], columns=['V'], index=index)


def test_join_fn(fn, index1, index2, index_sizes):
    df1 = create_dataframe(index1, [index_sizes[c] for c in index1])
    df2 = create_dataframe(index2, [index_sizes[c] for c in index2])
    try:
        start_time = time.time()
        result, main_time_1 = fn(df1, df2, lsuffix='l', rsuffix='r')
        mid_time = time.time()
        result, main_time_2 = fn(df1, df2, lsuffix='l', rsuffix='r')
        end_time = time.time()
        print("    %.3f (%.3f), then %.3f (%.3f)" % (mid_time - start_time, main_time_1, end_time - mid_time, main_time_2))
        return True
    except:
        print("    ERROR")
        return False


def test_joins(*args):
    print("  Raw join on index:")
    if not test_join_fn(join_raw, *args):
        print("  Join on index with workaround (partial reset_index):")
        test_join_fn(join_with_workaround, *args)
    print("  Join after reset_index:")
    test_join_fn(join_with_reset_index, *args)
    print("  Join via concat:")
    test_join_fn(join_via_concat, *args)
    print("")


def test_index_combination(index1, index2):
    result_index = index1 + [c for c in index2 if c not in index1]
    print("%s + %s -> %s:" % (index1, index2, result_index))
    remaining_row_count = result_row_count
    index_sizes = {}
    for c in result_index[:-1]:
        if remaining_row_count >= 10:
            index_sizes[c] = 10
            remaining_row_count = (remaining_row_count + 9) // 10
        else:
            index_sizes[c] = 1
    index_sizes[result_index[-1]] = remaining_row_count
    test_joins(index1, index2, index_sizes)


test_index_combination(list('A'), list('A'))
test_index_combination(list('AB'), list('AB'))
test_index_combination(list('ABC'), list('ABC'))
test_index_combination(list('ABCDE'), list('ABCDE'))
test_index_combination(list('ABCDEFG'), list('ABCDEFG'))
test_index_combination(list('A'), list('AB'))
test_index_combination(list('B'), list('AB'))
test_index_combination(list('A'), list('ABC'))
test_index_combination(list('B'), list('ABC'))
test_index_combination(list('AB'), list('A'))
test_index_combination(list('AB'), list('B'))
test_index_combination(list('ABC'), list('A'))
test_index_combination(list('ABC'), list('B'))
test_index_combination(list('ABC'), list('AB'))
test_index_combination(list('ABCDEF'), list('ABCDE'))
test_index_combination(list('AB'), list('ABC'))
test_index_combination(list('ABCDE'), list('ABCDEF'))
test_index_combination(list('ABC'), list('ABD'))
test_index_combination(list('ABC'), list('BC'))
test_index_combination(list('BC'), list('ABC'))
test_index_combination(list('ABC'), list('AC'))
test_index_combination(list('AC'), list('ABC'))
test_index_combination(list('AB'), list('BA'))
test_index_combination(list('ABCEF'), list('ACDBF'))

Problem description

We are getting severe performance bottlenecks in our code due to join operations on multiple index levels.
To analyze and illustrate the problem, we have created a microbenchmark which executes joins on data frames with different indices and one data column. The benchmark is constructed such that the resulting data frame always has the same number of rows (10000000 in this case, though we also have to deal with larger data sets). We would therefore expect all join operations to take a similar amount of time, perhaps somewhat linear in the number of index levels. Instead, we see up to 200x longer execution times in some cases (see output).

The benchmark works as follows:

First, we create two data frames with unique, hierarchically sorted multi-indices. The data frames are built so that after they are joined, all index levels except the last have 10 values each, and the last level has however many values are necessary to get 10000000 rows.
Then these data frames are joined twice in a row, measuring the time each join operation takes.
Moreover, we perform the same join in several different ways that (as far as we can tell) yield the same result but are sometimes faster, and/or work in cases where the native join operation fails with a "currently not implemented" exception. In particular, using concat instead of join sometimes helps a lot. Unfortunately, even the fastest method is often too slow for our purposes.

The numbers show the times of both joins that are executed on the same data frames. Interestingly, in some cases, the first join often takes a lot longer than the second. For the alternative implementations, the numbers in parentheses show the number of seconds spent in the main inner operation (either join or concat).

We would be grateful for answers to any of the following questions:

What is causing these large differences in performance?
Is there anything we can do on our side to speed things up (other than selecting the fastest option in each case, which we already do)?
Are there any plans to improve the performance of joins in Pandas, in general?
Could we assist with this, e.g. with a bug bounty?

Output

test_join_results.txt

Expected Output

We would expect all execution times to be less than a second, usually much less.

Output of `pd.show_versions()`

INSTALLED VERSIONS ------------------ commit: None python: 3.5.2.final.0 python-bits: 64 OS: Windows OS-release: 7 machine: AMD64 processor: Intel64 Family 6 Model 58 Stepping 9, GenuineIntel byteorder: little LC_ALL: None LANG: None LOCALE: None.None

pandas: 0.19.1
nose: None
pip: 9.0.1
setuptools: 20.10.1
Cython: None
numpy: 1.11.2
scipy: None
statsmodels: None
xarray: None
IPython: None
sphinx: None
patsy: None
dateutil: 2.6.0
pytz: 2016.7
blosc: None
bottleneck: None
tables: None
numexpr: None
matplotlib: None
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: None
boto: None
pandas_datareader: None

The text was updated successfully, but these errors were encountered:

jreback · 2016-11-30T16:51:44Z

after you reset, you need to .sort_index() to assure lexsorted-ness. I believe a performance warning is issued if that is not the case (though don't really remember)

chris-b1 · 2016-11-30T22:36:03Z

So, I think one problem is here:

pandas/pandas/indexes/base.py

Line 2644 in 2bd9c95

pass

If the levels are the same, we try and do a 'simple' join, but it involves populating a huge and expensive hash table of tuples, xref #13904. I think that could be easily fixed (the join case - not the general issue). This would explain why the joins on non-identical levels are faster.

SReicheltPTV · 2016-12-01T13:15:57Z

after you reset, you need to .sort_index() to assure lexsorted-ness.

Thanks for this info; that sounds very plausible. (I didn't get any performance warnings, though.)
Here are the updated results:
test_join_results_sorted.txt
Unfortunately, it doesn't usually seem to help much, and the sorting itself takes even longer than the inefficient join.

If the levels are the same, we try and do a 'simple' join, but it involves populating a huge and expensive hash table of tuples, xref #13904.

Ah, so I suppose the large differences between subsequent tries are also due to the calculation of this hash table.

I think that could be easily fixed (the join case - not the general issue). This would explain why the joins on non-identical levels are faster.

Wouldn't we then run into the problem that "merging with more than one level overlap is not implemented", though?

kongscn · 2017-11-07T16:21:20Z

Any update? I'm facing the same problem on multi-index joins with 0.21.0, about 6M rows of each dataframe.
@jreback is right about sort. But by using pd.merge I can get roughly the same performance as joining two dataframes with sorted index(even slightly better).
It can be confirmed by prun that a significant amount of time is used for index-ordering tasks, which is not really bound to a join task. Though this is kind of one-time job and future performance may benefit, the cost is just too high in a join.

1755 function calls (1726 primitive calls) in 16.401 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2   10.127    5.063   10.127    5.063 {built-in method numpy.core.multiarray.lexsort}
        2    4.343    2.172    4.343    2.172 {pandas._libs.lib.fast_zip}
        1    0.837    0.837    0.837    0.837 {pandas._libs.join.left_join_indexer_unique_object}

It takes only about 2 sec for pd.merge for the same job. It's comparable to joining tasks with sorted index.

mroeschke · 2024-01-27T22:53:45Z

Looks like these benchmark use a fairly old version of pandas. We can reopen if a simplified benchmark shows this is still an issue in a modern version of pandas

chris-b1 added MultiIndex Performance Memory or execution speed performance labels Nov 30, 2016

mroeschke added the Reshaping Concat, Merge/Join, Stack/Unstack, Explode label May 2, 2021

mroeschke closed this as completed Jan 27, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Severe performance problems with multi-index joins #14775

Severe performance problems with multi-index joins #14775

SReicheltPTV commented Nov 30, 2016

jreback commented Nov 30, 2016

chris-b1 commented Nov 30, 2016

SReicheltPTV commented Dec 1, 2016

kongscn commented Nov 7, 2017

mroeschke commented Jan 27, 2024

Severe performance problems with multi-index joins #14775

Severe performance problems with multi-index joins #14775

Comments

SReicheltPTV commented Nov 30, 2016

Code Sample (microbenchmark)

Problem description

Output

Expected Output

Output of pd.show_versions()

jreback commented Nov 30, 2016

chris-b1 commented Nov 30, 2016

SReicheltPTV commented Dec 1, 2016

kongscn commented Nov 7, 2017

mroeschke commented Jan 27, 2024

Output of `pd.show_versions()`