pandas-dev · chancyk · Jun 9, 2014 · Jun 11, 2014 · Jun 24, 2014 · Sep 15, 2014
diff --git a/bench/bench_existence.py b/bench/bench_existence.py
@@ -0,0 +1,141 @@
+from timeit import Timer
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+
+
+class Benchmarks(object):
+
+    def removed_time_py_list(look_for, look_in):
+        l = range(look_in)
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[[x in l for x in df.index.values]]
+
+        return time_this
+
+    def time_py_dict(look_for, look_in):
+        l = range(look_in)
+        l_dict = dict(zip(l, l))
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[[x in l_dict for x in df.index.values]]
+
+        return time_this
+
+
+    def time_isin_list(look_for, look_in):
+        l = range(look_in)
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[df.index.isin(l)]
+
+        return time_this
+
+
+    def time_isin_dict(look_for, look_in):
+        l = range(look_in)
+        l_dict = dict(zip(l, l))
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[df.index.isin(l_dict)]
+
+        return time_this
+
+
+    def time_isin_series(look_for, look_in):
+        l = range(look_in)
+        l_series = pd.Series(l)
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[df.index.isin(l_series.index)]
+
+        return time_this
+
+
+    def time_join(look_for, look_in):
+        l = range(look_in)
+        l_series = pd.Series(l)
+        l_series.name = 'data'
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df.join(l_series, how='inner')
+
+        return time_this
+
+    # Removed. This functionality might be a bug in query('.. == ..').
+    # def time_query_eqeq(look_for, look_in):
+        # l = range(look_in)
+        # s = pd.Series(l)
+        # s.name = 'data'
+        # df = pd.DataFrame(range(look_for))
+
+        # def time_this():
+            # l_series = s
+            # df.query('index == @l_series')
+
+        # return time_this
+
+    def time_query_in(look_for, look_in):
+        l = range(look_in)
+        s = pd.Series(l)
+        s.name = 'data'
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            l_series = s
+            df.query('index in @l_series')
+
+        return time_this
+
+
+def run_bench(to_time, repeat, look_in, num_look_for_rows, y_limit, filename):
+    func_results = []
+    plt.figure()
+
+    for time_func_name in to_time:
+        plot_results = []
+        for look_for in num_look_for_rows:
+            func = Benchmarks.__dict__[time_func_name](look_for, look_in)
+            t = Timer(func)
+            elapsed = t.timeit(number=repeat) / repeat
+            name = time_func_name.replace('time_', '')
+            func_results.append((name, look_for, look_in, elapsed))
+            plot_results.append(elapsed)
+        plt.plot(num_look_for_rows, plot_results, label=name)
+
+    plt.axes().set_xscale('log')
+    x1,x2,y1,y2 = plt.axis()
+    plt.axis((x1, x2, 0, y_limit))
+
+    plt.legend(loc=2, prop={'size':8})
+    plt.title('Look in %s Rows' % look_in)
+    plt.xlabel('Look For X Rows')
+    plt.ylabel('Time(s)')
+    plt.savefig(filename)
+    plt.clf()
+
+
+if __name__ == '__main__':
+
+    pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+    static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
+
+    join = lambda p: os.path.join(static_path, p)
+
+    to_time = [key for key in Benchmarks.__dict__ if key.startswith('time_')]
+
+    num_look_for_rows = [10 * 2**i for i in range(1, 21)]
+
+    filename = join('existence-perf-small.png')
+    run_bench(to_time, 10, 5000, num_look_for_rows[0:len(num_look_for_rows)/2], 0.004, filename)
+
+    filename = join('existence-perf-large.png')
+    run_bench(to_time, 3, 5000000, num_look_for_rows[len(num_look_for_rows)/2:], 10, filename)
+
diff --git a/doc/source/_static/existence-perf-large.png b/doc/source/_static/existence-perf-large.png
diff --git a/doc/source/_static/existence-perf-small.png b/doc/source/_static/existence-perf-small.png
diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
@@ -668,3 +668,171 @@ In general, :meth:`DataFrame.query`/:func:`pandas.eval` will
 evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those
 that must be evaluated in Python space transparently to the user. This is done
 by inferring the result type of an expression from its arguments and operators.
+
+Existence (IsIn, Inner Join, Dict/Hash, Query)
+----------------------------------------------------
+
+Existence is the process of testing if an item exists in another list of items, and
+in the case of a DataFrame, we're testing each value of a column for existence in 
+another collection of items.
+
+There are a number of different ways to test for existence using pandas and the 
+following methods are a few of those. The comments correspond to the legend
+in the plots further down.
+
+
+:meth:`DataFrame.isin`
+
+.. code-block:: python
+
+    # isin_list
+    df[df.index.isin(lst)]
+    # isin_dict
+    df[df.index.isin(dct)]
+    # isin_series
+    df[df.index.isin(series)] 
+
+
+
+:meth:`DataFrame.query`
+
+.. code-block:: python
+
+    # The '@' symbol is used with `query` to reference local variables. Names
+    # without '@' will reference the DataFrame's columns or index.
+
+    # query_in list
+    df.query('index in @lst')
+    # query_in Series
+    df.query('index in @series')
+
+    # A list can be used with `query('.. == ..')` to test for existence
+    # but other data structures such as the `pandas.Series` have
+    # a different behaviour.
+
+    df.query('index == @lst')
+
+
+:meth:`DataFrame.apply`
+
+.. code-block:: python
+
+    df[df.index.apply(lambda x: x in lst)]
+
+
+:meth:`DataFrame.join`
+
+.. code-block:: python
+
+    # join
+    df.join(lst, how='inner')
+
+    # this can actually be fast for small DataFrames
+    df[[x in dct for x in df.index]]
+
+    # isin_series, query_in Series, pydict,
+    # join and isin_list are included in the plots below.
+
+
+As seen below, generally using a ``Series`` is better than using pure python data
+structures for anything larger than very small datasets of around 1000 records.
+The fastest two being ``join(series)``:
+
+.. code-block:: python
+
+    lst = range(1000000)
+    series = Series(lst, name='data')
+
+    df = DataFrame(lst, columns=['ID'])
+
+    df.join(series, how='inner')
+    # 100 loops, best of 3: 19.2 ms per loop
+
+list vs Series:
+
+.. code-block:: python
+
+    df[df.index.isin(lst)]
+    # 1 loops, best of 3: 1.06 s per loop
+
+    df[df.index.isin(series)]
+    # 1 loops, best of 3: 477 ms per loop
+
+df.index vs df.column doesn't make a difference here:
+
+.. code-block:: python
+
+    df[df.ID.isin(series)]
+    # 1 loops, best of 3: 474 ms per loop
+
+    df[df.index.isin(series)]
+    # 1 loops, best of 3: 475 ms per loop
+
+The ``query`` 'in' syntax has the same performance as ``isin``.
+
+.. code-block:: python
+
+    df.query('index in @lst')
+    # 1 loops, best of 3: 1.04 s per loop
+
+    df.query('index in @series')
+    # 1 loops, best of 3: 451 ms per loop
+
+    df.query('index == @lst')
+    # 1 loops, best of 3: 1.03 s per loop
+
+
+For ``join``, the data must be the index in the ``DataFrame`` and the index in the ``Series``
+for the best performance. The ``Series`` must also have a ``name``. ``join`` defaults to a
+left join so we need to specify 'inner' for existence.
+
+.. code-block:: python
+
+    df.join(series, how='inner')
+    # 100 loops, best of 3: 19.7 ms per loop
+
+Smaller datasets:
+
+.. code-block:: python
+
+    df = DataFrame([1,2,3,4], columns=['ID'])
+    lst = range(10000)
+    dct = dict(zip(lst, lst))
+    series = Series(lst, name='data')
+
+    df.join(series, how='inner')
+    # 1000 loops, best of 3: 866 us per loop
+
+    df[df.ID.isin(dct)]
+    # 1000 loops, best of 3: 809 us per loop
+
+    df[df.ID.isin(lst)]
+    # 1000 loops, best of 3: 853 us per loop
+
+    df[df.ID.isin(series)]
+    # 100 loops, best of 3: 2.22 ms per loop
+
+It's actually faster to use ``apply`` or a list comprehension for these small cases.
+
+.. code-block:: python
+
+    df[[x in dct for x in df.ID]]
+    # 1000 loops, best of 3: 266 us per loop
+
+    df[df.ID.apply(lambda x: x in dct)]
+    # 1000 loops, best of 3: 364 us per loop
+
+
+Here is a visualization of some of the benchmarks above. You can see that except for with
+very small datasets, ``isin(Series)`` and ``join(Series)`` quickly become faster than the
+pure python data structures. 
+
+.. image:: _static/existence-perf-small.png
+
+However, ``isin(Series)`` still presents fairly poor exponential performance where ``join`` is quite
+fast for large datasets. There is some overhead involved in ensuring your data is the index
+in both your left and right datasets but that time should be clearly outweighed by the gains of
+the join itself. For extremely large datasets, you may start bumping into memory limits since ``join``
+does not perform any disk chunking, etc.
+
+.. image:: _static/existence-perf-large.png