DOC: existence docs and benchmarks.

chancyk · chancyk · commit a7abb0e37209 · 2014-06-11T01:24:45.000-05:00
diff --git a/bench/bench_existence.py b/bench/bench_existence.py
@@ -0,0 +1,140 @@
+from timeit import Timer
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+
+
+class Benchmarks(object):
+
+    def removed_time_py_list(look_for, look_in):
+        l = range(look_in)
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[[x in l for x in df.index.values]]
+            
+        return time_this
+
+    def time_py_dict(look_for, look_in):
+        l = range(look_in)
+        l_dict = dict(zip(l, l))
+        df = pd.DataFrame(range(look_for))
+
+        def time_this():
+            df[[x in l_dict for x in df.index.values]]
+            
+        return time_this
+        
+        
+    def time_isin_list(look_for, look_in):
+        l = range(look_in)
+        df = pd.DataFrame(range(look_for))
+        
+        def time_this():
+            df[df.index.isin(l)]
+            
+        return time_this
+        
+        
+    def time_isin_dict(look_for, look_in):
+        l = range(look_in)
+        l_dict = dict(zip(l, l))
+        df = pd.DataFrame(range(look_for))
+        
+        def time_this():
+            df[df.index.isin(l_dict)]
+            
+        return time_this
+        
+        
+    def time_isin_series(look_for, look_in):
+        l = range(look_in)
+        l_series = pd.Series(l)
+        df = pd.DataFrame(range(look_for))
+        
+        def time_this():
+            df[df.index.isin(l_series.index)]
+            
+        return time_this
+        
+        
+    def time_join(look_for, look_in):
+        l = range(look_in)
+        l_series = pd.Series(l)
+        l_series.name = 'data'
+        df = pd.DataFrame(range(look_for))
+        
+        def time_this():
+            df.join(l_series, how='inner')
+            
+        return time_this
+        
+    def time_query_eqeq(look_for, look_in):
+        l = range(look_in)
+        s = pd.Series(l)
+        s.name = 'data'
+        df = pd.DataFrame(range(look_for))
+        
+        def time_this():
+            l_series = s
+            df.query('index == @l_series')
+    
+        return time_this
+        
+    def time_query_in(look_for, look_in):
+        l = range(look_in)
+        s = pd.Series(l)
+        s.name = 'data'
+        df = pd.DataFrame(range(look_for))
+        
+        def time_this():
+            l_series = s
+            df.query('index in @l_series')
+    
+        return time_this
+        
+    
+def run_bench(to_time, repeat, look_in, num_look_for_rows, y_limit, filename):
+    func_results = []
+    plt.figure()
+    
+    for time_func_name in to_time:
+        plot_results = []
+        for look_for in num_look_for_rows:
+            func = Benchmarks.__dict__[time_func_name](look_for, look_in)
+            t = Timer(func)
+            elapsed = t.timeit(number=repeat) / repeat
+            name = time_func_name.replace('time_', '')
+            func_results.append((name, look_for, look_in, elapsed))
+            plot_results.append(elapsed)
+        plt.plot(num_look_for_rows, plot_results, label=name)
+        
+    plt.axes().set_xscale('log')
+    x1,x2,y1,y2 = plt.axis()
+    plt.axis((x1, x2, 0, y_limit))
+        
+    plt.legend(loc=2, prop={'size':8})
+    plt.title('Look in %s Rows' % look_in)
+    plt.xlabel('Look For X Rows')
+    plt.ylabel('Time(s)')
+    plt.savefig(filename)
+    plt.clf()
+            
+
+if __name__ == '__main__':
+
+    pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+    static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
+    
+    join = lambda p: os.path.join(static_path, p)
+    
+    to_time = [key for key in Benchmarks.__dict__ if key.startswith('time_')]
+        
+    num_look_for_rows = [10 * 2**i for i in range(1, 21)]
+        
+    filename = join('existence-perf-small.png')
+    run_bench(to_time, 10, 5000, num_look_for_rows[0:len(num_look_for_rows)/2], 0.004, filename)
+    
+    filename = join('existence-perf-large.png')
+    run_bench(to_time, 3, 5000000, num_look_for_rows[len(num_look_for_rows)/2:], 10, filename)
+    
diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
@@ -668,3 +668,180 @@ In general, :meth:`DataFrame.query`/:func:`pandas.eval` will
 evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those
 that must be evaluated in Python space transparently to the user. This is done
 by inferring the result type of an expression from its arguments and operators.
+
+Existence (IsIn, Inner Join, Dict/Hash, Query)
+----------------------------------------------------
+
+There are a number of different ways to test for existence using pandas. The 
+following methods can be used to achieve an existence test. The comments correspond
+to the legend in the plots further down.
+
+
+:meth:`DataFrame.isin`
+
+.. code-block:: python
+
+    # isin_list
+    df[df.index.isin(lst)]
+    # isin_dict
+    df[df.index.isin(dct)]
+    # isin_series
+    df[df.index.isin(series)] 
+    
+    
+    
+:meth:`DataFrame.query`
+
+.. code-block:: python
+    
+    # query_in list
+    df.query('index in @lst')
+    # query_in Series
+    df.query('index in @series')
+    # query_in dict
+    df.query('index in @dct')
+    
+    # query_eqeq list
+    df.query('index == @lst')
+    # query_eqeq Series
+    df.query('index == @series')
+    
+    # dict actually throws an error with '=='
+    
+    
+    
+:meth:`DataFrame.apply`
+
+.. code-block:: python
+    
+    df[df.index.apply(lambda x: x in lst)]
+    
+    
+    
+:meth:`DataFrame.join`
+
+.. code-block:: python
+
+    # join
+    df.join(lst, how='inner')
+    
+    # this can actually be fast for small DataFrames
+    df[[x in dct for x in df.index]]
+    
+    # isin_series, query_eqeq Series, query_in Series, pydict,
+    # join and isin_list are included in the plots below.
+    
+
+As seen below, generally using a ``Series`` is better than using pure python data
+structures for anything larger than very small datasets of around 1000 records.
+The fastest two being ``query('col == @series')`` and ``join(series)``:
+
+.. code-block:: python
+
+    lst = range(1000000)
+    series = Series(lst, name='data')
+
+    df = DataFrame(lst, columns=['ID'])
+    
+    df.query('index == @series')
+    # 10 loops, best of 3: 82.9 ms per loop
+    
+    df.join(series, how='inner')
+    # 100 loops, best of 3: 19.2 ms per loop
+    
+list vs Series:
+
+.. code-block:: python
+
+    df[df.index.isin(lst)]
+    # 1 loops, best of 3: 1.06 s per loop
+    
+    df[df.index.isin(series)]
+    # 1 loops, best of 3: 477 ms per loop
+
+df.index vs df.column doesn't make a difference here:
+
+.. code-block:: python
+
+    df[df.ID.isin(series)]
+    # 1 loops, best of 3: 474 ms per loop
+    
+    df[df.index.isin(series)]
+    # 1 loops, best of 3: 475 ms per loop
+
+The ``query`` 'in' syntax has the same performance as ``isin``, except
+for when using '==' with a ``Series``:
+
+.. code-block:: python
+
+    df.query('index in @lst')
+    # 1 loops, best of 3: 1.04 s per loop
+    
+    df.query('index in @series')
+    # 1 loops, best of 3: 451 ms per loop
+    
+    df.query('index == @lst')
+    # 1 loops, best of 3: 1.03 s per loop
+    
+'==' is actually quite a bit faster than 'in' when used against a Series
+but not as fast as ``join``.
+
+.. code-block:: python
+
+    df.query('index == @series')
+    # 10 loops, best of 3: 80.5 ms per loop
+
+For ``join``, the data must be the index in the ``DataFrame`` and the index in the ``Series``
+for the best performance. The ``Series`` must also have a ``name``. ``join`` defaults to a
+left join so we need to specify 'inner' for existence.
+
+.. code-block:: python
+
+    df.join(series, how='inner')
+    # 100 loops, best of 3: 19.7 ms per loop
+
+Smaller datasets:
+
+.. code-block:: python
+
+    df = DataFrame([1,2,3,4], columns=['ID'])
+    lst = range(10000)
+    dct = dict(zip(lst, lst))
+    series = Series(lst, name='data')
+
+    df.join(series, how='inner')
+    # 1000 loops, best of 3: 866 us per loop
+    
+    df[df.ID.isin(dct)]
+    # 1000 loops, best of 3: 809 us per loop
+    
+    df[df.ID.isin(lst)]
+    # 1000 loops, best of 3: 853 us per loop
+    
+    df[df.ID.isin(series)]
+    # 100 loops, best of 3: 2.22 ms per loop
+
+It's actually faster to use ``apply`` or a list comprehension for these small cases.
+
+.. code-block:: python
+
+    df[[x in dct for x in df.ID]]
+    # 1000 loops, best of 3: 266 us per loop
+    
+    df[df.ID.apply(lambda x: x in dct)]
+    # 1000 loops, best of 3: 364 us per loop
+
+    
+Here is a visualization of some of the benchmarks above. You can see that except for with
+very small datasets, ``isin(Series)``, ``join(Series)``, and ``query('col == Series')``
+quickly become faster than the pure python data structures. 
+
+.. image:: _static/existence-perf-small.png
+
+However, ``isin(Series)`` still presents fairly poor exponential performance where ``join`` is quite
+fast for large datasets. There is some overhead involved in ensuring your data is the index
+in both your left and right datasets but that time should be clearly outweighed by the gains of
+the join itself. For extremely large datasets, you may start bumping into memory limits since ``join``
+does not perform any disk chunking, etc.
+
+.. image:: _static/existence-perf-large.png