pandas-dev
diff --git a/‎bench/bench_with_subset.py
+100-21 b/‎bench/bench_with_subset.py
+100-21
diff --git a/‎doc/source/_static/eval-perf-small.png
24.7 KB b/‎doc/source/_static/eval-perf-small.png
24.7 KB
diff --git a/‎doc/source/_static/eval-perf.png
18.2 KB b/‎doc/source/_static/eval-perf.png
18.2 KB
diff --git a/‎doc/source/_static/query-perf-small.png
25.1 KB b/‎doc/source/_static/query-perf-small.png
25.1 KB
diff --git a/‎doc/source/_static/query-perf.png
19.9 KB b/‎doc/source/_static/query-perf.png
19.9 KB
diff --git a/‎doc/source/enhancingperf.rst
+9-3 b/‎doc/source/enhancingperf.rst
+9-3
diff --git a/‎doc/source/indexing.rst
+21-2 b/‎doc/source/indexing.rst
+21-2
diff --git a/‎doc/source/io.rst
+1-1 b/‎doc/source/io.rst
+1-1
diff --git a/‎pandas/computation/pytables.py
+7-1 b/‎pandas/computation/pytables.py
+7-1
diff --git a/‎pandas/io/tests/test_pytables.py
+11-21 b/‎pandas/io/tests/test_pytables.py
+11-21
@@ -5,33 +5,112 @@
 """
 
 from __future__ import print_function
-from timeit import timeit
+import numpy as np
+from numpy import array
+from timeit import repeat as timeit
+from pandas.compat import range, zip
+from pandas import DataFrame
 
 
-def bench_with(n=1e7, times=10, repeat=3):
-    setup = "from pandas import DataFrame\n"
-    setup += "from numpy.random import randn\n"
-    setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n
-    setup += "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
-    print('DataFrame.eval:')
-    print(timeit('df.eval(s)', setup=setup, repeat=repeat, number=times))
+setup_common = """from pandas import DataFrame
+from numpy.random import randn
+df = DataFrame(randn(%d, 3), columns=list('abc'))
+%s"""
 
 
-def bench_subset(n=1e7, times=10, repeat=3):
-    setup = "from pandas import DataFrame\n"
-    setup += "from numpy.random import randn\n"
-    setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n
-    setup += "s = 'a <= b <= (c ** 2 + b ** 2 - a) and b > c'"
-    print('DataFrame.query:')
-    print(timeit('df.query(s)', setup=setup, repeat=repeat, number=times))
-    print('DataFrame.__getitem__:')
-    print(timeit('df[s]', setup=setup, repeat=repeat, number=times))
+setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
 
 
-def bench():
-    bench_with()
-    bench_subset()
+def bench_with(n, times=10, repeat=3, engine='numexpr'):
+    return np.array(timeit('df.eval(s, engine=%r)' % engine,
+                           setup=setup_common % (n, setup_with),
+                           repeat=repeat, number=times)) / times
+
+
+setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
+
+
+def bench_subset(n, times=10, repeat=3, engine='numexpr'):
+    return np.array(timeit('df.query(s, engine=%r)' % engine,
+                           setup=setup_common % (n, setup_subset),
+                           repeat=repeat, number=times)) / times
+
+
+def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False):
+    r = np.logspace(mn, mx, num=num).round().astype(int)
+
+    ev = DataFrame(np.empty((num, len(engines))), columns=engines)
+    qu = ev.copy(deep=True)
+
+    ev['size'] = qu['size'] = r
+
+    for engine in engines:
+        for i, n in enumerate(r):
+            if verbose:
+                print('engine: %r, i == %d' % (engine, i))
+            ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine)
+            qu.loc[i, engine] = bench_subset(n, times=1, repeat=1,
+                                             engine=engine)
+
+    return ev, qu
+
+
+def plot_perf(df, engines, title, filename=None):
+    from matplotlib.pyplot import figure, rc
+
+    try:
+        from mpltools import style
+    except ImportError:
+        pass
+    else:
+        style.use('ggplot')
+
+    rc('text', usetex=True)
+
+    fig = figure(figsize=(4, 3), dpi=100)
+    ax = fig.add_subplot(111)
+
+    for engine in engines:
+        ax.plot(df.size, df[engine], label=engine, lw=2)
+
+    ax.set_xlabel('Number of Rows')
+    ax.set_ylabel('Time (s)')
+    ax.set_title(title)
+    ax.legend(loc='best')
+    ax.tick_params(top=False, right=False)
+
+    fig.tight_layout()
+
+    if filename is not None:
+        fig.savefig(filename)
 
 
 if __name__ == '__main__':
-    bench()
+    import os
+    import pandas as pd
+
+    pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+    static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
+
+    join = lambda p: os.path.join(static_path, p)
+
+    fn = join('eval-query-perf-data.h5')
+
+    engines = 'python', 'numexpr'
+
+    if not os.path.exists(fn):
+        ev, qu = bench(verbose=True)
+        ev.to_hdf(fn, 'eval')
+        qu.to_hdf(fn, 'query')
+    else:
+        ev = pd.read_hdf(fn, 'eval')
+        qu = pd.read_hdf(fn, 'query')
+
+    plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png'))
+    plot_perf(qu, engines, 'DataFrame.query()',
+              filename=join('query-perf.png'))
+
+    plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()',
+              filename=join('eval-perf-small.png'))
+    plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()',
+              filename=join('query-perf-small.png'))
@@ -526,7 +526,13 @@ different engines.
 .. image:: _static/eval-perf.png
 
 
-Note that operations with smallish objects (around 15,000 rows) are faster
-using plain Python:
+.. note::
+
+   Operations with smallish objects (around 15k-20k rows) are faster using
+   plain Python:
+
+       .. image:: _static/eval-perf-small.png
+
 
-.. image:: _static/eval-perf-intersect.png
+This plot was created using a ``DataFrame`` with 3 columns each containing
+floating point values generated using ``numpy.random.randn()``.
@@ -1190,12 +1190,12 @@ The ``in`` and ``not in`` operators
    df['a in b']
 
    # How you'd do it in pure Python
-   df[df.b.isin(df.a)]
+   df[df.a.isin(df.b)]
 
    df['a not in b']
 
    # pure Python
-   df[~df.b.isin(df.a)]
+   df[~df.a.isin(df.b)]
 
 
 You can, of course, combine this with other expressions for very succinct
@@ -1288,6 +1288,25 @@ Of course, expressions can be arbitrarily complex too
    del old_d
 
 
+Perfomance of ``DataFrame.query()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
+large frames
+
+.. image:: _static/query-perf.png
+
+.. note::
+
+   You will only see the performance benefits of using the ``numexpr`` engine
+   with ``DataFrame.query()`` if your frame has more than approximately 50,000
+   rows
+
+      .. image:: _static/query-perf-small.png
+
+This plot was created using a ``DataFrame`` with 3 columns each containing
+floating point values generated using ``numpy.random.randn()``.
+
 .. _indexing.class:
 
 Index objects
 
@@ -2109,7 +2109,7 @@ specified in the format: ``<float>(<unit>)``, where float may be signed (and fra
    dftd['C'] = dftd['A']-dftd['B']
    dftd
    store.append('dftd',dftd,data_columns=True)
-   store.select('dftd',Term("C","<","-3.5D"))
+   store.select('dftd',"C<'-3.5D'")
 
 Indexing
 ~~~~~~~~
 
@@ -14,7 +14,7 @@
 from pandas.computation.ops import is_term
 from pandas.computation.expr import BaseExprVisitor
 from pandas.computation.common import _ensure_decoded
-
+from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
 
 class Scope(expr.Scope):
     __slots__ = 'globals', 'locals', 'queryables'
@@ -79,6 +79,9 @@ def __init__(self, op, lhs, rhs, queryables, encoding):
         self.filter = None
         self.condition = None
 
+    def _disallow_scalar_only_bool_ops(self):
+        pass
+
     def prune(self, klass):
 
         def pr(left, right):
@@ -177,6 +180,9 @@ def stringify(value):
         elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date'):
             v = time.mktime(v.timetuple())
             return TermValue(v, pd.Timestamp(v), kind)
+        elif kind == u('timedelta64') or kind == u('timedelta'):
+            v = _coerce_scalar_to_timedelta_type(v,unit='s').item()
+            return TermValue(int(v), v, kind)
         elif kind == u('integer'):
             v = int(float(v))
             return TermValue(v, v, kind)
 
@@ -1864,16 +1864,16 @@ def test_append_with_timedelta(self):
             result = store.select('df',Term("C","<",-3*86400))
             assert_frame_equal(result,df.iloc[3:])
 
-            result = store.select('df',Term("C","<",'-3D'))
+            result = store.select('df',"C<'-3D'")
             assert_frame_equal(result,df.iloc[3:])
 
             # a bit hacky here as we don't really deal with the NaT properly
 
-            result = store.select('df',Term("C","<",'-500000s'))
+            result = store.select('df',"C<'-500000s'")
             result = result.dropna(subset=['C'])
             assert_frame_equal(result,df.iloc[6:])
 
-            result = store.select('df',Term("C","<",'-3.5D'))
+            result = store.select('df',"C<'-3.5D'")
             result = result.iloc[1:]
             assert_frame_equal(result,df.iloc[4:])
 
@@ -2039,14 +2039,6 @@ def test_invalid_terms(self):
             self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']")
             self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"])
             self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"])
-
-            # deprecations
-            with tm.assert_produces_warning(expected_warning=DeprecationWarning):
-                Term('index','==')
-
-            with tm.assert_produces_warning(expected_warning=DeprecationWarning):
-                Term('index', '>', 5)
-
             self.assertRaises(TypeError, Term)
 
             # more invalid
@@ -2086,11 +2078,10 @@ def test_terms(self):
             assert_panel_equal(result, expected)
 
             # with deprecation
-            with tm.assert_produces_warning(expected_warning=DeprecationWarning):
-                result = store.select('wp', [Term(
-                    'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")])
-                expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
-                tm.assert_panel_equal(result, expected)
+            result = store.select('wp', [Term(
+                'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")])
+            expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
+            tm.assert_panel_equal(result, expected)
 
             # p4d
             result = store.select('p4d', [Term('major_axis<"20000108"'),
@@ -2147,11 +2138,10 @@ def test_term_compat(self):
                        minor_axis=['A', 'B', 'C', 'D'])
             store.append('wp',wp)
 
-            with tm.assert_produces_warning(expected_warning=DeprecationWarning):
-                result = store.select('wp', [Term('major_axis>20000102'),
-                                                Term('minor_axis', '=', ['A','B']) ])
-                expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']]
-                assert_panel_equal(result, expected)
+            result = store.select('wp', [Term('major_axis>20000102'),
+                                         Term('minor_axis', '=', ['A','B']) ])
+            expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']]
+            assert_panel_equal(result, expected)
 
             store.remove('wp', Term('major_axis>20000103'))
             result = store.select('wp')