pandas-dev · cpcloud · Sep 16, 2013 · Jun 16, 2013 · Jul 6, 2013 · Jul 6, 2013
diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R
@@ -0,0 +1,53 @@
+library(microbenchmark)
+library(data.table)
+
+
+data.frame.subset.bench <- function (n=1e7, times=30) {
+    df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
+                         times=times))
+}
+
+
+# data.table allows something very similar to query with an expression
+# but we have chained comparisons AND we're faster BOO YAH!
+data.table.subset.expression.bench <- function (n=1e7, times=30) {
+    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c],
+                         times=times))
+}
+
+
+# compare against subset with data.table for good measure
+data.table.subset.bench <- function (n=1e7, times=30) {
+    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
+                         times=times))
+}
+
+
+data.frame.with.bench <- function (n=1e7, times=30) {
+    df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+
+    print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
+                         times=times))
+}
+
+
+data.table.with.bench <- function (n=1e7, times=30) {
+    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
+                         times=times))
+}
+
+
+bench <- function () {
+    data.frame.subset.bench()
+    data.table.subset.expression.bench()
+    data.table.subset.bench()
+    data.frame.with.bench()
+    data.table.with.bench()
+}
+
+
+bench()
diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+"""
+Microbenchmarks for comparison with R's "with" and "subset" functions
+"""
+
+from __future__ import print_function
+import numpy as np
+from numpy import array
+from timeit import repeat as timeit
+from pandas.compat import range, zip
+from pandas import DataFrame
+
+
+setup_common = """from pandas import DataFrame
+from numpy.random import randn
+df = DataFrame(randn(%d, 3), columns=list('abc'))
+%s"""
+
+
+setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
+
+
+def bench_with(n, times=10, repeat=3, engine='numexpr'):
+    return np.array(timeit('df.eval(s, engine=%r)' % engine,
+                           setup=setup_common % (n, setup_with),
+                           repeat=repeat, number=times)) / times
+
+
+setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
+
+
+def bench_subset(n, times=10, repeat=3, engine='numexpr'):
+    return np.array(timeit('df.query(s, engine=%r)' % engine,
+                           setup=setup_common % (n, setup_subset),
+                           repeat=repeat, number=times)) / times
+
+
+def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False):
+    r = np.logspace(mn, mx, num=num).round().astype(int)
+
+    ev = DataFrame(np.empty((num, len(engines))), columns=engines)
+    qu = ev.copy(deep=True)
+
+    ev['size'] = qu['size'] = r
+
+    for engine in engines:
+        for i, n in enumerate(r):
+            if verbose:
+                print('engine: %r, i == %d' % (engine, i))
+            ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine)
+            qu.loc[i, engine] = bench_subset(n, times=1, repeat=1,
+                                             engine=engine)
+
+    return ev, qu
+
+
+def plot_perf(df, engines, title, filename=None):
+    from matplotlib.pyplot import figure, rc
+
+    try:
+        from mpltools import style
+    except ImportError:
+        pass
+    else:
+        style.use('ggplot')
+
+    rc('text', usetex=True)
+
+    fig = figure(figsize=(4, 3), dpi=100)
+    ax = fig.add_subplot(111)
+
+    for engine in engines:
+        ax.plot(df.size, df[engine], label=engine, lw=2)
+
+    ax.set_xlabel('Number of Rows')
+    ax.set_ylabel('Time (s)')
+    ax.set_title(title)
+    ax.legend(loc='best')
+    ax.tick_params(top=False, right=False)
+
+    fig.tight_layout()
+
+    if filename is not None:
+        fig.savefig(filename)
+
+
+if __name__ == '__main__':
+    import os
+    import pandas as pd
+
+    pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+    static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
+
+    join = lambda p: os.path.join(static_path, p)
+
+    fn = join('eval-query-perf-data.h5')
+
+    engines = 'python', 'numexpr'
+
+    if not os.path.exists(fn):
+        ev, qu = bench(verbose=True)
+        ev.to_hdf(fn, 'eval')
+        qu.to_hdf(fn, 'query')
+    else:
+        ev = pd.read_hdf(fn, 'eval')
+        qu = pd.read_hdf(fn, 'query')
+
+    plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png'))
+    plot_perf(qu, engines, 'DataFrame.query()',
+              filename=join('query-perf.png'))
+
+    plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()',
+              filename=join('eval-perf-small.png'))
+    plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()',
+              filename=join('query-perf-small.png'))
diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png
diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png
diff --git a/doc/source/_static/query-perf-small.png b/doc/source/_static/query-perf-small.png
diff --git a/doc/source/_static/query-perf.png b/doc/source/_static/query-perf.png
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -155,6 +155,17 @@ Top-level dealing with datetimes
    to_datetime
 
 
+Top-level evaluation
+~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: pandas
+
+.. autosummary::
+   :toctree: generated/
+
+   eval
+
+
 Standard moving window functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -452,6 +463,7 @@ Indexing, iteration
    DataFrame.tail
    DataFrame.xs
    DataFrame.isin
+   DataFrame.query
 
 Binary operator functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -502,6 +514,7 @@ Computations / Descriptive Stats
    DataFrame.cumsum
    DataFrame.describe
    DataFrame.diff
+   DataFrame.eval
    DataFrame.kurt
    DataFrame.mad
    DataFrame.max

diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst
@@ -1,28 +1,87 @@
 .. currentmodule:: pandas
 .. _compare_with_r:
 
-*******************************
 Comparison with R / R libraries
 *******************************
 
-Since pandas aims to provide a lot of the data manipulation and analysis
-functionality that people use R for, this page was started to provide a more
-detailed look at the R language and it's many 3rd party libraries as they
-relate to pandas. In offering comparisons with R and CRAN libraries, we care
-about the following things:
+Since ``pandas`` aims to provide a lot of the data manipulation and analysis
+functionality that people use `R <http://www.r-project.org/>`__ for, this page
+was started to provide a more detailed look at the `R language
+<http://en.wikipedia.org/wiki/R_(programming_language)>`__ and its many third
+party libraries as they relate to ``pandas``. In comparisons with R and CRAN
+libraries, we care about the following things:
 
-  - **Functionality / flexibility**: what can / cannot be done with each tool
-  - **Performance**: how fast are operations. Hard numbers / benchmarks are
+  - **Functionality / flexibility**: what can/cannot be done with each tool
+  - **Performance**: how fast are operations. Hard numbers/benchmarks are
     preferable
-  - **Ease-of-use**: is one tool easier or harder to use (you may have to be
-    the judge of this given side-by-side code comparisons)
+  - **Ease-of-use**: Is one tool easier/harder to use (you may have to be
+    the judge of this, given side-by-side code comparisons)
+
+This page is also here to offer a bit of a translation guide for users of these
+R packages.
+
+Base R
+------
+
+|subset|_
+~~~~~~~~~~
+
+.. versionadded:: 0.13
+
+The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset``
+function. In R you might want to get the rows of a ``data.frame`` where one
+column's values are less than another column's values:
+
+    .. code-block:: r
+
+       df <- data.frame(a=rnorm(10), b=rnorm(10))
+       subset(df, a <= b)
+       df[df$a <= df$b,]  # note the comma
+
+In ``pandas``, there are a few ways to perform subsetting. You can use
+:meth:`~pandas.DataFrame.query` or pass an expression as if it were an
+index/slice as well as standard boolean indexing:
+
+    .. ipython:: python
+
+       from pandas import DataFrame
+       from numpy.random import randn
+
+       df = DataFrame({'a': randn(10), 'b': randn(10)})
+       df.query('a <= b')
+       df[df.a <= df.b]
+       df.loc[df.a <= df.b]
 
-As I do not have an encyclopedic knowledge of R packages, feel free to suggest
-additional CRAN packages to add to this list. This is also here to offer a big
-of a translation guide for users of these R packages.
+For more details and examples see :ref:`the query documentation
+<indexing.query>`.
 
-data.frame
-----------
+
+|with|_
+~~~~~~~~
+
+.. versionadded:: 0.13
+
+An expression using a data.frame called ``df`` in R with the columns ``a`` and
+``b`` would be evaluated using ``with`` like so:
+
+    .. code-block:: r
+
+       df <- data.frame(a=rnorm(10), b=rnorm(10))
+       with(df, a + b)
+       df$a + df$b  # same as the previous expression
+
+In ``pandas`` the equivalent expression, using the
+:meth:`~pandas.DataFrame.eval` method, would be:
+
+    .. ipython:: python
+
+       df = DataFrame({'a': randn(10), 'b': randn(10)})
+       df.eval('a + b')
+       df.a + df.b  # same as the previous expression
+
+In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than
+evaluation in pure Python. For more details and examples see :ref:`the eval
+documentation <enhancingperf.eval>`.
 
 zoo
 ---
@@ -36,3 +95,9 @@ plyr
 reshape / reshape2
 ------------------
 
+
+.. |with| replace:: ``with``
+.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html
+
+.. |subset| replace:: ``subset``
+.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html