Skip to content

Commit bf1b8ca

Browse files
committed
Merge pull request #4164 from pydata/eval-3393
ENH: add expression evaluation functionality via eval
2 parents ba9c143 + ab60f4b commit bf1b8ca

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+6681
-570
lines changed

bench/bench_with_subset.R

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
library(microbenchmark)
2+
library(data.table)
3+
4+
5+
data.frame.subset.bench <- function (n=1e7, times=30) {
6+
df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
7+
print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
8+
times=times))
9+
}
10+
11+
12+
# data.table allows something very similar to query with an expression
13+
# but we have chained comparisons AND we're faster BOO YAH!
14+
data.table.subset.expression.bench <- function (n=1e7, times=30) {
15+
dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
16+
print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c],
17+
times=times))
18+
}
19+
20+
21+
# compare against subset with data.table for good measure
22+
data.table.subset.bench <- function (n=1e7, times=30) {
23+
dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
24+
print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
25+
times=times))
26+
}
27+
28+
29+
data.frame.with.bench <- function (n=1e7, times=30) {
30+
df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
31+
32+
print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
33+
times=times))
34+
}
35+
36+
37+
data.table.with.bench <- function (n=1e7, times=30) {
38+
dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
39+
print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
40+
times=times))
41+
}
42+
43+
44+
bench <- function () {
45+
data.frame.subset.bench()
46+
data.table.subset.expression.bench()
47+
data.table.subset.bench()
48+
data.frame.with.bench()
49+
data.table.with.bench()
50+
}
51+
52+
53+
bench()

bench/bench_with_subset.py

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Microbenchmarks for comparison with R's "with" and "subset" functions
5+
"""
6+
7+
from __future__ import print_function
8+
import numpy as np
9+
from numpy import array
10+
from timeit import repeat as timeit
11+
from pandas.compat import range, zip
12+
from pandas import DataFrame
13+
14+
15+
setup_common = """from pandas import DataFrame
16+
from numpy.random import randn
17+
df = DataFrame(randn(%d, 3), columns=list('abc'))
18+
%s"""
19+
20+
21+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
22+
23+
24+
def bench_with(n, times=10, repeat=3, engine='numexpr'):
25+
return np.array(timeit('df.eval(s, engine=%r)' % engine,
26+
setup=setup_common % (n, setup_with),
27+
repeat=repeat, number=times)) / times
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=10, repeat=3, engine='numexpr'):
34+
return np.array(timeit('df.query(s, engine=%r)' % engine,
35+
setup=setup_common % (n, setup_subset),
36+
repeat=repeat, number=times)) / times
37+
38+
39+
def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False):
40+
r = np.logspace(mn, mx, num=num).round().astype(int)
41+
42+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
43+
qu = ev.copy(deep=True)
44+
45+
ev['size'] = qu['size'] = r
46+
47+
for engine in engines:
48+
for i, n in enumerate(r):
49+
if verbose:
50+
print('engine: %r, i == %d' % (engine, i))
51+
ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine)
52+
qu.loc[i, engine] = bench_subset(n, times=1, repeat=1,
53+
engine=engine)
54+
55+
return ev, qu
56+
57+
58+
def plot_perf(df, engines, title, filename=None):
59+
from matplotlib.pyplot import figure, rc
60+
61+
try:
62+
from mpltools import style
63+
except ImportError:
64+
pass
65+
else:
66+
style.use('ggplot')
67+
68+
rc('text', usetex=True)
69+
70+
fig = figure(figsize=(4, 3), dpi=100)
71+
ax = fig.add_subplot(111)
72+
73+
for engine in engines:
74+
ax.plot(df.size, df[engine], label=engine, lw=2)
75+
76+
ax.set_xlabel('Number of Rows')
77+
ax.set_ylabel('Time (s)')
78+
ax.set_title(title)
79+
ax.legend(loc='best')
80+
ax.tick_params(top=False, right=False)
81+
82+
fig.tight_layout()
83+
84+
if filename is not None:
85+
fig.savefig(filename)
86+
87+
88+
if __name__ == '__main__':
89+
import os
90+
import pandas as pd
91+
92+
pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
93+
static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
94+
95+
join = lambda p: os.path.join(static_path, p)
96+
97+
fn = join('eval-query-perf-data.h5')
98+
99+
engines = 'python', 'numexpr'
100+
101+
if not os.path.exists(fn):
102+
ev, qu = bench(verbose=True)
103+
ev.to_hdf(fn, 'eval')
104+
qu.to_hdf(fn, 'query')
105+
else:
106+
ev = pd.read_hdf(fn, 'eval')
107+
qu = pd.read_hdf(fn, 'query')
108+
109+
plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png'))
110+
plot_perf(qu, engines, 'DataFrame.query()',
111+
filename=join('query-perf.png'))
112+
113+
plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()',
114+
filename=join('eval-perf-small.png'))
115+
plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()',
116+
filename=join('query-perf-small.png'))
24.7 KB
Loading

doc/source/_static/eval-perf.png

18.2 KB
Loading
25.1 KB
Loading

doc/source/_static/query-perf.png

19.9 KB
Loading

doc/source/api.rst

+13
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,17 @@ Top-level dealing with datetimes
155155
to_datetime
156156

157157

158+
Top-level evaluation
159+
~~~~~~~~~~~~~~~~~~~~
160+
161+
.. currentmodule:: pandas
162+
163+
.. autosummary::
164+
:toctree: generated/
165+
166+
eval
167+
168+
158169
Standard moving window functions
159170
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
160171

@@ -452,6 +463,7 @@ Indexing, iteration
452463
DataFrame.tail
453464
DataFrame.xs
454465
DataFrame.isin
466+
DataFrame.query
455467

456468
Binary operator functions
457469
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -502,6 +514,7 @@ Computations / Descriptive Stats
502514
DataFrame.cumsum
503515
DataFrame.describe
504516
DataFrame.diff
517+
DataFrame.eval
505518
DataFrame.kurt
506519
DataFrame.mad
507520
DataFrame.max

doc/source/comparison_with_r.rst

+80-15
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,87 @@
11
.. currentmodule:: pandas
22
.. _compare_with_r:
33

4-
*******************************
54
Comparison with R / R libraries
65
*******************************
76

8-
Since pandas aims to provide a lot of the data manipulation and analysis
9-
functionality that people use R for, this page was started to provide a more
10-
detailed look at the R language and it's many 3rd party libraries as they
11-
relate to pandas. In offering comparisons with R and CRAN libraries, we care
12-
about the following things:
7+
Since ``pandas`` aims to provide a lot of the data manipulation and analysis
8+
functionality that people use `R <http://www.r-project.org/>`__ for, this page
9+
was started to provide a more detailed look at the `R language
10+
<http://en.wikipedia.org/wiki/R_(programming_language)>`__ and its many third
11+
party libraries as they relate to ``pandas``. In comparisons with R and CRAN
12+
libraries, we care about the following things:
1313

14-
- **Functionality / flexibility**: what can / cannot be done with each tool
15-
- **Performance**: how fast are operations. Hard numbers / benchmarks are
14+
- **Functionality / flexibility**: what can/cannot be done with each tool
15+
- **Performance**: how fast are operations. Hard numbers/benchmarks are
1616
preferable
17-
- **Ease-of-use**: is one tool easier or harder to use (you may have to be
18-
the judge of this given side-by-side code comparisons)
17+
- **Ease-of-use**: Is one tool easier/harder to use (you may have to be
18+
the judge of this, given side-by-side code comparisons)
19+
20+
This page is also here to offer a bit of a translation guide for users of these
21+
R packages.
22+
23+
Base R
24+
------
25+
26+
|subset|_
27+
~~~~~~~~~~
28+
29+
.. versionadded:: 0.13
30+
31+
The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset``
32+
function. In R you might want to get the rows of a ``data.frame`` where one
33+
column's values are less than another column's values:
34+
35+
.. code-block:: r
36+
37+
df <- data.frame(a=rnorm(10), b=rnorm(10))
38+
subset(df, a <= b)
39+
df[df$a <= df$b,] # note the comma
40+
41+
In ``pandas``, there are a few ways to perform subsetting. You can use
42+
:meth:`~pandas.DataFrame.query` or pass an expression as if it were an
43+
index/slice as well as standard boolean indexing:
44+
45+
.. ipython:: python
46+
47+
from pandas import DataFrame
48+
from numpy.random import randn
49+
50+
df = DataFrame({'a': randn(10), 'b': randn(10)})
51+
df.query('a <= b')
52+
df[df.a <= df.b]
53+
df.loc[df.a <= df.b]
1954
20-
As I do not have an encyclopedic knowledge of R packages, feel free to suggest
21-
additional CRAN packages to add to this list. This is also here to offer a big
22-
of a translation guide for users of these R packages.
55+
For more details and examples see :ref:`the query documentation
56+
<indexing.query>`.
2357

24-
data.frame
25-
----------
58+
59+
|with|_
60+
~~~~~~~~
61+
62+
.. versionadded:: 0.13
63+
64+
An expression using a data.frame called ``df`` in R with the columns ``a`` and
65+
``b`` would be evaluated using ``with`` like so:
66+
67+
.. code-block:: r
68+
69+
df <- data.frame(a=rnorm(10), b=rnorm(10))
70+
with(df, a + b)
71+
df$a + df$b # same as the previous expression
72+
73+
In ``pandas`` the equivalent expression, using the
74+
:meth:`~pandas.DataFrame.eval` method, would be:
75+
76+
.. ipython:: python
77+
78+
df = DataFrame({'a': randn(10), 'b': randn(10)})
79+
df.eval('a + b')
80+
df.a + df.b # same as the previous expression
81+
82+
In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than
83+
evaluation in pure Python. For more details and examples see :ref:`the eval
84+
documentation <enhancingperf.eval>`.
2685

2786
zoo
2887
---
@@ -36,3 +95,9 @@ plyr
3695
reshape / reshape2
3796
------------------
3897

98+
99+
.. |with| replace:: ``with``
100+
.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html
101+
102+
.. |subset| replace:: ``subset``
103+
.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html

0 commit comments

Comments
 (0)