Skip to content

Commit 0469fe4

Browse files
jrebackcpcloud
authored andcommitted
BUG: parsing of timedelta selection syntax needed correction
1 parent 0d8997a commit 0469fe4

File tree

10 files changed

+149
-49
lines changed

10 files changed

+149
-49
lines changed

bench/bench_with_subset.py

+100-21
Original file line numberDiff line numberDiff line change
@@ -5,33 +5,112 @@
55
"""
66

77
from __future__ import print_function
8-
from timeit import timeit
8+
import numpy as np
9+
from numpy import array
10+
from timeit import repeat as timeit
11+
from pandas.compat import range, zip
12+
from pandas import DataFrame
913

1014

11-
def bench_with(n=1e7, times=10, repeat=3):
12-
setup = "from pandas import DataFrame\n"
13-
setup += "from numpy.random import randn\n"
14-
setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n
15-
setup += "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
16-
print('DataFrame.eval:')
17-
print(timeit('df.eval(s)', setup=setup, repeat=repeat, number=times))
15+
setup_common = """from pandas import DataFrame
16+
from numpy.random import randn
17+
df = DataFrame(randn(%d, 3), columns=list('abc'))
18+
%s"""
1819

1920

20-
def bench_subset(n=1e7, times=10, repeat=3):
21-
setup = "from pandas import DataFrame\n"
22-
setup += "from numpy.random import randn\n"
23-
setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n
24-
setup += "s = 'a <= b <= (c ** 2 + b ** 2 - a) and b > c'"
25-
print('DataFrame.query:')
26-
print(timeit('df.query(s)', setup=setup, repeat=repeat, number=times))
27-
print('DataFrame.__getitem__:')
28-
print(timeit('df[s]', setup=setup, repeat=repeat, number=times))
21+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
2922

3023

31-
def bench():
32-
bench_with()
33-
bench_subset()
24+
def bench_with(n, times=10, repeat=3, engine='numexpr'):
25+
return np.array(timeit('df.eval(s, engine=%r)' % engine,
26+
setup=setup_common % (n, setup_with),
27+
repeat=repeat, number=times)) / times
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=10, repeat=3, engine='numexpr'):
34+
return np.array(timeit('df.query(s, engine=%r)' % engine,
35+
setup=setup_common % (n, setup_subset),
36+
repeat=repeat, number=times)) / times
37+
38+
39+
def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False):
40+
r = np.logspace(mn, mx, num=num).round().astype(int)
41+
42+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
43+
qu = ev.copy(deep=True)
44+
45+
ev['size'] = qu['size'] = r
46+
47+
for engine in engines:
48+
for i, n in enumerate(r):
49+
if verbose:
50+
print('engine: %r, i == %d' % (engine, i))
51+
ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine)
52+
qu.loc[i, engine] = bench_subset(n, times=1, repeat=1,
53+
engine=engine)
54+
55+
return ev, qu
56+
57+
58+
def plot_perf(df, engines, title, filename=None):
59+
from matplotlib.pyplot import figure, rc
60+
61+
try:
62+
from mpltools import style
63+
except ImportError:
64+
pass
65+
else:
66+
style.use('ggplot')
67+
68+
rc('text', usetex=True)
69+
70+
fig = figure(figsize=(4, 3), dpi=100)
71+
ax = fig.add_subplot(111)
72+
73+
for engine in engines:
74+
ax.plot(df.size, df[engine], label=engine, lw=2)
75+
76+
ax.set_xlabel('Number of Rows')
77+
ax.set_ylabel('Time (s)')
78+
ax.set_title(title)
79+
ax.legend(loc='best')
80+
ax.tick_params(top=False, right=False)
81+
82+
fig.tight_layout()
83+
84+
if filename is not None:
85+
fig.savefig(filename)
3486

3587

3688
if __name__ == '__main__':
37-
bench()
89+
import os
90+
import pandas as pd
91+
92+
pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
93+
static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
94+
95+
join = lambda p: os.path.join(static_path, p)
96+
97+
fn = join('eval-query-perf-data.h5')
98+
99+
engines = 'python', 'numexpr'
100+
101+
if not os.path.exists(fn):
102+
ev, qu = bench(verbose=True)
103+
ev.to_hdf(fn, 'eval')
104+
qu.to_hdf(fn, 'query')
105+
else:
106+
ev = pd.read_hdf(fn, 'eval')
107+
qu = pd.read_hdf(fn, 'query')
108+
109+
plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png'))
110+
plot_perf(qu, engines, 'DataFrame.query()',
111+
filename=join('query-perf.png'))
112+
113+
plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()',
114+
filename=join('eval-perf-small.png'))
115+
plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()',
116+
filename=join('query-perf-small.png'))
24.7 KB
Loading

doc/source/_static/eval-perf.png

18.2 KB
Loading
25.1 KB
Loading

doc/source/_static/query-perf.png

19.9 KB
Loading

doc/source/enhancingperf.rst

+9-3
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,13 @@ different engines.
526526
.. image:: _static/eval-perf.png
527527

528528

529-
Note that operations with smallish objects (around 15,000 rows) are faster
530-
using plain Python:
529+
.. note::
530+
531+
Operations with smallish objects (around 15k-20k rows) are faster using
532+
plain Python:
533+
534+
.. image:: _static/eval-perf-small.png
535+
531536

532-
.. image:: _static/eval-perf-intersect.png
537+
This plot was created using a ``DataFrame`` with 3 columns each containing
538+
floating point values generated using ``numpy.random.randn()``.

doc/source/indexing.rst

+21-2
Original file line numberDiff line numberDiff line change
@@ -1190,12 +1190,12 @@ The ``in`` and ``not in`` operators
11901190
df['a in b']
11911191
11921192
# How you'd do it in pure Python
1193-
df[df.b.isin(df.a)]
1193+
df[df.a.isin(df.b)]
11941194
11951195
df['a not in b']
11961196
11971197
# pure Python
1198-
df[~df.b.isin(df.a)]
1198+
df[~df.a.isin(df.b)]
11991199
12001200
12011201
You can, of course, combine this with other expressions for very succinct
@@ -1288,6 +1288,25 @@ Of course, expressions can be arbitrarily complex too
12881288
del old_d
12891289
12901290
1291+
Perfomance of ``DataFrame.query()``
1292+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1293+
1294+
``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
1295+
large frames
1296+
1297+
.. image:: _static/query-perf.png
1298+
1299+
.. note::
1300+
1301+
You will only see the performance benefits of using the ``numexpr`` engine
1302+
with ``DataFrame.query()`` if your frame has more than approximately 50,000
1303+
rows
1304+
1305+
.. image:: _static/query-perf-small.png
1306+
1307+
This plot was created using a ``DataFrame`` with 3 columns each containing
1308+
floating point values generated using ``numpy.random.randn()``.
1309+
12911310
.. _indexing.class:
12921311

12931312
Index objects

doc/source/io.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -2109,7 +2109,7 @@ specified in the format: ``<float>(<unit>)``, where float may be signed (and fra
21092109
dftd['C'] = dftd['A']-dftd['B']
21102110
dftd
21112111
store.append('dftd',dftd,data_columns=True)
2112-
store.select('dftd',Term("C","<","-3.5D"))
2112+
store.select('dftd',"C<'-3.5D'")
21132113
21142114
Indexing
21152115
~~~~~~~~

pandas/computation/pytables.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pandas.computation.ops import is_term
1515
from pandas.computation.expr import BaseExprVisitor
1616
from pandas.computation.common import _ensure_decoded
17-
17+
from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
1818

1919
class Scope(expr.Scope):
2020
__slots__ = 'globals', 'locals', 'queryables'
@@ -79,6 +79,9 @@ def __init__(self, op, lhs, rhs, queryables, encoding):
7979
self.filter = None
8080
self.condition = None
8181

82+
def _disallow_scalar_only_bool_ops(self):
83+
pass
84+
8285
def prune(self, klass):
8386

8487
def pr(left, right):
@@ -177,6 +180,9 @@ def stringify(value):
177180
elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date'):
178181
v = time.mktime(v.timetuple())
179182
return TermValue(v, pd.Timestamp(v), kind)
183+
elif kind == u('timedelta64') or kind == u('timedelta'):
184+
v = _coerce_scalar_to_timedelta_type(v,unit='s').item()
185+
return TermValue(int(v), v, kind)
180186
elif kind == u('integer'):
181187
v = int(float(v))
182188
return TermValue(v, v, kind)

pandas/io/tests/test_pytables.py

+11-21
Original file line numberDiff line numberDiff line change
@@ -1864,16 +1864,16 @@ def test_append_with_timedelta(self):
18641864
result = store.select('df',Term("C","<",-3*86400))
18651865
assert_frame_equal(result,df.iloc[3:])
18661866

1867-
result = store.select('df',Term("C","<",'-3D'))
1867+
result = store.select('df',"C<'-3D'")
18681868
assert_frame_equal(result,df.iloc[3:])
18691869

18701870
# a bit hacky here as we don't really deal with the NaT properly
18711871

1872-
result = store.select('df',Term("C","<",'-500000s'))
1872+
result = store.select('df',"C<'-500000s'")
18731873
result = result.dropna(subset=['C'])
18741874
assert_frame_equal(result,df.iloc[6:])
18751875

1876-
result = store.select('df',Term("C","<",'-3.5D'))
1876+
result = store.select('df',"C<'-3.5D'")
18771877
result = result.iloc[1:]
18781878
assert_frame_equal(result,df.iloc[4:])
18791879

@@ -2039,14 +2039,6 @@ def test_invalid_terms(self):
20392039
self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']")
20402040
self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"])
20412041
self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"])
2042-
2043-
# deprecations
2044-
with tm.assert_produces_warning(expected_warning=DeprecationWarning):
2045-
Term('index','==')
2046-
2047-
with tm.assert_produces_warning(expected_warning=DeprecationWarning):
2048-
Term('index', '>', 5)
2049-
20502042
self.assertRaises(TypeError, Term)
20512043

20522044
# more invalid
@@ -2086,11 +2078,10 @@ def test_terms(self):
20862078
assert_panel_equal(result, expected)
20872079

20882080
# with deprecation
2089-
with tm.assert_produces_warning(expected_warning=DeprecationWarning):
2090-
result = store.select('wp', [Term(
2091-
'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")])
2092-
expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
2093-
tm.assert_panel_equal(result, expected)
2081+
result = store.select('wp', [Term(
2082+
'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")])
2083+
expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
2084+
tm.assert_panel_equal(result, expected)
20942085

20952086
# p4d
20962087
result = store.select('p4d', [Term('major_axis<"20000108"'),
@@ -2147,11 +2138,10 @@ def test_term_compat(self):
21472138
minor_axis=['A', 'B', 'C', 'D'])
21482139
store.append('wp',wp)
21492140

2150-
with tm.assert_produces_warning(expected_warning=DeprecationWarning):
2151-
result = store.select('wp', [Term('major_axis>20000102'),
2152-
Term('minor_axis', '=', ['A','B']) ])
2153-
expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']]
2154-
assert_panel_equal(result, expected)
2141+
result = store.select('wp', [Term('major_axis>20000102'),
2142+
Term('minor_axis', '=', ['A','B']) ])
2143+
expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']]
2144+
assert_panel_equal(result, expected)
21552145

21562146
store.remove('wp', Term('major_axis>20000103'))
21572147
result = store.select('wp')

0 commit comments

Comments
 (0)