Skip to content

Commit a544e9e

Browse files
pvjreback
authored andcommitted
Update asv config + fix some broken benchmarks
- Enable platform-dependent config in asv (needs asv git version for it to do something) - Enable wheel cache in asv (in asv git version) - Fix a few easily fixed broken benchmarks Author: Pauli Virtanen <[email protected]> Closes #12563 from pv/asv-update and squashes the following commits: 8cba84d [Pauli Virtanen] DOC: contributing: explain how to tell asv which environment to use in more detail 65db647 [Pauli Virtanen] CLN: more precise asv_bench ignores in .gitignore 448b36a [Pauli Virtanen] PERF: fix easily fixed issues in asv benchmarks e083c01 [Pauli Virtanen] PERF: update asv.conf.json to work with both conda and virtualenv
1 parent bb9b9c5 commit a544e9e

File tree

7 files changed

+131
-199
lines changed

7 files changed

+131
-199
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,10 @@ scikits
8383

8484
# Performance Testing #
8585
#######################
86-
asv_bench/
86+
asv_bench/env/
87+
asv_bench/html/
88+
asv_bench/results/
89+
asv_bench/pandas/
8790

8891
# Documentation generated files #
8992
#################################

asv_bench/asv.conf.json

+62-7
Original file line numberDiff line numberDiff line change
@@ -30,24 +30,62 @@
3030

3131
// The matrix of dependencies to test. Each key is the name of a
3232
// package (in PyPI) and the values are version numbers. An empty
33-
// list indicates to just test against the default (latest)
34-
// version.
33+
// list or empty string indicates to just test against the default
34+
// (latest) version. null indicates that the package is to not be
35+
// installed. If the package to be tested is only available from
36+
// PyPi, and the 'environment_type' is conda, then you can preface
37+
// the package name by 'pip+', and the package will be installed via
38+
// pip (with all the conda available packages installed first,
39+
// followed by the pip installed packages).
3540
"matrix": {
36-
// To run against multiple versions, replace with
37-
// "numpy": ["1.7", "1.9"],
3841
"numpy": [],
3942
"Cython": [],
4043
"matplotlib": [],
4144
"sqlalchemy": [],
4245
"scipy": [],
4346
"numexpr": [],
44-
"pytables": [],
47+
"pytables": [null, ""], // platform dependent, see excludes below
48+
"tables": [null, ""],
49+
"libpython": [null, ""],
4550
"openpyxl": [],
4651
"xlsxwriter": [],
4752
"xlrd": [],
4853
"xlwt": []
4954
},
5055

56+
// Combinations of libraries/python versions can be excluded/included
57+
// from the set to test. Each entry is a dictionary containing additional
58+
// key-value pairs to include/exclude.
59+
//
60+
// An exclude entry excludes entries where all values match. The
61+
// values are regexps that should match the whole string.
62+
//
63+
// An include entry adds an environment. Only the packages listed
64+
// are installed. The 'python' key is required. The exclude rules
65+
// do not apply to includes.
66+
//
67+
// In addition to package names, the following keys are available:
68+
//
69+
// - python
70+
// Python version, as in the *pythons* variable above.
71+
// - environment_type
72+
// Environment type, as above.
73+
// - sys_platform
74+
// Platform, as in sys.platform. Possible values for the common
75+
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
76+
"exclude": [
77+
// On conda install pytables, otherwise tables
78+
{"environment_type": "conda", "tables": ""},
79+
{"environment_type": "conda", "pytables": null},
80+
{"environment_type": "virtualenv", "tables": null},
81+
{"environment_type": "virtualenv", "pytables": ""},
82+
// On conda&win32, install libpython
83+
{"sys_platform": "(?!win32).*", "libpython": ""},
84+
{"sys_platform": "win32", "libpython": null},
85+
{"environment_type": "(?!conda).*", "libpython": ""}
86+
],
87+
"include": [],
88+
5189
// The directory (relative to the current directory) that benchmarks are
5290
// stored in. If not provided, defaults to "benchmarks"
5391
// "benchmark_dir": "benchmarks",
@@ -56,7 +94,6 @@
5694
// environments in. If not provided, defaults to "env"
5795
// "env_dir": "env",
5896

59-
6097
// The directory (relative to the current directory) that raw benchmark
6198
// results are stored in. If not provided, defaults to "results".
6299
// "results_dir": "results",
@@ -66,5 +103,23 @@
66103
// "html_dir": "html",
67104

68105
// The number of characters to retain in the commit hashes.
69-
// "hash_length": 8
106+
// "hash_length": 8,
107+
108+
// `asv` will cache wheels of the recent builds in each
109+
// environment, making them faster to install next time. This is
110+
// number of builds to keep, per environment.
111+
"wheel_cache_size": 8,
112+
113+
// The commits after which the regression search in `asv publish`
114+
// should start looking for regressions. Dictionary whose keys are
115+
// regexps matching to benchmark names, and values corresponding to
116+
// the commit (exclusive) after which to start looking for
117+
// regressions. The default is to start from the first commit
118+
// with results. If the commit is `null`, regression detection is
119+
// skipped for the matching benchmark.
120+
//
121+
// "regressions_first_commits": {
122+
// "some_benchmark": "352cdf", // Consider regressions only after this commit
123+
// "another_benchmark": null, // Skip regression detection altogether
124+
// }
70125
}

asv_bench/benchmarks/eval.py

+22-175
Original file line numberDiff line numberDiff line change
@@ -3,192 +3,36 @@
33
import pandas.computation.expressions as expr
44

55

6-
class eval_frame_add_all_threads(object):
6+
class eval_frame(object):
77
goal_time = 0.2
88

9-
def setup(self):
10-
self.df = DataFrame(np.random.randn(20000, 100))
11-
self.df2 = DataFrame(np.random.randn(20000, 100))
12-
self.df3 = DataFrame(np.random.randn(20000, 100))
13-
self.df4 = DataFrame(np.random.randn(20000, 100))
14-
15-
def time_eval_frame_add_all_threads(self):
16-
pd.eval('df + df2 + df3 + df4')
17-
18-
19-
class eval_frame_add_one_thread(object):
20-
goal_time = 0.2
21-
22-
def setup(self):
23-
self.df = DataFrame(np.random.randn(20000, 100))
24-
self.df2 = DataFrame(np.random.randn(20000, 100))
25-
self.df3 = DataFrame(np.random.randn(20000, 100))
26-
self.df4 = DataFrame(np.random.randn(20000, 100))
27-
expr.set_numexpr_threads(1)
28-
29-
def time_eval_frame_add_one_thread(self):
30-
pd.eval('df + df2 + df3 + df4')
31-
32-
33-
class eval_frame_add_python(object):
34-
goal_time = 0.2
35-
36-
def setup(self):
37-
self.df = DataFrame(np.random.randn(20000, 100))
38-
self.df2 = DataFrame(np.random.randn(20000, 100))
39-
self.df3 = DataFrame(np.random.randn(20000, 100))
40-
self.df4 = DataFrame(np.random.randn(20000, 100))
41-
42-
def time_eval_frame_add_python(self):
43-
pd.eval('df + df2 + df3 + df4', engine='python')
44-
45-
46-
class eval_frame_add_python_one_thread(object):
47-
goal_time = 0.2
48-
49-
def setup(self):
50-
self.df = DataFrame(np.random.randn(20000, 100))
51-
self.df2 = DataFrame(np.random.randn(20000, 100))
52-
self.df3 = DataFrame(np.random.randn(20000, 100))
53-
self.df4 = DataFrame(np.random.randn(20000, 100))
54-
expr.set_numexpr_threads(1)
55-
56-
def time_eval_frame_add_python_one_thread(self):
57-
pd.eval('df + df2 + df3 + df4', engine='python')
58-
59-
60-
class eval_frame_and_all_threads(object):
61-
goal_time = 0.2
62-
63-
def setup(self):
64-
self.df = DataFrame(np.random.randn(20000, 100))
65-
self.df2 = DataFrame(np.random.randn(20000, 100))
66-
self.df3 = DataFrame(np.random.randn(20000, 100))
67-
self.df4 = DataFrame(np.random.randn(20000, 100))
68-
69-
def time_eval_frame_and_all_threads(self):
70-
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
71-
72-
73-
class eval_frame_and_python_one_thread(object):
74-
goal_time = 0.2
75-
76-
def setup(self):
77-
self.df = DataFrame(np.random.randn(20000, 100))
78-
self.df2 = DataFrame(np.random.randn(20000, 100))
79-
self.df3 = DataFrame(np.random.randn(20000, 100))
80-
self.df4 = DataFrame(np.random.randn(20000, 100))
81-
expr.set_numexpr_threads(1)
82-
83-
def time_eval_frame_and_python_one_thread(self):
84-
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
85-
86-
87-
class eval_frame_and_python(object):
88-
goal_time = 0.2
89-
90-
def setup(self):
91-
self.df = DataFrame(np.random.randn(20000, 100))
92-
self.df2 = DataFrame(np.random.randn(20000, 100))
93-
self.df3 = DataFrame(np.random.randn(20000, 100))
94-
self.df4 = DataFrame(np.random.randn(20000, 100))
95-
96-
def time_eval_frame_and_python(self):
97-
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
98-
99-
100-
class eval_frame_chained_cmp_all_threads(object):
101-
goal_time = 0.2
102-
103-
def setup(self):
104-
self.df = DataFrame(np.random.randn(20000, 100))
105-
self.df2 = DataFrame(np.random.randn(20000, 100))
106-
self.df3 = DataFrame(np.random.randn(20000, 100))
107-
self.df4 = DataFrame(np.random.randn(20000, 100))
108-
109-
def time_eval_frame_chained_cmp_all_threads(self):
110-
pd.eval('df < df2 < df3 < df4')
111-
112-
113-
class eval_frame_chained_cmp_python_one_thread(object):
114-
goal_time = 0.2
9+
params = [['numexpr', 'python'], [1, 'all']]
10+
param_names = ['engine', 'threads']
11511

116-
def setup(self):
117-
self.df = DataFrame(np.random.randn(20000, 100))
118-
self.df2 = DataFrame(np.random.randn(20000, 100))
119-
self.df3 = DataFrame(np.random.randn(20000, 100))
120-
self.df4 = DataFrame(np.random.randn(20000, 100))
121-
expr.set_numexpr_threads(1)
122-
123-
def time_eval_frame_chained_cmp_python_one_thread(self):
124-
pd.eval('df < df2 < df3 < df4', engine='python')
125-
126-
127-
class eval_frame_chained_cmp_python(object):
128-
goal_time = 0.2
129-
130-
def setup(self):
12+
def setup(self, engine, threads):
13113
self.df = DataFrame(np.random.randn(20000, 100))
13214
self.df2 = DataFrame(np.random.randn(20000, 100))
13315
self.df3 = DataFrame(np.random.randn(20000, 100))
13416
self.df4 = DataFrame(np.random.randn(20000, 100))
13517

136-
def time_eval_frame_chained_cmp_python(self):
137-
pd.eval('df < df2 < df3 < df4', engine='python')
18+
if threads == 1:
19+
expr.set_numexpr_threads(1)
13820

21+
def time_add(self, engine, threads):
22+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
23+
pd.eval('df + df2 + df3 + df4', engine=engine)
13924

140-
class eval_frame_mult_all_threads(object):
141-
goal_time = 0.2
25+
def time_and(self, engine, threads):
26+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
27+
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine)
14228

143-
def setup(self):
144-
self.df = DataFrame(np.random.randn(20000, 100))
145-
self.df2 = DataFrame(np.random.randn(20000, 100))
146-
self.df3 = DataFrame(np.random.randn(20000, 100))
147-
self.df4 = DataFrame(np.random.randn(20000, 100))
148-
149-
def time_eval_frame_mult_all_threads(self):
150-
pd.eval('df * df2 * df3 * df4')
151-
152-
153-
class eval_frame_mult_one_thread(object):
154-
goal_time = 0.2
155-
156-
def setup(self):
157-
self.df = DataFrame(np.random.randn(20000, 100))
158-
self.df2 = DataFrame(np.random.randn(20000, 100))
159-
self.df3 = DataFrame(np.random.randn(20000, 100))
160-
self.df4 = DataFrame(np.random.randn(20000, 100))
161-
expr.set_numexpr_threads(1)
162-
163-
def time_eval_frame_mult_one_thread(self):
164-
pd.eval('df * df2 * df3 * df4')
165-
166-
167-
class eval_frame_mult_python(object):
168-
goal_time = 0.2
169-
170-
def setup(self):
171-
self.df = DataFrame(np.random.randn(20000, 100))
172-
self.df2 = DataFrame(np.random.randn(20000, 100))
173-
self.df3 = DataFrame(np.random.randn(20000, 100))
174-
self.df4 = DataFrame(np.random.randn(20000, 100))
175-
176-
def time_eval_frame_mult_python(self):
177-
pd.eval('df * df2 * df3 * df4', engine='python')
178-
179-
180-
class eval_frame_mult_python_one_thread(object):
181-
goal_time = 0.2
182-
183-
def setup(self):
184-
self.df = DataFrame(np.random.randn(20000, 100))
185-
self.df2 = DataFrame(np.random.randn(20000, 100))
186-
self.df3 = DataFrame(np.random.randn(20000, 100))
187-
self.df4 = DataFrame(np.random.randn(20000, 100))
188-
expr.set_numexpr_threads(1)
29+
def time_chained_cmp(self, engine, threads):
30+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
31+
pd.eval('df < df2 < df3 < df4', engine=engine)
18932

190-
def time_eval_frame_mult_python_one_thread(self):
191-
pd.eval('df * df2 * df3 * df4', engine='python')
33+
def time_mult(self, engine, threads):
34+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
35+
pd.eval('df * df2 * df3 * df4', engine=engine)
19236

19337

19438
class query_datetime_index(object):
@@ -203,6 +47,7 @@ def setup(self):
20347
self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index)
20448

20549
def time_query_datetime_index(self):
50+
ts = self.ts
20651
self.df.query('index < @ts')
20752

20853

@@ -218,6 +63,7 @@ def setup(self):
21863
self.df = DataFrame({'dates': self.s.values, })
21964

22065
def time_query_datetime_series(self):
66+
ts = self.ts
22167
self.df.query('dates < @ts')
22268

22369

@@ -236,4 +82,5 @@ def setup(self):
23682
self.max_val = self.df['a'].max()
23783

23884
def time_query_with_boolean_selection(self):
239-
self.df.query('(a >= @min_val) & (a <= @max_val)')
85+
min_val, max_val = self.min_val, self.max_val
86+
self.df.query('(a >= @min_val) & (a <= @max_val)')

asv_bench/benchmarks/groupby.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def setup(self):
254254
self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat')
255255
self.value2 = np.random.randn(self.n)
256256
self.value2[(np.random.rand(self.n) > 0.5)] = np.nan
257-
self.obj = tm.choice(list('ab'), size=self.n).astype(object)
257+
self.obj = np.random.choice(list('ab'), size=self.n).astype(object)
258258
self.obj[(np.random.randn(self.n) > 0.5)] = np.nan
259259
self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n),
260260
'key2': np.random.randint(0, 100, size=self.n),
@@ -651,7 +651,7 @@ class groupby_sum_multiindex(object):
651651

652652
def setup(self):
653653
self.N = 50
654-
self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B'])
654+
self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B'])
655655

656656
def time_groupby_sum_multiindex(self):
657657
self.df.groupby(level=[0, 1]).sum()
@@ -673,9 +673,9 @@ def setup(self):
673673
self.secid_min = int('10000000', 16)
674674
self.secid_max = int('F0000000', 16)
675675
self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
676-
self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
676+
self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step)))
677677
self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids],
678-
labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)],
678+
labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)],
679679
names=['date', 'security_id'])
680680
self.n_data = len(self.data_index)
681681
self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))])

asv_bench/benchmarks/packers.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,9 @@ def remove(self, f):
321321
class packers_read_sas7bdat(object):
322322

323323
def setup(self):
324-
self.f = 'data/test1.sas7bdat'
324+
self.f = os.path.join(os.path.dirname(__file__), '..', '..',
325+
'pandas', 'io', 'tests', 'sas', 'data',
326+
'test1.sas7bdat')
325327

326328
def time_packers_read_sas7bdat(self):
327329
pd.read_sas(self.f, format='sas7bdat')
@@ -330,7 +332,9 @@ def time_packers_read_sas7bdat(self):
330332
class packers_read_xport(object):
331333

332334
def setup(self):
333-
self.f = 'data/paxraw_d_short.xpt'
335+
self.f = os.path.join(os.path.dirname(__file__), '..', '..',
336+
'pandas', 'io', 'tests', 'sas', 'data',
337+
'paxraw_d_short.xpt')
334338

335339
def time_packers_read_xport(self):
336340
pd.read_sas(self.f, format='xport')

0 commit comments

Comments
 (0)