Update asv config + fix some broken benchmarks

pv · jreback · commit a544e9eef5ac · 2016-04-25T10:52:10.000-04:00
- Enable platform-dependent config in asv (needs asv git version for it to do something) - Enable wheel cache in asv (in asv git version) - Fix a few easily fixed broken benchmarks Author: Pauli Virtanen <pav@iki.fi> Closes #12563 from pv/asv-update and squashes the following commits: 8cba84d [Pauli Virtanen] DOC: contributing: explain how to tell asv which environment to use in more detail 65db647 [Pauli Virtanen] CLN: more precise asv_bench ignores in .gitignore 448b36a [Pauli Virtanen] PERF: fix easily fixed issues in asv benchmarks e083c01 [Pauli Virtanen] PERF: update asv.conf.json to work with both conda and virtualenv
diff --git a/.gitignore b/.gitignore
@@ -83,7 +83,10 @@ scikits
 
 # Performance Testing #
 #######################
-asv_bench/
+asv_bench/env/
+asv_bench/html/
+asv_bench/results/
+asv_bench/pandas/
 
 # Documentation generated files #
 #################################
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -30,24 +30,62 @@
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
-    // list indicates to just test against the default (latest)
-    // version.
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
     "matrix": {
-        // To run against multiple versions, replace with
-        // "numpy": ["1.7", "1.9"],
         "numpy": [],
         "Cython": [],
         "matplotlib": [],
         "sqlalchemy": [],
         "scipy": [],
         "numexpr": [],
-        "pytables": [],
+        "pytables": [null, ""],  // platform dependent, see excludes below
+        "tables": [null, ""],
+        "libpython": [null, ""],
         "openpyxl": [],
         "xlsxwriter": [],
         "xlrd": [],
         "xlwt": []
     },
 
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    "exclude": [
+        // On conda install pytables, otherwise tables
+        {"environment_type": "conda", "tables": ""},
+        {"environment_type": "conda", "pytables": null},
+        {"environment_type": "virtualenv", "tables": null},
+        {"environment_type": "virtualenv", "pytables": ""},
+        // On conda&win32, install libpython
+        {"sys_platform": "(?!win32).*", "libpython": ""},
+        {"sys_platform": "win32", "libpython": null},
+        {"environment_type": "(?!conda).*", "libpython": ""}
+    ],
+    "include": [],
+
     // The directory (relative to the current directory) that benchmarks are
     // stored in.  If not provided, defaults to "benchmarks"
     // "benchmark_dir": "benchmarks",
@@ -56,7 +94,6 @@
     // environments in.  If not provided, defaults to "env"
     // "env_dir": "env",
 
-
     // The directory (relative to the current directory) that raw benchmark
     // results are stored in.  If not provided, defaults to "results".
     // "results_dir": "results",
@@ -66,5 +103,23 @@
     // "html_dir": "html",
 
     // The number of characters to retain in the commit hashes.
-    // "hash_length": 8
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    "wheel_cache_size": 8,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
 }
diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
@@ -3,192 +3,36 @@
 import pandas.computation.expressions as expr
 
 
-class eval_frame_add_all_threads(object):
+class eval_frame(object):
     goal_time = 0.2
 
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_add_all_threads(self):
-        pd.eval('df + df2 + df3 + df4')
-
-
-class eval_frame_add_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_add_one_thread(self):
-        pd.eval('df + df2 + df3 + df4')
-
-
-class eval_frame_add_python(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_add_python(self):
-        pd.eval('df + df2 + df3 + df4', engine='python')
-
-
-class eval_frame_add_python_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_add_python_one_thread(self):
-        pd.eval('df + df2 + df3 + df4', engine='python')
-
-
-class eval_frame_and_all_threads(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_and_all_threads(self):
-        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
-
-
-class eval_frame_and_python_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_and_python_one_thread(self):
-        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
-
-
-class eval_frame_and_python(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_and_python(self):
-        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
-
-
-class eval_frame_chained_cmp_all_threads(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_chained_cmp_all_threads(self):
-        pd.eval('df < df2 < df3 < df4')
-
-
-class eval_frame_chained_cmp_python_one_thread(object):
-    goal_time = 0.2
+    params = [['numexpr', 'python'], [1, 'all']]
+    param_names = ['engine', 'threads']
 
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_chained_cmp_python_one_thread(self):
-        pd.eval('df < df2 < df3 < df4', engine='python')
-
-
-class eval_frame_chained_cmp_python(object):
-    goal_time = 0.2
-
-    def setup(self):
+    def setup(self, engine, threads):
         self.df = DataFrame(np.random.randn(20000, 100))
         self.df2 = DataFrame(np.random.randn(20000, 100))
         self.df3 = DataFrame(np.random.randn(20000, 100))
         self.df4 = DataFrame(np.random.randn(20000, 100))
 
-    def time_eval_frame_chained_cmp_python(self):
-        pd.eval('df < df2 < df3 < df4', engine='python')
+        if threads == 1:
+            expr.set_numexpr_threads(1)
 
+    def time_add(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('df + df2 + df3 + df4', engine=engine)
 
-class eval_frame_mult_all_threads(object):
-    goal_time = 0.2
+    def time_and(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine)
 
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_mult_all_threads(self):
-        pd.eval('df * df2 * df3 * df4')
-
-
-class eval_frame_mult_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_mult_one_thread(self):
-        pd.eval('df * df2 * df3 * df4')
-
-
-class eval_frame_mult_python(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_mult_python(self):
-        pd.eval('df * df2 * df3 * df4', engine='python')
-
-
-class eval_frame_mult_python_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
+    def time_chained_cmp(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('df < df2 < df3 < df4', engine=engine)
 
-    def time_eval_frame_mult_python_one_thread(self):
-        pd.eval('df * df2 * df3 * df4', engine='python')
+    def time_mult(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('df * df2 * df3 * df4', engine=engine)
 
 
 class query_datetime_index(object):
@@ -203,6 +47,7 @@ def setup(self):
         self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index)
 
     def time_query_datetime_index(self):
+        ts = self.ts
         self.df.query('index < @ts')
 
 
@@ -218,6 +63,7 @@ def setup(self):
         self.df = DataFrame({'dates': self.s.values, })
 
     def time_query_datetime_series(self):
+        ts = self.ts
         self.df.query('dates < @ts')
 
 
@@ -236,4 +82,5 @@ def setup(self):
         self.max_val = self.df['a'].max()
 
     def time_query_with_boolean_selection(self):
-        self.df.query('(a >= @min_val) & (a <= @max_val)')
+        min_val, max_val = self.min_val, self.max_val
+        self.df.query('(a >= @min_val) & (a <= @max_val)')
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -254,7 +254,7 @@ def setup(self):
         self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat')
         self.value2 = np.random.randn(self.n)
         self.value2[(np.random.rand(self.n) > 0.5)] = np.nan
-        self.obj = tm.choice(list('ab'), size=self.n).astype(object)
+        self.obj = np.random.choice(list('ab'), size=self.n).astype(object)
         self.obj[(np.random.randn(self.n) > 0.5)] = np.nan
         self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n),
                              'key2': np.random.randint(0, 100, size=self.n),
@@ -651,7 +651,7 @@ class groupby_sum_multiindex(object):
 
     def setup(self):
         self.N = 50
-        self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B'])
+        self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B'])
 
     def time_groupby_sum_multiindex(self):
         self.df.groupby(level=[0, 1]).sum()
@@ -673,9 +673,9 @@ def setup(self):
         self.secid_min = int('10000000', 16)
         self.secid_max = int('F0000000', 16)
         self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
-        self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
+        self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step)))
         self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids],
-                                     labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)],
+                                     labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)],
                                      names=['date', 'security_id'])
         self.n_data = len(self.data_index)
         self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))])
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
@@ -321,7 +321,9 @@ def remove(self, f):
 class packers_read_sas7bdat(object):
 
     def setup(self):
-        self.f = 'data/test1.sas7bdat'
+        self.f = os.path.join(os.path.dirname(__file__), '..', '..',
+                              'pandas', 'io', 'tests', 'sas', 'data',
+                              'test1.sas7bdat')
 
     def time_packers_read_sas7bdat(self):
         pd.read_sas(self.f, format='sas7bdat')
@@ -330,7 +332,9 @@ def time_packers_read_sas7bdat(self):
 class packers_read_xport(object):
 
     def setup(self):
-        self.f = 'data/paxraw_d_short.xpt'
+        self.f = os.path.join(os.path.dirname(__file__), '..', '..',
+                              'pandas', 'io', 'tests', 'sas', 'data',
+                              'paxraw_d_short.xpt')
 
     def time_packers_read_xport(self):
         pd.read_sas(self.f, format='xport')
diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst