pandas-dev · Aug 22, 2013 · Aug 22, 2013 · Aug 22, 2013
diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py
@@ -26,6 +26,21 @@
 
 """
 
+# IMPORTANT NOTE
+#
+# This script should run on pandas versions at least as far back as 0.9.1.
+# devs should be able to use the latest version of this script with
+# any dusty old commit and expect it to "just work".
+# One way in which this is useful is when collecting historical data,
+# where writing some logic around this script may prove easier
+# in some cases then running vbench directly (think perf bisection).
+#
+# *please*, when you modify this script for whatever reason,
+# make sure you do not break it's functionality when running under older
+# pandas versions.
+# Note that depreaction warnings are turned off in main(), so there's
+# no need to change the actual code to supress such warnings.
+
 import shutil
 import os
 import sys
@@ -37,14 +52,20 @@
 import random
 import numpy as np
 
+import pandas as pd
 from pandas import DataFrame, Series
 
 from suite import REPO_PATH
-
+VB_DIR = os.path.dirname(os.path.abspath(__file__))
 DEFAULT_MIN_DURATION = 0.01
 HEAD_COL="head[ms]"
 BASE_COL="base[ms]"
 
+try:
+    import git # gitpython
+except Exception:
+    print("Error: Please install the `gitpython` package\n")
+    sys.exit(1)
 
 class RevParseAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
@@ -66,6 +87,14 @@ def __call__(self, parser, namespace, values, option_string=None):
 parser.add_argument('-t', '--target-commit',
                     help='The commit to compare against the baseline (default: HEAD).',
                     type=str, action=RevParseAction)
+parser.add_argument('--base-pickle',
+                    help='name of pickle file with timings data generated by a former `-H -d FILE` run. '\
+                    'filename must be of the form <hash>-*.* or specify --base-commit seperately',
+                    type=str)
+parser.add_argument('--target-pickle',
+                    help='name of pickle file with timings data generated by a former `-H -d FILE` run '\
+                    'filename must be of the form <hash>-*.* or specify --target-commit seperately',
+                    type=str)
 parser.add_argument('-m', '--min-duration',
                     help='Minimum duration (in ms) of baseline test for inclusion in report (default: %.3f).' % DEFAULT_MIN_DURATION,
                     type=float,
@@ -213,30 +242,82 @@ def profile_comparative(benchmarks):
 
         head_res = get_results_df(db, h_head)
         baseline_res = get_results_df(db, h_baseline)
-        ratio = head_res['timing'] / baseline_res['timing']
-        totals = DataFrame({HEAD_COL:head_res['timing'],
-                                BASE_COL:baseline_res['timing'],
-                                'ratio':ratio,
-                                'name':baseline_res.name},
-                                columns=[HEAD_COL, BASE_COL, "ratio", "name"])
-        totals = totals.ix[totals[HEAD_COL] > args.min_duration]
-            # ignore below threshold
-        totals = totals.dropna(
-        ).sort("ratio").set_index('name')  # sort in ascending order
-
-        h_msg =   repo.messages.get(h_head, "")
-        b_msg =   repo.messages.get(h_baseline, "")
-
-        print_report(totals,h_head=h_head,h_msg=h_msg,
-                     h_baseline=h_baseline,b_msg=b_msg)
-
-        if args.outdf:
-            prprint("The results DataFrame was written to '%s'\n" %  args.outdf)
-            totals.save(args.outdf)
+
+        report_comparative(head_res,baseline_res)
+
     finally:
         #        print("Disposing of TMP_DIR: %s" % TMP_DIR)
         shutil.rmtree(TMP_DIR)
 
+def prep_pickle_for_total(df, agg_name='median'):
+    """
+    accepts a datafram resulting from invocation with -H -d o.pickle
+    If multiple data columns are present (-N was used), the
+    `agg_name` attr of the datafram will be used to reduce
+    them to a single value per vbench, df.median is used by defa
+    ult.
+
+    Returns a datadrame of the form expected by prep_totals
+    """
+    def prep(df):
+        agg = getattr(df,agg_name)
+        df = DataFrame(agg(1))
+        cols = list(df.columns)
+        cols[0]='timing'
+        df.columns=cols
+        df['name'] = list(df.index)
+        return df
+
+    return prep(df)
+
+def prep_totals(head_res, baseline_res):
+    """
+    Each argument should be a dataframe with  'timing' and 'name' columns
+    where name is the name of the vbench.
+
+    returns a 'totals' dataframe, suitable as input for print_report.
+    """
+    head_res, baseline_res = head_res.align(baseline_res)
+    ratio = head_res['timing'] / baseline_res['timing']
+    totals = DataFrame({HEAD_COL:head_res['timing'],
+                        BASE_COL:baseline_res['timing'],
+                        'ratio':ratio,
+                        'name':baseline_res.name},
+                        columns=[HEAD_COL, BASE_COL, "ratio", "name"])
+    totals = totals.ix[totals[HEAD_COL] > args.min_duration]
+    # ignore below threshold
+    totals = totals.dropna(
+    ).sort("ratio").set_index('name')  # sort in ascending order
+    return totals
+
+def report_comparative(head_res,baseline_res):
+    try:
+        r=git.Repo(VB_DIR)
+    except:
+        import pdb
+        pdb.set_trace()
+
+    totals = prep_totals(head_res,baseline_res)
+
+    h_head = args.target_commit
+    h_baseline = args.base_commit
+    h_msg = b_msg = "Unknown"
+    try:
+        h_msg =  r.commit(h_head).message.strip()
+    except git.exc.BadObject:
+        pass
+    try:
+        b_msg =  r.commit(h_baseline).message.strip()
+    except git.exc.BadObject:
+        pass
+
+
+    print_report(totals,h_head=h_head,h_msg=h_msg,
+             h_baseline=h_baseline,b_msg=b_msg)
+
+    if args.outdf:
+        prprint("The results DataFrame was written to '%s'\n" %  args.outdf)
+        totals.save(args.outdf)
 
 def profile_head_single(benchmark):
     import gc
@@ -395,38 +476,22 @@ def main():
     random.seed(args.seed)
     np.random.seed(args.seed)
 
-    affinity_set = False
+    if args.base_pickle and args.target_pickle:
+        baseline_res = prep_pickle_for_total(pd.load(args.base_pickle))
+        target_res = prep_pickle_for_total(pd.load(args.target_pickle))
 
-    # try psutil first since it is more commonly present and better
-    # maintained.  Some people experienced problems with affinity package
-    # (see https://code.google.com/p/psutil/issues/detail?id=238 for more references)
-    try:
-        import psutil
-        if hasattr(psutil.Process, 'set_cpu_affinity'):
-            psutil.Process(os.getpid()).set_cpu_affinity([args.affinity])
-            affinity_set = True
-    except ImportError:
-        pass
+        report_comparative(target_res, baseline_res)
+        sys.exit(0)
 
-    if not affinity_set:
-        try:
-            import affinity
-            affinity.set_process_affinity_mask(0, args.affinity)
-            assert affinity.get_process_affinity_mask(0) == args.affinity
-            affinity_set = True
+    if args.affinity is not None:
+        try: # use psutil rather then stale affinity module. Thanks @yarikoptic
+            import psutil
+            if hasattr(psutil.Process, 'set_cpu_affinity'):
+                psutil.Process(os.getpid()).set_cpu_affinity([args.affinity])
+                print("CPU affinity set to %d" % args.affinity)
         except ImportError:
-            pass
-
-    if not affinity_set:
-        import warnings
-        warnings.warn("\n\n"
-              "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
-              "The 'affinity' or 'psutil' >= 0.5.0 modules are not available, results may be unreliable\n"
-              "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n"
-            )
-        time.sleep(2)
-    else:
-        print("CPU affinity set to %d" % args.affinity)
+            print("-a/--affinity specified, but the 'psutil' module is not available, aborting.\n")
+            sys.exit(1)
 
     print("\n")
     prprint("LOG_FILE = %s" % args.log_file)
@@ -508,10 +573,39 @@ def inner(repo_path):
 
 if __name__ == '__main__':
     args = parser.parse_args()
-    if not args.head and (not args.base_commit and not args.target_commit):
+    if (not args.head
+        and not (args.base_commit and args.target_commit)
+        and not (args.base_pickle and args.target_pickle)):
         parser.print_help()
-    else:
-        import warnings
-        warnings.filterwarnings('ignore',category=FutureWarning)
-        warnings.filterwarnings('ignore',category=DeprecationWarning)
-        main()
+        sys.exit(1)
+    elif ((args.base_pickle or args.target_pickle) and not
+        (args.base_pickle and args.target_pickle)):
+        print("Must specify Both --base-pickle and --target-pickle.")
+        sys.exit(1)
+
+    if ((args.base_pickle or args.target_pickle) and not
+        (args.base_commit and args.target_commit)):
+        if not args.base_commit:
+            print("base_commit not specified, Assuming base_pickle is named <commit>-foo.*")
+            args.base_commit = args.base_pickle.split('-')[0]
+        if not args.target_commit:
+            print("target_commit not specified, Assuming target_pickle is named <commit>-foo.*")
+            args.target_commit = args.target_pickle.split('-')[0]
+
+    import warnings
+    warnings.filterwarnings('ignore',category=FutureWarning)
+    warnings.filterwarnings('ignore',category=DeprecationWarning)
+
+    if args.base_commit and args.target_commit:
+        print("Verifying specified commits exist in repo...")
+        r=git.Repo(VB_DIR)
+        for c in [ args.base_commit, args.target_commit ]:
+            try:
+                msg =  r.commit(c).message.strip()
+            except git.BadObject:
+                print("The commit '%s' was not found, aborting..." % c)
+                sys.exit(1)
+            else:
+                print("%s: %s" % (c,msg))
+
+    main()