Merge pull request diffblue#471 from diffblue/stage_1_performance_over_all_apps

marek-trtik · web-flow · commit 921844ccb5ec · 2018-07-04T16:51:31.000+01:00
SEC-499: Making time&amp;memory performance scatter plots of Stage 1 over many apps.
diff --git a/scripts/collect_stats.py b/scripts/collect_stats.py
@@ -0,0 +1,72 @@
+import argparse
+import os
+import json
+
+
+def _parse_cmd_line():
+    parser = argparse.ArgumentParser(
+        description="Searches result directories computed by security-analyser for web apps and "
+                    "collects performance data. The script provides an input to the script "
+                    "'make_performace_scatter_plots.py'.")
+    parser.add_argument("input", type=str,
+                        help="A root directory under which are stored results of the web apps.")
+    parser.add_argument("output", type=str,
+                        help="A path-name of the output JSON file.")
+    return parser.parse_args()
+
+
+def _main(cmdline):
+    if not os.path.isdir(cmdline.input):
+        print("ERROR: The input path is not an existing directory.")
+        return
+    result = {}
+    for root, _, file_names in os.walk(cmdline.input):
+        is_other = False
+        xroot = root
+        for dirname in ["JSON", "statistics", "RESULTS"]:
+            xroot, tail = os.path.split(xroot)
+            if tail != dirname:
+                is_other = True
+                break
+        full_pathname = os.path.abspath(os.path.join(root, "statistics_security_analyser.json"))
+        if not is_other and os.path.isfile(full_pathname):
+            assert os.path.basename(xroot) not in result
+            with open(full_pathname, "r") as ifile:
+                stats = json.load(ifile)
+            assert "table-files" in stats
+
+            num_locations = 0
+            for record in stats["table-files"]:
+                assert "functions" in record
+                for func in record["functions"]:
+                    assert "num-locations" in func
+                    num_locations += func["num-locations"]
+            assert "table-phases" in stats
+            total_time = 0.0
+            for _, time in stats["table-phases"].items():
+                total_time += time
+            benchmark_name = os.path.basename(xroot)
+            webgoat_lessons = [
+                "Assignment5",
+                "Assignment6",
+                "BlindSendFileAssignment",
+                "CrossSiteScriptingLesson5a",
+                "SimpleXXE",
+                "SqlInjectionChallenge",
+                "SqlInjectionLesson12a",
+                "SqlInjectionLesson5a",
+                "SqlInjectionLesson5b",
+                "SqlInjectionLesson6a",
+            ]
+            result[benchmark_name] = {
+                "category": benchmark_name if benchmark_name not in webgoat_lessons else "WebGoat",
+                "num_goto_program_locations": num_locations,
+                "time_in_seconds": total_time,
+                "memory_in_mega_bytes": 0,
+            }
+    with open(cmdline.output, "w") as ofile:
+        ofile.write(json.dumps(result, sort_keys=True, indent=4))
+
+
+if __name__ == "__main__":
+    _main(_parse_cmd_line())
diff --git a/scripts/make_performance_scatter_plots.py b/scripts/make_performance_scatter_plots.py
@@ -0,0 +1,159 @@
+import argparse
+import os
+import json
+import matplotlib.pyplot as plt
+import matplotlib.lines as mlines
+import numpy
+import random
+
+
+def _parse_cmd_line():
+    parser = argparse.ArgumentParser(
+        description="Makes scatter plots from performance data collected by the script 'collect_stats.py'.")
+    parser.add_argument("-V","--version", action="store_true",
+                        help="Prints a version string.")
+    parser.add_argument("input", type=str,
+                        help="A path-name of a JSON file with input data (i.e. output from the script 'collect_stats.py').")
+    parser.add_argument("output", type=str,
+                        help="A directory under which the plots will be stored.")
+    parser.add_argument("-F", "--format", type=str, default="svg",
+                        help="A directory under which the plots will be stored. Possible values are: svg, png, pdf, ps, eps.")
+    return parser.parse_args()
+
+
+def get_predefined_colour_names():
+    return [
+        "blue",
+        "green",
+        "red",
+        "cyan",
+        "magenta",
+        "orange",
+        "black",
+        "brown",
+        "navy",
+        "khaki",
+        "olive",
+        "pink",
+        "violet",
+        "purple",
+        "yellow",
+        "salmon",
+    ]
+
+
+def choose_colour(colour_index=0):
+    if colour_index < len(get_predefined_colour_names()):
+        return get_predefined_colour_names()[colour_index], colour_index + 1
+    return (random.uniform(0.0, 0.75), random.uniform(0.0, 0.75), random.uniform(0.0, 0.75)), colour_index
+
+
+def make_scatter_plot(
+        pathname,
+        format,
+        point_groups,
+        title=None,
+        xaxis_name=None,
+        faxis_name=None,
+        xaxis_log=False,
+        faxis_log=False,
+        draw_diagonal=False,
+        draw_fitline=False,
+        add_legend=False,
+        size_xy=None,
+        dpi=None
+        ):
+    assert isinstance(pathname, str) and len(pathname) > 0
+    assert title is None or isinstance(title, str)
+    assert xaxis_name is None or isinstance(xaxis_name, str)
+    assert faxis_name is None or isinstance(faxis_name, str)
+    assert size_xy is None or (isinstance(size_xy, tuple) and len(size_xy) == 2)
+    assert dpi is None or isinstance(dpi, int)
+    if dpi is None:
+        dpi = 100
+    os.makedirs(os.path.dirname(pathname), exist_ok=True)
+    fig = plt.figure(figsize=size_xy, dpi=dpi)
+    ax = fig.gca()
+    if title:
+        ax.set_title(title)
+    if xaxis_name:
+        ax.set_xlabel(xaxis_name)
+    if faxis_name:
+        ax.set_ylabel(faxis_name)
+    if xaxis_log:
+        ax.set_xscale('log')
+    if faxis_log:
+        ax.set_yscale('symlog')
+    ax.grid(True, linestyle='dotted')
+    all_xs = []
+    all_ys = []
+    idx = 0
+    for group in sorted(point_groups.keys()):
+        points = point_groups[group]
+        colour, idx = choose_colour(idx)
+        xs = []
+        ys = []
+        for x, y in points:
+            xs.append(x)
+            ys.append(y)
+        ax.scatter(xs, ys, marker="o", color=colour, label=group)
+        all_xs += xs
+        all_ys += ys
+    ax.legend()
+    if draw_diagonal:
+        line = mlines.Line2D([0, 1], [0, 1], color=("blue" if draw_fitline else "red"))
+        line.set_transform(ax.transAxes)
+        ax.add_line(line)
+    if draw_fitline:
+        line_coefs = numpy.polyfit(all_xs, all_ys, 1)
+        x_lo = min(all_xs)
+        x_hi = max(all_xs)
+        n_steps = 1000
+        dx = (x_hi - x_lo) / n_steps
+        lxs = sorted(all_xs + [x_lo + t * dx for t in range(n_steps + 1)])
+        lys = [line_coefs[0] * x + line_coefs[1] for x in lxs]
+        ax.plot(lxs, lys, "k:")
+    fig.savefig(pathname, bbox_inches='tight', format=format)
+
+
+def _main(cmdline):
+    with open(cmdline.input, "r") as ifile:
+        stats = json.load(ifile)
+
+    time_points = {}
+    memory_points = {}
+    for _, data in stats.items():
+        category = "" if "category" not in data else data["category"]
+        if category not in time_points:
+            time_points[category] = []
+        time_points[category].append((data["num_goto_program_locations"], data["time_in_seconds"]))
+        if category not in memory_points:
+            memory_points[category] = []
+        memory_points[category].append((data["num_goto_program_locations"], data["memory_in_mega_bytes"]))
+
+    fname_prefix = "security-analyser_stage1_"
+
+    make_scatter_plot(
+        os.path.join(cmdline.output, fname_prefix + "time_perf." + cmdline.format),
+        cmdline.format,
+        time_points,
+        "Time performance of Stage 1 of the security-analyser",
+        "goto-program locations",
+        "seconds",
+        draw_fitline=True,
+        add_legend=True
+        )
+    make_scatter_plot(
+        os.path.join(cmdline.output, fname_prefix + "memory_perf." + cmdline.format),
+        cmdline.format,
+        memory_points,
+        "Memory performance of Stage 1 of the security-analyser",
+        "goto-program locations",
+        "MB",
+        draw_fitline=True,
+        add_legend=True
+        )
+
+
+if __name__ == "__main__":
+    _main(_parse_cmd_line())