Adds Python method to run DI model generation

JohnDumbell · JohnDumbell · commit cf246386329f · 2018-08-28T16:03:58.000+01:00
diff --git a/driver/mkbench.py b/driver/mkbench.py
@@ -1,4 +1,7 @@
 import os
+import re
+import subprocess
+
 import analyser
 import filecmp
 import shutil
@@ -7,6 +10,8 @@
 import utility
 import subprocess
 
+import xml.dom.minidom
+
 class CollectedJavaBinaries:
     def __init__(self):
         self.class_files = []
@@ -70,6 +75,88 @@ def _read_info_of_class_files(class_files, configuration, cmdline):
     return classes_info, java_class_info_call_duration
 
 
+def run_di_model_generation(configuration, cmdline):
+    """
+    Runs DI overlay model generation. This attempts to work out what Spring
+    DI modelling is being used by the application under analysis and then create
+    fake Java code that will simulate what the DI would do at run-time.
+
+    At this point we should have a configuration file from java-class-info that
+    will drive generation.
+    """
+
+    # If we don't have anything vaguely looking like a spring library being used, just skip generation.
+    if not next((lib for lib in cmdline.libraries if re.search("spring-[a-zA-Z]*-[0-9]*\.[0-9]*", lib)), None) \
+            or cmdline.skip_di_generation:
+        print("No Spring Framework libraries passed in, skipping DI generation.")
+        return
+
+    di_output_file_path = configuration["diConfigurationPath"]
+    detected_entry_points_path = configuration["detectedEntryPointsPath"]
+    di_starting_script = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "env-model-generator", "built", "env-model-generator.js"))
+
+    # Right now we're only using our metadata generated files, but this should also include XML.
+    spring_files = []
+    if os.path.exists(di_output_file_path) and os.path.isfile(di_output_file_path):
+        spring_files.append(di_output_file_path)
+
+    # Clear down any existing files.
+    generated_source_file_path = os.path.join(cmdline.common_dir, "GENERATED_SOURCE")
+    if os.path.exists(generated_source_file_path):
+        os.rmdir(generated_source_file_path)
+    os.mkdir(generated_source_file_path)
+
+    di_generation_commandline = ["node", di_starting_script, "".join(spring_files[:1]), "--input-file", " ".join(spring_files[1:]), "--output-path", generated_source_file_path, "--entry-points-input-file", detected_entry_points_path]
+
+    print("Running commandline: " + " ".join(di_generation_commandline))
+    di_generation_result = subprocess.run(di_generation_commandline, stderr=subprocess.STDOUT)
+    if di_generation_result.returncode != 0:
+        print("DI generation failed.")
+        return
+
+    # Quick check to see if we have any output files.
+    if not next((file for root, dirs, files in os.walk(generated_source_file_path) for file in files), None):
+        print("No Java source files emitted by DI generation.")
+        return
+
+    # Clear down yet more existing files.
+    java_binaries_path = os.path.join(cmdline.common_dir, "GENERATED_BINARIES")
+    if os.path.exists(java_binaries_path):
+        os.rmdir(java_binaries_path)
+    os.mkdir(java_binaries_path)
+
+    # Combine the incoming libraries with our collected classes, this
+    # should hold every reference we need to compile.
+    class_paths = cmdline.libraries + [os.path.join(cmdline.common_dir, "collected_classes.jar")]
+
+    generated_jar_path = os.path.join(cmdline.common_dir, "DI-models.jar")
+    ant_build_file = ('<project default="compile">'
+                          '<target name="compile">'
+                              '<javac includeantruntime="true" srcdir="' + generated_source_file_path + '" destdir="' + java_binaries_path + '">'
+                              + '<classpath>' + "".join(["<pathelement location=\"" + path + "\"/>" for path in class_paths]) + '</classpath>' +
+                              '</javac>'
+                              '<jar destfile="' + generated_jar_path + '" basedir="' + java_binaries_path + '"/>'
+                          '</target>'
+                      '</project>')
+
+    # Attempt to prettify output a little.
+    ant_build_file = xml.dom.minidom.parseString(ant_build_file).toprettyxml()
+
+    ant_build_path = os.path.join(generated_source_file_path, "build.xml")
+    with open(ant_build_path, 'x') as ant_xml_file:
+        ant_xml_file.write(ant_build_file)
+
+    javac_commandline = ["ant", "-file", ant_build_path, "-lib", os.pathsep.join(class_paths)]
+
+    print("Running commandline: " + " ".join(javac_commandline))
+    javac_result = subprocess.run(javac_commandline, stderr=subprocess.STDOUT)
+    if javac_result.returncode != 0:
+        print("Java compilation of DI-generated source failed.")
+        return
+
+    return generated_jar_path
+
+
 def collect_java_binaries(cmdline):
     with open(cmdline.prepare_scan) as config_file:
         configuration = json.load(config_file)
@@ -157,9 +244,6 @@ def collect_java_binaries(cmdline):
         print("WARNING: Unable to find any detected entry points. No analysis will be run.")
         return
 
-    with open(entry_points_file) as ep_config_file:
-        ep_config = json.load(ep_config_file)
-
     # Copy the current commandline and transpose into a dictionary.
     copied_command_line = {key.replace("_", "-"): val for key, val in vars(cmdline).items()}
 
@@ -170,14 +254,29 @@ def collect_java_binaries(cmdline):
                    java_libraries.classpath_jar_files +
                    library_directories)
 
+    di_output_path = run_di_model_generation(configuration, cmdline)
+
+    # If DI generation has run we want to target its output jar as our starting point.
+    target_binary = java_binaries.jar_file
+    if di_output_path:
+        class_paths.append(java_binaries.jar_file)
+        class_paths.append(di_output_path) # Appended because we need overlay classes too.
+        target_binary = di_output_path
+
+    with open(entry_points_file) as ep_config_file:
+        ep_config = json.load(ep_config_file)
+
     previously_created_folders = set()
     for ep_data in ep_config["entryPoints"]:
 
         method_data = ep_data["method"]
 
         # We don't add the descriptor here as it's not recognized (and makes folder names too long).
         friendly_method_name = ep_data["className"] + "." + method_data["name"]
-        raw_method_name = ep_data["className"] + "." + method_data["signature"]
+
+        actual_entry_point = \
+            "com.diffblue.security.SyntheticEntryPoints." + method_data["syntheticMethodName"] if di_output_path \
+            else ep_data["className"] + "." + method_data["signature"]
 
         # Try to make sure the folder name isn't invalid.
         folder_name = friendly_method_name.replace(os.path.sep, '.')
@@ -199,12 +298,12 @@ def collect_java_binaries(cmdline):
         generated_temp_folder = os.path.join(cmdline.temp_dir, folder_name)
 
         program_json = {
-            "jar": java_binaries.jar_file,
+            "jar": target_binary,
             "classpath": class_paths,
             # The file should not exist yet. Here we only record the prefered/desired location
             #  of the file on disk. Analyser will create it, if it does not exist (in the first run)
             "gbf": os.path.join(generated_temp_folder, "input_program.gbf"),
-            "entry-point": raw_method_name
+            "entry-point": actual_entry_point
         }
 
         program_file = os.path.join(output_folder, "program.json")
@@ -219,7 +318,7 @@ def collect_java_binaries(cmdline):
             "output-dir": output_folder,
             "results-dir": output_folder,
             "temp-dir": generated_temp_folder,
-            "entry-point": raw_method_name
+            "entry-point": actual_entry_point
         })
 
         # Save the commandline file.
diff --git a/driver/run.py b/driver/run.py
@@ -195,6 +195,8 @@ def create_parser():
                         help="Path to the configuration file used to do the pre-analysis stage")
     parser.add_argument("--run-scan", type=str, const='', nargs="?",
                         help="Flag for whether a security scan should be run. Target will be the results folder.")
+    parser.add_argument("--skip-di-generation", action='store_true',
+                        help="Turns off DI overlay generation. This overrides the automatic detection of this feature.")
 
     return parser
 
@@ -432,6 +434,15 @@ def __main():
               analyser.get_missing_binary_error_message())
         return
 
+    # If we've been passed a path with * on the end, unpack the .jar files in
+    # that directory (and children).
+    detected_jar_files = [os.path.join(root, file)
+                          for path in cmdline.libraries if not os.path.isdir(path) and path.endswith("*") and os.path.isdir(path.strip('*'))
+                          for root, directories, files in os.walk(path.strip('*'))
+                          for file in files if re.search(".*\.jar$", file)]
+
+    cmdline.libraries = detected_jar_files + [path for path in cmdline.libraries if not (not os.path.isdir(path) and path.endswith("*"))]
+
     common_libraries = _get_common_libraries(cmdline.models_library_location)
 
     if cmdline.use_models_library: