verilog-to-routing · vaughnbetz · Jun 14, 2021 · Jun 2, 2021 · Jun 2, 2021 · Jun 2, 2021
diff --git a/.github/kokoro/presubmit/nightly_test4.cfg b/.github/kokoro/presubmit/nightly_test4.cfg
@@ -0,0 +1,67 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+build_file: "vtr-verilog-to-routing/.github/kokoro/run-vtr.sh"
+
+# 72 hours
+timeout_mins: 4320
+
+action {
+  define_artifacts {
+    # File types
+    regex: "**/*.out"
+    regex: "**/vpr_stdout.log"
+    regex: "**/parse_results.txt"
+    regex: "**/qor_results.txt"
+    regex: "**/pack.log"
+    regex: "**/place.log"
+    regex: "**/route.log"
+    regex: "**/*_qor.csv"
+    regex: "**/*.out.gz"
+    regex: "**/vpr_stdout.log.gz"
+    regex: "**/parse_results.txt.gz"
+    regex: "**/qor_results.txt.gz"
+    regex: "**/pack.log.gz"
+    regex: "**/place.log.gz"
+    regex: "**/route.log.gz"
+    regex: "**/*_qor.csv.gz"
+    strip_prefix: "github/vtr-verilog-to-routing/"
+  }
+}
+
+env_vars {
+  key: "KOKORO_TYPE"
+  value: "presubmit"
+}
+
+env_vars {
+  key: "KOKORO_DIR"
+  value: "vtr-verilog-to-routing"
+}
+
+env_vars {
+  key: "VTR_DIR"
+  value: "vtr-verilog-to-routing"
+}
+
+#Use default build configuration
+env_vars {
+  key: "VTR_CMAKE_PARAMS"
+  value: ""
+}
+
+env_vars {
+  key: "VTR_TEST"
+  value: "vtr_reg_nightly_test4"
+}
+
+#Options for run_reg_test.py
+# -show_failures: show tool failures in main log output
+env_vars {
+  key: "VTR_TEST_OPTIONS"
+  value: "-show_failures"
+}
+
+env_vars {
+  key: "NUM_CORES"
+  value: "8"
+}
diff --git a/.github/kokoro/steps/vtr-test.sh b/.github/kokoro/steps/vtr-test.sh
@@ -72,7 +72,7 @@ find . -type f -regex ".*\.tar\.\(gz\|xz\)" -delete
 find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test1/ -type f -print0 | xargs -0 -P $(nproc) gzip
 find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/ -type f -print0 | xargs -0 -P $(nproc) gzip
 find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test3/ -type f -print0 | xargs -0 -P $(nproc) gzip
-
+find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/ -type f -print0 | xargs -0 -P $(nproc) gzip
 
 # Make sure working directory doesn't exceed disk space limit!
 echo "Working directory size: $(du -sh)"

diff --git a/ODIN_II/regression_test/benchmark/suite/heavy_suite/task_list.conf b/ODIN_II/regression_test/benchmark/suite/heavy_suite/task_list.conf
@@ -1,3 +1,4 @@
 regression_test/benchmark/task/full
 regression_test/benchmark/task/large
+regression_test/benchmark/task/koios
 
diff --git a/ODIN_II/regression_test/benchmark/task/koios/synthesis_result.json b/ODIN_II/regression_test/benchmark/task/koios/synthesis_result.json
diff --git a/ODIN_II/regression_test/benchmark/task/koios/task.conf b/ODIN_II/regression_test/benchmark/task/koios/task.conf
@@ -0,0 +1,31 @@
+########################
+# large benchmarks config
+########################
+
+regression_params=--disable_simulation --disable_parallel_jobs --verbose
+script_synthesis_params=--limit_ressource --time_limit 14400s
+script_simulation_params=--limit_ressource --time_limit 14400s
+
+# setup the architecture
+archs_dir=../vtr_flow/arch/COFFE_22nm
+
+# one arch allows it to run faster given it is single threaded
+arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
+
+circuits_dir=../../../../vtr_flow/benchmarks/verilog/koios
+
+# glob the large benchmark and the vtr ones to prevent duplicate run
+circuit_list_add=tpu_like.small.v
+circuit_list_add=dla_like.small.v
+circuit_list_add=bnn.v
+circuit_list_add=attention_layer.v
+circuit_list_add=conv_layer_hls.v
+circuit_list_add=conv_layer.v
+circuit_list_add=gemm_layer.v
+circuit_list_add=eltwise_layer.v
+circuit_list_add=robot_rl.v
+circuit_list_add=reduction_layer.v
+circuit_list_add=spmv.v
+circuit_list_add=softmax.v
+
+synthesis_parse_file=regression_test/parse_result/conf/synth.toml
diff --git a/ODIN_II/regression_test/benchmark/task/large/synthesis_result.json b/ODIN_II/regression_test/benchmark/task/large/synthesis_result.json
@@ -1161,89 +1161,6 @@
         "Longest Path": 2,
         "Average Path": 2
     },
-    "large/tpu.16x16.int8/k6_frac_N10_frac_chain_mem32K_40nm": {
-        "test_name": "large/tpu.16x16.int8/k6_frac_N10_frac_chain_mem32K_40nm",
-        "architecture": "k6_frac_N10_frac_chain_mem32K_40nm.xml",
-        "verilog": "tpu.16x16.int8.v",
-        "warnings": [
-            "tpu.16x16.int8.v:2298:7 [AST] Odin does not handle signed REG (counter)"
-        ],
-        "max_rss(MiB)": 241.4,
-        "exec_time(ms)": 2140.9,
-        "synthesis_time(ms)": 2135.1,
-        "Latch Drivers": 1,
-        "Pi": 354,
-        "Po": 289,
-        "logic element": 59049,
-        "latch": 22362,
-        "Adder": 4988,
-        "Multiplier": 288,
-        "Memory": 256,
-        "generic logic size": 4,
-        "Longest Path": 1596,
-        "Average Path": 4,
-        "Estimated LUTs": 70497,
-        "Total Node": 86944
-    },
-    "large/tpu.32x32.int8/k6_frac_N10_frac_chain_mem32K_40nm": {
-        "test_name": "large/tpu.32x32.int8/k6_frac_N10_frac_chain_mem32K_40nm",
-        "architecture": "k6_frac_N10_frac_chain_mem32K_40nm.xml",
-        "verilog": "tpu.32x32.int8.v",
-        "warnings": [
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[0] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[1] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[2] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[3] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[4] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[5] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[6] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[7] is unused in module systolic_data_setup",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[0] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[1] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[2] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[3] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[4] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[5] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[6] is unused in module output_logic",
-            "tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[7] is unused in module output_logic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[0] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[1] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[2] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[3] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[4] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[5] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[6] is unused in module matmul_32x32_systolic",
-            "tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[7] is unused in module matmul_32x32_systolic"
-        ],
-        "max_rss(MiB)": 811.1,
-        "exec_time(ms)": 8389.9,
-        "synthesis_time(ms)": 8384.1,
-        "Latch Drivers": 1,
-        "Pi": 642,
-        "Po": 545,
-        "logic element": 190121,
-        "latch": 85146,
-        "Adder": 18297,
-        "Multiplier": 1088,
-        "Memory": 512,
-        "generic logic size": 4,
-        "Longest Path": 3164,
-        "Average Path": 4,
-        "Estimated LUTs": 209209,
-        "Total Node": 295165
-    },
-    "large/matmul_8x8_fp16/k6_frac_N10_frac_chain_mem32K_40nm": {
-        "test_name": "large/matmul_8x8_fp16/k6_frac_N10_frac_chain_mem32K_40nm",
-        "architecture": "k6_frac_N10_frac_chain_mem32K_40nm.xml",
-        "verilog": "matmul_8x8_fp16.v",
-        "exit": 134,
-        "errors": [
-            "matmul_8x8_fp16.v:1648:1 [AST] Can't find module name mac_fp"
-        ],
-        "warnings": [
-            "matmul_8x8_fp16.v:1212:7 [AST] Odin does not handle signed REG (counter)"
-        ]
-    },
     "DEFAULT": {
         "test_name": "n/a",
         "architecture": "n/a",

diff --git a/doc/src/vtr/benchmarks.rst b/doc/src/vtr/benchmarks.rst
@@ -38,9 +38,7 @@ They are suitable for FPGA architecture research and medium-scale CAD research.
     stereovision0       Computer Vision
     stereovision1       Computer Vision
     stereovision2       Computer Vision
-    stereovision3       Computer Vision
-    tpu.32x32.int8      Deep Learning
-    tpu.16x16.int8      Deep Learning
+    stereovision3       Computer Vision    
     ================    =================
 
 The VTR benchmarks are provided as Verilog under: ::
@@ -66,6 +64,51 @@ The Titan benchmarks are suitable for large-scale FPGA CAD research, and FPGA ar
 
 .. seealso:: :ref:`titan_benchmarks_tutorial`
 
+Koios Benchmarks
+-----------------
+The Koios benchmarks :cite:`koios_benchmarks` are a set of Deep Learning (DL) benchmarks. 
+They are suitable for DL related architecture and CAD research.
+There are 19 designs that include several medium-sized benchmarks and some large benchmarks.
+The designs target different network types (CNNs, RNNs, MLPs, RL) and layer types (fully-connected, convolution, activation, softmax, reduction, eltwise).
+Some of the designs are generated from HLS tools as well.
+These designs use many precisions including binary, different fixed point types int8/16/32, brain floating point (bfloat16), and IEEE half-precision floating point (fp16).
+
+..  table_koios_benchmarks:
+
+.. table:: The Koios Benchmarks.  
+
+    =================   ======================================
+    Benchmark           Description
+    =================   ======================================
+    clstm_like          CLSTM-like accelerator
+    dla_like            Intel-DLA-like accelerator
+    lstm                LSTM engine
+    tpu_like            Google-TPU-v1-like accelerator
+    bnn                 4-layer binary neural network
+    tiny_darknet_like   Accelerator for Tiny Darknet    
+    gemm_layer          20x20 matrix multiplication engine
+    attention_layer     Transformer self-attention layer
+    conv_layer          GEMM based convolution
+    spmv                Sparse matrix vector multiplication
+    robot_rl            Robot+maze application     
+    reduction_layer     Add/max/min reduction tree
+    softmax             Softmax classification layer
+    conv_layer_hls      Sliding window convolution
+    eltwise_layer       Matrix elementwise add/sub/mult  
+    =================   ======================================
+
+Koios benchmarks are fully compatible with the full VTR flow. Some Koios benchmarks use advanced DSP features that are available in only a few FPGA architectures provided with VTR. This is because they instantiate DSP macros to implement native FP16 multiplications or use the hard dedicated chains, and these are architecture-specific. If users want to use a different FPGA architecture file, they can replace the macro instantiations in the benchmarks with their equivalents from the FPGA architectures they wish to use.
+
+Alternatively, users can disable these advanced features. The macro ``complex_dsp`` can be used for this purpose. If complex_dsp is defined in a benchmark file (using ```define complex_dsp`` in the beginning of the benchmark file), then advanced DSP features mentioned above will be used. If a user wants to run a Koios benchmark with FPGA architectures that don't have these advanced DSP features (for example, the flagship architectures: ``$VTR_ROOT/vtr_flow/arch/timing/k6_frac_N10_*_mem32K_40nm*``), then they can remove the line defining the complex_dsp macro. This enables the same functionality with behavioral Verilog that is mapped to the FPGA soft logic when an architecture without the required macro definitions is used.
+
+The VTR benchmarks are provided as Verilog (enabling full flexibility to modify and change how the designs are implemented) under: ::
+
+    $VTR_ROOT/vtr_flow/benchmarks/verilog/koios
+
+The FPGA architectures with advanced DSP that work out-of-the-box with Koios benchmarks are available here: ::
+
+    $VTR_ROOT/vtr_flow/arch/COFFE_22nm/k6FracN10LB_mem20K_complexDSP_customSB_22nm.*
+
 MCNC20 Benchmarks
 -----------------
 The MCNC benchmarks :cite:`mcnc_benchmarks` are a set of small and old (circa 1991) benchmarks.
@@ -114,3 +157,4 @@ where :math:`K=` ``<#>``.
     spla        2278
     tseng       1583
     =========   ========================================
+
diff --git a/doc/src/z_references.bib b/doc/src/z_references.bib
@@ -415,3 +415,11 @@ @ARTICLE{murray_micro_symbiflow
 	number={},
 	pages={1-1}
 }
+
+@inproceedings{koios_benchmarks,
+  title={Koios: A Deep Learning Benchmark Suite for FPGA Architecture and CAD Research},
+  author={Arora, Aman and Boutros, Andrew and Rauch, Daniel and Rajen, Aishwarya and Borda, Aatman and Damghani, Seyed A. and Mehta, Samidh and Kate, Sangram and Patel, Pragnesh and Kent, Kenneth B. and Betz, Vaughn and John, Lizy K.},
+  booktitle={International Conference on Field Programmable Logic and Applications (FPL)},
+  year={2021}
+}
+