diff --git a/ODIN_II/regression_test/benchmark/task/koios/task.conf b/ODIN_II/regression_test/benchmark/task/koios/task.conf index f0bea875027..57f03554c4b 100644 --- a/ODIN_II/regression_test/benchmark/task/koios/task.conf +++ b/ODIN_II/regression_test/benchmark/task/koios/task.conf @@ -1,20 +1,29 @@ ######################## -# large benchmarks config +# Koios benchmarks config ######################## regression_params=--disable_simulation --disable_parallel_jobs --verbose script_synthesis_params=--limit_ressource --time_limit 14400s script_simulation_params=--limit_ressource --time_limit 14400s -# setup the architecture +#------------------------------------------------------- +# specify the directory to look for architecture file in +#------------------------------------------------------- archs_dir=../vtr_flow/arch/COFFE_22nm -# one arch allows it to run faster given it is single threaded +#------------------------------------------------------- +# specify the architecture file +#------------------------------------------------------- arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml +#------------------------------------------------------- +# specify the directory to look for benchmarks in +#------------------------------------------------------- circuits_dir=../../../../vtr_flow/benchmarks/verilog/koios -# glob the large benchmark and the vtr ones to prevent duplicate run +#------------------------------------------------------- +# specify the benchmarks +#------------------------------------------------------- circuit_list_add=tpu_like.small.v circuit_list_add=dla_like.small.v circuit_list_add=bnn.v @@ -28,4 +37,18 @@ circuit_list_add=reduction_layer.v circuit_list_add=spmv.v circuit_list_add=softmax.v +#------------------------------------------------------- +# specify the directory to look for include file in +#------------------------------------------------------- +includes_dir=../../../../vtr_flow/benchmarks/verilog/koios + +#------------------------------------------------------- +# specify the include files +#------------------------------------------------------- +# Some benchmarks instantiate complex dsp blocks to implement features +# like native floating point math, cascade chains, etc. This functionality +# is guarded under the `complex_dsp` macro. The complex_dsp_include.v file +# defines this macro, thereby enabling instantiations of the complex dsp. +include_list_add=complex_dsp_include.v + synthesis_parse_file=regression_test/parse_result/conf/synth.toml diff --git a/README.developers.md b/README.developers.md index 110f7c0d080..2c77340d932 100644 --- a/README.developers.md +++ b/README.developers.md @@ -626,6 +626,48 @@ stratixiv_arch.timing.xml stereo_vision_stratixiv_arch_timing.blif 0208312 stratixiv_arch.timing.xml cholesky_mc_stratixiv_arch_timing.blif 0208312 success 140214 108592 67410 5444 121 90 -1 111 151 -1 -1 5221059 8.16972 -454610 -8.16972 1518597 15 0 0 2.38657e+08 21915.3 9.34704 -531231 -9.34704 0 0 211.12 364.32 490.24 6356252 -1 -1 ``` +### Example: Koios Benchmarks QoR Measurement + +The [Koios benchmarks](https://github.com/verilog-to-routing/vtr-verilog-to-routing/tree/master/vtr_flow/benchmarks/verilog/koios) are a group of Deep Learning benchmark circuits distributed with the VTR project. +The are provided as synthesizable verilog and can be re-mapped to VTR supported architectures. They consist mostly of medium to large sized circuits from Deep Learning (DL). +They can be used for FPGA architecture exploration for DL and also for tuning CAD tools. + +A typical approach to evaluating an algorithm change would be to run `koios` (or `koios_no_complex_dsp`) task from the nightly regression test (vtr_reg_nightly_test4) and the `koios` (or `koios_no_complex_dsp`) task from the weekly regression test (vtr_reg_weekly). The nightly test contains smaller benchmarks, whereas the large designs are in the weekly regression test. To measure QoR for the entire benchmark suite, both nightly and weekly tests should be run and the results should be concatenated. + +The `koios` regression task runs these benchmarks with complex_dsp functionality enabled, whereas `koios_no_complex_dsp` regression task runs these benchmarks without complex_dsp functionality. Normally, only the `koios` tasks should be enough for QoR. + +The following steps show a sequence of commands to run the `koios` tasks on the Koios benchmarks from both nightly and weekly regressions: + +```shell +#From the VTR root +$ cd vtr_flow/tasks + +#Run the VTR benchmarks +$ ../scripts/run_vtr_task.py regression_tests/vtr_reg_nightly_test4/koios & +$ ../scripts/run_vtr_task.py regression_tests/vtr_reg_weekly/koios & + +#Several hours later... they complete + +#Parse the results +$ ../scripts/python_libs/vtr/parse_vtr_task.py regression_tests/vtr_reg_nightly_test4/koios +$ ../scripts/python_libs/vtr/parse_vtr_task.py regression_tests/vtr_reg_weekly/koios + +#The run directory should now contain a summary parse_results.txt file +$ head -5 vtr_reg_nightly_test4/koios//parse_results.txt +arch circuit script_params vtr_flow_elapsed_time error odin_synth_time max_odin_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_time placed_wirelength_est place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml tpu_like.small.v common 2871.10 9.36 235096 5 619.21 -1 -1 159760 -1 -1 1119 355 14 -1 success v8.0.0-4161-g8f4b3e9ca release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-05-28T23:09:34 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 2568860 355 289 50215 41827 2 23224 2053 136 136 18496 dsp_top auto 1233.72 457725 91.70 0.38 7.24742 -105267 -7.24742 2.59789 14.13 0.101267 0.0738583 24.91 18.6865 -1 561916 17 5.92627e+08 1.03195e+08 4.09037e+08 22114.9 16.37 32.3744 25.1979 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml dla_like.small.v common 7527.41 42.24 729876 5 3941.31 -1 -1 630244 -1 -1 5545 194 828 -1 success v8.0.0-4161-g8f4b3e9ca release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-05-28T23:09:34 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 4409476 194 13 217044 174718 1 91037 6708 164 164 26896 memory auto 1604.22 969627 663.41 2.84 5.61569 -424718 -5.61569 5.61569 21.49 0.584073 0.385993 104.796 73.1698 -1 1450542 14 8.6211e+08 3.01197e+08 5.93540e+08 22068.0 53.97 132.203 96.049 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml bnn.v common 2028.52 40.37 577472 3 240.94 -1 -1 513656 -1 -1 5695 260 0 -1 success v8.0.0-4161-g8f4b3e9ca release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-05-28T23:09:34 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 2195980 260 122 231647 179602 1 86181 6140 83 83 6889 clb auto 613.32 940951 503.35 2.87 6.4402 -131403 -6.4402 6.4402 5.41 0.753268 0.564332 85.331 60.8639 -1 1224690 16 2.13666e+08 1.74902e+08 1.51359e+08 21971.1 50.49 114.382 84.8538 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml attention_layer.v common 1330.99 11.83 1095592 7 59.16 -1 -1 560612 -1 -1 1248 1058 161 -1 success v8.0.0-4161-g8f4b3e9ca release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-05-28T23:09:34 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 1180420 1058 16 47407 39134 1 26605 2588 86 86 7396 dsp_top auto 728.70 234151 118.11 0.71 5.89837 -78343.6 -5.89837 5.89837 6.64 0.181478 0.146942 31.9659 24.5807 -1 366899 17 2.32446e+08 8.36361e+07 1.62201e+08 21930.9 16.25 40.6352 32.1556 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 + +$ head -5 vtr_reg_weekly/koios//parse_results.txt +arch circuit script_params vtr_flow_elapsed_time error odin_synth_time max_odin_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_time placed_wirelength_est place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml clstm_like.small.v common 19316.21 162.86 2395612 3 15651.23 -1 -1 1337084 -1 -1 9309 293 407 -1 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 4793656 293 290 374045 333664 1 99618 10661 152 152 23104 dsp_top auto 780.11 1482046 931.17 6.18 6.93257-474705 -6.93257 6.93257 27.96 1.30491 1.10886 193.621 147.02 -1 1762769 16 7.41832e+08 4.07655e+08 5.09972e+08 22072.9 106.48 263.016 205.56 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml clstm_like.medium.v common 66156.51 483.84 4490632 3 60586.75 -1 -1 2480684 -1 -1 17641 293 784 -1 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 8819072 293 578 696105 624609 1 181497 19958 206 206 42436 dsp_top auto 1057.61 3181171 1581.42 7.89 8.18417-1.24303e+06 -8.18417 8.18417 38.05 1.40496 0.993779 233.672 173.509 -1 3532357 18 1.36407e+09 7.68183e+08 9.34572e+08 22023.1 165.78 317.679 246.098 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml clstm_like.large.v common 101695.32 717.44 6547568 3 94597.42 -1 -1 3590044 -1 -1 25995 293 1161 -1 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 13040176 293 866 1018158 915547 1 263328 29277 248 248 61504 dsp_top auto 1503.62 5185716 1951.96 10.18 8.86387-1.97516e+06 -8.86387 8.86387 46.08 1.71937 1.21506 288.67 213.718 -1 5523775 18 1.98856e+09 1.12933e+09 1.35238e+09 21988.4 224.14 391.303 303.49 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml lstm.v common 38901.96 35.76 651868 7 30532.88 -1 -1 606240 -1 -1 6626 17 305 -1 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 6036204 17 19 252939 204226 1 121211 7577 200 200 40000 dsp_top auto 4576.24 1453809 1136.46 3.95 8.38544 -386636 -8.385448.38544 54.87 0.944433 0.763732 237.758 176.282 -1 1876011 15 1.28987e+09 3.81683e+08 8.80433e+08 22010.877.53 284.979 214.902 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +``` + ## Comparing QoR Measurements Once you have two (or more) sets of QoR measurements they now need to be compared. diff --git a/doc/src/vtr/benchmarks.rst b/doc/src/vtr/benchmarks.rst index 93bd6864c78..d3fe85f3622 100644 --- a/doc/src/vtr/benchmarks.rst +++ b/doc/src/vtr/benchmarks.rst @@ -97,17 +97,12 @@ These designs use many precisions including binary, different fixed point types eltwise_layer Matrix elementwise add/sub/mult ================= ====================================== -Koios benchmarks are fully compatible with the full VTR flow. Some Koios benchmarks use advanced DSP features that are available in only a few FPGA architectures provided with VTR. This is because they instantiate DSP macros to implement native FP16 multiplications or use the hard dedicated chains, and these are architecture-specific. If users want to use a different FPGA architecture file, they can replace the macro instantiations in the benchmarks with their equivalents from the FPGA architectures they wish to use. - -Alternatively, users can disable these advanced features. The macro ``complex_dsp`` can be used for this purpose. If complex_dsp is defined in a benchmark file (using ```define complex_dsp`` in the beginning of the benchmark file), then advanced DSP features mentioned above will be used. If a user wants to run a Koios benchmark with FPGA architectures that don't have these advanced DSP features (for example, the flagship architectures: ``$VTR_ROOT/vtr_flow/arch/timing/k6_frac_N10_*_mem32K_40nm*``), then they can remove the line defining the complex_dsp macro. This enables the same functionality with behavioral Verilog that is mapped to the FPGA soft logic when an architecture without the required macro definitions is used. - The VTR benchmarks are provided as Verilog (enabling full flexibility to modify and change how the designs are implemented) under: :: $VTR_ROOT/vtr_flow/benchmarks/verilog/koios -The FPGA architectures with advanced DSP that work out-of-the-box with Koios benchmarks are available here: :: +To use these benchmarks, please see the documentation in the README file at: https://github.com/verilog-to-routing/vtr-verilog-to-routing/tree/master/vtr_flow/benchmarks/verilog/koios - $VTR_ROOT/vtr_flow/arch/COFFE_22nm/k6FracN10LB_mem20K_complexDSP_customSB_22nm.* MCNC20 Benchmarks ----------------- diff --git a/vtr_flow/benchmarks/verilog/koios/README.md b/vtr_flow/benchmarks/verilog/koios/README.md index 9532b35d04d..531f9d86c46 100644 --- a/vtr_flow/benchmarks/verilog/koios/README.md +++ b/vtr_flow/benchmarks/verilog/koios/README.md @@ -6,6 +6,39 @@ Koios benchmarks are a set of Deep Learning (DL) benchmarks for FPGA architectur ## Documentation A brief documentation of Koios benchmarks is available [here](https://docs.verilogtorouting.org/en/latest/vtr/benchmarks/#koios-benchmarks). +## How to Use +Koios benchmarks are fully compatible with the full VTR flow. They can be used using the standard VTR flow described [here](https://docs.verilogtorouting.org/en/latest/vtr/running_vtr/). + +Some Koios benchmarks use advanced DSP features that are available in only a few FPGA architectures provided with VTR. These benchmarks instantiate DSP macros to implement native FP16 or BF16 multiplications or use the hard dedicated chains, and these are architecture-specific. However, these advanced/complex DSP features can be enabled or disabled. The macro ``complex_dsp`` can be used for this purpose. If `complex_dsp` is defined in a benchmark file (using `` `define complex_dsp``), then advanced DSP features mentioned above will be used. If `complex_dsp` is not defined, then equivalent functionality is obtained through behavioral Verilog that gets mapped to the FPGA soft logic. + +From a flow perspective, a feature was recently added in VTR (June 2021) that makes it easy to enable/disable a macro (like `complex_dsp`). The feature provides for specifying a separate Verilog header file while running a flow/task, so a benchmark's Verilog file doesn't have to be modified. For `run_vtr_flow` users, `-include ` needs to be added. For `run_vtr_task` users, `includes_dir` and `include_list_add` need to be specified in the task file. An example task file can be seen [here](https://github.com/verilog-to-routing/vtr-verilog-to-routing/blob/master/vtr_flow/tasks/regression_tests/vtr_reg_basic/hdl_include/config/config.txt). + +Using such advanced DSP features is common in modern designs used with contemporary FPGAs. When using these benchmarks and enabling these advanced features, an FPGA architecture that supports these features must be provided. Supporting these features implies that the architecture XML file provided to VTR must describe such features (e.g. by defining a hard block macro DSP slice). We provide such architectures with Koios. The FPGA architectures with advanced DSP that work out-of-the-box with Koios benchmarks are available here: + + $VTR_ROOT/vtr_flow/arch/COFFE_22nm/k6FracN10LB_mem20K_complexDSP_customSB_22nm.* + + +When disabling these advanced features (by not defining `complex_dsp` as mentioned above), users can run these benchmarks with FPGA architectures that don't have these advanced DSP features. That is, an architecture XML file without the required hard macro definitions can be used. For example, the flagship architectures available here: :: + + $VTR_ROOT/vtr_flow/arch/timing/k6_frac_N10_*_mem32K_40nm* + +If users want to use a different FPGA architecture file, they can replace the macro instantiations in the benchmarks with their equivalents from the FPGA architectures they wish to use. + +## Regressions +Koios benchmarks are tested by the following regression tests in VTR: +| Suite |Test Description | Config file | Wall-clock time | +|---------------|----------------------|---------------|-------------------| +| Strong | A test circuit. Goal is to check the architecture files. | tasks/regression_tests/vtr_reg_strong/koios | 6 seconds | +| Strong | Same test circuit without enabling complex dsp features | tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp | 6 seconds| +| Nightly | Small-to-medium sized designs from Koios run with one arch file | tasks/regression_tests/vtr_reg_nightly_test4/koios | 2 hours with -j3 | +| Nightly | Small-to-medium sized designs from Koios run with an arch file without enabling complex dsp features | tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp | 2 hours with -j3 | +| Nightly | A small design from Koios run with various flavors of the arch file that enables complex dsp features | tasks/regression_tests/vtr_reg_nightly_test4/koios_multi_arch | 2 hours with -j3 | +| Weekly | Large designs from Koios run with one arch file | tasks/regression_tests/vtr_reg_weekly/koios | a little over 24 hours with -j4 | +| Weekly | Large designs from Koios run with an arch file without enabling complex dsp features | tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp | a little over 24 hours with -j4 | + +## Collecting QoR measurements +For collecting QoR measurements on Koios benchmarks, follow the instructions [here](https://docs.verilogtorouting.org/en/latest/dev/developing/#collecting-qor-measurements). + ## How to Cite The following paper may be used as a citation for Koios: diff --git a/vtr_flow/benchmarks/verilog/koios/attention_layer.v b/vtr_flow/benchmarks/verilog/koios/attention_layer.v index 46019c0c26f..dfb339513ec 100644 --- a/vtr_flow/benchmarks/verilog/koios/attention_layer.v +++ b/vtr_flow/benchmarks/verilog/koios/attention_layer.v @@ -4,7 +4,7 @@ //`define SIMULATION_MEMORY //`define SIMULATION_addfp -`define complex_dsp + `define VECTOR_DEPTH 64 //Q,K,V vector size `define DATA_WIDTH 16 `define VECTOR_BITS 1024 // 16 bit each (16x64) diff --git a/vtr_flow/benchmarks/verilog/koios/complex_dsp_include.v b/vtr_flow/benchmarks/verilog/koios/complex_dsp_include.v new file mode 100644 index 00000000000..da4576883e7 --- /dev/null +++ b/vtr_flow/benchmarks/verilog/koios/complex_dsp_include.v @@ -0,0 +1 @@ +`define complex_dsp diff --git a/vtr_flow/benchmarks/verilog/koios/conv_layer_hls.v b/vtr_flow/benchmarks/verilog/koios/conv_layer_hls.v index fb87a4e6ff7..28a37ef4272 100644 --- a/vtr_flow/benchmarks/verilog/koios/conv_layer_hls.v +++ b/vtr_flow/benchmarks/verilog/koios/conv_layer_hls.v @@ -18,7 +18,7 @@ // Abridged for VTR by: Daniel Rauch ////////////////////////////////////////////////////////////////////////////// -`define complex_dsp + module dpram (     clk, diff --git a/vtr_flow/benchmarks/verilog/koios/dla_like.medium.v b/vtr_flow/benchmarks/verilog/koios/dla_like.medium.v index 160e8f540bb..926810e5481 100644 --- a/vtr_flow/benchmarks/verilog/koios/dla_like.medium.v +++ b/vtr_flow/benchmarks/verilog/koios/dla_like.medium.v @@ -15,7 +15,7 @@ //4. Double-buffering after each layer. /////////////////////////////////////////////////////////////////////////////// -`define complex_dsp + module DLA ( input clk, input i_reset, diff --git a/vtr_flow/benchmarks/verilog/koios/dla_like.small.v b/vtr_flow/benchmarks/verilog/koios/dla_like.small.v index e7ff1dce971..d766ee6fbdc 100644 --- a/vtr_flow/benchmarks/verilog/koios/dla_like.small.v +++ b/vtr_flow/benchmarks/verilog/koios/dla_like.small.v @@ -15,7 +15,7 @@ //4. Double-buffering after each layer. /////////////////////////////////////////////////////////////////////////////// -`define complex_dsp + module DLA ( input clk, input i_reset, diff --git a/vtr_flow/benchmarks/verilog/koios/eltwise_layer.v b/vtr_flow/benchmarks/verilog/koios/eltwise_layer.v index 0b745ea8b0b..7fbd9a0be1e 100644 --- a/vtr_flow/benchmarks/verilog/koios/eltwise_layer.v +++ b/vtr_flow/benchmarks/verilog/koios/eltwise_layer.v @@ -56,7 +56,7 @@ //section by section. The number of rows will be programmed //in the "iterations" register in the design. -`define complex_dsp + `define BFLOAT16 // IEEE Half Precision => EXPONENT = 5, MANTISSA = 10 diff --git a/vtr_flow/benchmarks/verilog/koios/gemm_layer.v b/vtr_flow/benchmarks/verilog/koios/gemm_layer.v index 70911f25983..57258791ea1 100644 --- a/vtr_flow/benchmarks/verilog/koios/gemm_layer.v +++ b/vtr_flow/benchmarks/verilog/koios/gemm_layer.v @@ -19,7 +19,7 @@ // with a simpler DSP (just a fixed point multiplier) like in the // flagship arch timing/k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml ///////////////////////////////////////////////////////////////////////// -`define complex_dsp + `define BFLOAT16 // IEEE Half Precision => EXPONENT = 5, MANTISSA = 10 diff --git a/vtr_flow/benchmarks/verilog/koios/softmax.v b/vtr_flow/benchmarks/verilog/koios/softmax.v index 6a27414fd09..3d1bea1f55d 100644 --- a/vtr_flow/benchmarks/verilog/koios/softmax.v +++ b/vtr_flow/benchmarks/verilog/koios/softmax.v @@ -14,7 +14,7 @@ ////////////////////////////////////////////////////////////////////////////// //softmax_p8_smem_rfloat16_alut_v512_b2_-0.1_0.1.v -`define complex_dsp + `ifndef DEFINES_DONE `define DEFINES_DONE `define EXPONENT 5 diff --git a/vtr_flow/benchmarks/verilog/koios/test.v b/vtr_flow/benchmarks/verilog/koios/test.v index 0764c7b135a..2c46ed784d9 100644 --- a/vtr_flow/benchmarks/verilog/koios/test.v +++ b/vtr_flow/benchmarks/verilog/koios/test.v @@ -12,7 +12,6 @@ ///////////////////////////////////////////////////////// -`define complex_dsp `define BFLOAT16 // IEEE Half Precision => EXPONENT = 5, MANTISSA = 10 diff --git a/vtr_flow/benchmarks/verilog/koios/tiny_darknet_like.small.v b/vtr_flow/benchmarks/verilog/koios/tiny_darknet_like.small.v index 2b27c3e044c..d8a2b98b22e 100644 --- a/vtr_flow/benchmarks/verilog/koios/tiny_darknet_like.small.v +++ b/vtr_flow/benchmarks/verilog/koios/tiny_darknet_like.small.v @@ -16,7 +16,7 @@ ////////////////////////////////////////////////////////////////////////////// `timescale 1 ns / 1 ps -`define complex_dsp + module td_fused_top_Block_entry_proc_proc392 ( ap_clk, ap_rst, diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios/config/config.txt index 10bd5b8a3b1..7f44d68f0c0 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios/config/config.txt @@ -9,6 +9,9 @@ circuits_dir=benchmarks/verilog/koios # Path to directory of architectures to use archs_dir=arch/COFFE_22nm +# Directory containing the verilog includes file(s) +includes_dir=benchmarks/verilog/koios + # Add circuits to list to sweep circuit_list_add=tpu_like.small.v circuit_list_add=dla_like.small.v @@ -26,6 +29,13 @@ circuit_list_add=softmax.v # Add architectures to list to sweep arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml +# Add include files to the list. +# Some benchmarks instantiate complex dsp blocks to implement features +# like native floating point math, cascade chains, etc. This functionality +# is guarded under the `complex_dsp` macro. The complex_dsp_include.v file +# defines this macro, thereby enabling instantiations of the complex dsp. +include_list_add=complex_dsp_include.v + # Parse info and how to parse parse_file=vpr_standard.txt diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_multi_arch/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_multi_arch/config/config.txt index b2e37c5acdb..6a860519f36 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_multi_arch/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_multi_arch/config/config.txt @@ -9,6 +9,9 @@ circuits_dir=benchmarks/verilog/koios # Path to directory of architectures to use archs_dir=arch/COFFE_22nm +# Directory containing the verilog includes file(s) +includes_dir=benchmarks/verilog/koios + # Add circuits to list to sweep circuit_list_add=conv_layer.v @@ -25,6 +28,13 @@ arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.clustered.xml arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.clustered.densest.xml arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.clustered.denser.xml +# Add include files to the list. +# Some benchmarks instantiate complex dsp blocks to implement features +# like native floating point math, cascade chains, etc. This functionality +# is guarded under the `complex_dsp` macro. The complex_dsp_include.v file +# defines this macro, thereby enabling instantiations of the complex dsp. +include_list_add=complex_dsp_include.v + # Parse info and how to parse parse_file=vpr_standard.txt diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp/config/config.txt new file mode 100644 index 00000000000..a74f2b38090 --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp/config/config.txt @@ -0,0 +1,43 @@ +# +############################################ +# Configuration file for running experiments +############################################## + +# Path to directory of circuits to use +circuits_dir=benchmarks/verilog/koios + +# Path to directory of architectures to use +archs_dir=arch/timing + +# Add circuits to list to sweep. +# Some of these benchmarks are designs with complex_dsp blocks +# but in this task, we're running them without enabling these +# blocks (that is, the macro `complex_dsp` is not defined). +# Equivalent functionality is obtained through behavioral logic +# that gets mapped to soft logic. +circuit_list_add=tpu_like.small.v +circuit_list_add=dla_like.small.v +circuit_list_add=bnn.v +circuit_list_add=attention_layer.v +circuit_list_add=conv_layer_hls.v +circuit_list_add=conv_layer.v +circuit_list_add=eltwise_layer.v +circuit_list_add=robot_rl.v +circuit_list_add=reduction_layer.v +circuit_list_add=spmv.v +circuit_list_add=softmax.v + +# Add architectures to list to sweep +arch_list_add=k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml + +# Parse info and how to parse +parse_file=vpr_standard.txt + +# How to parse QoR info +qor_parse_file=qor_standard.txt + +# Pass requirements +pass_requirements_file=pass_requirements.txt + +#Script parameters +script_params=-track_memory_usage -crit_path_router_iterations 100 --route_chan_width 300 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp/config/golden_results.txt new file mode 100644 index 00000000000..e1830981aa7 --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp/config/golden_results.txt @@ -0,0 +1,12 @@ +arch circuit script_params vtr_flow_elapsed_time error odin_synth_time max_odin_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_time placed_wirelength_est place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml tpu_like.small.v common 849.39 7.94 237496 5 472.04 -1 -1 168564 -1 -1 1415 355 8 276 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 1539976 355 289 50183 41795 2 23394 2343 96 96 9216 mult_36 auto 37.35 342671 75.50 0.39 7.35345 -98715.8 -7.35345 7.35345 5.80 0.0984607 0.0720276 18.7385 13.6354 -1 415473 17 5.62531e+08 1.89934e+08 1.76662e+08 19169.0 35.06 24.3552 18.4839 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml dla_like.small.v common 4146.37 45.50 678656 5 2947.67 -1 -1 615152 -1 -1 5933 194 420 128 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 3963000 194 13 224191 176897 1 82017 6688 146 146 21316 memory auto 156.04 752361 358.07 2.42 8.99314 -211861 -8.99314 8.99314 16.32 0.279894 0.221952 47.1808 34.387 -1 1069662 14 1.33143e+09 6.00581e+08 4.13415e+08 19394.6 121.58 64.1925 49.1802 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml bnn.v common 1730.24 37.56 578396 3 285.56 -1 -1 517032 -1 -1 6110 260 0 63 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 2282864 260 122 231646 179601 1 81379 6555 93 93 8649 clb auto 378.96 939392 501.29 2.68 8.31531 -186585 -8.31531 8.31531 6.89 0.426506 0.344701 80.581 56.3638 -1 1170654 15 5.27943e+08 3.54222e+08 1.65793e+08 19169.0 50.10 103.678 75.7731 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml attention_layer.v common 768.30 14.93 1108436 7 36.79 -1 -1 564864 -1 -1 1353 1058 99 104 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 879172 1058 16 47201 38874 1 26103 2630 68 68 4624 memory auto 193.05 209559 131.46 0.90 6.72901 -105268 -6.72901 6.72901 4.37 0.189956 0.156694 30.7649 24.059 -1 313048 15 2.79226e+08 1.68351e+08 8.76559e+07 18956.7 80.08 38.8565 31.1229 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml conv_layer_hls.v common 1781.99 182.88 9217232 7 196.70 -1 -1 4563828 -1 -1 1868 1016 21 2 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 1681248 1016 2283 15881 16829 1 8739 5190 106 106 11236 io auto 47.11 79645 123.60 0.74 11.8372 -24843 -11.8372 11.8372 13.33 0.148091 0.13741 32.2291 29.3036 -1 111172 15 6.92108e+08 1.12977e+08 2.16447e+08 19263.7 11.15 38.4259 35.0858 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml conv_layer.v common 821.23 15.23 229576 4 514.20 -1 -1 127936 -1 -1 1190 91 28 84 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 672144 91 65 45573 38071 2 19420 1458 56 56 3136 mult_36 auto 38.79 186387 87.99 0.37 5.47591 -127457 -5.47591 5.39861 2.68 0.133663 0.0989283 24.0914 17.7419 -1 249207 18 1.8697e+08 1.12744e+08 5.89565e+07 18799.9 49.29 31.121 23.7521 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml eltwise_layer.v common 339.66 47.20 436556 5 67.93 -1 -1 89892 -1 -1 835 152 36 6 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 422952 152 97 24680 20968 2 10728 1126 44 44 1936 memory auto 38.72 126968 61.18 0.59 5.08142 -68449.5 -5.08142 4.47836 1.74 0.143398 0.0998815 17.7283 12.7572 -1 171182 15 1.12988e+08 6.71067e+07 3.58735e+07 18529.7 50.09 24.0332 18.066 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml robot_rl.v common 842.08 423.14 366520 15 98.92 -1 -1 155680 -1 -1 1358 3 48 18 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 550096 3 384 29480 27404 1 14788 1811 50 50 2500 memory auto 37.95 140977 132.63 0.84 9.47507 -46908.5 -9.47507 9.47507 2.57 0.133717 0.110025 20.8378 15.2052 -1 228094 16 1.47946e+08 1.06623e+08 4.69174e+07 18767.0 26.00 28.7586 21.8829 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml reduction_layer.v common 215.12 2.00 90856 6 47.78 -1 -1 71424 -1 -1 808 37 32 0 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 398092 37 17 18215 15970 1 9626 894 44 44 1936 memory auto 27.94 122488 52.35 0.42 6.72068 -43919 -6.72068 6.72068 1.07 0.0888394 0.0637534 10.6471 7.70209 -1 188691 15 1.12988e+08 6.10834e+07 3.58735e+07 18529.7 23.40 14.4615 10.9793 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml spmv.v common 812.93 27.07 1508296 6 72.44 -1 -1 654316 -1 -1 506 2 238 32 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 1727652 2 17 16149 14237 1 8674 795 108 108 11664 memory auto 23.75 208470 35.28 0.19 7.12111 -55062.3 -7.12111 7.12111 12.36 0.0785605 0.0648064 13.5422 9.99265 -1 251155 13 7.15542e+08 1.70364e+08 2.24153e+08 19217.5 196.53 16.4832 12.5461 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_depop50_mem32K_40nm.xml softmax.v common 351.18 30.38 453032 10 125.60 -1 -1 104052 -1 -1 1043 402 0 8 success v8.0.0-4470-ge625fdfe9 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-18T14:02:07 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 397940 402 150 25654 23042 1 11116 1603 40 40 1600 clb auto 37.33 76216 75.35 0.50 7.79097 -15966.3 -7.79097 7.79097 1.32 0.111692 0.0831882 15.8057 11.6749 -1 123818 16 9.16046e+07 5.93804e+07 2.94502e+07 18406.4 8.96 21.8675 16.8152 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/task_list.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/task_list.txt index bcd82a1de86..3b29483a800 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/task_list.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/task_list.txt @@ -1,2 +1,3 @@ regression_tests/vtr_reg_nightly_test4/koios_multi_arch regression_tests/vtr_reg_nightly_test4/koios +regression_tests/vtr_reg_nightly_test4/koios_no_complex_dsp diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt index b00f3eff89c..7bb68bed48e 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt @@ -9,12 +9,22 @@ circuits_dir=benchmarks/verilog/koios # Path to directory of architectures to use archs_dir=arch/COFFE_22nm +# Directory containing the verilog includes file(s) +includes_dir=benchmarks/verilog/koios + # Add circuits to list to sweep circuit_list_add=test.v # Add architectures to list to sweep arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml +# Add include files to the list. +# Some benchmarks instantiate complex dsp blocks to implement features +# like native floating point math, cascade chains, etc. This functionality +# is guarded under the `complex_dsp` macro. The complex_dsp_include.v file +# defines this macro, thereby enabling instantiations of the complex dsp. +include_list_add=complex_dsp_include.v + # Parse info and how to parse parse_file=vpr_standard.txt diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp/config/config.txt new file mode 100644 index 00000000000..ffd5f1f96b3 --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp/config/config.txt @@ -0,0 +1,33 @@ +# +############################################ +# Configuration file for running experiments +############################################## + +# Path to directory of circuits to use +circuits_dir=benchmarks/verilog/koios + +# Path to directory of architectures to use +archs_dir=arch/COFFE_22nm + +# Add circuits to list to sweep. +# Some of these benchmarks are designs with complex_dsp blocks +# but in this task, we're running them without enabling these +# blocks (that is, the macro `complex_dsp` is not defined). +# Equivalent functionality is obtained through behavioral logic +# that gets mapped to soft logic. +circuit_list_add=test.v + +# Add architectures to list to sweep +arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml + +# Parse info and how to parse +parse_file=vpr_standard.txt + +# How to parse QoR info +qor_parse_file=qor_standard.txt + +# Pass requirements +pass_requirements_file=pass_requirements.txt + +#Script parameters +script_params=-track_memory_usage diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp/config/golden_results.txt new file mode 100644 index 00000000000..dcd45e2e33b --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios_no_complex_dsp/config/golden_results.txt @@ -0,0 +1,2 @@ +arch circuit script_params vtr_flow_elapsed_time error odin_synth_time max_odin_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_time placed_wirelength_est place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml test.v common 13.10 0.14 10892 1 0.09 -1 -1 38376 -1 -1 23 130 0 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 60536 130 40 1203 1030 1 587 196 14 14 196 dsp_top auto 0.68 2534 0.53 0.00 5.32806 -555.517 -5.32806 5.32806 0.65 0.00143487 0.00122056 0.226575 0.190161 118 4958 20 4.93594e+06 1.40315e+06 1.68070e+06 8574.99 7.71 0.968945 0.831906 4601 19 1835 1910 311629 85505 6.80282 6.80282 -732.396 -6.80282 0 0 2.17123e+06 11077.7 0.42 0.17 0.0875814 0.0796773 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt index 21b84db1e2c..058abb05c9c 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt @@ -78,3 +78,5 @@ regression_tests/vtr_reg_strong/strong_unroute_analysis regression_tests/vtr_reg_strong/strong_verify_rr_graph regression_tests/vtr_reg_strong/strong_verify_rr_graph_bin regression_tests/vtr_reg_strong/strong_verify_rr_graph_titan +regression_tests/vtr_reg_strong/koios +regression_tests/vtr_reg_strong/koios_no_complex_dsp diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios/config/config.txt index a0f8fd84fd5..9a12a9c8169 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios/config/config.txt @@ -9,6 +9,9 @@ circuits_dir=benchmarks/verilog/koios # Path to directory of architectures to use archs_dir=arch/COFFE_22nm +# Directory containing the verilog includes file(s) +includes_dir=benchmarks/verilog/koios + # Add circuits to list to sweep circuit_list_add=clstm_like.small.v circuit_list_add=clstm_like.medium.v @@ -22,6 +25,13 @@ circuit_list_add=dla_like.medium.v # Add architectures to list to sweep arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml +# Add include files to the list +# Some benchmarks instantiate complex dsp blocks to implement features +# like native floating point math, cascade chains, etc. This functionality +# is guarded under the `complex_dsp` macro. The complex_dsp_include.v file +# defines this macro, thereby enabling instantiations of the complex dsp. +include_list_add=complex_dsp_include.v + # Parse info and how to parse parse_file=vpr_standard.txt diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp/config/config.txt new file mode 100644 index 00000000000..41c0fa33c44 --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp/config/config.txt @@ -0,0 +1,42 @@ +# +############################################ +# Configuration file for running experiments +############################################## + +# Path to directory of circuits to use +circuits_dir=benchmarks/verilog/koios + +# Path to directory of architectures to use +archs_dir=arch/COFFE_22nm + +# Add circuits to list to sweep. +# Some of these benchmarks are designs with complex_dsp blocks +# but in this task, we're running them without enabling these +# blocks (that is, the macro `complex_dsp` is not defined). +# Equivalent functionality is obtained through behavioral logic +# that gets mapped to soft logic. +circuit_list_add=clstm_like.small.v +circuit_list_add=clstm_like.medium.v +circuit_list_add=clstm_like.large.v +circuit_list_add=lstm.v +circuit_list_add=gemm_layer.v +circuit_list_add=tpu_like.medium.v +circuit_list_add=tiny_darknet_like.small.v +circuit_list_add=tiny_darknet_like.medium.v +circuit_list_add=dla_like.medium.v + +# Add architectures to list to sweep +arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml + +# Parse info and how to parse +parse_file=vpr_standard.txt + +# How to parse QoR info +qor_parse_file=qor_standard.txt + +# Pass requirements +pass_requirements_file=pass_requirements.txt + +#Script parameters +script_params=-track_memory_usage -crit_path_router_iterations 100 --route_chan_width 300 + diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp/config/golden_results.txt new file mode 100644 index 00000000000..fe0e7ab373d --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/koios_no_complex_dsp/config/golden_results.txt @@ -0,0 +1,10 @@ +arch circuit script_params vtr_flow_elapsed_time error odin_synth_time max_odin_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_time placed_wirelength_est place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml clstm_like.small.v common 10473.59 150.75 2395672 3 8182.20 -1 -1 1336976 -1 -1 9309 293 407 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 4793228 293 290 374045 333664 1 99618 10661 152 152 23104 dsp_top auto 531.60 1482046 550.12 3.20 6.93257 -474705 -6.93257 6.93257 15.50 0.610639 0.487079 106.558 80.4796 -1 1762769 16 7.41832e+08 4.07655e+08 5.09972e+08 22072.9 60.75 145.301 114.481 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml clstm_like.medium.v common 36368.03 351.33 4490844 3 31776.79 -1 -1 2480920 -1 -1 17641 293 784 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 8877148 293 578 696105 624609 1 181497 19958 206 206 42436 dsp_top auto 995.79 3181171 1298.67 6.75 8.18417 -1.24303e+06 -8.18417 8.18417 32.57 1.2458 0.887226 207.902 154.969 -1 3532357 18 1.36407e+09 7.68183e+08 9.34572e+08 22023.1 140.23 282.605 219.984 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml clstm_like.large.v common 72872.35 611.13 6547740 3 66326.53 -1 -1 3589816 -1 -1 25995 293 1161 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 13040044 293 866 1018158 915547 1 263328 29277 248 248 61504 dsp_top auto 1508.75 5185716 1821.05 9.62 8.86387 -1.97516e+06 -8.86387 8.86387 45.35 1.6894 1.19345 284.61 210.364 -1 5523775 18 1.98856e+09 1.12933e+09 1.35238e+09 21988.4 212.84 385.873 298.995 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml lstm.v common 21238.37 27.93 651712 7 15848.23 -1 -1 606560 -1 -1 6626 17 305 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 6036064 17 19 252939 204226 1 121211 7577 200 200 40000 dsp_top auto 3284.75 1453809 643.98 2.31 8.38544 -386636 -8.38544 8.38544 31.65 0.479418 0.401282 131.276 98.2184 -1 1876011 15 1.28987e+09 3.81683e+08 8.80433e+08 22010.8 45.39 157.548 120.347 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml gemm_layer.v common 40198.18 3319.49 6711328 7 30574.71 -1 -1 1656804 -1 -1 23255 691 0 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 7282816 691 1088 596478 520644 1 254252 25134 166 166 27556 clb auto 869.46 3551107 3053.62 15.54 9.92515 -521505 -9.92515 9.92515 20.55 1.44248 1.01705 203.131 146.626 -1 4615823 16 8.82626e+08 6.74286e+08 6.08100e+08 22067.8 1033.61 281.361 214.104 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml tpu_like.medium.v common 11217.72 65.65 780508 5 7039.87 -1 -1 539548 -1 -1 4254 643 26 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 9361264 643 545 177234 145386 2 82338 6532 264 264 69696 dsp_top auto 1277.07 2123426 288.43 1.65 10.1627 -557095 -10.1627 2.91086 65.86 0.307098 0.229864 52.5813 39.2266 -1 2472234 18 2.25492e+09 3.92304e+08 1.53224e+09 21984.7 46.43 71.2002 55.3506 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml tiny_darknet_like.small.v common 19291.69 885.90 2129832 6 10683.53 -1 -1 832868 -1 -1 7899 21 3978 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 16748348 21 25 165154 154791 1 86126 11939 356 356 126736 memory auto 1308.76 2056119 1178.65 4.64 13.4713 -3.30107e+06 -13.4713 13.4713 137.56 0.900336 0.717 240.284 176.223 -1 2923550 15 4.10542e+09 7.72103e+08 2.78633e+09 21985.3 97.58 286.944 216.379 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml tiny_darknet_like.medium.v common 88040.81 3262.26 6220320 7 74014.64 -1 -1 1973912 -1 -1 17934 21 4400 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 19902688 21 25 451230 416088 1 208255 22447 372 372 138384 memory auto 3210.96 3308175 2155.12 10.03 12.3106 -3.53919e+06 -12.3106 12.3106 134.29 1.57866 1.09798 285.817 203.077 -1 4740870 18 4.48577e+09 1.12316e+09 3.04235e+09 21984.9 148.78 370.733 276.178 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml dla_like.medium.v common 21886.90 206.58 1720396 5 14797.99 -1 -1 1472172 -1 -1 12726 386 1008 -1 success v8.0.0-3991-g2d0e717e7 release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-124-generic x86_64 2021-06-23T17:20:33 jupiter0 /export/aman/vtr_aman/vtr-verilog-to-routing/vtr_flow/tasks 6446588 386 25 546898 438864 1 225712 14545 180 180 32400 memory auto 3306.64 3003810 1604.66 9.73 9.99656 -697281 -9.99656 9.99656 23.75 0.941921 0.755598 165.793 121.101 -1 3919972 19 1.04034e+09 5.95383e+08 7.14256e+08 22044.9 609.62 226.353 172.676 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_weekly/task_list.txt b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/task_list.txt index 1edf2c54d63..0504eeb4f1d 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_weekly/task_list.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_weekly/task_list.txt @@ -4,4 +4,5 @@ regression_tests/vtr_reg_weekly/vtr_reg_qor_chain_predictor_off regression_tests/vtr_reg_weekly/vtr_reg_fpu_hard_block_arch regression_tests/vtr_reg_weekly/vtr_reg_fpu_soft_logic_arch regression_tests/vtr_reg_weekly/vpr_ispd -regression_tests/vtr_reg_weekly/koios \ No newline at end of file +regression_tests/vtr_reg_weekly/koios +regression_tests/vtr_reg_weekly/koios_no_complex_dsp