Skip to content

Commit 35f90c9

Browse files
authored
Merge pull request #1753 from aman26kbm/koios_benchmarks
Submitting Koios benchmarks
2 parents 4bfba43 + 5d75317 commit 35f90c9

File tree

53 files changed

+565535
-1818
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+565535
-1818
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Format: //devtools/kokoro/config/proto/build.proto
2+
3+
build_file: "vtr-verilog-to-routing/.github/kokoro/run-vtr.sh"
4+
5+
# 72 hours
6+
timeout_mins: 4320
7+
8+
action {
9+
define_artifacts {
10+
# File types
11+
regex: "**/*.out"
12+
regex: "**/vpr_stdout.log"
13+
regex: "**/parse_results.txt"
14+
regex: "**/qor_results.txt"
15+
regex: "**/pack.log"
16+
regex: "**/place.log"
17+
regex: "**/route.log"
18+
regex: "**/*_qor.csv"
19+
regex: "**/*.out.gz"
20+
regex: "**/vpr_stdout.log.gz"
21+
regex: "**/parse_results.txt.gz"
22+
regex: "**/qor_results.txt.gz"
23+
regex: "**/pack.log.gz"
24+
regex: "**/place.log.gz"
25+
regex: "**/route.log.gz"
26+
regex: "**/*_qor.csv.gz"
27+
strip_prefix: "github/vtr-verilog-to-routing/"
28+
}
29+
}
30+
31+
env_vars {
32+
key: "KOKORO_TYPE"
33+
value: "presubmit"
34+
}
35+
36+
env_vars {
37+
key: "KOKORO_DIR"
38+
value: "vtr-verilog-to-routing"
39+
}
40+
41+
env_vars {
42+
key: "VTR_DIR"
43+
value: "vtr-verilog-to-routing"
44+
}
45+
46+
#Use default build configuration
47+
env_vars {
48+
key: "VTR_CMAKE_PARAMS"
49+
value: ""
50+
}
51+
52+
env_vars {
53+
key: "VTR_TEST"
54+
value: "vtr_reg_nightly_test4"
55+
}
56+
57+
#Options for run_reg_test.py
58+
# -show_failures: show tool failures in main log output
59+
env_vars {
60+
key: "VTR_TEST_OPTIONS"
61+
value: "-show_failures"
62+
}
63+
64+
env_vars {
65+
key: "NUM_CORES"
66+
value: "8"
67+
}

.github/kokoro/steps/vtr-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ find . -type f -regex ".*\.tar\.\(gz\|xz\)" -delete
7272
find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test1/ -type f -print0 | xargs -0 -P $(nproc) gzip
7373
find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/ -type f -print0 | xargs -0 -P $(nproc) gzip
7474
find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test3/ -type f -print0 | xargs -0 -P $(nproc) gzip
75-
75+
find vtr_flow/tasks/regression_tests/vtr_reg_nightly_test4/ -type f -print0 | xargs -0 -P $(nproc) gzip
7676

7777
# Make sure working directory doesn't exceed disk space limit!
7878
echo "Working directory size: $(du -sh)"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
regression_test/benchmark/task/full
22
regression_test/benchmark/task/large
3+
regression_test/benchmark/task/koios
34

ODIN_II/regression_test/benchmark/task/koios/synthesis_result.json

Lines changed: 15674 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
########################
2+
# large benchmarks config
3+
########################
4+
5+
regression_params=--disable_simulation --disable_parallel_jobs --verbose
6+
script_synthesis_params=--limit_ressource --time_limit 14400s
7+
script_simulation_params=--limit_ressource --time_limit 14400s
8+
9+
# setup the architecture
10+
archs_dir=../vtr_flow/arch/COFFE_22nm
11+
12+
# one arch allows it to run faster given it is single threaded
13+
arch_list_add=k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
14+
15+
circuits_dir=../../../../vtr_flow/benchmarks/verilog/koios
16+
17+
# glob the large benchmark and the vtr ones to prevent duplicate run
18+
circuit_list_add=tpu_like.small.v
19+
circuit_list_add=dla_like.small.v
20+
circuit_list_add=bnn.v
21+
circuit_list_add=attention_layer.v
22+
circuit_list_add=conv_layer_hls.v
23+
circuit_list_add=conv_layer.v
24+
circuit_list_add=gemm_layer.v
25+
circuit_list_add=eltwise_layer.v
26+
circuit_list_add=robot_rl.v
27+
circuit_list_add=reduction_layer.v
28+
circuit_list_add=spmv.v
29+
circuit_list_add=softmax.v
30+
31+
synthesis_parse_file=regression_test/parse_result/conf/synth.toml

ODIN_II/regression_test/benchmark/task/large/synthesis_result.json

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,89 +1161,6 @@
11611161
"Longest Path": 2,
11621162
"Average Path": 2
11631163
},
1164-
"large/tpu.16x16.int8/k6_frac_N10_frac_chain_mem32K_40nm": {
1165-
"test_name": "large/tpu.16x16.int8/k6_frac_N10_frac_chain_mem32K_40nm",
1166-
"architecture": "k6_frac_N10_frac_chain_mem32K_40nm.xml",
1167-
"verilog": "tpu.16x16.int8.v",
1168-
"warnings": [
1169-
"tpu.16x16.int8.v:2298:7 [AST] Odin does not handle signed REG (counter)"
1170-
],
1171-
"max_rss(MiB)": 241.4,
1172-
"exec_time(ms)": 2140.9,
1173-
"synthesis_time(ms)": 2135.1,
1174-
"Latch Drivers": 1,
1175-
"Pi": 354,
1176-
"Po": 289,
1177-
"logic element": 59049,
1178-
"latch": 22362,
1179-
"Adder": 4988,
1180-
"Multiplier": 288,
1181-
"Memory": 256,
1182-
"generic logic size": 4,
1183-
"Longest Path": 1596,
1184-
"Average Path": 4,
1185-
"Estimated LUTs": 70497,
1186-
"Total Node": 86944
1187-
},
1188-
"large/tpu.32x32.int8/k6_frac_N10_frac_chain_mem32K_40nm": {
1189-
"test_name": "large/tpu.32x32.int8/k6_frac_N10_frac_chain_mem32K_40nm",
1190-
"architecture": "k6_frac_N10_frac_chain_mem32K_40nm.xml",
1191-
"verilog": "tpu.32x32.int8.v",
1192-
"warnings": [
1193-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[0] is unused in module systolic_data_setup",
1194-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[1] is unused in module systolic_data_setup",
1195-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[2] is unused in module systolic_data_setup",
1196-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[3] is unused in module systolic_data_setup",
1197-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[4] is unused in module systolic_data_setup",
1198-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[5] is unused in module systolic_data_setup",
1199-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[6] is unused in module systolic_data_setup",
1200-
"tpu.32x32.int8.v:1569:18 [NETLIST] This module port u_systolic_data_setup.final_mat_mul_size[7] is unused in module systolic_data_setup",
1201-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[0] is unused in module output_logic",
1202-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[1] is unused in module output_logic",
1203-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[2] is unused in module output_logic",
1204-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[3] is unused in module output_logic",
1205-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[4] is unused in module output_logic",
1206-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[5] is unused in module output_logic",
1207-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[6] is unused in module output_logic",
1208-
"tpu.32x32.int8.v:2881:18 [NETLIST] This module port u_output_logic.final_mat_mul_size[7] is unused in module output_logic",
1209-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[0] is unused in module matmul_32x32_systolic",
1210-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[1] is unused in module matmul_32x32_systolic",
1211-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[2] is unused in module matmul_32x32_systolic",
1212-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[3] is unused in module matmul_32x32_systolic",
1213-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[4] is unused in module matmul_32x32_systolic",
1214-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[5] is unused in module matmul_32x32_systolic",
1215-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[6] is unused in module matmul_32x32_systolic",
1216-
"tpu.32x32.int8.v:15361:1 [NETLIST] This module port u_matmul.final_mat_mul_size[7] is unused in module matmul_32x32_systolic"
1217-
],
1218-
"max_rss(MiB)": 811.1,
1219-
"exec_time(ms)": 8389.9,
1220-
"synthesis_time(ms)": 8384.1,
1221-
"Latch Drivers": 1,
1222-
"Pi": 642,
1223-
"Po": 545,
1224-
"logic element": 190121,
1225-
"latch": 85146,
1226-
"Adder": 18297,
1227-
"Multiplier": 1088,
1228-
"Memory": 512,
1229-
"generic logic size": 4,
1230-
"Longest Path": 3164,
1231-
"Average Path": 4,
1232-
"Estimated LUTs": 209209,
1233-
"Total Node": 295165
1234-
},
1235-
"large/matmul_8x8_fp16/k6_frac_N10_frac_chain_mem32K_40nm": {
1236-
"test_name": "large/matmul_8x8_fp16/k6_frac_N10_frac_chain_mem32K_40nm",
1237-
"architecture": "k6_frac_N10_frac_chain_mem32K_40nm.xml",
1238-
"verilog": "matmul_8x8_fp16.v",
1239-
"exit": 134,
1240-
"errors": [
1241-
"matmul_8x8_fp16.v:1648:1 [AST] Can't find module name mac_fp"
1242-
],
1243-
"warnings": [
1244-
"matmul_8x8_fp16.v:1212:7 [AST] Odin does not handle signed REG (counter)"
1245-
]
1246-
},
12471164
"DEFAULT": {
12481165
"test_name": "n/a",
12491166
"architecture": "n/a",

doc/src/vtr/benchmarks.rst

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ They are suitable for FPGA architecture research and medium-scale CAD research.
3838
stereovision0 Computer Vision
3939
stereovision1 Computer Vision
4040
stereovision2 Computer Vision
41-
stereovision3 Computer Vision
42-
tpu.32x32.int8 Deep Learning
43-
tpu.16x16.int8 Deep Learning
41+
stereovision3 Computer Vision
4442
================ =================
4543

4644
The VTR benchmarks are provided as Verilog under: ::
@@ -66,6 +64,51 @@ The Titan benchmarks are suitable for large-scale FPGA CAD research, and FPGA ar
6664

6765
.. seealso:: :ref:`titan_benchmarks_tutorial`
6866

67+
Koios Benchmarks
68+
-----------------
69+
The Koios benchmarks :cite:`koios_benchmarks` are a set of Deep Learning (DL) benchmarks.
70+
They are suitable for DL related architecture and CAD research.
71+
There are 19 designs that include several medium-sized benchmarks and some large benchmarks.
72+
The designs target different network types (CNNs, RNNs, MLPs, RL) and layer types (fully-connected, convolution, activation, softmax, reduction, eltwise).
73+
Some of the designs are generated from HLS tools as well.
74+
These designs use many precisions including binary, different fixed point types int8/16/32, brain floating point (bfloat16), and IEEE half-precision floating point (fp16).
75+
76+
.. table_koios_benchmarks:
77+
78+
.. table:: The Koios Benchmarks.
79+
80+
================= ======================================
81+
Benchmark Description
82+
================= ======================================
83+
clstm_like CLSTM-like accelerator
84+
dla_like Intel-DLA-like accelerator
85+
lstm LSTM engine
86+
tpu_like Google-TPU-v1-like accelerator
87+
bnn 4-layer binary neural network
88+
tiny_darknet_like Accelerator for Tiny Darknet
89+
gemm_layer 20x20 matrix multiplication engine
90+
attention_layer Transformer self-attention layer
91+
conv_layer GEMM based convolution
92+
spmv Sparse matrix vector multiplication
93+
robot_rl Robot+maze application
94+
reduction_layer Add/max/min reduction tree
95+
softmax Softmax classification layer
96+
conv_layer_hls Sliding window convolution
97+
eltwise_layer Matrix elementwise add/sub/mult
98+
================= ======================================
99+
100+
Koios benchmarks are fully compatible with the full VTR flow. Some Koios benchmarks use advanced DSP features that are available in only a few FPGA architectures provided with VTR. This is because they instantiate DSP macros to implement native FP16 multiplications or use the hard dedicated chains, and these are architecture-specific. If users want to use a different FPGA architecture file, they can replace the macro instantiations in the benchmarks with their equivalents from the FPGA architectures they wish to use.
101+
102+
Alternatively, users can disable these advanced features. The macro ``complex_dsp`` can be used for this purpose. If complex_dsp is defined in a benchmark file (using ```define complex_dsp`` in the beginning of the benchmark file), then advanced DSP features mentioned above will be used. If a user wants to run a Koios benchmark with FPGA architectures that don't have these advanced DSP features (for example, the flagship architectures: ``$VTR_ROOT/vtr_flow/arch/timing/k6_frac_N10_*_mem32K_40nm*``), then they can remove the line defining the complex_dsp macro. This enables the same functionality with behavioral Verilog that is mapped to the FPGA soft logic when an architecture without the required macro definitions is used.
103+
104+
The VTR benchmarks are provided as Verilog (enabling full flexibility to modify and change how the designs are implemented) under: ::
105+
106+
$VTR_ROOT/vtr_flow/benchmarks/verilog/koios
107+
108+
The FPGA architectures with advanced DSP that work out-of-the-box with Koios benchmarks are available here: ::
109+
110+
$VTR_ROOT/vtr_flow/arch/COFFE_22nm/k6FracN10LB_mem20K_complexDSP_customSB_22nm.*
111+
69112
MCNC20 Benchmarks
70113
-----------------
71114
The MCNC benchmarks :cite:`mcnc_benchmarks` are a set of small and old (circa 1991) benchmarks.
@@ -114,3 +157,4 @@ where :math:`K=` ``<#>``.
114157
spla 2278
115158
tseng 1583
116159
========= ========================================
160+

doc/src/z_references.bib

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,3 +415,11 @@ @ARTICLE{murray_micro_symbiflow
415415
number={},
416416
pages={1-1}
417417
}
418+
419+
@inproceedings{koios_benchmarks,
420+
title={Koios: A Deep Learning Benchmark Suite for FPGA Architecture and CAD Research},
421+
author={Arora, Aman and Boutros, Andrew and Rauch, Daniel and Rajen, Aishwarya and Borda, Aatman and Damghani, Seyed A. and Mehta, Samidh and Kate, Sangram and Patel, Pragnesh and Kent, Kenneth B. and Betz, Vaughn and John, Lizy K.},
422+
booktitle={International Conference on Field Programmable Logic and Applications (FPL)},
423+
year={2021}
424+
}
425+

0 commit comments

Comments
 (0)