Skip to content

Commit 6555900

Browse files
committed
Somehow these changes got left out in the previous commit
1 parent f34efa8 commit 6555900

File tree

3 files changed

+58
-71
lines changed

3 files changed

+58
-71
lines changed

doc/src/vtr/benchmarks.rst

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ They are suitable for FPGA architecture research and medium-scale CAD research.
3838
stereovision0 Computer Vision
3939
stereovision1 Computer Vision
4040
stereovision2 Computer Vision
41-
stereovision3 Computer Vision
42-
tpu.32x32.int8 Deep Learning
43-
tpu.16x16.int8 Deep Learning
41+
stereovision3 Computer Vision
4442
================ =================
4543

4644
The VTR benchmarks are provided as Verilog under: ::
@@ -66,6 +64,51 @@ The Titan benchmarks are suitable for large-scale FPGA CAD research, and FPGA ar
6664

6765
.. seealso:: :ref:`titan_benchmarks_tutorial`
6866

67+
Koios Benchmarks
68+
-----------------
69+
The Koios benchmarks :cite:`koios_benchmarks` are a set of Deep Learning (DL) benchmarks.
70+
They are suitable for DL related architecture and CAD research.
71+
There are 19 designs that include several medium-sized benchmarks and some large benchmarks.
72+
The designs target different network types (CNNs, RNNs, MLPs, RL) and layer types (fully-connected, convolution, activation, softmax, reduction, eltwise).
73+
Some of the designs are generated from HLS tools as well.
74+
These designs use many precisions including binary, different fixed point types int8/16/32, brain floating point (bfloat16), and IEEE half-precision floating point (fp16).
75+
76+
.. table_koios_benchmarks:
77+
78+
.. table:: The Koios Benchmarks.
79+
80+
================= ======================================
81+
Benchmark Description
82+
================= ======================================
83+
clstm_like CLSTM-like accelerator
84+
dla_like Intel-DLA-like accelerator
85+
lstm LSTM engine
86+
tpu_like Google-TPU-v1-like accelerator
87+
bnn 4-layer binary neural network
88+
tiny_darknet_like Accelerator for Tiny Darknet
89+
gemm_layer 20x20 matrix multiplication engine
90+
attention_layer Transformer self-attention layer
91+
conv_layer GEMM based convolution
92+
spmv Sparse matrix vector multiplication
93+
robot_rl Robot+maze application
94+
reduction_layer Add/max/min reduction tree
95+
softmax Softmax classification layer
96+
conv_layer_hls Sliding window convolution
97+
eltwise_layer Matrix elementwise add/sub/mult
98+
================= ======================================
99+
100+
Koios benchmarks are fully compatible with the full VTR flow. Some Koios benchmarks use advanced DSP features that are available in only a few FPGA architectures provided with VTR. This is because they instantiate DSP macros to implement native FP16 multiplications or use the hard dedicated chains, and these are architecture-specific. If users want to use a different FPGA architecture file, they can replace the macro instantiations in the benchmarks with their equivalents from the FPGA architectures they wish to use.
101+
102+
Alternatively, users can disable these advanced features. The macro ``complex_dsp`` can be used for this purpose. If complex_dsp is defined in a benchmark file (using ```define complex_dsp`` in the beginning of the benchmark file), then advanced DSP features mentioned above will be used. If a user wants to run a Koios benchmark with FPGA architectures that don't have these advanced DSP features (for example, the flagship architectures: ``$VTR_ROOT/vtr_flow/arch/timing/k6_frac_N10_*_mem32K_40nm*``), then they can remove the line defining the complex_dsp macro. This enables the same functionality with behavioral Verilog that is mapped to the FPGA soft logic when an architecture without the required macro definitions is used.
103+
104+
The VTR benchmarks are provided as Verilog (enabling full flexibility to modify and change how the designs are implemented) under: ::
105+
106+
$VTR_ROOT/vtr_flow/benchmarks/verilog/koios
107+
108+
The FPGA architectures with advanced DSP that work out-of-the-box with Koios benchmarks are available here: ::
109+
110+
$VTR_ROOT/vtr_flow/arch/COFFE_22nm/k6FracN10LB_mem20K_complexDSP_customSB_22nm.*
111+
69112
MCNC20 Benchmarks
70113
-----------------
71114
The MCNC benchmarks :cite:`mcnc_benchmarks` are a set of small and old (circa 1991) benchmarks.

vtr_flow/benchmarks/verilog/tpu.32x32.int8.v renamed to vtr_flow/benchmarks/verilog/koios/tpu_like.medium.v

Lines changed: 6 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
//////////////////////////////////////////////////////////////////////////////
2+
// Author: Aman Arora
3+
//////////////////////////////////////////////////////////////////////////////
4+
15
`timescale 1ns / 1ps
26

37
///////////////////////////////////
@@ -58,6 +62,7 @@
5862
// Logic area (used): 1.72408e+08 MWTAs
5963
// Resource usage: 5033 LBs, 26 RAMs, 1072 Multipliers
6064
// Runtime (on Intel Xeon E5-2430 2.5GHz with single thread): 12500 sec
65+
// 3. 22nm architectures generated from COFFE. Example: arch/COFFE_22nm/k6n10LB_mem20K_complexDSP_customSB_22nm*
6166

6267
//////////////////////////////////////
6368
// Parameters
@@ -288,26 +293,6 @@
288293
// Matrix multiplication unit
289294
////////////////////////////////////
290295

291-
//////////////////////////////////////////////////////////////////////////////////
292-
// Company:
293-
// Engineer:
294-
//
295-
// Create Date: 2020-09-27 21:12:45.762386
296-
// Design Name:
297-
// Module Name: matmul_32x32_systolic
298-
// Project Name:
299-
// Target Devices:
300-
// Tool Versions:
301-
// Description:
302-
//
303-
// Dependencies:
304-
//
305-
// Revision:
306-
// Revision 0.01 - File Created
307-
// Additional Comments:
308-
//
309-
//////////////////////////////////////////////////////////////////////////////////
310-
311296
module matmul_32x32_systolic(
312297
clk,
313298
reset,
@@ -14174,36 +14159,23 @@ wire [2*`DWIDTH-1:0] mul_out_temp;
1417414159
reg [2*`DWIDTH-1:0] mul_out_temp_reg;
1417514160

1417614161
always @(posedge clk) begin
14177-
if (reset) begin
14178-
a_flopped <= 0;
14179-
b_flopped <= 0;
14180-
end else begin
1418114162
a_flopped <= a;
1418214163
b_flopped <= b;
14183-
end
1418414164
end
1418514165

1418614166
//assign mul_out = a * b;
1418714167
qmult mult_u1(.i_multiplicand(a_flopped), .i_multiplier(b_flopped), .o_result(mul_out_temp));
1418814168

1418914169
always @(posedge clk) begin
14190-
if (reset) begin
14191-
mul_out_temp_reg <= 0;
14192-
end else begin
1419314170
mul_out_temp_reg <= mul_out_temp;
14194-
end
1419514171
end
1419614172

1419714173
//we just truncate the higher bits of the product
1419814174
//assign add_out = mul_out + out;
1419914175
qadd add_u1(.a(out_temp), .b(mul_out_temp_reg), .c(add_out));
1420014176

1420114177
always @(posedge clk) begin
14202-
if (reset) begin
14203-
out_temp <= 0;
14204-
end else begin
1420514178
out_temp <= add_out;
14206-
end
1420714179
end
1420814180

1420914181
//down cast the result
@@ -14638,7 +14610,7 @@ input clk;
1463814610

1463914611
`ifdef SIMULATION
1464014612

14641-
reg [7:0] ram[((1<<`AWIDTH)-1):0];
14613+
reg [`DWIDTH-1:0] ram[((1<<`AWIDTH)-1):0];
1464214614
reg [31:0] i;
1464314615

1464414616
always @(posedge clk)

vtr_flow/benchmarks/verilog/tpu.16x16.int8.v renamed to vtr_flow/benchmarks/verilog/koios/tpu_like.small.v

Lines changed: 6 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
//////////////////////////////////////////////////////////////////////////////
2+
// Author: Aman Arora
3+
//////////////////////////////////////////////////////////////////////////////
4+
15
`timescale 1ns / 1ps
26

37
///////////////////////////////////
@@ -58,6 +62,7 @@
5862
// Logic area (used): 4.95598e+07 MWTAs
5963
// Resource usage: 1477 LBs, 14 RAMs, 280 Multipliers
6064
// Runtime (on Intel Xeon E5-2430 2.5GHz with single thread): 3400 sec
65+
// 3. 22nm architectures generated from COFFE. Example: arch/COFFE_22nm/k6n10LB_mem20K_complexDSP_customSB_22nm*
6166

6267
//////////////////////////////////////
6368
// Parameters
@@ -288,26 +293,6 @@
288293
// Matrix multiplication unit
289294
////////////////////////////////////
290295

291-
//////////////////////////////////////////////////////////////////////////////////
292-
// Company:
293-
// Engineer:
294-
//
295-
// Create Date: 2020-09-27 21:12:45.762386
296-
// Design Name:
297-
// Module Name: matmul_16x16_systolic
298-
// Project Name:
299-
// Target Devices:
300-
// Tool Versions:
301-
// Description:
302-
//
303-
// Dependencies:
304-
//
305-
// Revision:
306-
// Revision 0.01 - File Created
307-
// Additional Comments:
308-
//
309-
//////////////////////////////////////////////////////////////////////////////////
310-
311296
module matmul_16x16_systolic(
312297
clk,
313298
reset,
@@ -4448,36 +4433,23 @@ wire [2*`DWIDTH-1:0] mul_out_temp;
44484433
reg [2*`DWIDTH-1:0] mul_out_temp_reg;
44494434

44504435
always @(posedge clk) begin
4451-
if (reset) begin
4452-
a_flopped <= 0;
4453-
b_flopped <= 0;
4454-
end else begin
44554436
a_flopped <= a;
44564437
b_flopped <= b;
4457-
end
44584438
end
44594439

44604440
//assign mul_out = a * b;
44614441
qmult mult_u1(.i_multiplicand(a_flopped), .i_multiplier(b_flopped), .o_result(mul_out_temp));
44624442

44634443
always @(posedge clk) begin
4464-
if (reset) begin
4465-
mul_out_temp_reg <= 0;
4466-
end else begin
44674444
mul_out_temp_reg <= mul_out_temp;
4468-
end
44694445
end
44704446

44714447
//we just truncate the higher bits of the product
44724448
//assign add_out = mul_out + out;
44734449
qadd add_u1(.a(out_temp), .b(mul_out_temp_reg), .c(add_out));
44744450

44754451
always @(posedge clk) begin
4476-
if (reset) begin
4477-
out_temp <= 0;
4478-
end else begin
44794452
out_temp <= add_out;
4480-
end
44814453
end
44824454

44834455
//down cast the result
@@ -4912,7 +4884,7 @@ input clk;
49124884

49134885
`ifdef SIMULATION
49144886

4915-
reg [7:0] ram[((1<<`AWIDTH)-1):0];
4887+
reg [`DWIDTH-1:0] ram[((1<<`AWIDTH)-1):0];
49164888
reg [31:0] i;
49174889

49184890
always @(posedge clk)

0 commit comments

Comments
 (0)