Somehow these changes got left out in the previous commit

aman26kbm · aman26kbm · commit 655590091feb · 2021-06-01T21:25:30.000-05:00
diff --git a/doc/src/vtr/benchmarks.rst b/doc/src/vtr/benchmarks.rst
@@ -38,9 +38,7 @@ They are suitable for FPGA architecture research and medium-scale CAD research.
     stereovision0       Computer Vision
     stereovision1       Computer Vision
     stereovision2       Computer Vision
-    stereovision3       Computer Vision
-    tpu.32x32.int8      Deep Learning
-    tpu.16x16.int8      Deep Learning
+    stereovision3       Computer Vision    
     ================    =================
 
 The VTR benchmarks are provided as Verilog under: ::
@@ -66,6 +64,51 @@ The Titan benchmarks are suitable for large-scale FPGA CAD research, and FPGA ar
 
 .. seealso:: :ref:`titan_benchmarks_tutorial`
 
+Koios Benchmarks
+-----------------
+The Koios benchmarks :cite:`koios_benchmarks` are a set of Deep Learning (DL) benchmarks. 
+They are suitable for DL related architecture and CAD research.
+There are 19 designs that include several medium-sized benchmarks and some large benchmarks.
+The designs target different network types (CNNs, RNNs, MLPs, RL) and layer types (fully-connected, convolution, activation, softmax, reduction, eltwise).
+Some of the designs are generated from HLS tools as well.
+These designs use many precisions including binary, different fixed point types int8/16/32, brain floating point (bfloat16), and IEEE half-precision floating point (fp16).
+
+..  table_koios_benchmarks:
+
+.. table:: The Koios Benchmarks.  
+  
+    =================   ======================================
+    Benchmark           Description
+    =================   ======================================
+    clstm_like          CLSTM-like accelerator
+    dla_like            Intel-DLA-like accelerator
+    lstm                LSTM engine
+    tpu_like            Google-TPU-v1-like accelerator
+    bnn                 4-layer binary neural network
+    tiny_darknet_like   Accelerator for Tiny Darknet    
+    gemm_layer          20x20 matrix multiplication engine
+    attention_layer     Transformer self-attention layer
+    conv_layer          GEMM based convolution
+    spmv                Sparse matrix vector multiplication
+    robot_rl            Robot+maze application     
+    reduction_layer     Add/max/min reduction tree
+    softmax             Softmax classification layer
+    conv_layer_hls      Sliding window convolution
+    eltwise_layer       Matrix elementwise add/sub/mult  
+    =================   ======================================
+
+Koios benchmarks are fully compatible with the full VTR flow. Some Koios benchmarks use advanced DSP features that are available in only a few FPGA architectures provided with VTR. This is because they instantiate DSP macros to implement native FP16 multiplications or use the hard dedicated chains, and these are architecture-specific. If users want to use a different FPGA architecture file, they can replace the macro instantiations in the benchmarks with their equivalents from the FPGA architectures they wish to use.
+
+Alternatively, users can disable these advanced features. The macro ``complex_dsp`` can be used for this purpose. If complex_dsp is defined in a benchmark file (using ```define complex_dsp`` in the beginning of the benchmark file), then advanced DSP features mentioned above will be used. If a user wants to run a Koios benchmark with FPGA architectures that don't have these advanced DSP features (for example, the flagship architectures: ``$VTR_ROOT/vtr_flow/arch/timing/k6_frac_N10_*_mem32K_40nm*``), then they can remove the line defining the complex_dsp macro. This enables the same functionality with behavioral Verilog that is mapped to the FPGA soft logic when an architecture without the required macro definitions is used.
+
+The VTR benchmarks are provided as Verilog (enabling full flexibility to modify and change how the designs are implemented) under: ::
+
+    $VTR_ROOT/vtr_flow/benchmarks/verilog/koios
+
+The FPGA architectures with advanced DSP that work out-of-the-box with Koios benchmarks are available here: ::
+
+    $VTR_ROOT/vtr_flow/arch/COFFE_22nm/k6FracN10LB_mem20K_complexDSP_customSB_22nm.*
+
 MCNC20 Benchmarks
 -----------------
 The MCNC benchmarks :cite:`mcnc_benchmarks` are a set of small and old (circa 1991) benchmarks.
diff --git a/vtr_flow/benchmarks/verilog/koios/tpu_like.medium.v b/vtr_flow/benchmarks/verilog/koios/tpu_like.medium.v
@@ -1,3 +1,7 @@
+//////////////////////////////////////////////////////////////////////////////
+// Author: Aman Arora
+//////////////////////////////////////////////////////////////////////////////
+
 `timescale 1ns / 1ps
 
 ///////////////////////////////////
@@ -58,6 +62,7 @@
 //      Logic area (used): 1.72408e+08 MWTAs
 //      Resource usage: 5033 LBs, 26 RAMs, 1072 Multipliers
 //      Runtime (on Intel Xeon E5-2430 2.5GHz with single thread): 12500 sec
+// 3. 22nm architectures generated from COFFE. Example: arch/COFFE_22nm/k6n10LB_mem20K_complexDSP_customSB_22nm*
 
 //////////////////////////////////////
 // Parameters
@@ -288,26 +293,6 @@
 // Matrix multiplication unit
 ////////////////////////////////////
 
-//////////////////////////////////////////////////////////////////////////////////
-// Company: 
-// Engineer: 
-// 
-// Create Date: 2020-09-27 21:12:45.762386
-// Design Name: 
-// Module Name: matmul_32x32_systolic
-// Project Name: 
-// Target Devices: 
-// Tool Versions: 
-// Description: 
-// 
-// Dependencies: 
-// 
-// Revision:
-// Revision 0.01 - File Created
-// Additional Comments:
-// 
-//////////////////////////////////////////////////////////////////////////////////
-
 module matmul_32x32_systolic(
  clk,
  reset,
@@ -14174,36 +14159,23 @@ wire [2*`DWIDTH-1:0] mul_out_temp;
 reg [2*`DWIDTH-1:0] mul_out_temp_reg;
 
 always @(posedge clk) begin
-  if (reset) begin
-    a_flopped <= 0;
-    b_flopped <= 0;
-  end else begin
     a_flopped <= a;
     b_flopped <= b;
-  end
 end
 
 //assign mul_out = a * b;
 qmult mult_u1(.i_multiplicand(a_flopped), .i_multiplier(b_flopped), .o_result(mul_out_temp));
 
 always @(posedge clk) begin
-  if (reset) begin
-    mul_out_temp_reg <= 0;
-  end else begin
     mul_out_temp_reg <= mul_out_temp;
-  end
 end
 
 //we just truncate the higher bits of the product
 //assign add_out = mul_out + out;
 qadd add_u1(.a(out_temp), .b(mul_out_temp_reg), .c(add_out));
 
 always @(posedge clk) begin
-  if (reset) begin
-    out_temp <= 0;
-  end else begin
     out_temp <= add_out;
-  end
 end
 
 //down cast the result
@@ -14638,7 +14610,7 @@ input clk;
 
 `ifdef SIMULATION
 
-reg [7:0] ram[((1<<`AWIDTH)-1):0];
+reg [`DWIDTH-1:0] ram[((1<<`AWIDTH)-1):0];
 reg [31:0] i;
 
 always @(posedge clk)  
diff --git a/vtr_flow/benchmarks/verilog/koios/tpu_like.small.v b/vtr_flow/benchmarks/verilog/koios/tpu_like.small.v
@@ -1,3 +1,7 @@
+//////////////////////////////////////////////////////////////////////////////
+// Author: Aman Arora
+//////////////////////////////////////////////////////////////////////////////
+
 `timescale 1ns / 1ps
 
 ///////////////////////////////////
@@ -58,6 +62,7 @@
 //      Logic area (used): 4.95598e+07 MWTAs
 //      Resource usage: 1477 LBs, 14 RAMs, 280 Multipliers
 //      Runtime (on Intel Xeon E5-2430 2.5GHz with single thread): 3400 sec
+// 3. 22nm architectures generated from COFFE. Example: arch/COFFE_22nm/k6n10LB_mem20K_complexDSP_customSB_22nm*
 
 //////////////////////////////////////
 // Parameters
@@ -288,26 +293,6 @@
 // Matrix multiplication unit
 ////////////////////////////////////
 
-//////////////////////////////////////////////////////////////////////////////////
-// Company: 
-// Engineer: 
-// 
-// Create Date: 2020-09-27 21:12:45.762386
-// Design Name: 
-// Module Name: matmul_16x16_systolic
-// Project Name: 
-// Target Devices: 
-// Tool Versions: 
-// Description: 
-// 
-// Dependencies: 
-// 
-// Revision:
-// Revision 0.01 - File Created
-// Additional Comments:
-// 
-//////////////////////////////////////////////////////////////////////////////////
-
 module matmul_16x16_systolic(
  clk,
  reset,
@@ -4448,36 +4433,23 @@ wire [2*`DWIDTH-1:0] mul_out_temp;
 reg [2*`DWIDTH-1:0] mul_out_temp_reg;
 
 always @(posedge clk) begin
-  if (reset) begin
-    a_flopped <= 0;
-    b_flopped <= 0;
-  end else begin
     a_flopped <= a;
     b_flopped <= b;
-  end
 end
 
 //assign mul_out = a * b;
 qmult mult_u1(.i_multiplicand(a_flopped), .i_multiplier(b_flopped), .o_result(mul_out_temp));
 
 always @(posedge clk) begin
-  if (reset) begin
-    mul_out_temp_reg <= 0;
-  end else begin
     mul_out_temp_reg <= mul_out_temp;
-  end
 end
 
 //we just truncate the higher bits of the product
 //assign add_out = mul_out + out;
 qadd add_u1(.a(out_temp), .b(mul_out_temp_reg), .c(add_out));
 
 always @(posedge clk) begin
-  if (reset) begin
-    out_temp <= 0;
-  end else begin
     out_temp <= add_out;
-  end
 end
 
 //down cast the result
@@ -4912,7 +4884,7 @@ input clk;
 
 `ifdef SIMULATION
 
-reg [7:0] ram[((1<<`AWIDTH)-1):0];
+reg [`DWIDTH-1:0] ram[((1<<`AWIDTH)-1):0];
 reg [31:0] i;
 
 always @(posedge clk)