|
| 1 | +<!-- |
| 2 | + Flagship Heterogeneous Architecture (No Carry Chains) for VTR 7.0. |
| 3 | +
|
| 4 | + - 40 nm technology |
| 5 | + - General purpose logic block: |
| 6 | + K = 6, N = 10, fracturable 6 LUTs (can operate as one 6-LUT or two 5-LUTs with all 5 inputs shared) |
| 7 | + with optionally registered outputs |
| 8 | + - Routing architecture: L = 4, fc_in = 0.15, Fc_out = 0.1 |
| 9 | +
|
| 10 | + Details on Modelling: |
| 11 | +
|
| 12 | + Based on flagship k6_frac_N10_mem32K_40nm.xml architecture. |
| 13 | +
|
| 14 | + Authors: Jason Luu, Jeff Goeders, Vaughn Betz |
| 15 | +--> |
| 16 | +<architecture> |
| 17 | + <!-- |
| 18 | + ODIN II specific config begins |
| 19 | + Describes the types of user-specified netlist blocks (in blif, this corresponds to |
| 20 | + ".model [type_of_block]") that this architecture supports. |
| 21 | +
|
| 22 | + Note: Basic LUTs, I/Os, and flip-flops are not included here as there are |
| 23 | + already special structures in blif (.names, .input, .output, and .latch) |
| 24 | + that describe them. |
| 25 | + --> |
| 26 | + <models> |
| 27 | + </models> |
| 28 | + <tiles> |
| 29 | + <tile name="io" area="0"> |
| 30 | + <sub_tile name="io" capacity="8"> |
| 31 | + <equivalent_sites> |
| 32 | + <site pb_type="io" pin_mapping="direct"/> |
| 33 | + </equivalent_sites> |
| 34 | + <input name="outpad" num_pins="1"/> |
| 35 | + <output name="inpad" num_pins="1"/> |
| 36 | + <clock name="clock" num_pins="1"/> |
| 37 | + <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/> |
| 38 | + <pinlocations pattern="custom"> |
| 39 | + <loc side="left">io.outpad io.inpad io.clock</loc> |
| 40 | + <loc side="top">io.outpad io.inpad io.clock</loc> |
| 41 | + <loc side="right">io.outpad io.inpad io.clock</loc> |
| 42 | + <loc side="bottom">io.outpad io.inpad io.clock</loc> |
| 43 | + </pinlocations> |
| 44 | + </sub_tile> |
| 45 | + </tile> |
| 46 | + <tile name="clb" area="53894"> |
| 47 | + <sub_tile name="clb" capacity="6"> |
| 48 | + <equivalent_sites> |
| 49 | + <site pb_type="clb" pin_mapping="direct"/> |
| 50 | + </equivalent_sites> |
| 51 | + <input name="I" num_pins="40" equivalent="full"/> |
| 52 | + <output name="O" num_pins="20" equivalent="none"/> |
| 53 | + <clock name="clk" num_pins="1"/> |
| 54 | + <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/> |
| 55 | + <pinlocations pattern="spread"/> |
| 56 | + </sub_tile> |
| 57 | + </tile> |
| 58 | + </tiles> |
| 59 | + <!-- ODIN II specific config ends --> |
| 60 | + <!-- Physical descriptions begin --> |
| 61 | + <layout> |
| 62 | + <auto_layout aspect_ratio="1.0"> |
| 63 | + <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners--> |
| 64 | + <perimeter type="io" priority="100"/> |
| 65 | + <corners type="EMPTY" priority="101"/> |
| 66 | + <!--Fill with 'clb'--> |
| 67 | + <fill type="clb" priority="10"/> |
| 68 | + </auto_layout> |
| 69 | + </layout> |
| 70 | + <device> |
| 71 | + <!-- VB & JL: Using Ian Kuon's transistor sizing and drive strength data for routing, at 40 nm. Ian used BPTM |
| 72 | + models. We are modifying the delay values however, to include metal C and R, which allows more architecture |
| 73 | + experimentation. We are also modifying the relative resistance of PMOS to be 1.8x that of NMOS |
| 74 | + (vs. Ian's 3x) as 1.8x lines up with Jeff G's data from a 45 nm process (and is more typical of |
| 75 | + 45 nm in general). I'm upping the Rmin_nmos from Ian's just over 6k to nearly 9k, and dropping |
| 76 | + RminW_pmos from 18k to 16k to hit this 1.8x ratio, while keeping the delays of buffers approximately |
| 77 | + lined up with Stratix IV. |
| 78 | + We are using Jeff G.'s capacitance data for 45 nm (in tech/ptm_45nm). |
| 79 | + Jeff's tables list C in for transistors with widths in multiples of the minimum feature size (45 nm). |
| 80 | + The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply drive strength sizes in this file |
| 81 | + by 2.5x when looking up in Jeff's tables. |
| 82 | + The delay values are lined up with Stratix IV, which has an architecture similar to this |
| 83 | + proposed FPGA, and which is also 40 nm |
| 84 | + C_ipin_cblock: input capacitance of a track buffer, which VPR assumes is a single-stage |
| 85 | + 4x minimum drive strength buffer. --> |
| 86 | + <sizing R_minW_nmos="8926" R_minW_pmos="16067"/> |
| 87 | + <!-- The grid_logic_tile_area below will be used for all blocks that do not explicitly set their own (non-routing) |
| 88 | + area; set to 0 since we explicitly set the area of all blocks currently in this architecture file. |
| 89 | + --> |
| 90 | + <area grid_logic_tile_area="0"/> |
| 91 | + <chan_width_distr> |
| 92 | + <x distr="uniform" peak="1.000000"/> |
| 93 | + <y distr="uniform" peak="1.000000"/> |
| 94 | + </chan_width_distr> |
| 95 | + <switch_block type="wilton" fs="3"/> |
| 96 | + <connection_block input_switch_name="ipin_cblock"/> |
| 97 | + </device> |
| 98 | + <switchlist> |
| 99 | + <!-- VB: the mux_trans_size and buf_size data below is in minimum width transistor *areas*, assuming the purple |
| 100 | + book area formula. This means the mux transistors are about 5x minimum drive strength. |
| 101 | + We assume the first stage of the buffer is 3x min drive strength to be reasonable given the large |
| 102 | + mux transistors, and this gives a reasonable stage ratio of a bit over 5x to the second stage. We assume |
| 103 | + the n and p transistors in the first stage are equal-sized to lower the buffer trip point, since it's fed |
| 104 | + by a pass transistor mux. We can then reverse engineer the buffer second stage to hit the specified |
| 105 | + buf_size (really buffer area) - 16.2x minimum drive nmos and 1.8*16.2 = 29.2x minimum drive. |
| 106 | + I then took the data from Jeff G.'s PTM modeling of 45 nm to get the Cin (gate of first stage) and Cout |
| 107 | + (diff of second stage) listed below. Jeff's models are in tech/ptm_45nm, and are in min feature multiples. |
| 108 | + The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply the drive strength sizes above by |
| 109 | + 2.5x when looking up in Jeff's tables. |
| 110 | + Finally, we choose a switch delay (58 ps) that leads to length 4 wires having a delay equal to that of SIV of 126 ps. |
| 111 | + This also leads to the switch being 46% of the total wire delay, which is reasonable. --> |
| 112 | + <switch type="mux" name="0" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/> |
| 113 | + <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer--> |
| 114 | + <switch type="mux" name="ipin_cblock" R="2231.5" Cout="0." Cin="1.47e-15" Tdel="7.247000e-11" mux_trans_size="1.222260" buf_size="auto"/> |
| 115 | + </switchlist> |
| 116 | + <segmentlist> |
| 117 | + <!--- VB & JL: using ITRS metal stack data, 96 nm half pitch wires, which are intermediate metal width/space. |
| 118 | + With the 96 nm half pitch, such wires would take 60 um of height, vs. a 90 nm high (approximated as square) Stratix IV tile so this seems |
| 119 | + reasonable. Using a tile length of 90 nm, corresponding to the length of a Stratix IV tile if it were square. --> |
| 120 | + <segment freq="1.000000" length="4" type="unidir" Rmetal="101" Cmetal="22.5e-15"> |
| 121 | + <mux name="0"/> |
| 122 | + <sb type="pattern">1 1 1 1 1</sb> |
| 123 | + <cb type="pattern">1 1 1 1</cb> |
| 124 | + </segment> |
| 125 | + </segmentlist> |
| 126 | + <complexblocklist> |
| 127 | + <!-- Define I/O pads begin --> |
| 128 | + <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA --> |
| 129 | + <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. --> |
| 130 | + <pb_type name="io"> |
| 131 | + <input name="outpad" num_pins="1"/> |
| 132 | + <output name="inpad" num_pins="1"/> |
| 133 | + <clock name="clock" num_pins="1"/> |
| 134 | + <!-- IOs can operate as either inputs or outputs. |
| 135 | + Delays below come from Ian Kuon. They are small, so they should be interpreted as |
| 136 | + the delays to and from registers in the I/O (and generally I/Os are registered |
| 137 | + today and that is when you timing analyze them. |
| 138 | + --> |
| 139 | + <mode name="inpad"> |
| 140 | + <pb_type name="inpad" blif_model=".input" num_pb="1"> |
| 141 | + <output name="inpad" num_pins="1"/> |
| 142 | + </pb_type> |
| 143 | + <interconnect> |
| 144 | + <direct name="inpad" input="inpad.inpad" output="io.inpad"> |
| 145 | + <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/> |
| 146 | + </direct> |
| 147 | + </interconnect> |
| 148 | + </mode> |
| 149 | + <mode name="outpad"> |
| 150 | + <pb_type name="outpad" blif_model=".output" num_pb="1"> |
| 151 | + <input name="outpad" num_pins="1"/> |
| 152 | + </pb_type> |
| 153 | + <interconnect> |
| 154 | + <direct name="outpad" input="io.outpad" output="outpad.outpad"> |
| 155 | + <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/> |
| 156 | + </direct> |
| 157 | + </interconnect> |
| 158 | + </mode> |
| 159 | + <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel --> |
| 160 | + <!-- IOs go on the periphery of the FPGA, for consistency, |
| 161 | + make it physically equivalent on all sides so that only one definition of I/Os is needed. |
| 162 | + If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA |
| 163 | + --> |
| 164 | + <!-- Place I/Os on the sides of the FPGA --> |
| 165 | + <power method="ignore"/> |
| 166 | + </pb_type> |
| 167 | + <!-- Define I/O pads ends --> |
| 168 | + <!-- Define general purpose logic block (CLB) begin --> |
| 169 | + <!--- Area calculation: Total Stratix IV tile area is about 8100 um^2, and a minimum width transistor |
| 170 | + area is 60 L^2 yields a tile area of 84375 MWTAs. |
| 171 | + Routing at W=300 is 30481 MWTAs, leaving us with a total of 53000 MWTAs for logic block area |
| 172 | + This means that only 37% of our area is in the general routing, and 63% is inside the logic |
| 173 | + block. Note that the crossbar / local interconnect is considered part of the logic block |
| 174 | + area in this analysis. That is a lower proportion of of routing area than most academics |
| 175 | + assume, but note that the total routing area really includes the crossbar, which would push |
| 176 | + routing area up significantly, we estimate into the ~70% range. |
| 177 | + --> |
| 178 | + <pb_type name="clb"> |
| 179 | + <input name="I" num_pins="40" equivalent="full"/> |
| 180 | + <output name="O" num_pins="20" equivalent="none"/> |
| 181 | + <clock name="clk" num_pins="1"/> |
| 182 | + <!-- Describe fracturable logic element. |
| 183 | + Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. |
| 184 | + The outputs of the fracturable logic element can be optionally registered |
| 185 | + --> |
| 186 | + <pb_type name="fle" num_pb="10"> |
| 187 | + <input name="in" num_pins="6"/> |
| 188 | + <output name="out" num_pins="2"/> |
| 189 | + <clock name="clk" num_pins="1"/> |
| 190 | + <!-- Dual 5-LUT mode definition begin --> |
| 191 | + <mode name="n2_lut5"> |
| 192 | + <pb_type name="lut5inter" num_pb="1"> |
| 193 | + <input name="in" num_pins="5"/> |
| 194 | + <output name="out" num_pins="2"/> |
| 195 | + <clock name="clk" num_pins="1"/> |
| 196 | + <pb_type name="ble5" num_pb="2"> |
| 197 | + <input name="in" num_pins="5"/> |
| 198 | + <output name="out" num_pins="1"/> |
| 199 | + <clock name="clk" num_pins="1"/> |
| 200 | + <!-- Define the LUT --> |
| 201 | + <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut"> |
| 202 | + <input name="in" num_pins="5" port_class="lut_in"/> |
| 203 | + <output name="out" num_pins="1" port_class="lut_out"/> |
| 204 | + <!-- LUT timing using delay matrix --> |
| 205 | + <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing, |
| 206 | + we instead take the average of these numbers to get more stable results |
| 207 | + 82e-12 |
| 208 | + 173e-12 |
| 209 | + 261e-12 |
| 210 | + 263e-12 |
| 211 | + 398e-12 |
| 212 | + --> |
| 213 | + <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out"> |
| 214 | + 235e-12 |
| 215 | + 235e-12 |
| 216 | + 235e-12 |
| 217 | + 235e-12 |
| 218 | + 235e-12 |
| 219 | + </delay_matrix> |
| 220 | + </pb_type> |
| 221 | + <!-- Define the flip-flop --> |
| 222 | + <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop"> |
| 223 | + <input name="D" num_pins="1" port_class="D"/> |
| 224 | + <output name="Q" num_pins="1" port_class="Q"/> |
| 225 | + <clock name="clk" num_pins="1" port_class="clock"/> |
| 226 | + <T_setup value="66e-12" port="ff.D" clock="clk"/> |
| 227 | + <T_clock_to_Q max="124e-12" port="ff.Q" clock="clk"/> |
| 228 | + </pb_type> |
| 229 | + <interconnect> |
| 230 | + <direct name="direct1" input="ble5.in[4:0]" output="lut5[0:0].in[4:0]"/> |
| 231 | + <direct name="direct2" input="lut5[0:0].out" output="ff[0:0].D"> |
| 232 | + <!-- Advanced user option that tells CAD tool to find LUT+FF pairs in netlist --> |
| 233 | + <pack_pattern name="ble5" in_port="lut5[0:0].out" out_port="ff[0:0].D"/> |
| 234 | + </direct> |
| 235 | + <direct name="direct3" input="ble5.clk" output="ff[0:0].clk"/> |
| 236 | + <mux name="mux1" input="ff[0:0].Q lut5.out[0:0]" output="ble5.out[0:0]"> |
| 237 | + <!-- LUT to output is faster than FF to output on a Stratix IV --> |
| 238 | + <delay_constant max="25e-12" in_port="lut5.out[0:0]" out_port="ble5.out[0:0]"/> |
| 239 | + <delay_constant max="45e-12" in_port="ff[0:0].Q" out_port="ble5.out[0:0]"/> |
| 240 | + </mux> |
| 241 | + </interconnect> |
| 242 | + </pb_type> |
| 243 | + <interconnect> |
| 244 | + <direct name="direct1" input="lut5inter.in" output="ble5[0:0].in"/> |
| 245 | + <direct name="direct2" input="lut5inter.in" output="ble5[1:1].in"/> |
| 246 | + <direct name="direct3" input="ble5[1:0].out" output="lut5inter.out"/> |
| 247 | + <complete name="complete1" input="lut5inter.clk" output="ble5[1:0].clk"/> |
| 248 | + </interconnect> |
| 249 | + </pb_type> |
| 250 | + <interconnect> |
| 251 | + <direct name="direct1" input="fle.in[4:0]" output="lut5inter.in"/> |
| 252 | + <direct name="direct2" input="lut5inter.out" output="fle.out"/> |
| 253 | + <direct name="direct3" input="fle.clk" output="lut5inter.clk"/> |
| 254 | + </interconnect> |
| 255 | + </mode> |
| 256 | + <!-- Dual 5-LUT mode definition end --> |
| 257 | + <!-- 6-LUT mode definition begin --> |
| 258 | + <mode name="n1_lut6"> |
| 259 | + <!-- Define 6-LUT mode --> |
| 260 | + <pb_type name="ble6" num_pb="1"> |
| 261 | + <input name="in" num_pins="6"/> |
| 262 | + <output name="out" num_pins="1"/> |
| 263 | + <clock name="clk" num_pins="1"/> |
| 264 | + <!-- Define LUT --> |
| 265 | + <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut"> |
| 266 | + <input name="in" num_pins="6" port_class="lut_in"/> |
| 267 | + <output name="out" num_pins="1" port_class="lut_out"/> |
| 268 | + <!-- LUT timing using delay matrix --> |
| 269 | + <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing, |
| 270 | + we instead take the average of these numbers to get more stable results |
| 271 | + 82e-12 |
| 272 | + 173e-12 |
| 273 | + 261e-12 |
| 274 | + 263e-12 |
| 275 | + 398e-12 |
| 276 | + 397e-12 |
| 277 | + --> |
| 278 | + <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out"> |
| 279 | + 261e-12 |
| 280 | + 261e-12 |
| 281 | + 261e-12 |
| 282 | + 261e-12 |
| 283 | + 261e-12 |
| 284 | + 261e-12 |
| 285 | + </delay_matrix> |
| 286 | + </pb_type> |
| 287 | + <!-- Define flip-flop --> |
| 288 | + <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop"> |
| 289 | + <input name="D" num_pins="1" port_class="D"/> |
| 290 | + <output name="Q" num_pins="1" port_class="Q"/> |
| 291 | + <clock name="clk" num_pins="1" port_class="clock"/> |
| 292 | + <T_setup value="66e-12" port="ff.D" clock="clk"/> |
| 293 | + <T_clock_to_Q max="124e-12" port="ff.Q" clock="clk"/> |
| 294 | + </pb_type> |
| 295 | + <interconnect> |
| 296 | + <direct name="direct1" input="ble6.in" output="lut6[0:0].in"/> |
| 297 | + <direct name="direct2" input="lut6.out" output="ff.D"> |
| 298 | + <!-- Advanced user option that tells CAD tool to find LUT+FF pairs in netlist --> |
| 299 | + <pack_pattern name="ble6" in_port="lut6.out" out_port="ff.D"/> |
| 300 | + </direct> |
| 301 | + <direct name="direct3" input="ble6.clk" output="ff.clk"/> |
| 302 | + <mux name="mux1" input="ff.Q lut6.out" output="ble6.out"> |
| 303 | + <!-- LUT to output is faster than FF to output on a Stratix IV --> |
| 304 | + <delay_constant max="25e-12" in_port="lut6.out" out_port="ble6.out"/> |
| 305 | + <delay_constant max="45e-12" in_port="ff.Q" out_port="ble6.out"/> |
| 306 | + </mux> |
| 307 | + </interconnect> |
| 308 | + </pb_type> |
| 309 | + <interconnect> |
| 310 | + <direct name="direct1" input="fle.in" output="ble6.in"/> |
| 311 | + <direct name="direct2" input="ble6.out" output="fle.out[0:0]"/> |
| 312 | + <direct name="direct3" input="fle.clk" output="ble6.clk"/> |
| 313 | + </interconnect> |
| 314 | + </mode> |
| 315 | + <!-- 6-LUT mode definition end --> |
| 316 | + </pb_type> |
| 317 | + <interconnect> |
| 318 | + <!-- We use a full crossbar to get logical equivalence at inputs of CLB |
| 319 | + The delays below come from Stratix IV. the delay through a connection block |
| 320 | + input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps |
| 321 | + delay on the connection block input mux (modeled by Ian Kuon), so the remaining |
| 322 | + delay within the crossbar is 95 ps. |
| 323 | + The delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT. |
| 324 | + Since all our outputs LUT outputs go to a BLE output, and have a delay of |
| 325 | + 25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback |
| 326 | + to get the part that should be marked on the crossbar. --> |
| 327 | + <complete name="crossbar" input="clb.I fle[9:0].out" output="fle[9:0].in"> |
| 328 | + <delay_constant max="95e-12" in_port="clb.I" out_port="fle[9:0].in"/> |
| 329 | + <delay_constant max="75e-12" in_port="fle[9:0].out" out_port="fle[9:0].in"/> |
| 330 | + </complete> |
| 331 | + <complete name="clks" input="clb.clk" output="fle[9:0].clk"> |
| 332 | + </complete> |
| 333 | + <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins. |
| 334 | + By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, |
| 335 | + then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more |
| 336 | + naive specification). |
| 337 | + --> |
| 338 | + <direct name="clbouts1" input="fle[9:0].out[0:0]" output="clb.O[9:0]"/> |
| 339 | + <direct name="clbouts2" input="fle[9:0].out[1:1]" output="clb.O[19:10]"/> |
| 340 | + </interconnect> |
| 341 | + <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel --> |
| 342 | + <!-- Place this general purpose logic block in any unspecified column --> |
| 343 | + </pb_type> |
| 344 | + <!-- Define general purpose logic block (CLB) ends --> |
| 345 | + </complexblocklist> |
| 346 | + <power> |
| 347 | + <local_interconnect C_wire="2.5e-10"/> |
| 348 | + <mux_transistor_size mux_transistor_size="3"/> |
| 349 | + <FF_size FF_size="4"/> |
| 350 | + <LUT_transistor_size LUT_transistor_size="4"/> |
| 351 | + </power> |
| 352 | + <clocks> |
| 353 | + <clock buffer_size="auto" C_wire="2.5e-10"/> |
| 354 | + </clocks> |
| 355 | +</architecture> |
0 commit comments