Skip to content

Commit fdf6d3c

Browse files
authored
Merge pull request #2799 from verilog-to-routing/temp_chan_w_factors_prefix_sum
Chan x/y placement cost factors using prefix sum
2 parents 91f2941 + db3f7ae commit fdf6d3c

File tree

18 files changed

+125
-140
lines changed

18 files changed

+125
-140
lines changed

libs/libvtrutil/src/vtr_ndoffsetmatrix.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define VTR_ND_OFFSET_MATRIX_H
33
#include <array>
44
#include <memory>
5+
#include <algorithm>
56

67
#include "vtr_assert.h"
78

@@ -309,9 +310,8 @@ class NdOffsetMatrixBase {
309310

310311
///@brief Swap two NdOffsetMatrixBase objects
311312
friend void swap(NdOffsetMatrixBase<T, N>& m1, NdOffsetMatrixBase<T, N>& m2) {
312-
using std::swap;
313-
swap(m1.dim_ranges_, m2.dim_ranges_);
314-
swap(m1.data_, m2.data_);
313+
std::swap(m1.dim_ranges_, m2.dim_ranges_);
314+
std::swap(m1.data_, m2.data_);
315315
}
316316

317317
private:
@@ -441,7 +441,9 @@ class NdOffsetMatrix<T, 1> : public NdOffsetMatrixBase<T, 1> {
441441
VTR_ASSERT_SAFE_MSG(index >= this->dim_ranges_[0].begin_index(), "Index out of range (below dimension minimum)");
442442
VTR_ASSERT_SAFE_MSG(index < this->dim_ranges_[0].end_index(), "Index out of range (above dimension maximum)");
443443

444-
return this->data_[index];
444+
int effective_index = index - this->dim_ranges_[0].begin_index();
445+
446+
return this->data_[effective_index];
445447
}
446448

447449
///@brief Access an element (mutable)

vpr/src/base/SetupVPR.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -633,8 +633,6 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
633633
PlacerOpts->inner_loop_recompute_divider = Options.inner_loop_recompute_divider;
634634
PlacerOpts->quench_recompute_divider = Options.quench_recompute_divider;
635635

636-
PlacerOpts->place_cost_exp = 1;
637-
638636
PlacerOpts->td_place_exp_first = Options.place_exp_first;
639637

640638
PlacerOpts->td_place_exp_last = Options.place_exp_last;

vpr/src/base/ShowSetup.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,6 @@ static void ShowPlacerOpts(const t_placer_opts& PlacerOpts,
547547
VTR_LOG("Using constraints file '%s'\n", PlacerOpts.constraints_file.c_str());
548548
}
549549

550-
VTR_LOG("PlacerOpts.place_cost_exp: %f\n", PlacerOpts.place_cost_exp);
551-
552550
VTR_LOG("PlacerOpts.place_chan_width: %d\n", PlacerOpts.place_chan_width);
553551

554552
if (PlacerOpts.place_algorithm.is_timing_driven()) {

vpr/src/base/vpr_types.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,6 @@ struct t_placer_opts {
10651065
t_place_algorithm place_algorithm;
10661066
t_place_algorithm place_quench_algorithm;
10671067
float timing_tradeoff;
1068-
float place_cost_exp;
10691068
int place_chan_width;
10701069
enum e_pad_loc_type pad_loc_type;
10711070
std::string constraints_file;

vpr/src/place/net_cost_handler.cpp

Lines changed: 69 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -151,79 +151,48 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
151151
}
152152

153153
void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
154-
const double place_cost_exp = static_cast<double>(placer_opts_.place_cost_exp);
155-
auto& device_ctx = g_vpr_ctx.device();
156-
157-
const int grid_height = device_ctx.grid.height();
158-
const int grid_width = device_ctx.grid.width();
159-
160-
/* Access arrays below as chan?_place_cost_fac_(subhigh, sublow). Since subhigh must be greater than or
161-
* equal to sublow, we will only access the lower half of a matrix, but we allocate the whole matrix anyway
162-
* for simplicity, so we can use the vtr utility matrix functions. */
163-
chanx_place_cost_fac_ = vtr::NdOffsetMatrix<float, 2>({{{-1, grid_height}, {-1, grid_height}}});
164-
chany_place_cost_fac_ = vtr::NdOffsetMatrix<float, 2>({{{-1, grid_width}, {-1, grid_width}}});
165-
166-
// First compute the number of tracks between channel high and channel low, inclusive.
167-
chanx_place_cost_fac_[-1][-1] = 0;
168-
169-
for (int high = 0; high < grid_height; high++) {
170-
chanx_place_cost_fac_[high][high] = (float)device_ctx.chan_width.x_list[high];
171-
for (int low = -1; low < high; low++) {
172-
chanx_place_cost_fac_[high][low] = chanx_place_cost_fac_[high - 1][low] + (float)device_ctx.chan_width.x_list[high];
173-
}
174-
}
175-
176-
/* Now compute the inverse of the average number of tracks per channel *
177-
* between high and low. The cost function divides by the average *
178-
* number of tracks per channel, so by storing the inverse I convert *
179-
* this to a faster multiplication. Take this final number to the *
180-
* place_cost_exp power -- numbers other than one mean this is no *
181-
* longer a simple "average number of tracks"; it is some power of *
182-
* that, allowing greater penalization of narrow channels. */
183-
for (int high = -1; high < grid_height; high++) {
184-
for (int low = -1; low <= high; low++) {
185-
/* Since we will divide the wiring cost by the average channel *
186-
* capacity between high and low, having only 0 width channels *
187-
* will result in infinite wiring capacity normalization *
188-
* factor, and extremely bad placer behaviour. Hence we change *
189-
* this to a small (1 track) channel capacity instead. */
190-
if (chanx_place_cost_fac_[high][low] == 0.0f) {
191-
VTR_LOG_WARN("CHANX place cost fac is 0 at %d %d\n", high, low);
192-
chanx_place_cost_fac_[high][low] = 1.0f;
193-
}
154+
const auto& device_ctx = g_vpr_ctx.device();
194155

195-
chanx_place_cost_fac_[high][low] = (high - low + 1.) / chanx_place_cost_fac_[high][low];
196-
chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], place_cost_exp);
197-
}
198-
}
156+
const int grid_height = (int)device_ctx.grid.height();
157+
const int grid_width = (int)device_ctx.grid.width();
158+
159+
/* These arrays contain accumulative channel width between channel zero and
160+
* the channel specified by the given index. The accumulated channel width
161+
* is inclusive, meaning that it includes both channel zero and channel `idx`.
162+
* To compute the total channel width between channels 'low' and 'high', use the
163+
* following formula:
164+
* acc_chan?_width_[high] - acc_chan?_width_[low - 1]
165+
* This returns the total number of tracks between channels 'low' and 'high',
166+
* including tracks in these channels.
167+
*
168+
* Channel -1 doesn't exist, so we can say it has zero tracks. We need to be able
169+
* to access these arrays with index -1 to handle cases where the lower channel is 0.
170+
*/
171+
acc_chanx_width_ = vtr::NdOffsetMatrix<int, 1>({{{-1, grid_height}}});
172+
acc_chany_width_ = vtr::NdOffsetMatrix<int, 1>({{{-1, grid_width}}});
199173

200-
/* Now do the same thing for the y-directed channels. First get the
201-
* number of tracks between channel high and channel low, inclusive. */
202-
chany_place_cost_fac_[-1][-1] = 0;
174+
// initialize the first element (index -1) with zero
175+
acc_chanx_width_[-1] = 0;
176+
for (int y = 0; y < grid_height; y++) {
177+
acc_chanx_width_[y] = acc_chanx_width_[y - 1] + device_ctx.chan_width.x_list[y];
203178

204-
for (int high = 0; high < grid_width; high++) {
205-
chany_place_cost_fac_[high][high] = device_ctx.chan_width.y_list[high];
206-
for (int low = -1; low < high; low++) {
207-
chany_place_cost_fac_[high][low] = chany_place_cost_fac_[high - 1][low] + device_ctx.chan_width.y_list[high];
179+
/* If the number of tracks in a channel is zero, two consecutive elements take the same
180+
* value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
181+
* potential issue, we assume that the channel width is at least 1.
182+
*/
183+
if (acc_chanx_width_[y] == acc_chanx_width_[y - 1]) {
184+
acc_chanx_width_[y]++;
208185
}
209186
}
210187

211-
/* Now compute the inverse of the average number of tracks per channel
212-
* between high and low. Take to specified power. */
213-
for (int high = -1; high < grid_width; high++) {
214-
for (int low = -1; low <= high; low++) {
215-
/* Since we will divide the wiring cost by the average channel *
216-
* capacity between high and low, having only 0 width channels *
217-
* will result in infinite wiring capacity normalization *
218-
* factor, and extremely bad placer behaviour. Hence we change *
219-
* this to a small (1 track) channel capacity instead. */
220-
if (chany_place_cost_fac_[high][low] == 0.0f) {
221-
VTR_LOG_WARN("CHANY place cost fac is 0 at %d %d\n", high, low);
222-
chany_place_cost_fac_[high][low] = 1.0f;
223-
}
188+
// initialize the first element (index -1) with zero
189+
acc_chany_width_[-1] = 0;
190+
for (int x = 0; x < grid_width; x++) {
191+
acc_chany_width_[x] = acc_chany_width_[x - 1] + device_ctx.chan_width.y_list[x];
224192

225-
chany_place_cost_fac_[high][low] = (high - low + 1.) / chany_place_cost_fac_[high][low];
226-
chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], place_cost_exp);
193+
// to avoid a division by zero
194+
if (acc_chany_width_[x] == acc_chany_width_[x - 1]) {
195+
acc_chany_width_[x]++;
227196
}
228197
}
229198

@@ -239,33 +208,32 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
239208
const size_t grid_height = device_ctx.grid.height();
240209
const size_t grid_width = device_ctx.grid.width();
241210

242-
243-
acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0.);
211+
acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0);
244212

245213
vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);
246214

247215
/*
248-
* Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
249-
* in the device. We count all these edges, regardless of which layers they connect. Then we divide by
250-
* the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
251-
* what we do for the horizontal and vertical channels where we assume the channel width doesn't change
252-
* along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
216+
* Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
217+
* in the device. We count all these edges, regardless of which layers they connect. Then we divide by
218+
* the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
219+
* what we do for the horizontal and vertical channels where we assume the channel width doesn't change
220+
* along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
253221
* if someday we have architectures with widely varying connectivity between different layers in a stack.
254-
*/
222+
*/
255223

256224
/*
257-
* To calculate the accumulative number of inter-die connections we first need to get the number of
258-
* inter-die connection per location. To be able to work for the cases that RR Graph is read instead
259-
* of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
260-
* tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,
261-
* we populate the first row and column. Then, we iterate over the rest of blocks and get the number of
262-
* inter-die connections by adding up the number of inter-die block at that location + the accumulation
263-
* for the block below and left to it. Then, since the accumulated number of inter-die connection to
264-
* the block on the lower left connection of the block is added twice, that part needs to be removed.
265-
*/
266-
for (const auto& src_rr_node : rr_graph.nodes()) {
267-
for (const auto& rr_edge_idx : rr_graph.edges(src_rr_node)) {
268-
const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
225+
* To calculate the accumulative number of inter-die connections we first need to get the number of
226+
* inter-die connection per location. To be able to work for the cases that RR Graph is read instead
227+
* of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
228+
* tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,
229+
* we populate the first row and column. Then, we iterate over the rest of blocks and get the number of
230+
* inter-die connections by adding up the number of inter-die block at that location + the accumulation
231+
* for the block below and left to it. Then, since the accumulated number of inter-die connection to
232+
* the block on the lower left connection of the block is added twice, that part needs to be removed.
233+
*/
234+
for (const RRNodeId src_rr_node : rr_graph.nodes()) {
235+
for (const t_edge_size rr_edge_idx : rr_graph.edges(src_rr_node)) {
236+
const RRNodeId sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
269237
if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
270238
// We assume that the nodes driving the inter-layer connection or being driven by it
271239
// are not stretched across multiple tiles
@@ -290,20 +258,20 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
290258
// Initialize the first row and column
291259
for (size_t x = 1; x < device_ctx.grid.width(); x++) {
292260
acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] +
293-
tile_num_inter_die_conn[x][0];
261+
tile_num_inter_die_conn[x][0];
294262
}
295263

296264
for (size_t y = 1; y < device_ctx.grid.height(); y++) {
297265
acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] +
298-
tile_num_inter_die_conn[0][y];
266+
tile_num_inter_die_conn[0][y];
299267
}
300268

301269
for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) {
302270
for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) {
303271
acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] +
304-
acc_tile_num_inter_die_conn_[x_high][y_high-1] +
305-
tile_num_inter_die_conn[x_high][y_high] -
306-
acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
272+
acc_tile_num_inter_die_conn_[x_high][y_high-1] +
273+
tile_num_inter_die_conn[x_high][y_high] -
274+
acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
307275
}
308276
}
309277
}
@@ -1421,7 +1389,7 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
14211389

14221390
const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id];
14231391

1424-
double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
1392+
const double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
14251393

14261394
/* Could insert a check for xmin == xmax. In that case, assume *
14271395
* connection will be made with no bends and hence no x-cost. *
@@ -1437,8 +1405,9 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
14371405
*/
14381406

14391407
double ncost;
1440-
ncost = (bb.xmax - bb.xmin + 1) * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
1441-
ncost += (bb.ymax - bb.ymin + 1) * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
1408+
const auto [chanx_cost_fac, chany_cost_fac] = get_chanxy_cost_fac_(bb);
1409+
ncost = (bb.xmax - bb.xmin + 1) * chanx_cost_fac;
1410+
ncost += (bb.ymax - bb.ymin + 1) * chany_cost_fac;
14421411
if (is_multi_layer_) {
14431412
ncost += (bb.layer_max - bb.layer_min) * get_chanz_cost_factor_(bb);
14441413
}
@@ -1448,6 +1417,7 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
14481417
return ncost;
14491418
}
14501419

1420+
14511421
double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use_ts) {
14521422
const auto& move_ctx = placer_state_.move();
14531423

@@ -1469,7 +1439,7 @@ double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use
14691439
/* Adjust the bounding box half perimeter by the wirelength correction
14701440
* factor based on terminal count, which is 1 for the source + the number
14711441
* of sinks on this layer. */
1472-
double crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
1442+
const double crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
14731443

14741444
/* Could insert a check for xmin == xmax. In that case, assume *
14751445
* connection will be made with no bends and hence no x-cost. *
@@ -1484,11 +1454,10 @@ double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id , bool use
14841454
* chan?_place_cost_fac_ objects can handle -1 indices internally.
14851455
*/
14861456

1487-
ncost += (bb[layer_num].xmax - bb[layer_num].xmin + 1) * crossing
1488-
* chanx_place_cost_fac_[bb[layer_num].ymax][bb[layer_num].ymin - 1];
1489-
1490-
ncost += (bb[layer_num].ymax - bb[layer_num].ymin + 1) * crossing
1491-
* chany_place_cost_fac_[bb[layer_num].xmax][bb[layer_num].xmin - 1];
1457+
const auto[chanx_cost_fac, chany_cost_fac] = get_chanxy_cost_fac_(bb[layer_num]);
1458+
ncost += (bb[layer_num].xmax - bb[layer_num].xmin + 1) * chanx_cost_fac;
1459+
ncost += (bb[layer_num].ymax - bb[layer_num].ymin + 1) * chany_cost_fac;
1460+
ncost *= crossing;
14921461
}
14931462

14941463
return ncost;
@@ -1546,8 +1515,6 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) {
15461515
}
15471516

15481517
float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
1549-
float place_cost_exp = placer_opts_.place_cost_exp;
1550-
15511518
int num_inter_dir_conn;
15521519

15531520
if (bb.xmin == 0 && bb.ymin == 0) {
@@ -1571,7 +1538,6 @@ float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
15711538
} else {
15721539
int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
15731540
z_cost_factor = bb_num_tiles / static_cast<float>(num_inter_dir_conn);
1574-
z_cost_factor = pow((double)z_cost_factor, (double)place_cost_exp);
15751541
}
15761542

15771543
return z_cost_factor;

0 commit comments

Comments
 (0)