Skip to content

Commit 0b1c8ae

Browse files
Merge branch 'master' into temp_clean_rrgraph_gen
2 parents bb23e6c + 5090124 commit 0b1c8ae

File tree

34 files changed

+623
-140
lines changed

34 files changed

+623
-140
lines changed

doc/src/api/vtrutil/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ VTRUTIL API
1111
container_utils
1212
logging
1313
geometry
14+
parallel
1415
other

doc/src/api/vtrutil/parallel.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
=====
2+
Parallel
3+
=====
4+
5+
vtr_thread_pool
6+
-------------
7+
.. doxygenfile:: vtr_thread_pool.h
8+
:project: vtr
9+
:sections: briefdescription detaileddescription func innernamespace enum
10+
11+
.. doxygenclass:: vtr::thread_pool
12+
:project: vtr
13+
:members:

doc/src/vpr/command_line_usage.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,15 @@ Analytical Placement is generally split into three stages:
12531253

12541254
**Default:** ``annealer``
12551255

1256+
.. option:: --ap_timing_tradeoff <float>
1257+
1258+
Controls the trade-off between wirelength (HPWL) and delay minimization in the AP flow.
1259+
1260+
A value of 0.0 makes the AP flow focus completely on wirelength minimization,
1261+
while a value of 1.0 makes the AP flow focus completely on timing optimization.
1262+
1263+
**Default:** ``0.5``
1264+
12561265
.. option:: --ap_verbosity <int>
12571266

12581267
Controls the verbosity of the AP flow output.

libs/libvtrutil/src/vtr_thread_pool.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,39 +26,42 @@ namespace vtr {
2626
*
2727
* Example usage:
2828
*
29-
* vtr::thread_pool pool(4);
29+
* ```
30+
* vtr::thread_pool pool(4); // 4 threads
3031
* pool.schedule_work([]{
3132
* // Task body
3233
* });
33-
* pool.wait_for_all(); // There's no API to wait for a single task
34+
* pool.wait_for_all(); // There's no API to wait for a single task
35+
* ```
3436
*/
3537
class thread_pool {
3638
private:
37-
/* Thread-local data */
39+
/** Thread-local data */
3840
struct ThreadData {
3941
std::thread thread;
40-
/* Per-thread task queue */
42+
/** Per-thread task queue */
4143
std::queue<std::function<void()>> task_queue;
4244

43-
/* Threads wait on cv for a stop signal or a new task
45+
/** Threads wait on cv for a stop signal or a new task
4446
* queue_mutex is required for condition variable */
4547
std::mutex queue_mutex;
4648
std::condition_variable cv;
4749
bool stop = false;
4850
};
4951

50-
/* Container for thread-local data */
52+
/** Container for thread-local data */
5153
std::vector<std::unique_ptr<ThreadData>> threads;
52-
/* Used for round-robin scheduling */
54+
/** Used for round-robin scheduling */
5355
std::atomic<size_t> next_thread{0};
54-
/* Used for wait_for_all */
56+
/** Used for wait_for_all */
5557
std::atomic<size_t> active_tasks{0};
5658

57-
/* Condition variable for wait_for_all */
59+
/** Condition variable for wait_for_all */
5860
std::mutex completion_mutex;
5961
std::condition_variable completion_cv;
6062

6163
public:
64+
/** Create a thread pool with \p thread_count threads. */
6265
thread_pool(size_t thread_count) {
6366
threads.reserve(thread_count);
6467

@@ -96,6 +99,7 @@ class thread_pool {
9699
}
97100
}
98101

102+
/** Schedule a function to be executed on one of the threads. */
99103
template<typename F>
100104
void schedule_work(F&& f) {
101105
active_tasks++;
@@ -133,6 +137,8 @@ class thread_pool {
133137
thread_data->cv.notify_one();
134138
}
135139

140+
/** Wait until the work queue is empty.
141+
* Note that functions are allowed to schedule new functions. */
136142
void wait_for_all() {
137143
std::unique_lock<std::mutex> lock(completion_mutex);
138144
completion_cv.wait(lock, [this]() { return active_tasks == 0; });

vpr/src/analytical_place/analytical_placement_flow.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77

88
#include "analytical_placement_flow.h"
99
#include <memory>
10+
#include "PreClusterTimingManager.h"
1011
#include "analytical_solver.h"
1112
#include "ap_netlist.h"
1213
#include "atom_netlist.h"
14+
#include "cluster_util.h"
1315
#include "detailed_placer.h"
1416
#include "full_legalizer.h"
1517
#include "gen_ap_netlist_from_atoms.h"
@@ -120,6 +122,7 @@ static PartialPlacement run_global_placer(const t_ap_opts& ap_opts,
120122
const AtomNetlist& atom_nlist,
121123
const APNetlist& ap_netlist,
122124
const Prepacker& prepacker,
125+
const PreClusterTimingManager& pre_cluster_timing_manager,
123126
const DeviceContext& device_ctx) {
124127
if (g_vpr_ctx.atom().flat_placement_info().valid) {
125128
VTR_LOG("Flat Placement is provided in the AP flow, skipping the Global Placement.\n");
@@ -139,6 +142,8 @@ static PartialPlacement run_global_placer(const t_ap_opts& ap_opts,
139142
device_ctx.grid,
140143
device_ctx.logical_block_types,
141144
device_ctx.physical_tile_types,
145+
pre_cluster_timing_manager,
146+
ap_opts.ap_timing_tradeoff,
142147
ap_opts.log_verbosity);
143148
return global_placer->place();
144149
}
@@ -163,12 +168,25 @@ void run_analytical_placement_flow(t_vpr_setup& vpr_setup) {
163168
constraints);
164169
print_ap_netlist_stats(ap_netlist);
165170

171+
// Pre-compute the pre-clustering timing delays. This object will be passed
172+
// into the global placer and the full legalizer to make them timing driven.
173+
PreClusterTimingManager pre_cluster_timing_manager(vpr_setup.PackerOpts.timing_driven,
174+
atom_nlist,
175+
g_vpr_ctx.atom().lookup(),
176+
prepacker,
177+
vpr_setup.PackerOpts.timing_update_type,
178+
*device_ctx.arch,
179+
vpr_setup.RoutingArch,
180+
vpr_setup.PackerOpts.device_layout,
181+
vpr_setup.AnalysisOpts);
182+
166183
// Run the Global Placer.
167184
const t_ap_opts& ap_opts = vpr_setup.APOpts;
168185
PartialPlacement p_placement = run_global_placer(ap_opts,
169186
atom_nlist,
170187
ap_netlist,
171188
prepacker,
189+
pre_cluster_timing_manager,
172190
device_ctx);
173191

174192
// Verify that the partial placement is valid before running the full
@@ -185,6 +203,7 @@ void run_analytical_placement_flow(t_vpr_setup& vpr_setup) {
185203
ap_netlist,
186204
atom_nlist,
187205
prepacker,
206+
pre_cluster_timing_manager,
188207
vpr_setup,
189208
*device_ctx.arch,
190209
device_ctx.grid);

vpr/src/analytical_place/analytical_solver.cpp

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
#include <memory>
1414
#include <utility>
1515
#include <vector>
16+
#include "PreClusterTimingManager.h"
17+
#include "atom_netlist.h"
18+
#include "atom_netlist_fwd.h"
1619
#include "device_grid.h"
1720
#include "flat_placement_types.h"
1821
#include "partial_placement.h"
@@ -42,23 +45,39 @@
4245
std::unique_ptr<AnalyticalSolver> make_analytical_solver(e_ap_analytical_solver solver_type,
4346
const APNetlist& netlist,
4447
const DeviceGrid& device_grid,
48+
const AtomNetlist& atom_netlist,
49+
const PreClusterTimingManager& pre_cluster_timing_manager,
50+
float ap_timing_tradeoff,
4551
int log_verbosity) {
4652
// Based on the solver type passed in, build the solver.
4753
switch (solver_type) {
4854
case e_ap_analytical_solver::QP_Hybrid:
4955
#ifdef EIGEN_INSTALLED
50-
return std::make_unique<QPHybridSolver>(netlist, device_grid, log_verbosity);
56+
return std::make_unique<QPHybridSolver>(netlist,
57+
device_grid,
58+
atom_netlist,
59+
pre_cluster_timing_manager,
60+
ap_timing_tradeoff,
61+
log_verbosity);
5162
#else
5263
(void)netlist;
5364
(void)device_grid;
65+
(void)atom_netlist;
66+
(void)pre_cluster_timing_manager;
67+
(void)ap_timing_tradeoff;
5468
(void)log_verbosity;
5569
VPR_FATAL_ERROR(VPR_ERROR_AP,
5670
"QP Hybrid Solver requires the Eigen library");
5771
break;
5872
#endif // EIGEN_INSTALLED
5973
case e_ap_analytical_solver::LP_B2B:
6074
#ifdef EIGEN_INSTALLED
61-
return std::make_unique<B2BSolver>(netlist, device_grid, log_verbosity);
75+
return std::make_unique<B2BSolver>(netlist,
76+
device_grid,
77+
atom_netlist,
78+
pre_cluster_timing_manager,
79+
ap_timing_tradeoff,
80+
log_verbosity);
6281
#else
6382
VPR_FATAL_ERROR(VPR_ERROR_AP,
6483
"LP B2B Solver requires the Eigen library");
@@ -72,10 +91,15 @@ std::unique_ptr<AnalyticalSolver> make_analytical_solver(e_ap_analytical_solver
7291
return nullptr;
7392
}
7493

75-
AnalyticalSolver::AnalyticalSolver(const APNetlist& netlist, int log_verbosity)
94+
AnalyticalSolver::AnalyticalSolver(const APNetlist& netlist,
95+
const AtomNetlist& atom_netlist,
96+
const PreClusterTimingManager& pre_cluster_timing_manager,
97+
float ap_timing_tradeoff,
98+
int log_verbosity)
7699
: netlist_(netlist)
77100
, blk_id_to_row_id_(netlist.blocks().size(), APRowId::INVALID())
78101
, row_id_to_blk_id_(netlist.blocks().size(), APBlockId::INVALID())
102+
, net_weights_(netlist.nets().size(), 1.0f)
79103
, log_verbosity_(log_verbosity) {
80104
// Get the number of moveable blocks in the netlist and create a unique
81105
// row ID from [0, num_moveable_blocks) for each moveable block in the
@@ -94,6 +118,21 @@ AnalyticalSolver::AnalyticalSolver(const APNetlist& netlist, int log_verbosity)
94118
current_row_id++;
95119
num_moveable_blocks_++;
96120
}
121+
122+
if (pre_cluster_timing_manager.is_valid()) {
123+
for (APNetId net_id : netlist.nets()) {
124+
// Get the atom net associated with the given AP net. When
125+
// constructing the AP netlist, we happen to set the name of each
126+
// AP net to the same name as the atom net that generated them!
127+
// TODO: Create a proper lookup structure to go from the AP Netlist
128+
// back to the Atom Netlist.
129+
AtomNetId atom_net_id = atom_netlist.find_net(netlist.net_name(net_id));
130+
VTR_ASSERT(atom_net_id.is_valid());
131+
float crit = pre_cluster_timing_manager.calc_net_setup_criticality(atom_net_id, atom_netlist);
132+
133+
net_weights_[net_id] = ap_timing_tradeoff * crit + (1.0f - ap_timing_tradeoff);
134+
}
135+
}
97136
}
98137

99138
#ifdef EIGEN_INSTALLED
@@ -201,12 +240,15 @@ void QPHybridSolver::init_linear_system() {
201240
for (APNetId net_id : netlist_.nets()) {
202241
size_t num_pins = netlist_.net_pins(net_id).size();
203242
VTR_ASSERT_DEBUG(num_pins > 1);
243+
244+
double net_weight = net_weights_[net_id];
245+
204246
if (num_pins > star_num_pins_threshold) {
205247
// Create a star node and connect each block in the net to the star
206248
// node.
207249
// Using the weight from FastPlace
208250
// TODO: Investigate other weight terms.
209-
double w = static_cast<double>(num_pins) / static_cast<double>(num_pins - 1);
251+
double w = net_weight * static_cast<double>(num_pins) / static_cast<double>(num_pins - 1);
210252
size_t star_node_id = num_moveable_blocks_ + star_node_offset;
211253
for (APPinId pin_id : netlist_.net_pins(net_id)) {
212254
APBlockId blk_id = netlist_.pin_block(pin_id);
@@ -220,7 +262,7 @@ void QPHybridSolver::init_linear_system() {
220262
// exactly once to every other block in the net.
221263
// Using the weight from FastPlace
222264
// TODO: Investigate other weight terms.
223-
double w = 1.0 / static_cast<double>(num_pins - 1);
265+
double w = net_weight * 1.0 / static_cast<double>(num_pins - 1);
224266
for (size_t ipin_idx = 0; ipin_idx < num_pins; ipin_idx++) {
225267
APPinId first_pin_id = netlist_.net_pin(net_id, ipin_idx);
226268
APBlockId first_blk_id = netlist_.pin_block(first_pin_id);
@@ -638,6 +680,7 @@ static inline APNetBounds get_unique_net_bounds(APNetId net_id,
638680
void B2BSolver::add_connection_to_system(APBlockId first_blk_id,
639681
APBlockId second_blk_id,
640682
size_t num_pins,
683+
double net_w,
641684
const vtr::vector<APBlockId, double>& blk_locs,
642685
std::vector<Eigen::Triplet<double>>& triplet_list,
643686
Eigen::VectorXd& b) {
@@ -660,7 +703,7 @@ void B2BSolver::add_connection_to_system(APBlockId first_blk_id,
660703
// The denominator of weight is zero, which causes infinity term in the matrix. Another way of
661704
// interpreting epsilon is the minimum distance two nodes are considered to be in placement.
662705
double dist = std::max(std::abs(blk_locs[first_blk_id] - blk_locs[second_blk_id]), distance_epsilon_);
663-
double w = (2.0 / static_cast<double>(num_pins - 1)) * (1.0 / dist);
706+
double w = net_w * (2.0 / static_cast<double>(num_pins - 1)) * (1.0 / dist);
664707

665708
// Update the connectivity matrix and the constant vector.
666709
// This is similar to how connections are added for the quadratic formulation.
@@ -696,6 +739,8 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
696739
size_t num_pins = netlist_.net_pins(net_id).size();
697740
VTR_ASSERT_SAFE_MSG(num_pins > 1, "net must have at least 2 pins");
698741

742+
double net_w = net_weights_[net_id];
743+
699744
// Find the bounding blocks
700745
APNetBounds net_bounds = get_unique_net_bounds(net_id, p_placement, netlist_);
701746

@@ -706,19 +751,19 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
706751
for (APPinId pin_id : netlist_.net_pins(net_id)) {
707752
APBlockId blk_id = netlist_.pin_block(pin_id);
708753
if (blk_id != net_bounds.max_x_blk && blk_id != net_bounds.min_x_blk) {
709-
add_connection_to_system(blk_id, net_bounds.max_x_blk, num_pins, p_placement.block_x_locs, triplet_list_x, b_x);
710-
add_connection_to_system(blk_id, net_bounds.min_x_blk, num_pins, p_placement.block_x_locs, triplet_list_x, b_x);
754+
add_connection_to_system(blk_id, net_bounds.max_x_blk, num_pins, net_w, p_placement.block_x_locs, triplet_list_x, b_x);
755+
add_connection_to_system(blk_id, net_bounds.min_x_blk, num_pins, net_w, p_placement.block_x_locs, triplet_list_x, b_x);
711756
}
712757
if (blk_id != net_bounds.max_y_blk && blk_id != net_bounds.min_y_blk) {
713-
add_connection_to_system(blk_id, net_bounds.max_y_blk, num_pins, p_placement.block_y_locs, triplet_list_y, b_y);
714-
add_connection_to_system(blk_id, net_bounds.min_y_blk, num_pins, p_placement.block_y_locs, triplet_list_y, b_y);
758+
add_connection_to_system(blk_id, net_bounds.max_y_blk, num_pins, net_w, p_placement.block_y_locs, triplet_list_y, b_y);
759+
add_connection_to_system(blk_id, net_bounds.min_y_blk, num_pins, net_w, p_placement.block_y_locs, triplet_list_y, b_y);
715760
}
716761
}
717762

718763
// Connect the bounds to each other. Its just easier to put these here
719764
// instead of in the for loop above.
720-
add_connection_to_system(net_bounds.max_x_blk, net_bounds.min_x_blk, num_pins, p_placement.block_x_locs, triplet_list_x, b_x);
721-
add_connection_to_system(net_bounds.max_y_blk, net_bounds.min_y_blk, num_pins, p_placement.block_y_locs, triplet_list_y, b_y);
765+
add_connection_to_system(net_bounds.max_x_blk, net_bounds.min_x_blk, num_pins, net_w, p_placement.block_x_locs, triplet_list_x, b_x);
766+
add_connection_to_system(net_bounds.max_y_blk, net_bounds.min_y_blk, num_pins, net_w, p_placement.block_y_locs, triplet_list_y, b_y);
722767
}
723768

724769
// Build the sparse connectivity matrices from the triplets.

0 commit comments

Comments
 (0)