Skip to content

Commit 6634f57

Browse files
authored
Merge pull request #2961 from AlexandreSinger/feature-ap-tuning
[AP] Tuned the AP Flow
2 parents 64ab163 + ddd81e6 commit 6634f57

File tree

24 files changed

+253
-111
lines changed

24 files changed

+253
-111
lines changed

doc/src/vpr/command_line_usage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,7 @@ Analytical Placement is generally split into three stages:
12141214
Uses the legalized solution as anchor-points to pull the solution to a
12151215
more legal solution (similar to the approach from SimPL :cite:`Kim2013_SimPL`).
12161216

1217-
**Default:** ``qp-hybrid``
1217+
**Default:** ``lp-b2b``
12181218

12191219
.. option:: --ap_partial_legalizer {bipartitioning | flow-based}
12201220

vpr/src/analytical_place/analytical_solver.cpp

Lines changed: 50 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,8 @@ void QPHybridSolver::solve(unsigned iteration, PartialPlacement& p_placement) {
335335
p_placement, iteration);
336336
}
337337
// Verify that the constant vectors are valid.
338-
VTR_ASSERT_DEBUG(!b_x_diff.hasNaN() && "b_x has NaN!");
339-
VTR_ASSERT_DEBUG(!b_y_diff.hasNaN() && "b_y has NaN!");
338+
VTR_ASSERT_SAFE_MSG(!b_x_diff.hasNaN(), "b_x has NaN!");
339+
VTR_ASSERT_SAFE_MSG(!b_y_diff.hasNaN(), "b_y has NaN!");
340340

341341
// Set up the ConjugateGradient Solver using the coefficient matrix.
342342
// TODO: can change cg.tolerance to increase performance when needed
@@ -479,8 +479,11 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
479479
// p_placement.
480480
// 3) Repeat. Note: We need to repeat step 1 and 2 iteratively since
481481
// the bounds are likely to have changed after step 2.
482-
// TODO: As well as having a maximum number of bound updates, should also
483-
// investigate stopping when the HPWL converges.
482+
// We stop when it looks like the placement is converging (the change in
483+
// HPWL is sufficiently small for a few iterations).
484+
double prev_hpwl = std::numeric_limits<double>::max();
485+
double curr_hpwl = prev_hpwl;
486+
unsigned num_convergence = 0;
484487
for (unsigned counter = 0; counter < max_num_bound_updates_; counter++) {
485488
VTR_LOGV(log_verbosity_ >= 10,
486489
"\tPlacement HPWL in b2b loop: %f\n",
@@ -490,7 +493,7 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
490493
float build_linear_system_start_time = runtime_timer.elapsed_sec();
491494
init_linear_system(p_placement);
492495
if (iteration != 0)
493-
update_linear_system_with_anchors(p_placement, iteration);
496+
update_linear_system_with_anchors(iteration);
494497
total_time_spent_building_linear_system_ += runtime_timer.elapsed_sec() - build_linear_system_start_time;
495498
VTR_ASSERT_SAFE_MSG(!b_x.hasNaN(), "b_x has NaN!");
496499
VTR_ASSERT_SAFE_MSG(!b_y.hasNaN(), "b_y has NaN!");
@@ -524,22 +527,24 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
524527
total_time_spent_solving_linear_system_ += runtime_timer.elapsed_sec() - solve_linear_system_start_time;
525528

526529
// Save the result into the partial placement object.
527-
for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
528-
// Since we are capping the number of iterations, the solver may not
529-
// have enough time to converge on a solution that is on the device.
530-
// We just clamp the solution to zero for now.
531-
// TODO: Should handle this better. If the solution is very negative
532-
// it may indicate a bug.
533-
if (x[row_id_idx] < 0.0)
534-
x[row_id_idx] = 0.0;
535-
if (y[row_id_idx] < 0.0)
536-
y[row_id_idx] = 0.0;
537-
538-
APRowId row_id = APRowId(row_id_idx);
539-
APBlockId blk_id = row_id_to_blk_id_[row_id];
540-
p_placement.block_x_locs[blk_id] = x[row_id_idx];
541-
p_placement.block_y_locs[blk_id] = y[row_id_idx];
542-
}
530+
store_solution_into_placement(x, y, p_placement);
531+
532+
// If the current HPWL is larger than the previous HPWL (i.e. the HPWL
533+
// got worst since last B2B iter) or the gap between the two solutions
534+
// is small. Increment a counter.
535+
// TODO: Since, in theory, the HPWL could get worst due to numerical
536+
// reasons, should we save the best result? May not be worth it...
537+
curr_hpwl = p_placement.get_hpwl(netlist_);
538+
double target_gap = b2b_convergence_gap_fac_ * curr_hpwl;
539+
if (curr_hpwl > prev_hpwl || std::abs(curr_hpwl - prev_hpwl) < target_gap)
540+
num_convergence++;
541+
542+
// If the HPWL got close enough times, stop. This is to allow the HPWL
543+
// to "bounce", which can happen as it converges.
544+
// This trades-off quality for run time.
545+
if (num_convergence >= target_num_b2b_convergences_)
546+
break;
547+
prev_hpwl = curr_hpwl;
543548

544549
// Update the guesses with the most recent answer
545550
x_guess = x;
@@ -723,8 +728,7 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
723728

724729
// This function adds anchors for legalized solution. Anchors are treated as fixed node,
725730
// each connecting to a movable node. Number of nodes in a anchor net is always 2.
726-
void B2BSolver::update_linear_system_with_anchors(PartialPlacement& p_placement,
727-
unsigned iteration) {
731+
void B2BSolver::update_linear_system_with_anchors(unsigned iteration) {
728732
VTR_ASSERT_SAFE_MSG(iteration != 0,
729733
"no fixed solution to anchor to in the first iteration");
730734
// Get the anchor weight based on the iteration number. We want the anchor
@@ -733,23 +737,39 @@ void B2BSolver::update_linear_system_with_anchors(PartialPlacement& p_placement,
733737
double coeff_pseudo_anchor = anchor_weight_mult_ * std::exp((double)iteration / anchor_weight_exp_fac_);
734738

735739
// Add an anchor for each moveable block to its solved position.
736-
// Note: We treat anchors as being a 2-pin net between a moveable block
737-
// and a fixed block where both are the bounds of the net.
738740
for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
739741
APRowId row_id = APRowId(row_id_idx);
740742
APBlockId blk_id = row_id_to_blk_id_[row_id];
741-
double dx = std::abs(p_placement.block_x_locs[blk_id] - block_x_locs_legalized[blk_id]);
742-
double dy = std::abs(p_placement.block_y_locs[blk_id] - block_y_locs_legalized[blk_id]);
743-
// Anchor node are always 2 pins.
744-
double pseudo_w_x = coeff_pseudo_anchor * 2.0 / std::max(dx, distance_epsilon_);
745-
double pseudo_w_y = coeff_pseudo_anchor * 2.0 / std::max(dy, distance_epsilon_);
743+
double pseudo_w_x = coeff_pseudo_anchor * 2.0;
744+
double pseudo_w_y = coeff_pseudo_anchor * 2.0;
746745
A_sparse_x.coeffRef(row_id_idx, row_id_idx) += pseudo_w_x;
747746
A_sparse_y.coeffRef(row_id_idx, row_id_idx) += pseudo_w_y;
748747
b_x(row_id_idx) += pseudo_w_x * block_x_locs_legalized[blk_id];
749748
b_y(row_id_idx) += pseudo_w_y * block_y_locs_legalized[blk_id];
750749
}
751750
}
752751

752+
void B2BSolver::store_solution_into_placement(Eigen::VectorXd& x_soln,
753+
Eigen::VectorXd& y_soln,
754+
PartialPlacement& p_placement) {
755+
for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
756+
// Since we are capping the number of iterations, the solver may not
757+
// have enough time to converge on a solution that is on the device.
758+
// We just clamp the solution to zero for now.
759+
// TODO: Should handle this better. If the solution is very negative
760+
// it may indicate a bug.
761+
if (x_soln[row_id_idx] < 0.0)
762+
x_soln[row_id_idx] = 0.0;
763+
if (y_soln[row_id_idx] < 0.0)
764+
y_soln[row_id_idx] = 0.0;
765+
766+
APRowId row_id = APRowId(row_id_idx);
767+
APBlockId blk_id = row_id_to_blk_id_[row_id];
768+
p_placement.block_x_locs[blk_id] = x_soln[row_id_idx];
769+
p_placement.block_y_locs[blk_id] = y_soln[row_id_idx];
770+
}
771+
}
772+
753773
void B2BSolver::print_statistics() {
754774
VTR_LOG("B2B Solver Statistics:\n");
755775
VTR_LOG("\tTotal number of CG iterations: %u\n", total_num_cg_iters_);

vpr/src/analytical_place/analytical_solver.h

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -359,11 +359,26 @@ class B2BSolver : public AnalyticalSolver {
359359
/// than some epsilon.
360360
/// Decreasing this number may lead to more instability, but can yield
361361
/// a higher quality solution.
362-
static constexpr double distance_epsilon_ = 0.5;
362+
static constexpr double distance_epsilon_ = 0.01;
363+
364+
/// @brief The gap between the HPWL of the current solved solution in the
365+
/// B2B loop and the previous solved solution that is considered to
366+
/// be close-enough to be converged (as a fraction of the current
367+
/// solved solution HPWL).
368+
/// Decreasing this number toward zero would cause the B2B solver to run
369+
/// more iterations to try and reduce the HPWL further.
370+
static constexpr double b2b_convergence_gap_fac_ = 0.001;
371+
372+
/// @brief The number of times the B2B loop should "converge" before stopping
373+
/// the loop. Due to numerical inaccuracies, it is possible for the
374+
/// HPWL to bounce up and down as it converges. Increasing this number
375+
/// will allow more bounces which may get better quality; however
376+
/// more iterations will need to be run.
377+
static constexpr unsigned target_num_b2b_convergences_ = 2;
363378

364379
/// @brief Max number of bound update / solve iterations. Increasing this
365380
/// number will yield better quality at the expense of runtime.
366-
static constexpr unsigned max_num_bound_updates_ = 6;
381+
static constexpr unsigned max_num_bound_updates_ = 24;
367382

368383
/// @brief Max number of iterations the Conjugate Gradient solver can perform.
369384
/// Due to the weights getting very large in the early iterations of
@@ -376,7 +391,7 @@ class B2BSolver : public AnalyticalSolver {
376391
/// to prevent this behaviour and get good runtime.
377392
// TODO: Need to investigate this more to find a good number for this.
378393
// TODO: Should this be a proportion of the design size?
379-
static constexpr unsigned max_cg_iterations_ = 200;
394+
static constexpr unsigned max_cg_iterations_ = 150;
380395

381396
// The following constants are used to configure the anchor weighting.
382397
// The weights of anchors grow exponentially each iteration by the following
@@ -509,8 +524,18 @@ class B2BSolver : public AnalyticalSolver {
509524
* @brief Updates the linear system with anchor-blocks from the legalized
510525
* solution.
511526
*/
512-
void update_linear_system_with_anchors(PartialPlacement& p_placement,
513-
unsigned iteration);
527+
void update_linear_system_with_anchors(unsigned iteration);
528+
529+
/**
530+
* @brief Store the x and y solutions in Eigen's vectors into the partial
531+
* placement object.
532+
*
533+
* Note: The x_soln and y_soln may be modified if it is found that the
534+
* solution is imposible (i.e. has negative positions).
535+
*/
536+
void store_solution_into_placement(Eigen::VectorXd& x_soln,
537+
Eigen::VectorXd& y_soln,
538+
PartialPlacement& p_placement);
514539

515540
// The following are variables used to store the system of equations to be
516541
// solved in the x and y dimensions. The equations are of the form:

vpr/src/analytical_place/global_placer.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "global_placer.h"
1010
#include <cstdio>
11+
#include <limits>
1112
#include <memory>
1213
#include <vector>
1314
#include "analytical_solver.h"
@@ -207,6 +208,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
207208
float total_time_spent_in_solver = 0.0f;
208209
float total_time_spent_in_legalizer = 0.0f;
209210

211+
// Create a partial placement object to store the best placement found during
212+
// global placement. It is possible for the global placement to hit a minimum
213+
// in the middle of its iterations, this lets us keep that solution.
214+
PartialPlacement best_p_placement(ap_netlist_);
215+
double best_ub_hpwl = std::numeric_limits<double>::max();
216+
210217
// Run the global placer.
211218
for (size_t i = 0; i < max_num_iterations_; i++) {
212219
float iter_start_time = runtime_timer.elapsed_sec();
@@ -235,6 +242,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
235242
iter_end_time - iter_start_time);
236243
}
237244

245+
// If this placement is better than the best we have seen, save it.
246+
if (ub_hpwl < best_ub_hpwl) {
247+
best_ub_hpwl = ub_hpwl;
248+
best_p_placement = p_placement;
249+
}
250+
238251
// Exit condition: If the upper-bound and lower-bound HPWLs are
239252
// sufficiently close together then stop.
240253
double hpwl_relative_gap = (ub_hpwl - lb_hpwl) / ub_hpwl;
@@ -254,12 +267,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
254267

255268
// Print some statistics on the final placement.
256269
VTR_LOG("Placement after Global Placement:\n");
257-
print_placement_stats(p_placement,
270+
print_placement_stats(best_p_placement,
258271
ap_netlist_,
259272
*density_manager_);
260273

261274
// Return the placement from the final iteration.
262275
// TODO: investigate saving the best solution found so far. It should be
263276
// cheap to save a copy of the PartialPlacement object.
264-
return p_placement;
277+
return best_p_placement;
265278
}

vpr/src/analytical_place/partial_legalizer.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1490,12 +1490,17 @@ void BiPartitioningPartialLegalizer::partition_blocks_in_window(
14901490
// windows. To do this we sort the unplaced blocks by largest mass to
14911491
// smallest mass. Then we place each block in the bin with the highest
14921492
// underfill.
1493+
// FIXME: Above was the intuition; however, after experimentation, found that
1494+
// sorting by smallest mass to largest mass worked better...
1495+
// FIXME: I think large blocks (like carry chains) need to be handled special
1496+
// early on. If they are put into a partition too late, they may have
1497+
// to create overfill! Perhaps the partitions can hold two lists.
14931498
std::sort(unplaced_blocks.begin(),
14941499
unplaced_blocks.end(),
14951500
[&](APBlockId a, APBlockId b) {
14961501
const auto& blk_a_mass = density_manager_->mass_calculator().get_block_mass(a);
14971502
const auto& blk_b_mass = density_manager_->mass_calculator().get_block_mass(b);
1498-
return blk_a_mass.manhattan_norm() > blk_b_mass.manhattan_norm();
1503+
return blk_a_mass.manhattan_norm() < blk_b_mass.manhattan_norm();
14991504
});
15001505
for (APBlockId blk_id : unplaced_blocks) {
15011506
// Project the underfill from each window onto the mass. This gives us

vpr/src/analytical_place/partial_legalizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ class BiPartitioningPartialLegalizer : public PartialLegalizer {
367367
/// create large windows; decreasing this number will put more pressure on
368368
/// the window generation code, which can increase window size and runtime.
369369
/// TODO: Should this be distance instead of number of bins?
370-
static constexpr int max_bin_cluster_gap_ = 1;
370+
static constexpr int max_bin_cluster_gap_ = 2;
371371

372372
public:
373373
/**

vpr/src/base/read_options.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1903,7 +1903,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
19031903
"Controls which Analytical Solver the Global Placer will use in the AP Flow.\n"
19041904
" * qp-hybrid: olves for a placement that minimizes the quadratic HPWL of the flat placement using a hybrid clique/star net model.\n"
19051905
" * lp-b2b: Solves for a placement that minimizes the linear HPWL of theflat placement using the Bound2Bound net model.")
1906-
.default_value("qp-hybrid")
1906+
.default_value("lp-b2b")
19071907
.show_in(argparse::ShowIn::HELP_ONLY);
19081908

19091909
ap_grp.add_argument<e_ap_partial_legalizer, ParseAPPartialLegalizer>(args.ap_partial_legalizer, "--ap_partial_legalizer")

vpr/src/pack/appack_context.h

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
#include <limits>
1313
#include "device_grid.h"
1414
#include "flat_placement_types.h"
15+
#include "physical_types.h"
1516
#include "vpr_context.h"
17+
#include "vpr_utils.h"
1618

1719
/**
1820
* @brief Configuration options for APPack.
@@ -33,12 +35,19 @@ struct t_appack_options {
3335
// distance on the device (from the bottom corner to the top corner).
3436
// We also use an offset for the minimum this distance can be to prevent
3537
// small devices from finding candidates.
36-
float max_candidate_distance_scale = 0.5f;
37-
float max_candidate_distance_offset = 20.f;
38+
float max_candidate_distance_scale = 0.1f;
39+
float max_candidate_distance_offset = 15.0f;
3840
// Longest L1 distance on the device.
3941
float longest_distance = device_grid.width() + device_grid.height();
4042
max_candidate_distance = std::max(max_candidate_distance_scale * longest_distance,
4143
max_candidate_distance_offset);
44+
45+
// Infer the logical block type in the architecture. This will be used
46+
// for the max candidate distance optimization to use a more aggressive
47+
// distance.
48+
t_logical_block_type_ptr logic_block_type = infer_logic_block_type(device_grid);
49+
if (logic_block_type != nullptr)
50+
logic_block_type_index = logic_block_type->index;
4251
}
4352

4453
// Whether to use APPack or not.
@@ -67,11 +76,17 @@ struct t_appack_options {
6776
// Distance threshold which decides when to use quadratic decay or inverted
6877
// sqrt decay. If the distance is less than this threshold, quadratic decay
6978
// is used. Inverted sqrt is used otherwise.
70-
static constexpr float dist_th = 2.0f;
79+
static constexpr float dist_th = 1.75f;
80+
// Attenuation value at the threshold.
81+
static constexpr float attenuation_th = 0.35f;
82+
83+
// Using the distance threshold and the attenuation value at that point, we
84+
// can compute the other two terms. This is to keep the attenuation function
85+
// smooth.
7186
// Horizontal offset to the inverted sqrt decay.
72-
static constexpr float sqrt_offset = -6.1f;
73-
// Scaling factor for the quadratic decay term.
74-
static constexpr float quad_fac = 0.4f;
87+
static constexpr float sqrt_offset = dist_th - ((1.0f / attenuation_th) * (1.0f / attenuation_th));
88+
// Squared scaling factor for the quadratic decay term.
89+
static constexpr float quad_fac_sqr = (1.0f - attenuation_th) / (dist_th * dist_th);
7590

7691
// =========== Candidate selection distance ============================ //
7792
// When selecting candidates, what distance from the cluster will we
@@ -81,6 +96,14 @@ struct t_appack_options {
8196
// types of molecules / clusters. For example, CLBs vs DSPs
8297
float max_candidate_distance = std::numeric_limits<float>::max();
8398

99+
// A scaling applied to the max candidate distance of all clusters that are
100+
// not logic blocks.
101+
static constexpr float max_candidate_distance_non_lb_scale = 3.5f;
102+
103+
// TODO: This should be an option similar to the target pin utilization
104+
// so we can specify the max distance per block type!
105+
int logic_block_type_index = -1;
106+
84107
// =========== Unrelated clustering ==================================== //
85108
// After searching for candidates by connectivity and timing, the user may
86109
// turn on unrelated clustering, which will allow molecules which are
@@ -95,7 +118,7 @@ struct t_appack_options {
95118
// search within the cluster's tile. Setting this to a higher number would
96119
// allow APPack to search farther away; but may bring in molecules which
97120
// do not "want" to be in the cluster.
98-
static constexpr float max_unrelated_tile_distance = 1.0f;
121+
static constexpr float max_unrelated_tile_distance = 5.0f;
99122

100123
// Unrelated clustering occurs after all other candidate selection methods
101124
// have failed. This parameter sets how many time we will attempt unrelated
@@ -106,7 +129,7 @@ struct t_appack_options {
106129
// NOTE: A similar option exists in the candidate selector class. This was
107130
// duplicated since it is very likely that APPack would need a
108131
// different value for this option than the non-APPack flow.
109-
static constexpr int max_unrelated_clustering_attempts = 2;
132+
static constexpr int max_unrelated_clustering_attempts = 10;
110133

111134
// TODO: Investigate adding flat placement info to seed selection.
112135
};

0 commit comments

Comments
 (0)