verilog-to-routing · amin1377 · Apr 3, 2025 · Mar 22, 2025
diff --git a/doc/src/vpr/command_line_usage.rst b/doc/src/vpr/command_line_usage.rst
@@ -1214,7 +1214,7 @@ Analytical Placement is generally split into three stages:
       Uses the legalized solution as anchor-points to pull the solution to a
       more legal solution (similar to the approach from SimPL :cite:`Kim2013_SimPL`).
 
-    **Default:** ``qp-hybrid``
+    **Default:** ``lp-b2b``
 
 .. option:: --ap_partial_legalizer {bipartitioning | flow-based}
 

diff --git a/vpr/src/analytical_place/analytical_solver.cpp b/vpr/src/analytical_place/analytical_solver.cpp
@@ -335,8 +335,8 @@ void QPHybridSolver::solve(unsigned iteration, PartialPlacement& p_placement) {
                                           p_placement, iteration);
     }
     // Verify that the constant vectors are valid.
-    VTR_ASSERT_DEBUG(!b_x_diff.hasNaN() && "b_x has NaN!");
-    VTR_ASSERT_DEBUG(!b_y_diff.hasNaN() && "b_y has NaN!");
+    VTR_ASSERT_SAFE_MSG(!b_x_diff.hasNaN(), "b_x has NaN!");
+    VTR_ASSERT_SAFE_MSG(!b_y_diff.hasNaN(), "b_y has NaN!");
 
     // Set up the ConjugateGradient Solver using the coefficient matrix.
     // TODO: can change cg.tolerance to increase performance when needed
@@ -479,8 +479,11 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
     //         p_placement.
     //      3) Repeat. Note: We need to repeat step 1 and 2 iteratively since
     //         the bounds are likely to have changed after step 2.
-    // TODO: As well as having a maximum number of bound updates, should also
-    //       investigate stopping when the HPWL converges.
+    // We stop when it looks like the placement is converging (the change in
+    // HPWL is sufficiently small for a few iterations).
+    double prev_hpwl = std::numeric_limits<double>::max();
+    double curr_hpwl = prev_hpwl;
+    unsigned num_convergence = 0;
     for (unsigned counter = 0; counter < max_num_bound_updates_; counter++) {
         VTR_LOGV(log_verbosity_ >= 10,
                  "\tPlacement HPWL in b2b loop: %f\n",
@@ -490,7 +493,7 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
         float build_linear_system_start_time = runtime_timer.elapsed_sec();
         init_linear_system(p_placement);
         if (iteration != 0)
-            update_linear_system_with_anchors(p_placement, iteration);
+            update_linear_system_with_anchors(iteration);
         total_time_spent_building_linear_system_ += runtime_timer.elapsed_sec() - build_linear_system_start_time;
         VTR_ASSERT_SAFE_MSG(!b_x.hasNaN(), "b_x has NaN!");
         VTR_ASSERT_SAFE_MSG(!b_y.hasNaN(), "b_y has NaN!");
@@ -524,22 +527,24 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
         total_time_spent_solving_linear_system_ += runtime_timer.elapsed_sec() - solve_linear_system_start_time;
 
         // Save the result into the partial placement object.
-        for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
-            // Since we are capping the number of iterations, the solver may not
-            // have enough time to converge on a solution that is on the device.
-            // We just clamp the solution to zero for now.
-            // TODO: Should handle this better. If the solution is very negative
-            //       it may indicate a bug.
-            if (x[row_id_idx] < 0.0)
-                x[row_id_idx] = 0.0;
-            if (y[row_id_idx] < 0.0)
-                y[row_id_idx] = 0.0;
-
-            APRowId row_id = APRowId(row_id_idx);
-            APBlockId blk_id = row_id_to_blk_id_[row_id];
-            p_placement.block_x_locs[blk_id] = x[row_id_idx];
-            p_placement.block_y_locs[blk_id] = y[row_id_idx];
-        }
+        store_solution_into_placement(x, y, p_placement);
+
+        // If the current HPWL is larger than the previous HPWL (i.e. the HPWL
+        // got worst since last B2B iter) or the gap between the two solutions
+        // is small. Increment a counter.
+        // TODO: Since, in theory, the HPWL could get worst due to numerical
+        //       reasons, should we save the best result? May not be worth it...
+        curr_hpwl = p_placement.get_hpwl(netlist_);
+        double target_gap = b2b_convergence_gap_fac_ * curr_hpwl;
+        if (curr_hpwl > prev_hpwl || std::abs(curr_hpwl - prev_hpwl) < target_gap)
+            num_convergence++;
+
+        // If the HPWL got close enough times, stop. This is to allow the HPWL
+        // to "bounce", which can happen as it converges.
+        // This trades-off quality for run time.
+        if (num_convergence >= target_num_b2b_convergences_)
+            break;
+        prev_hpwl = curr_hpwl;
 
         // Update the guesses with the most recent answer
         x_guess = x;
@@ -723,8 +728,7 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
 
 // This function adds anchors for legalized solution. Anchors are treated as fixed node,
 // each connecting to a movable node. Number of nodes in a anchor net is always 2.
-void B2BSolver::update_linear_system_with_anchors(PartialPlacement& p_placement,
-                                                  unsigned iteration) {
+void B2BSolver::update_linear_system_with_anchors(unsigned iteration) {
     VTR_ASSERT_SAFE_MSG(iteration != 0,
                         "no fixed solution to anchor to in the first iteration");
     // Get the anchor weight based on the iteration number. We want the anchor
@@ -733,23 +737,39 @@ void B2BSolver::update_linear_system_with_anchors(PartialPlacement& p_placement,
     double coeff_pseudo_anchor = anchor_weight_mult_ * std::exp((double)iteration / anchor_weight_exp_fac_);
 
     // Add an anchor for each moveable block to its solved position.
-    // Note: We treat anchors as being a 2-pin net between a moveable block
-    //       and a fixed block where both are the bounds of the net.
     for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
         APRowId row_id = APRowId(row_id_idx);
         APBlockId blk_id = row_id_to_blk_id_[row_id];
-        double dx = std::abs(p_placement.block_x_locs[blk_id] - block_x_locs_legalized[blk_id]);
-        double dy = std::abs(p_placement.block_y_locs[blk_id] - block_y_locs_legalized[blk_id]);
-        // Anchor node are always 2 pins.
-        double pseudo_w_x = coeff_pseudo_anchor * 2.0 / std::max(dx, distance_epsilon_);
-        double pseudo_w_y = coeff_pseudo_anchor * 2.0 / std::max(dy, distance_epsilon_);
+        double pseudo_w_x = coeff_pseudo_anchor * 2.0;
+        double pseudo_w_y = coeff_pseudo_anchor * 2.0;
         A_sparse_x.coeffRef(row_id_idx, row_id_idx) += pseudo_w_x;
         A_sparse_y.coeffRef(row_id_idx, row_id_idx) += pseudo_w_y;
         b_x(row_id_idx) += pseudo_w_x * block_x_locs_legalized[blk_id];
         b_y(row_id_idx) += pseudo_w_y * block_y_locs_legalized[blk_id];
     }
 }
 
+void B2BSolver::store_solution_into_placement(Eigen::VectorXd& x_soln,
+                                              Eigen::VectorXd& y_soln,
+                                              PartialPlacement& p_placement) {
+    for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
+        // Since we are capping the number of iterations, the solver may not
+        // have enough time to converge on a solution that is on the device.
+        // We just clamp the solution to zero for now.
+        // TODO: Should handle this better. If the solution is very negative
+        //       it may indicate a bug.
+        if (x_soln[row_id_idx] < 0.0)
+            x_soln[row_id_idx] = 0.0;
+        if (y_soln[row_id_idx] < 0.0)
+            y_soln[row_id_idx] = 0.0;
+
+        APRowId row_id = APRowId(row_id_idx);
+        APBlockId blk_id = row_id_to_blk_id_[row_id];
+        p_placement.block_x_locs[blk_id] = x_soln[row_id_idx];
+        p_placement.block_y_locs[blk_id] = y_soln[row_id_idx];
+    }
+}
+
 void B2BSolver::print_statistics() {
     VTR_LOG("B2B Solver Statistics:\n");
     VTR_LOG("\tTotal number of CG iterations: %u\n", total_num_cg_iters_);

diff --git a/vpr/src/analytical_place/analytical_solver.h b/vpr/src/analytical_place/analytical_solver.h
@@ -359,11 +359,26 @@ class B2BSolver : public AnalyticalSolver {
     ///        than some epsilon.
     ///        Decreasing this number may lead to more instability, but can yield
     ///        a higher quality solution.
-    static constexpr double distance_epsilon_ = 0.5;
+    static constexpr double distance_epsilon_ = 0.01;
+
+    /// @brief The gap between the HPWL of the current solved solution in the
+    ///        B2B loop and the previous solved solution that is considered to
+    ///        be close-enough to be converged (as a fraction of the current
+    ///        solved solution HPWL).
+    /// Decreasing this number toward zero would cause the B2B solver to run
+    /// more iterations to try and reduce the HPWL further.
+    static constexpr double b2b_convergence_gap_fac_ = 0.001;
+
+    /// @brief The number of times the B2B loop should "converge" before stopping
+    ///        the loop. Due to numerical inaccuracies, it is possible for the
+    ///        HPWL to bounce up and down as it converges. Increasing this number
+    ///        will allow more bounces which may get better quality; however
+    ///        more iterations will need to be run.
+    static constexpr unsigned target_num_b2b_convergences_ = 2;
 
     /// @brief Max number of bound update / solve iterations. Increasing this
     ///        number will yield better quality at the expense of runtime.
-    static constexpr unsigned max_num_bound_updates_ = 6;
+    static constexpr unsigned max_num_bound_updates_ = 24;
 
     /// @brief Max number of iterations the Conjugate Gradient solver can perform.
     ///        Due to the weights getting very large in the early iterations of
@@ -376,7 +391,7 @@ class B2BSolver : public AnalyticalSolver {
     ///        to prevent this behaviour and get good runtime.
     // TODO: Need to investigate this more to find a good number for this.
     // TODO: Should this be a proportion of the design size?
-    static constexpr unsigned max_cg_iterations_ = 200;
+    static constexpr unsigned max_cg_iterations_ = 150;
 
     // The following constants are used to configure the anchor weighting.
     // The weights of anchors grow exponentially each iteration by the following
@@ -509,8 +524,18 @@ class B2BSolver : public AnalyticalSolver {
      * @brief Updates the linear system with anchor-blocks from the legalized
      *        solution.
      */
-    void update_linear_system_with_anchors(PartialPlacement& p_placement,
-                                           unsigned iteration);
+    void update_linear_system_with_anchors(unsigned iteration);
+
+    /**
+     * @brief Store the x and y solutions in Eigen's vectors into the partial
+     *        placement object.
+     *
+     * Note: The x_soln and y_soln may be modified if it is found that the
+     *       solution is imposible (i.e. has negative positions).
+     */
+    void store_solution_into_placement(Eigen::VectorXd& x_soln,
+                                       Eigen::VectorXd& y_soln,
+                                       PartialPlacement& p_placement);
 
     // The following are variables used to store the system of equations to be
     // solved in the x and y dimensions. The equations are of the form:

diff --git a/vpr/src/analytical_place/global_placer.cpp b/vpr/src/analytical_place/global_placer.cpp
@@ -8,6 +8,7 @@
 
 #include "global_placer.h"
 #include <cstdio>
+#include <limits>
 #include <memory>
 #include <vector>
 #include "analytical_solver.h"
@@ -207,6 +208,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
     float total_time_spent_in_solver = 0.0f;
     float total_time_spent_in_legalizer = 0.0f;
 
+    // Create a partial placement object to store the best placement found during
+    // global placement. It is possible for the global placement to hit a minimum
+    // in the middle of its iterations, this lets us keep that solution.
+    PartialPlacement best_p_placement(ap_netlist_);
+    double best_ub_hpwl = std::numeric_limits<double>::max();
+
     // Run the global placer.
     for (size_t i = 0; i < max_num_iterations_; i++) {
         float iter_start_time = runtime_timer.elapsed_sec();
@@ -235,6 +242,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
                                iter_end_time - iter_start_time);
         }
 
+        // If this placement is better than the best we have seen, save it.
+        if (ub_hpwl < best_ub_hpwl) {
+            best_ub_hpwl = ub_hpwl;
+            best_p_placement = p_placement;
+        }
+
         // Exit condition: If the upper-bound and lower-bound HPWLs are
         // sufficiently close together then stop.
         double hpwl_relative_gap = (ub_hpwl - lb_hpwl) / ub_hpwl;
@@ -254,12 +267,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
 
     // Print some statistics on the final placement.
     VTR_LOG("Placement after Global Placement:\n");
-    print_placement_stats(p_placement,
+    print_placement_stats(best_p_placement,
                           ap_netlist_,
                           *density_manager_);
 
     // Return the placement from the final iteration.
     // TODO: investigate saving the best solution found so far. It should be
     //       cheap to save a copy of the PartialPlacement object.
-    return p_placement;
+    return best_p_placement;
 }
diff --git a/vpr/src/analytical_place/partial_legalizer.cpp b/vpr/src/analytical_place/partial_legalizer.cpp
@@ -1490,12 +1490,17 @@ void BiPartitioningPartialLegalizer::partition_blocks_in_window(
     // windows. To do this we sort the unplaced blocks by largest mass to
     // smallest mass. Then we place each block in the bin with the highest
     // underfill.
+    // FIXME: Above was the intuition; however, after experimentation, found that
+    //        sorting by smallest mass to largest mass worked better...
+    // FIXME: I think large blocks (like carry chains) need to be handled special
+    //        early on. If they are put into a partition too late, they may have
+    //        to create overfill! Perhaps the partitions can hold two lists.
     std::sort(unplaced_blocks.begin(),
               unplaced_blocks.end(),
               [&](APBlockId a, APBlockId b) {
                   const auto& blk_a_mass = density_manager_->mass_calculator().get_block_mass(a);
                   const auto& blk_b_mass = density_manager_->mass_calculator().get_block_mass(b);
-                  return blk_a_mass.manhattan_norm() > blk_b_mass.manhattan_norm();
+                  return blk_a_mass.manhattan_norm() < blk_b_mass.manhattan_norm();
               });
     for (APBlockId blk_id : unplaced_blocks) {
         // Project the underfill from each window onto the mass. This gives us

diff --git a/vpr/src/analytical_place/partial_legalizer.h b/vpr/src/analytical_place/partial_legalizer.h
@@ -367,7 +367,7 @@ class BiPartitioningPartialLegalizer : public PartialLegalizer {
     /// create large windows; decreasing this number will put more pressure on
     /// the window generation code, which can increase window size and runtime.
     /// TODO: Should this be distance instead of number of bins?
-    static constexpr int max_bin_cluster_gap_ = 1;
+    static constexpr int max_bin_cluster_gap_ = 2;
 
   public:
     /**

diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
@@ -1903,7 +1903,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
             "Controls which Analytical Solver the Global Placer will use in the AP Flow.\n"
             " * qp-hybrid: olves for a placement that minimizes the quadratic HPWL of the flat placement using a hybrid clique/star net model.\n"
             " * lp-b2b: Solves for a placement that minimizes the linear HPWL of theflat placement using the Bound2Bound net model.")
-        .default_value("qp-hybrid")
+        .default_value("lp-b2b")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
     ap_grp.add_argument<e_ap_partial_legalizer, ParseAPPartialLegalizer>(args.ap_partial_legalizer, "--ap_partial_legalizer")

diff --git a/vpr/src/pack/appack_context.h b/vpr/src/pack/appack_context.h
@@ -12,7 +12,9 @@
 #include <limits>
 #include "device_grid.h"
 #include "flat_placement_types.h"
+#include "physical_types.h"
 #include "vpr_context.h"
+#include "vpr_utils.h"
 
 /**
  * @brief Configuration options for APPack.
@@ -33,12 +35,19 @@ struct t_appack_options {
         // distance on the device (from the bottom corner to the top corner).
         // We also use an offset for the minimum this distance can be to prevent
         // small devices from finding candidates.
-        float max_candidate_distance_scale = 0.5f;
-        float max_candidate_distance_offset = 20.f;
+        float max_candidate_distance_scale = 0.1f;
+        float max_candidate_distance_offset = 15.0f;
         // Longest L1 distance on the device.
         float longest_distance = device_grid.width() + device_grid.height();
         max_candidate_distance = std::max(max_candidate_distance_scale * longest_distance,
                                           max_candidate_distance_offset);
+
+        // Infer the logical block type in the architecture. This will be used
+        // for the max candidate distance optimization to use a more aggressive
+        // distance.
+        t_logical_block_type_ptr logic_block_type = infer_logic_block_type(device_grid);
+        if (logic_block_type != nullptr)
+            logic_block_type_index = logic_block_type->index;
     }
 
     // Whether to use APPack or not.
@@ -67,11 +76,17 @@ struct t_appack_options {
     // Distance threshold which decides when to use quadratic decay or inverted
     // sqrt decay. If the distance is less than this threshold, quadratic decay
     // is used. Inverted sqrt is used otherwise.
-    static constexpr float dist_th = 2.0f;
+    static constexpr float dist_th = 1.75f;
+    // Attenuation value at the threshold.
+    static constexpr float attenuation_th = 0.35f;
+
+    // Using the distance threshold and the attenuation value at that point, we
+    // can compute the other two terms. This is to keep the attenuation function
+    // smooth.
     // Horizontal offset to the inverted sqrt decay.
-    static constexpr float sqrt_offset = -6.1f;
-    // Scaling factor for the quadratic decay term.
-    static constexpr float quad_fac = 0.4f;
+    static constexpr float sqrt_offset = dist_th - ((1.0f / attenuation_th) * (1.0f / attenuation_th));
+    // Squared scaling factor for the quadratic decay term.
+    static constexpr float quad_fac_sqr = (1.0f - attenuation_th) / (dist_th * dist_th);
 
     // =========== Candidate selection distance ============================ //
     // When selecting candidates, what distance from the cluster will we
@@ -81,6 +96,14 @@ struct t_appack_options {
     //       types of molecules / clusters. For example, CLBs vs DSPs
     float max_candidate_distance = std::numeric_limits<float>::max();
 
+    // A scaling applied to the max candidate distance of all clusters that are
+    // not logic blocks.
+    static constexpr float max_candidate_distance_non_lb_scale = 3.5f;
+
+    // TODO: This should be an option similar to the target pin utilization
+    //       so we can specify the max distance per block type!
+    int logic_block_type_index = -1;
+
     // =========== Unrelated clustering ==================================== //
     // After searching for candidates by connectivity and timing, the user may
     // turn on unrelated clustering, which will allow molecules which are
@@ -95,7 +118,7 @@ struct t_appack_options {
     // search within the cluster's tile. Setting this to a higher number would
     // allow APPack to search farther away; but may bring in molecules which
     // do not "want" to be in the cluster.
-    static constexpr float max_unrelated_tile_distance = 1.0f;
+    static constexpr float max_unrelated_tile_distance = 5.0f;
 
     // Unrelated clustering occurs after all other candidate selection methods
     // have failed. This parameter sets how many time we will attempt unrelated
@@ -106,7 +129,7 @@ struct t_appack_options {
     // NOTE: A similar option exists in the candidate selector class. This was
     //       duplicated since it is very likely that APPack would need a
     //       different value for this option than the non-APPack flow.
-    static constexpr int max_unrelated_clustering_attempts = 2;
+    static constexpr int max_unrelated_clustering_attempts = 10;
 
     // TODO: Investigate adding flat placement info to seed selection.
 };