verilog-to-routing
diff --git a/‎doc/src/vpr/command_line_usage.rst
Lines changed: 1 addition & 1 deletion b/‎doc/src/vpr/command_line_usage.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎libs/EXTERNAL/libcatch2 b/‎libs/EXTERNAL/libcatch2
diff --git a/‎vpr/src/analytical_place/analytical_solver.cpp
Lines changed: 50 additions & 30 deletions b/‎vpr/src/analytical_place/analytical_solver.cpp
Lines changed: 50 additions & 30 deletions
diff --git a/‎vpr/src/analytical_place/analytical_solver.h
Lines changed: 30 additions & 5 deletions b/‎vpr/src/analytical_place/analytical_solver.h
Lines changed: 30 additions & 5 deletions
diff --git a/‎vpr/src/analytical_place/full_legalizer.cpp
Lines changed: 1 addition & 0 deletions b/‎vpr/src/analytical_place/full_legalizer.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎vpr/src/analytical_place/global_placer.cpp
Lines changed: 15 additions & 2 deletions b/‎vpr/src/analytical_place/global_placer.cpp
Lines changed: 15 additions & 2 deletions
diff --git a/‎vpr/src/analytical_place/partial_legalizer.cpp
Lines changed: 6 additions & 1 deletion b/‎vpr/src/analytical_place/partial_legalizer.cpp
Lines changed: 6 additions & 1 deletion
diff --git a/‎vpr/src/analytical_place/partial_legalizer.h
Lines changed: 1 addition & 1 deletion b/‎vpr/src/analytical_place/partial_legalizer.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎vpr/src/base/read_options.cpp
Lines changed: 1 addition & 1 deletion b/‎vpr/src/base/read_options.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎vpr/src/base/vpr_api.cpp
Lines changed: 9 additions & 1 deletion b/‎vpr/src/base/vpr_api.cpp
Lines changed: 9 additions & 1 deletion
@@ -1214,7 +1214,7 @@ Analytical Placement is generally split into three stages:
       Uses the legalized solution as anchor-points to pull the solution to a
       more legal solution (similar to the approach from SimPL :cite:`Kim2013_SimPL`).
 
-    **Default:** ``qp-hybrid``
+    **Default:** ``lp-b2b``
 
 .. option:: --ap_partial_legalizer {bipartitioning | flow-based}
 
 
@@ -335,8 +335,8 @@ void QPHybridSolver::solve(unsigned iteration, PartialPlacement& p_placement) {
                                           p_placement, iteration);
     }
     // Verify that the constant vectors are valid.
-    VTR_ASSERT_DEBUG(!b_x_diff.hasNaN() && "b_x has NaN!");
-    VTR_ASSERT_DEBUG(!b_y_diff.hasNaN() && "b_y has NaN!");
+    VTR_ASSERT_SAFE_MSG(!b_x_diff.hasNaN(), "b_x has NaN!");
+    VTR_ASSERT_SAFE_MSG(!b_y_diff.hasNaN(), "b_y has NaN!");
 
     // Set up the ConjugateGradient Solver using the coefficient matrix.
     // TODO: can change cg.tolerance to increase performance when needed
@@ -479,8 +479,11 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
     //         p_placement.
     //      3) Repeat. Note: We need to repeat step 1 and 2 iteratively since
     //         the bounds are likely to have changed after step 2.
-    // TODO: As well as having a maximum number of bound updates, should also
-    //       investigate stopping when the HPWL converges.
+    // We stop when it looks like the placement is converging (the change in
+    // HPWL is sufficiently small for a few iterations).
+    double prev_hpwl = std::numeric_limits<double>::max();
+    double curr_hpwl = prev_hpwl;
+    unsigned num_convergence = 0;
     for (unsigned counter = 0; counter < max_num_bound_updates_; counter++) {
         VTR_LOGV(log_verbosity_ >= 10,
                  "\tPlacement HPWL in b2b loop: %f\n",
@@ -490,7 +493,7 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
         float build_linear_system_start_time = runtime_timer.elapsed_sec();
         init_linear_system(p_placement);
         if (iteration != 0)
-            update_linear_system_with_anchors(p_placement, iteration);
+            update_linear_system_with_anchors(iteration);
         total_time_spent_building_linear_system_ += runtime_timer.elapsed_sec() - build_linear_system_start_time;
         VTR_ASSERT_SAFE_MSG(!b_x.hasNaN(), "b_x has NaN!");
         VTR_ASSERT_SAFE_MSG(!b_y.hasNaN(), "b_y has NaN!");
@@ -524,22 +527,24 @@ void B2BSolver::b2b_solve_loop(unsigned iteration, PartialPlacement& p_placement
         total_time_spent_solving_linear_system_ += runtime_timer.elapsed_sec() - solve_linear_system_start_time;
 
         // Save the result into the partial placement object.
-        for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
-            // Since we are capping the number of iterations, the solver may not
-            // have enough time to converge on a solution that is on the device.
-            // We just clamp the solution to zero for now.
-            // TODO: Should handle this better. If the solution is very negative
-            //       it may indicate a bug.
-            if (x[row_id_idx] < 0.0)
-                x[row_id_idx] = 0.0;
-            if (y[row_id_idx] < 0.0)
-                y[row_id_idx] = 0.0;
-
-            APRowId row_id = APRowId(row_id_idx);
-            APBlockId blk_id = row_id_to_blk_id_[row_id];
-            p_placement.block_x_locs[blk_id] = x[row_id_idx];
-            p_placement.block_y_locs[blk_id] = y[row_id_idx];
-        }
+        store_solution_into_placement(x, y, p_placement);
+
+        // If the current HPWL is larger than the previous HPWL (i.e. the HPWL
+        // got worst since last B2B iter) or the gap between the two solutions
+        // is small. Increment a counter.
+        // TODO: Since, in theory, the HPWL could get worst due to numerical
+        //       reasons, should we save the best result? May not be worth it...
+        curr_hpwl = p_placement.get_hpwl(netlist_);
+        double target_gap = b2b_convergence_gap_fac_ * curr_hpwl;
+        if (curr_hpwl > prev_hpwl || std::abs(curr_hpwl - prev_hpwl) < target_gap)
+            num_convergence++;
+
+        // If the HPWL got close enough times, stop. This is to allow the HPWL
+        // to "bounce", which can happen as it converges.
+        // This trades-off quality for run time.
+        if (num_convergence >= target_num_b2b_convergences_)
+            break;
+        prev_hpwl = curr_hpwl;
 
         // Update the guesses with the most recent answer
         x_guess = x;
@@ -723,8 +728,7 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
 
 // This function adds anchors for legalized solution. Anchors are treated as fixed node,
 // each connecting to a movable node. Number of nodes in a anchor net is always 2.
-void B2BSolver::update_linear_system_with_anchors(PartialPlacement& p_placement,
-                                                  unsigned iteration) {
+void B2BSolver::update_linear_system_with_anchors(unsigned iteration) {
     VTR_ASSERT_SAFE_MSG(iteration != 0,
                         "no fixed solution to anchor to in the first iteration");
     // Get the anchor weight based on the iteration number. We want the anchor
@@ -733,23 +737,39 @@ void B2BSolver::update_linear_system_with_anchors(PartialPlacement& p_placement,
     double coeff_pseudo_anchor = anchor_weight_mult_ * std::exp((double)iteration / anchor_weight_exp_fac_);
 
     // Add an anchor for each moveable block to its solved position.
-    // Note: We treat anchors as being a 2-pin net between a moveable block
-    //       and a fixed block where both are the bounds of the net.
     for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
         APRowId row_id = APRowId(row_id_idx);
         APBlockId blk_id = row_id_to_blk_id_[row_id];
-        double dx = std::abs(p_placement.block_x_locs[blk_id] - block_x_locs_legalized[blk_id]);
-        double dy = std::abs(p_placement.block_y_locs[blk_id] - block_y_locs_legalized[blk_id]);
-        // Anchor node are always 2 pins.
-        double pseudo_w_x = coeff_pseudo_anchor * 2.0 / std::max(dx, distance_epsilon_);
-        double pseudo_w_y = coeff_pseudo_anchor * 2.0 / std::max(dy, distance_epsilon_);
+        double pseudo_w_x = coeff_pseudo_anchor * 2.0;
+        double pseudo_w_y = coeff_pseudo_anchor * 2.0;
         A_sparse_x.coeffRef(row_id_idx, row_id_idx) += pseudo_w_x;
         A_sparse_y.coeffRef(row_id_idx, row_id_idx) += pseudo_w_y;
         b_x(row_id_idx) += pseudo_w_x * block_x_locs_legalized[blk_id];
         b_y(row_id_idx) += pseudo_w_y * block_y_locs_legalized[blk_id];
     }
 }
 
+void B2BSolver::store_solution_into_placement(Eigen::VectorXd& x_soln,
+                                              Eigen::VectorXd& y_soln,
+                                              PartialPlacement& p_placement) {
+    for (size_t row_id_idx = 0; row_id_idx < num_moveable_blocks_; row_id_idx++) {
+        // Since we are capping the number of iterations, the solver may not
+        // have enough time to converge on a solution that is on the device.
+        // We just clamp the solution to zero for now.
+        // TODO: Should handle this better. If the solution is very negative
+        //       it may indicate a bug.
+        if (x_soln[row_id_idx] < 0.0)
+            x_soln[row_id_idx] = 0.0;
+        if (y_soln[row_id_idx] < 0.0)
+            y_soln[row_id_idx] = 0.0;
+
+        APRowId row_id = APRowId(row_id_idx);
+        APBlockId blk_id = row_id_to_blk_id_[row_id];
+        p_placement.block_x_locs[blk_id] = x_soln[row_id_idx];
+        p_placement.block_y_locs[blk_id] = y_soln[row_id_idx];
+    }
+}
+
 void B2BSolver::print_statistics() {
     VTR_LOG("B2B Solver Statistics:\n");
     VTR_LOG("\tTotal number of CG iterations: %u\n", total_num_cg_iters_);
 
@@ -359,11 +359,26 @@ class B2BSolver : public AnalyticalSolver {
     ///        than some epsilon.
     ///        Decreasing this number may lead to more instability, but can yield
     ///        a higher quality solution.
-    static constexpr double distance_epsilon_ = 0.5;
+    static constexpr double distance_epsilon_ = 0.01;
+
+    /// @brief The gap between the HPWL of the current solved solution in the
+    ///        B2B loop and the previous solved solution that is considered to
+    ///        be close-enough to be converged (as a fraction of the current
+    ///        solved solution HPWL).
+    /// Decreasing this number toward zero would cause the B2B solver to run
+    /// more iterations to try and reduce the HPWL further.
+    static constexpr double b2b_convergence_gap_fac_ = 0.001;
+
+    /// @brief The number of times the B2B loop should "converge" before stopping
+    ///        the loop. Due to numerical inaccuracies, it is possible for the
+    ///        HPWL to bounce up and down as it converges. Increasing this number
+    ///        will allow more bounces which may get better quality; however
+    ///        more iterations will need to be run.
+    static constexpr unsigned target_num_b2b_convergences_ = 2;
 
     /// @brief Max number of bound update / solve iterations. Increasing this
     ///        number will yield better quality at the expense of runtime.
-    static constexpr unsigned max_num_bound_updates_ = 6;
+    static constexpr unsigned max_num_bound_updates_ = 24;
 
     /// @brief Max number of iterations the Conjugate Gradient solver can perform.
     ///        Due to the weights getting very large in the early iterations of
@@ -376,7 +391,7 @@ class B2BSolver : public AnalyticalSolver {
     ///        to prevent this behaviour and get good runtime.
     // TODO: Need to investigate this more to find a good number for this.
     // TODO: Should this be a proportion of the design size?
-    static constexpr unsigned max_cg_iterations_ = 200;
+    static constexpr unsigned max_cg_iterations_ = 150;
 
     // The following constants are used to configure the anchor weighting.
     // The weights of anchors grow exponentially each iteration by the following
@@ -509,8 +524,18 @@ class B2BSolver : public AnalyticalSolver {
      * @brief Updates the linear system with anchor-blocks from the legalized
      *        solution.
      */
-    void update_linear_system_with_anchors(PartialPlacement& p_placement,
-                                           unsigned iteration);
+    void update_linear_system_with_anchors(unsigned iteration);
+
+    /**
+     * @brief Store the x and y solutions in Eigen's vectors into the partial
+     *        placement object.
+     *
+     * Note: The x_soln and y_soln may be modified if it is found that the
+     *       solution is imposible (i.e. has negative positions).
+     */
+    void store_solution_into_placement(Eigen::VectorXd& x_soln,
+                                       Eigen::VectorXd& y_soln,
+                                       PartialPlacement& p_placement);
 
     // The following are variables used to store the system of equations to be
     // solved in the x and y dimensions. The equations are of the form:
 
@@ -518,6 +518,7 @@ void APPack::legalize(const PartialPlacement& p_placement) {
              arch_,
              vpr_setup_.RoutingArch,
              vpr_setup_.PackerRRGraph,
+             prepacker_,
              flat_placement_info);
 
     // The Packer stores the clusters into a .net file. Load the packing file.
 
@@ -8,6 +8,7 @@
 
 #include "global_placer.h"
 #include <cstdio>
+#include <limits>
 #include <memory>
 #include <vector>
 #include "analytical_solver.h"
@@ -207,6 +208,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
     float total_time_spent_in_solver = 0.0f;
     float total_time_spent_in_legalizer = 0.0f;
 
+    // Create a partial placement object to store the best placement found during
+    // global placement. It is possible for the global placement to hit a minimum
+    // in the middle of its iterations, this lets us keep that solution.
+    PartialPlacement best_p_placement(ap_netlist_);
+    double best_ub_hpwl = std::numeric_limits<double>::max();
+
     // Run the global placer.
     for (size_t i = 0; i < max_num_iterations_; i++) {
         float iter_start_time = runtime_timer.elapsed_sec();
@@ -235,6 +242,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
                                iter_end_time - iter_start_time);
         }
 
+        // If this placement is better than the best we have seen, save it.
+        if (ub_hpwl < best_ub_hpwl) {
+            best_ub_hpwl = ub_hpwl;
+            best_p_placement = p_placement;
+        }
+
         // Exit condition: If the upper-bound and lower-bound HPWLs are
         // sufficiently close together then stop.
         double hpwl_relative_gap = (ub_hpwl - lb_hpwl) / ub_hpwl;
@@ -254,12 +267,12 @@ PartialPlacement SimPLGlobalPlacer::place() {
 
     // Print some statistics on the final placement.
     VTR_LOG("Placement after Global Placement:\n");
-    print_placement_stats(p_placement,
+    print_placement_stats(best_p_placement,
                           ap_netlist_,
                           *density_manager_);
 
     // Return the placement from the final iteration.
     // TODO: investigate saving the best solution found so far. It should be
     //       cheap to save a copy of the PartialPlacement object.
-    return p_placement;
+    return best_p_placement;
 }
@@ -1490,12 +1490,17 @@ void BiPartitioningPartialLegalizer::partition_blocks_in_window(
     // windows. To do this we sort the unplaced blocks by largest mass to
     // smallest mass. Then we place each block in the bin with the highest
     // underfill.
+    // FIXME: Above was the intuition; however, after experimentation, found that
+    //        sorting by smallest mass to largest mass worked better...
+    // FIXME: I think large blocks (like carry chains) need to be handled special
+    //        early on. If they are put into a partition too late, they may have
+    //        to create overfill! Perhaps the partitions can hold two lists.
     std::sort(unplaced_blocks.begin(),
               unplaced_blocks.end(),
               [&](APBlockId a, APBlockId b) {
                   const auto& blk_a_mass = density_manager_->mass_calculator().get_block_mass(a);
                   const auto& blk_b_mass = density_manager_->mass_calculator().get_block_mass(b);
-                  return blk_a_mass.manhattan_norm() > blk_b_mass.manhattan_norm();
+                  return blk_a_mass.manhattan_norm() < blk_b_mass.manhattan_norm();
               });
     for (APBlockId blk_id : unplaced_blocks) {
         // Project the underfill from each window onto the mass. This gives us
 
@@ -367,7 +367,7 @@ class BiPartitioningPartialLegalizer : public PartialLegalizer {
     /// create large windows; decreasing this number will put more pressure on
     /// the window generation code, which can increase window size and runtime.
     /// TODO: Should this be distance instead of number of bins?
-    static constexpr int max_bin_cluster_gap_ = 1;
+    static constexpr int max_bin_cluster_gap_ = 2;
 
   public:
     /**
 
@@ -1907,7 +1907,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
             "Controls which Analytical Solver the Global Placer will use in the AP Flow.\n"
             " * qp-hybrid: olves for a placement that minimizes the quadratic HPWL of the flat placement using a hybrid clique/star net model.\n"
             " * lp-b2b: Solves for a placement that minimizes the linear HPWL of theflat placement using the Bound2Bound net model.")
-        .default_value("qp-hybrid")
+        .default_value("lp-b2b")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
     ap_grp.add_argument<e_ap_partial_legalizer, ParseAPPartialLegalizer>(args.ap_partial_legalizer, "--ap_partial_legalizer")
 
@@ -614,9 +614,17 @@ bool vpr_pack(t_vpr_setup& vpr_setup, const t_arch& arch) {
                                                                                      g_vpr_ctx.atom().netlist());
     }
 
+    // Run the prepacker, packing the atoms into molecules.
+    // The Prepacker object performs prepacking and stores the pack molecules.
+    // As long as the molecules are used, this object must persist.
+    const Prepacker prepacker(g_vpr_ctx.atom().netlist(),
+                              g_vpr_ctx.device().logical_block_types);
+
     return try_pack(&vpr_setup.PackerOpts, &vpr_setup.AnalysisOpts,
                     arch, vpr_setup.RoutingArch,
-                    vpr_setup.PackerRRGraph, g_vpr_ctx.atom().flat_placement_info());
+                    vpr_setup.PackerRRGraph,
+                    prepacker,
+                    g_vpr_ctx.atom().flat_placement_info());
 }
 
 void vpr_load_packing(const t_vpr_setup& vpr_setup, const t_arch& arch) {