verilog-to-routing
diff --git a/‎doc/src/api/vtrutil/index.rst
Lines changed: 1 addition & 0 deletions b/‎doc/src/api/vtrutil/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/src/api/vtrutil/parallel.rst
Lines changed: 13 additions & 0 deletions b/‎doc/src/api/vtrutil/parallel.rst
Lines changed: 13 additions & 0 deletions
diff --git a/‎doc/src/vpr/command_line_usage.rst
Lines changed: 9 additions & 0 deletions b/‎doc/src/vpr/command_line_usage.rst
Lines changed: 9 additions & 0 deletions
diff --git a/‎libs/libvtrutil/src/vtr_thread_pool.h
Lines changed: 15 additions & 9 deletions b/‎libs/libvtrutil/src/vtr_thread_pool.h
Lines changed: 15 additions & 9 deletions
diff --git a/‎vpr/src/analytical_place/analytical_placement_flow.cpp
Lines changed: 19 additions & 0 deletions b/‎vpr/src/analytical_place/analytical_placement_flow.cpp
Lines changed: 19 additions & 0 deletions
diff --git a/‎vpr/src/analytical_place/analytical_solver.cpp
Lines changed: 57 additions & 12 deletions b/‎vpr/src/analytical_place/analytical_solver.cpp
Lines changed: 57 additions & 12 deletions
@@ -11,4 +11,5 @@ VTRUTIL API
    container_utils
    logging
    geometry
+   parallel
    other
@@ -0,0 +1,13 @@
+=====
+Parallel
+=====
+
+vtr_thread_pool
+-------------
+.. doxygenfile:: vtr_thread_pool.h
+   :project: vtr
+   :sections: briefdescription detaileddescription func innernamespace enum
+
+.. doxygenclass:: vtr::thread_pool
+   :project: vtr
+   :members:
@@ -1253,6 +1253,15 @@ Analytical Placement is generally split into three stages:
 
     **Default:** ``annealer``
 
+.. option:: --ap_timing_tradeoff <float>
+
+    Controls the trade-off between wirelength (HPWL) and delay minimization in the AP flow.
+
+    A value of 0.0 makes the AP flow focus completely on wirelength minimization,
+    while a value of 1.0 makes the AP flow focus completely on timing optimization.
+
+    **Default:** ``0.5``
+
 .. option:: --ap_verbosity <int>
 
     Controls the verbosity of the AP flow output.
 
@@ -26,39 +26,42 @@ namespace vtr {
  *
  * Example usage:
  *
- * vtr::thread_pool pool(4);
+ * ```
+ * vtr::thread_pool pool(4);  // 4 threads
  * pool.schedule_work([]{
  *     // Task body
  * });
- * pool.wait_for_all(); // There's no API to wait for a single task
+ * pool.wait_for_all();  // There's no API to wait for a single task
+ * ```
  */
 class thread_pool {
   private:
-    /* Thread-local data */
+    /** Thread-local data */
     struct ThreadData {
         std::thread thread;
-        /* Per-thread task queue */
+        /** Per-thread task queue */
         std::queue<std::function<void()>> task_queue;
 
-        /* Threads wait on cv for a stop signal or a new task
+        /** Threads wait on cv for a stop signal or a new task
          * queue_mutex is required for condition variable */
         std::mutex queue_mutex;
         std::condition_variable cv;
         bool stop = false;
     };
 
-    /* Container for thread-local data */
+    /** Container for thread-local data */
     std::vector<std::unique_ptr<ThreadData>> threads;
-    /* Used for round-robin scheduling */
+    /** Used for round-robin scheduling */
     std::atomic<size_t> next_thread{0};
-    /* Used for wait_for_all */
+    /** Used for wait_for_all */
     std::atomic<size_t> active_tasks{0};
 
-    /* Condition variable for wait_for_all */
+    /** Condition variable for wait_for_all */
     std::mutex completion_mutex;
     std::condition_variable completion_cv;
 
   public:
+    /** Create a thread pool with \p thread_count threads. */
     thread_pool(size_t thread_count) {
         threads.reserve(thread_count);
 
@@ -96,6 +99,7 @@ class thread_pool {
         }
     }
 
+    /** Schedule a function to be executed on one of the threads. */
     template<typename F>
     void schedule_work(F&& f) {
         active_tasks++;
@@ -133,6 +137,8 @@ class thread_pool {
         thread_data->cv.notify_one();
     }
 
+    /** Wait until the work queue is empty.
+     * Note that functions are allowed to schedule new functions. */
     void wait_for_all() {
         std::unique_lock<std::mutex> lock(completion_mutex);
         completion_cv.wait(lock, [this]() { return active_tasks == 0; });
 
@@ -7,9 +7,11 @@
 
 #include "analytical_placement_flow.h"
 #include <memory>
+#include "PreClusterTimingManager.h"
 #include "analytical_solver.h"
 #include "ap_netlist.h"
 #include "atom_netlist.h"
+#include "cluster_util.h"
 #include "detailed_placer.h"
 #include "full_legalizer.h"
 #include "gen_ap_netlist_from_atoms.h"
@@ -120,6 +122,7 @@ static PartialPlacement run_global_placer(const t_ap_opts& ap_opts,
                                           const AtomNetlist& atom_nlist,
                                           const APNetlist& ap_netlist,
                                           const Prepacker& prepacker,
+                                          const PreClusterTimingManager& pre_cluster_timing_manager,
                                           const DeviceContext& device_ctx) {
     if (g_vpr_ctx.atom().flat_placement_info().valid) {
         VTR_LOG("Flat Placement is provided in the AP flow, skipping the Global Placement.\n");
@@ -139,6 +142,8 @@ static PartialPlacement run_global_placer(const t_ap_opts& ap_opts,
                                                                          device_ctx.grid,
                                                                          device_ctx.logical_block_types,
                                                                          device_ctx.physical_tile_types,
+                                                                         pre_cluster_timing_manager,
+                                                                         ap_opts.ap_timing_tradeoff,
                                                                          ap_opts.log_verbosity);
         return global_placer->place();
     }
@@ -163,12 +168,25 @@ void run_analytical_placement_flow(t_vpr_setup& vpr_setup) {
                                                      constraints);
     print_ap_netlist_stats(ap_netlist);
 
+    // Pre-compute the pre-clustering timing delays. This object will be passed
+    // into the global placer and the full legalizer to make them timing driven.
+    PreClusterTimingManager pre_cluster_timing_manager(vpr_setup.PackerOpts.timing_driven,
+                                                       atom_nlist,
+                                                       g_vpr_ctx.atom().lookup(),
+                                                       prepacker,
+                                                       vpr_setup.PackerOpts.timing_update_type,
+                                                       *device_ctx.arch,
+                                                       vpr_setup.RoutingArch,
+                                                       vpr_setup.PackerOpts.device_layout,
+                                                       vpr_setup.AnalysisOpts);
+
     // Run the Global Placer.
     const t_ap_opts& ap_opts = vpr_setup.APOpts;
     PartialPlacement p_placement = run_global_placer(ap_opts,
                                                      atom_nlist,
                                                      ap_netlist,
                                                      prepacker,
+                                                     pre_cluster_timing_manager,
                                                      device_ctx);
 
     // Verify that the partial placement is valid before running the full
@@ -185,6 +203,7 @@ void run_analytical_placement_flow(t_vpr_setup& vpr_setup) {
                                                                         ap_netlist,
                                                                         atom_nlist,
                                                                         prepacker,
+                                                                        pre_cluster_timing_manager,
                                                                         vpr_setup,
                                                                         *device_ctx.arch,
                                                                         device_ctx.grid);
 
@@ -13,6 +13,9 @@
 #include <memory>
 #include <utility>
 #include <vector>
+#include "PreClusterTimingManager.h"
+#include "atom_netlist.h"
+#include "atom_netlist_fwd.h"
 #include "device_grid.h"
 #include "flat_placement_types.h"
 #include "partial_placement.h"
@@ -42,23 +45,39 @@
 std::unique_ptr<AnalyticalSolver> make_analytical_solver(e_ap_analytical_solver solver_type,
                                                          const APNetlist& netlist,
                                                          const DeviceGrid& device_grid,
+                                                         const AtomNetlist& atom_netlist,
+                                                         const PreClusterTimingManager& pre_cluster_timing_manager,
+                                                         float ap_timing_tradeoff,
                                                          int log_verbosity) {
     // Based on the solver type passed in, build the solver.
     switch (solver_type) {
         case e_ap_analytical_solver::QP_Hybrid:
 #ifdef EIGEN_INSTALLED
-            return std::make_unique<QPHybridSolver>(netlist, device_grid, log_verbosity);
+            return std::make_unique<QPHybridSolver>(netlist,
+                                                    device_grid,
+                                                    atom_netlist,
+                                                    pre_cluster_timing_manager,
+                                                    ap_timing_tradeoff,
+                                                    log_verbosity);
 #else
             (void)netlist;
             (void)device_grid;
+            (void)atom_netlist;
+            (void)pre_cluster_timing_manager;
+            (void)ap_timing_tradeoff;
             (void)log_verbosity;
             VPR_FATAL_ERROR(VPR_ERROR_AP,
                             "QP Hybrid Solver requires the Eigen library");
             break;
 #endif // EIGEN_INSTALLED
         case e_ap_analytical_solver::LP_B2B:
 #ifdef EIGEN_INSTALLED
-            return std::make_unique<B2BSolver>(netlist, device_grid, log_verbosity);
+            return std::make_unique<B2BSolver>(netlist,
+                                               device_grid,
+                                               atom_netlist,
+                                               pre_cluster_timing_manager,
+                                               ap_timing_tradeoff,
+                                               log_verbosity);
 #else
             VPR_FATAL_ERROR(VPR_ERROR_AP,
                             "LP B2B Solver requires the Eigen library");
@@ -72,10 +91,15 @@ std::unique_ptr<AnalyticalSolver> make_analytical_solver(e_ap_analytical_solver
     return nullptr;
 }
 
-AnalyticalSolver::AnalyticalSolver(const APNetlist& netlist, int log_verbosity)
+AnalyticalSolver::AnalyticalSolver(const APNetlist& netlist,
+                                   const AtomNetlist& atom_netlist,
+                                   const PreClusterTimingManager& pre_cluster_timing_manager,
+                                   float ap_timing_tradeoff,
+                                   int log_verbosity)
     : netlist_(netlist)
     , blk_id_to_row_id_(netlist.blocks().size(), APRowId::INVALID())
     , row_id_to_blk_id_(netlist.blocks().size(), APBlockId::INVALID())
+    , net_weights_(netlist.nets().size(), 1.0f)
     , log_verbosity_(log_verbosity) {
     // Get the number of moveable blocks in the netlist and create a unique
     // row ID from [0, num_moveable_blocks) for each moveable block in the
@@ -94,6 +118,21 @@ AnalyticalSolver::AnalyticalSolver(const APNetlist& netlist, int log_verbosity)
         current_row_id++;
         num_moveable_blocks_++;
     }
+
+    if (pre_cluster_timing_manager.is_valid()) {
+        for (APNetId net_id : netlist.nets()) {
+            // Get the atom net associated with the given AP net. When
+            // constructing the AP netlist, we happen to set the name of each
+            // AP net to the same name as the atom net that generated them!
+            // TODO: Create a proper lookup structure to go from the AP Netlist
+            //       back to the Atom Netlist.
+            AtomNetId atom_net_id = atom_netlist.find_net(netlist.net_name(net_id));
+            VTR_ASSERT(atom_net_id.is_valid());
+            float crit = pre_cluster_timing_manager.calc_net_setup_criticality(atom_net_id, atom_netlist);
+
+            net_weights_[net_id] = ap_timing_tradeoff * crit + (1.0f - ap_timing_tradeoff);
+        }
+    }
 }
 
 #ifdef EIGEN_INSTALLED
@@ -201,12 +240,15 @@ void QPHybridSolver::init_linear_system() {
     for (APNetId net_id : netlist_.nets()) {
         size_t num_pins = netlist_.net_pins(net_id).size();
         VTR_ASSERT_DEBUG(num_pins > 1);
+
+        double net_weight = net_weights_[net_id];
+
         if (num_pins > star_num_pins_threshold) {
             // Create a star node and connect each block in the net to the star
             // node.
             // Using the weight from FastPlace
             // TODO: Investigate other weight terms.
-            double w = static_cast<double>(num_pins) / static_cast<double>(num_pins - 1);
+            double w = net_weight * static_cast<double>(num_pins) / static_cast<double>(num_pins - 1);
             size_t star_node_id = num_moveable_blocks_ + star_node_offset;
             for (APPinId pin_id : netlist_.net_pins(net_id)) {
                 APBlockId blk_id = netlist_.pin_block(pin_id);
@@ -220,7 +262,7 @@ void QPHybridSolver::init_linear_system() {
             // exactly once to every other block in the net.
             // Using the weight from FastPlace
             // TODO: Investigate other weight terms.
-            double w = 1.0 / static_cast<double>(num_pins - 1);
+            double w = net_weight * 1.0 / static_cast<double>(num_pins - 1);
             for (size_t ipin_idx = 0; ipin_idx < num_pins; ipin_idx++) {
                 APPinId first_pin_id = netlist_.net_pin(net_id, ipin_idx);
                 APBlockId first_blk_id = netlist_.pin_block(first_pin_id);
@@ -638,6 +680,7 @@ static inline APNetBounds get_unique_net_bounds(APNetId net_id,
 void B2BSolver::add_connection_to_system(APBlockId first_blk_id,
                                          APBlockId second_blk_id,
                                          size_t num_pins,
+                                         double net_w,
                                          const vtr::vector<APBlockId, double>& blk_locs,
                                          std::vector<Eigen::Triplet<double>>& triplet_list,
                                          Eigen::VectorXd& b) {
@@ -660,7 +703,7 @@ void B2BSolver::add_connection_to_system(APBlockId first_blk_id,
     // The denominator of weight is zero, which causes infinity term in the matrix. Another way of
     // interpreting epsilon is the minimum distance two nodes are considered to be in placement.
     double dist = std::max(std::abs(blk_locs[first_blk_id] - blk_locs[second_blk_id]), distance_epsilon_);
-    double w = (2.0 / static_cast<double>(num_pins - 1)) * (1.0 / dist);
+    double w = net_w * (2.0 / static_cast<double>(num_pins - 1)) * (1.0 / dist);
 
     // Update the connectivity matrix and the constant vector.
     // This is similar to how connections are added for the quadratic formulation.
@@ -696,6 +739,8 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
         size_t num_pins = netlist_.net_pins(net_id).size();
         VTR_ASSERT_SAFE_MSG(num_pins > 1, "net must have at least 2 pins");
 
+        double net_w = net_weights_[net_id];
+
         // Find the bounding blocks
         APNetBounds net_bounds = get_unique_net_bounds(net_id, p_placement, netlist_);
 
@@ -706,19 +751,19 @@ void B2BSolver::init_linear_system(PartialPlacement& p_placement) {
         for (APPinId pin_id : netlist_.net_pins(net_id)) {
             APBlockId blk_id = netlist_.pin_block(pin_id);
             if (blk_id != net_bounds.max_x_blk && blk_id != net_bounds.min_x_blk) {
-                add_connection_to_system(blk_id, net_bounds.max_x_blk, num_pins, p_placement.block_x_locs, triplet_list_x, b_x);
-                add_connection_to_system(blk_id, net_bounds.min_x_blk, num_pins, p_placement.block_x_locs, triplet_list_x, b_x);
+                add_connection_to_system(blk_id, net_bounds.max_x_blk, num_pins, net_w, p_placement.block_x_locs, triplet_list_x, b_x);
+                add_connection_to_system(blk_id, net_bounds.min_x_blk, num_pins, net_w, p_placement.block_x_locs, triplet_list_x, b_x);
             }
             if (blk_id != net_bounds.max_y_blk && blk_id != net_bounds.min_y_blk) {
-                add_connection_to_system(blk_id, net_bounds.max_y_blk, num_pins, p_placement.block_y_locs, triplet_list_y, b_y);
-                add_connection_to_system(blk_id, net_bounds.min_y_blk, num_pins, p_placement.block_y_locs, triplet_list_y, b_y);
+                add_connection_to_system(blk_id, net_bounds.max_y_blk, num_pins, net_w, p_placement.block_y_locs, triplet_list_y, b_y);
+                add_connection_to_system(blk_id, net_bounds.min_y_blk, num_pins, net_w, p_placement.block_y_locs, triplet_list_y, b_y);
             }
         }
 
         // Connect the bounds to each other. Its just easier to put these here
         // instead of in the for loop above.
-        add_connection_to_system(net_bounds.max_x_blk, net_bounds.min_x_blk, num_pins, p_placement.block_x_locs, triplet_list_x, b_x);
-        add_connection_to_system(net_bounds.max_y_blk, net_bounds.min_y_blk, num_pins, p_placement.block_y_locs, triplet_list_y, b_y);
+        add_connection_to_system(net_bounds.max_x_blk, net_bounds.min_x_blk, num_pins, net_w, p_placement.block_x_locs, triplet_list_x, b_x);
+        add_connection_to_system(net_bounds.max_y_blk, net_bounds.min_y_blk, num_pins, net_w, p_placement.block_y_locs, triplet_list_y, b_y);
     }
 
     // Build the sparse connectivity matrices from the triplets.