AlexandreSinger · AlexandreSinger · Jul 13, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 4, 2024
diff --git a/libs/EXTERNAL/CPS b/libs/EXTERNAL/CPS
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
@@ -410,9 +410,14 @@ static void SetupRoutingArch(const t_arch& Arch,
 static void SetupRouterOpts(const t_options& Options, t_router_opts* RouterOpts) {
     RouterOpts->do_check_rr_graph = Options.check_rr_graph;
     RouterOpts->astar_fac = Options.astar_fac;
+    RouterOpts->astar_offset = Options.astar_offset;
     RouterOpts->router_profiler_astar_fac = Options.router_profiler_astar_fac;
     RouterOpts->post_target_prune_fac = Options.post_target_prune_fac;
     RouterOpts->post_target_prune_offset = Options.post_target_prune_offset;
+    RouterOpts->multi_queue_num_threads = Options.multi_queue_num_threads;
+    RouterOpts->multi_queue_num_queues = Options.multi_queue_num_queues;
+    RouterOpts->multi_queue_direct_draining = Options.multi_queue_direct_draining;
+    RouterOpts->thread_affinity = Options.thread_affinity;
     RouterOpts->bb_factor = Options.bb_factor;
     RouterOpts->criticality_exp = Options.criticality_exp;
     RouterOpts->max_criticality = Options.max_criticality;

diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
@@ -257,6 +257,12 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
         VTR_LOG("false\n");
     }
 
+    auto transform_thread_affinity_list_to_str = [](const std::vector<int>& aff) {
+        std::string str = aff.size() ? std::to_string(aff.front()) : "off";
+        for (size_t i = 1; i < aff.size(); str += ',' + std::to_string(aff[i++])) ;
+        return str;
+    };
+
     if (DETAILED == RouterOpts.route_type) {
         VTR_LOG("RouterOpts.router_algorithm: ");
         switch (RouterOpts.router_algorithm) {
@@ -338,7 +344,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
 
         if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
+            VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
+            VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
+            VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
+            VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
+            VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
+            VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
+            VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
             VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
@@ -482,9 +495,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
         VTR_LOG("RouterOpts.exit_after_first_routing_iteration: %s\n", RouterOpts.exit_after_first_routing_iteration ? "true" : "false");
         if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
+            VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
             VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
             VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
+            VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
+            VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
+            VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
+            VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
             VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);

diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
@@ -1259,6 +1259,67 @@ struct ParsePostSynthNetlistUnconnOutputHandling {
     }
 };
 
+struct ParseTheadAffinityList {
+    inline std::vector<std::string> get_tokens_split_by_delimiter(const std::string& str, char delimiter) {
+        std::vector<std::string> tokens;
+        std::string acc = "";
+        for(const auto &x : str) {
+            if (x == delimiter) {
+                tokens.push_back(acc);
+                acc = "";
+            } else {
+                acc += x;
+            }
+        }
+        tokens.push_back(acc);
+        return tokens;
+    }
+
+    // Parse thread/core affinity list (i.e., pin threads to specific cores).
+    // Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7`
+    // are all supported.
+    inline std::vector<int> parse_thread_affinity_list(const std::string& str) {
+        std::vector<int> thread_affinity_list;
+        std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
+        for (const auto &l1_token : lv1_tokens_split_by_comma) {
+            std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
+            size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
+            VTR_ASSERT(num_lv2_tokens == 1 || num_lv2_tokens == 2);
+            if (num_lv2_tokens == 2) {
+                int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
+                int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
+                for (int i = start_core_id; i <= end_core_id; ++i) {
+                    thread_affinity_list.push_back(i);
+                }
+            } else {
+                thread_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
+            }
+        }
+        return thread_affinity_list;
+    }
+
+    ConvertedValue<std::vector<int>> from_str(const std::string& str) {
+        ConvertedValue<std::vector<int>> conv_value;
+        VTR_ASSERT(str.size() > 0);
+        if (str == "off") {
+            conv_value.set_value({});
+        } else {
+            conv_value.set_value(parse_thread_affinity_list(str));
+        }
+        return conv_value;
+    }
+
+    ConvertedValue<std::string> to_str(std::vector<int> val) {
+        ConvertedValue<std::string> conv_value;
+        std::string str = val.size() ? std::to_string(val.front()) : "off";
+        for (size_t i = 1; i < val.size(); str += ',' + std::to_string(val[i++])) ;
+        conv_value.set_value(str);
+        return conv_value;
+    }
+
+    std::vector<std::string> default_choices() { return {}; }
+};
+
 argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) {
     std::string description =
         "Implements the specified circuit onto the target FPGA architecture"
@@ -2477,6 +2538,14 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("1.2")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    route_timing_grp.add_argument(args.astar_offset, "--astar_offset")
+        .help(
+            "Controls the directedness of the timing-driven router's exploration."
+            " It is a subtractive adjustment to the lookahead heuristic."
+            " Values between 0 and 1e-9 are resonable; higher values may increase quality at the expense of run-time.")
+        .default_value("0.0")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     route_timing_grp.add_argument(args.router_profiler_astar_fac, "--router_profiler_astar_fac")
         .help(
             "Controls the directedness of the timing-driven router's exploration"
@@ -2496,6 +2565,26 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("0.0")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    route_timing_grp.add_argument<int>(args.multi_queue_num_threads, "--multi_queue_num_threads")
+        .help("TODO")
+        .default_value("1")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<int>(args.multi_queue_num_queues, "--multi_queue_num_queues")
+        .help("TODO")
+        .default_value("2")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<bool, ParseOnOff>(args.multi_queue_direct_draining, "--multi_queue_direct_draining")
+        .help("TODO")
+        .default_value("off")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<std::vector<int>, ParseTheadAffinityList>(args.thread_affinity, "--thread_affinity")
+        .help("TODO")
+        .default_value("off")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     route_timing_grp.add_argument(args.max_criticality, "--max_criticality")
         .help(
             "Sets the maximum fraction of routing cost derived from delay (vs routability) for any net."

diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
@@ -205,9 +205,14 @@ struct t_options {
 
     /* Timing-driven router options only */
     argparse::ArgValue<float> astar_fac;
+    argparse::ArgValue<float> astar_offset;
     argparse::ArgValue<float> router_profiler_astar_fac;
     argparse::ArgValue<float> post_target_prune_fac;
     argparse::ArgValue<float> post_target_prune_offset;
+    argparse::ArgValue<int> multi_queue_num_threads;
+    argparse::ArgValue<int> multi_queue_num_queues;
+    argparse::ArgValue<bool> multi_queue_direct_draining;
+    argparse::ArgValue<std::vector<int>> thread_affinity;
     argparse::ArgValue<float> max_criticality;
     argparse::ArgValue<float> criticality_exp;
     argparse::ArgValue<float> router_init_wirelength_abort_threshold;

diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
@@ -1332,6 +1332,8 @@ struct t_placer_opts {
  *             an essentially breadth-first search, astar_fac = 1 is near   *
  *             the usual astar algorithm and astar_fac > 1 are more         *
  *             aggressive.                                                  *
+ * astar_offset: Offset that is subtracted from the lookahead (expected     *
+ *               future costs) in the timing-driven router.                 *
  * max_criticality: The maximum criticality factor (from 0 to 1) any sink   *
  *                  will ever have (i.e. clip criticality to this number).  *
  * criticality_exp: Set criticality to (path_length(sink) / longest_path) ^ *
@@ -1419,9 +1421,14 @@ struct t_router_opts {
     enum e_router_algorithm router_algorithm;
     enum e_base_cost_type base_cost_type;
     float astar_fac;
+    float astar_offset;
     float router_profiler_astar_fac;
     float post_target_prune_fac;
     float post_target_prune_offset;
+    int multi_queue_num_threads;
+    int multi_queue_num_queues;
+    bool multi_queue_direct_draining;
+    std::vector<int> thread_affinity;
     float max_criticality;
     float criticality_exp;
     float init_wirelength_abort_threshold;

diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
@@ -1187,7 +1187,8 @@ void OverrideDelayModel::compute_override_delay_model(
     RouterDelayProfiler& route_profiler,
     const t_router_opts& router_opts) {
     t_router_opts router_opts2 = router_opts;
-    router_opts2.astar_fac = 0.;
+    router_opts2.astar_fac = 0.f;
+    router_opts2.astar_offset = 0.f;
 
     //Look at all the direct connections that exist, and add overrides to delay model
     auto& device_ctx = g_vpr_ctx.device();

diff --git a/vpr/src/route/SerialNetlistRouter.h b/vpr/src/route/SerialNetlistRouter.h
@@ -21,8 +21,8 @@ class SerialNetlistRouter : public NetlistRouter {
         const RoutingPredictor& routing_predictor,
         const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
         bool is_flat)
-        : _serial_router(_make_router(router_lookahead, is_flat, false))
-        , _parallel_router(_make_router(router_lookahead, is_flat, true))
+        : _serial_router(_make_router(router_lookahead, router_opts, is_flat, false))
+        , _parallel_router(_make_router(router_lookahead, router_opts, is_flat, true))
         , _net_list(net_list)
         , _router_opts(router_opts)
         , _connections_inf(connections_inf)
@@ -45,8 +45,10 @@ class SerialNetlistRouter : public NetlistRouter {
 
   private:
     bool should_use_parallel_connection_router(const ParentNetId &net_id, int itry, float pres_fac, float worst_neg_slack);
-
-    ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead, bool is_flat, bool is_parallel) {
+
+    ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead,
+                                            const t_router_opts& router_opts,
+                                            bool is_flat, bool is_parallel) {
         auto& device_ctx = g_vpr_ctx.device();
         auto& route_ctx = g_vpr_ctx.mutable_routing();
 
@@ -71,7 +73,11 @@ class SerialNetlistRouter : public NetlistRouter {
                 device_ctx.rr_rc_data,
                 device_ctx.rr_graph.rr_switch(),
                 route_ctx.rr_node_route_inf,
-                is_flat);
+                is_flat,
+                router_opts.multi_queue_num_threads,
+                router_opts.multi_queue_num_queues,
+                router_opts.multi_queue_direct_draining,
+                router_opts.thread_affinity);
             }
     }
     /* Context fields */

diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp
@@ -1,6 +1,7 @@
 #include "connection_router.h"
-#include "rr_graph.h"
 
+#include <algorithm>
+#include "rr_graph.h"
 #include "binary_heap.h"
 #include "bucket.h"
 #include "rr_graph_fwd.h"
@@ -695,8 +696,8 @@ float ConnectionRouter<Heap>::compute_node_cost_using_rcv(const t_conn_cost_para
     float expected_total_delay_cost;
     float expected_total_cong_cost;
 
-    float expected_total_cong = cost_params.astar_fac * expected_cong + backwards_cong;
-    float expected_total_delay = cost_params.astar_fac * expected_delay + backwards_delay;
+    float expected_total_cong = expected_cong + backwards_cong;
+    float expected_total_delay = expected_delay + backwards_delay;
 
     //If budgets specified calculate cost as described by RCV paper:
     //    R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While
@@ -835,7 +836,7 @@ void ConnectionRouter<Heap>::evaluate_timing_driven_node_costs(t_heap* to,
                                                               target_node,
                                                               cost_params,
                                                               to->R_upstream);
-    total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost;
+    total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
 
     // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) {
     //     to->path_data->backward_delay += cost_params.criticality * Tdel;
@@ -952,12 +953,8 @@ void ConnectionRouter<Heap>::add_route_tree_node_to_heap(
 
     if (!rcv_path_manager.is_enabled()) {
         // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost;
-        float tot_cost = backward_path_cost
-                         + cost_params.astar_fac
-                               * router_lookahead_.get_expected_cost(inode,
-                                                                     target_node,
-                                                                     cost_params,
-                                                                     R_upstream);
+        float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream);
+        float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
         VTR_LOGV_DEBUG(router_debug_, "  Adding node %8d to heap from init route tree with cost %g (%s)\n",
                        inode,
                        tot_cost,

diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h
@@ -23,6 +23,7 @@ struct t_conn_delay_budget {
 struct t_conn_cost_params {
     float criticality = 1.;
     float astar_fac = 1.2;
+    float astar_offset = 0.f;
     float post_target_prune_fac = 1.2f;
     float post_target_prune_offset = 0.f;
     float bend_cost = 1.;

diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h
@@ -1,6 +1,13 @@
 #ifndef _MULTI_QUEUE_PRIORITY_QUEUE_H
 #define _MULTI_QUEUE_PRIORITY_QUEUE_H
 
+// This is only used to enable the clearing code in the MQIO codebase. Whether
+// using queue draining optimization only depends on the VPR command-line option
+// `--multi_queue_direct_draining` setting during runtime. If the option is set
+// to `off`, the queue draining won't work since the `setMinPrioForPop` won't be
+// called leaving the `minPrioForPop` in MQIO object always as float maximum.
+#define MQ_IO_ENABLE_CLEAR_FOR_POP
+
 #include "heap_type.h"
 
 #include "MultiQueueIO.h"
@@ -34,7 +41,11 @@ class MultiQueuePriorityQueue {
     void build_heap();
     inline uint64_t getNumPushes() const { return pq_->getNumPushes(); }
     inline uint64_t getNumPops() const { return pq_->getNumPops(); }
+    inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
     inline void reset() { pq_->reset(); }
+#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
+    inline void setMinPrioForPop(const pq_prio_t& minPrio) { pq_->setMinPrioForPop(minPrio); }
+#endif
 
   private:
     MQ_IO* pq_;