From 91cb6c1c76e57545619b2559fa41b38acff3c2e1 Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Tue, 2 Jul 2024 08:56:02 -0400
Subject: [PATCH 1/9] [ParallelRouter] Added Customized Heap and Occupancy
 Profiling for MQ

Added a customized heap with indexing from one optimization and the
ability to drain/clear the heap directly.

Added a heap occupancy profiling method to gain insight into the MQ heap
occupancy and workload.
---
 vpr/src/route/multi_queue_priority_queue.cpp |  2 +-
 vpr/src/route/multi_queue_priority_queue.h   |  2 +
 vpr/src/route/parallel_connection_router.cpp | 15 ++++
 vpr/src/route/parallel_connection_router.h   | 86 ++++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/vpr/src/route/multi_queue_priority_queue.cpp b/vpr/src/route/multi_queue_priority_queue.cpp
index c928f18c113..af15927fa91 100644
--- a/vpr/src/route/multi_queue_priority_queue.cpp
+++ b/vpr/src/route/multi_queue_priority_queue.cpp
@@ -20,7 +20,7 @@ void MultiQueuePriorityQueue::init_heap(const DeviceGrid& grid) {
 }
 
 bool MultiQueuePriorityQueue::try_pop(pq_prio_t &prio, RRNodeId &node) {
-    auto tmp = pq_->tryPop();
+    auto tmp = pq_->tryPopWithMinPrio();
     if (!tmp) {
         return false;
     } else {
diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h
index 0428bda539e..ce037e90ec9 100644
--- a/vpr/src/route/multi_queue_priority_queue.h
+++ b/vpr/src/route/multi_queue_priority_queue.h
@@ -34,7 +34,9 @@ class MultiQueuePriorityQueue {
     void build_heap();
     inline uint64_t getNumPushes() const { return pq_->getNumPushes(); }
     inline uint64_t getNumPops() const { return pq_->getNumPops(); }
+    inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
     inline void reset() { pq_->reset(); }
+    inline void setMinPrio(const pq_prio_t min_prio) { pq_->setMinPrioForPop(min_prio); }
 
   private:
     MQ_IO* pq_;
diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp
index 868a251f2ba..b702a85d9cd 100644
--- a/vpr/src/route/parallel_connection_router.cpp
+++ b/vpr/src/route/parallel_connection_router.cpp
@@ -382,8 +382,22 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
     // cheapest t_heap in current route tree to be expanded on
     float new_total_cost;
     RRNodeId inode;
+#ifdef PROFILE_HEAP_OCCUPANCY
+    unsigned count = 0;
+    if (thread_idx == 0) {
+        heap_occ_profile_ << size_t(sink_node) << "\n";
+    }
+#endif
     // While the heap is not empty do
     while (heap_.try_pop(new_total_cost, inode)) {
+#ifdef PROFILE_HEAP_OCCUPANCY
+        if (thread_idx == 0) {
+            if (count % (1000 / mq_num_threads) == 0) {
+                heap_occ_profile_ << count << " " << heap_.getHeapOccupancy() << "\n";
+            }
+            count ++;
+        }
+#endif
         // update_router_stats(router_stats_,
         //                     false,
         //                     cheapest->index,
@@ -392,6 +406,7 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
         // Should we explore the neighbors of this node?
 
         if (inode == sink_node) {
+            heap_.setMinPrio(new_total_cost);
             continue;
         }
 
diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h
index 172f7010a07..85ae0e61507 100644
--- a/vpr/src/route/parallel_connection_router.h
+++ b/vpr/src/route/parallel_connection_router.h
@@ -11,6 +11,10 @@
 #include "router_stats.h"
 #include "spatial_route_tree_lookup.h"
 
+#include <fstream>
+
+#define ENABLE_CORE_AFFINITY
+
 #define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE
 // #define VPR_PARALLEL_CONNECTION_ROUTER_USE_ONE_TBB
 
@@ -110,6 +114,41 @@ class barrier_spin_t {
 
 using barrier_t = barrier_spin_t;
 
+inline std::vector<std::string> get_tokens_split_by_delimiter(std::string str, char delimiter) {
+    std::vector<std::string> tokens;
+    std::string acc = "";
+    for(const auto &x : str) {
+        if (x == delimiter) {
+            tokens.push_back(acc);
+            acc = "";
+        } else {
+            acc += x;
+        }
+    }
+    tokens.push_back(acc);
+    return tokens;
+}
+
+inline std::vector<size_t> parse_core_affinity_list(std::string str) {
+    std::vector<size_t> core_affinity_list;
+    std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
+    for (const auto &l1_token : lv1_tokens_split_by_comma) {
+        std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
+        size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
+        assert(num_lv2_tokens == 1 || num_lv2_tokens == 2);
+        if (num_lv2_tokens == 2) {
+            int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
+            int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
+            for (int i = start_core_id; i <= end_core_id; ++i) {
+                core_affinity_list.push_back(i);
+            }
+        } else {
+            core_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
+        }
+    }
+    return core_affinity_list;
+}
+
 // Prune the heap when it contains 4x the number of nodes in the RR graph.
 // constexpr size_t kHeapPruneFactor = 4;
 
@@ -154,10 +193,48 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         std::cout << "#T=" << mq_num_threads << " #Q=" << mq_num_queues << std::endl << std::flush;
         sub_threads_.resize(mq_num_threads-1);
         thread_barrier_.init();
+
+#ifdef PROFILE_HEAP_OCCUPANCY
+        heap_occ_profile_.open("occupancy.txt", std::ios::trunc);
+#endif
+
+#ifdef ENABLE_CORE_AFFINITY
+        std::vector<size_t> thread_core_affinity_mapping;
+        if (std::getenv("VPR_CORE_AFFINITY")) {
+            thread_core_affinity_mapping = parse_core_affinity_list(std::getenv("VPR_CORE_AFFINITY"));
+            assert(thread_core_affinity_mapping.size() == mq_num_threads);
+        } else {
+            for (size_t i = 0; i < mq_num_threads; ++i) {
+                thread_core_affinity_mapping.push_back(i);
+            }
+        }
+#endif
+
         for (size_t i = 0 ; i < mq_num_threads - 1; ++i) {
             sub_threads_[i] = std::thread(&ParallelConnectionRouter::timing_driven_route_connection_from_heap_sub_thread_wrapper, this, i + 1 /*0: main thread*/);
+            // Create a cpu_set_t object representing a set of CPUs. Clear it and mark only CPU i as set.
+#ifdef ENABLE_CORE_AFFINITY
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(thread_core_affinity_mapping[i + 1], &cpuset);
+            int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(),
+                                            sizeof(cpu_set_t), &cpuset);
+            if (rc != 0) {
+                VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
+            }
+#endif
             sub_threads_[i].detach();
         }
+#ifdef ENABLE_CORE_AFFINITY
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(thread_core_affinity_mapping[0], &cpuset);
+        int rc = pthread_setaffinity_np(pthread_self(),
+                                        sizeof(cpu_set_t), &cpuset);
+        if (rc != 0) {
+            VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
+        }
+#endif
     }
 
     ~ParallelConnectionRouter() {
@@ -165,6 +242,10 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         thread_barrier_.wait();
 
         VTR_LOG("Parallel Connection Router is being destroyed. Time spent computing SSSP: %g seconds\n.", this->sssp_total_time.count() / 1000000.0);
+
+#ifdef PROFILE_HEAP_OCCUPANCY
+        heap_occ_profile_.close();
+#endif
     }
 
     // Clear's the modified list.  Should be called after reset_path_costs
@@ -424,6 +505,11 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
 
     // Timing
     std::chrono::microseconds sssp_total_time{0};
+
+    // Profiling
+#ifdef PROFILE_HEAP_OCCUPANCY
+    std::ofstream heap_occ_profile_;
+#endif
 };
 
 #endif /* _PARALLEL_CONNECTION_ROUTER_H */

From b6ffc69b78957f003677893d69b979b55fa4685d Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Tue, 2 Jul 2024 09:14:10 -0400
Subject: [PATCH 2/9] [ParallelRouter] Added Comments for Setting Core Affinity
 List

Added detailed comments for configuring the core affinity list in the
parallel router.
---
 vpr/src/route/parallel_connection_router.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h
index 85ae0e61507..47a32affcb9 100644
--- a/vpr/src/route/parallel_connection_router.h
+++ b/vpr/src/route/parallel_connection_router.h
@@ -13,6 +13,7 @@
 
 #include <fstream>
 
+// For details on setting core affinity, please see `parse_core_affinity_list`.
 #define ENABLE_CORE_AFFINITY
 
 #define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE
@@ -129,6 +130,10 @@ inline std::vector<std::string> get_tokens_split_by_delimiter(std::string str, c
     return tokens;
 }
 
+// To assign core affinity (i.e., pin threads to specific cores), please set the
+// environment variable `export VPR_CORE_AFFINITY=0-8` before running VPR.
+// Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7`
+// are all supported.
 inline std::vector<size_t> parse_core_affinity_list(std::string str) {
     std::vector<size_t> core_affinity_list;
     std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');

From c713e0bc9e4401140264f8a18d92dda7d5883de2 Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Thu, 4 Jul 2024 11:34:01 -0400
Subject: [PATCH 3/9] [ParallelRouter] Fixed Issues and Upgraded CPS Submodule

Fixed issues in PR#6 and upgraded the CPS submodule.
---
 libs/EXTERNAL/CPS                            |  2 +-
 vpr/src/route/multi_queue_priority_queue.cpp | 10 ++++++++--
 vpr/src/route/multi_queue_priority_queue.h   |  5 +++--
 vpr/src/route/parallel_connection_router.cpp |  7 +------
 vpr/src/route/parallel_connection_router.h   |  3 +++
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/libs/EXTERNAL/CPS b/libs/EXTERNAL/CPS
index d8b81cb9bb4..e6bab594840 160000
--- a/libs/EXTERNAL/CPS
+++ b/libs/EXTERNAL/CPS
@@ -1 +1 @@
-Subproject commit d8b81cb9bb4540acd782aea513e93b8e47b0b7a3
+Subproject commit e6bab594840e5db2dd3c9bacb07e8f5402f8de63
diff --git a/vpr/src/route/multi_queue_priority_queue.cpp b/vpr/src/route/multi_queue_priority_queue.cpp
index af15927fa91..d5fa3ecc7ff 100644
--- a/vpr/src/route/multi_queue_priority_queue.cpp
+++ b/vpr/src/route/multi_queue_priority_queue.cpp
@@ -20,7 +20,7 @@ void MultiQueuePriorityQueue::init_heap(const DeviceGrid& grid) {
 }
 
 bool MultiQueuePriorityQueue::try_pop(pq_prio_t &prio, RRNodeId &node) {
-    auto tmp = pq_->tryPopWithMinPrio();
+    auto tmp = pq_->tryPop();
     if (!tmp) {
         return false;
     } else {
@@ -37,7 +37,13 @@ static inline pq_index_t cast_RRNodeId_to_pq_index_t(RRNodeId node) {
     return static_cast<pq_index_t>(std::size_t(node));
 }
 
-void MultiQueuePriorityQueue::add_to_heap(const pq_prio_t& prio, const RRNodeId& node) {
+void MultiQueuePriorityQueue::add_to_heap(const pq_prio_t& prio, const RRNodeId& node, const RRNodeId& target_node) {
+    if (node == target_node) {
+#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
+        pq_.setMinPrio(new_total_cost);
+#endif
+        return;
+    }
     pq_->push({prio, cast_RRNodeId_to_pq_index_t(node)});
 }
 
diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h
index ce037e90ec9..318b846b970 100644
--- a/vpr/src/route/multi_queue_priority_queue.h
+++ b/vpr/src/route/multi_queue_priority_queue.h
@@ -1,6 +1,8 @@
 #ifndef _MULTI_QUEUE_PRIORITY_QUEUE_H
 #define _MULTI_QUEUE_PRIORITY_QUEUE_H
 
+// #define MQ_IO_ENABLE_CLEAR_FOR_POP
+
 #include "heap_type.h"
 
 #include "MultiQueueIO.h"
@@ -26,7 +28,7 @@ class MultiQueuePriorityQueue {
 
     void init_heap(const DeviceGrid& grid);
     bool try_pop(pq_prio_t &prio, RRNodeId &node);
-    void add_to_heap(const pq_prio_t& prio, const RRNodeId& node);
+    void add_to_heap(const pq_prio_t& prio, const RRNodeId& node, const RRNodeId& target_node);
     void push_back(const pq_prio_t& prio, const RRNodeId& node);
     bool is_empty_heap() const;
     bool is_valid() const;
@@ -36,7 +38,6 @@ class MultiQueuePriorityQueue {
     inline uint64_t getNumPops() const { return pq_->getNumPops(); }
     inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
     inline void reset() { pq_->reset(); }
-    inline void setMinPrio(const pq_prio_t min_prio) { pq_->setMinPrioForPop(min_prio); }
 
   private:
     MQ_IO* pq_;
diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp
index b702a85d9cd..487eced6df5 100644
--- a/vpr/src/route/parallel_connection_router.cpp
+++ b/vpr/src/route/parallel_connection_router.cpp
@@ -405,11 +405,6 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
 
         // Should we explore the neighbors of this node?
 
-        if (inode == sink_node) {
-            heap_.setMinPrio(new_total_cost);
-            continue;
-        }
-
         if (should_not_explore_neighbors(inode, new_total_cost, rr_node_route_inf_[inode].backward_path_cost, sink_node, rr_node_route_inf_, cost_params)) {
             continue;
         }
@@ -634,7 +629,7 @@ void ParallelConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_param
 
     releaseLock(to_node);
 
-    heap_.add_to_heap(next.total_cost, to_node);
+    heap_.add_to_heap(new_total_cost, to_node, target_node);
 
     // update_router_stats(router_stats_,
     //                     true,
diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h
index 47a32affcb9..72bd0635eed 100644
--- a/vpr/src/route/parallel_connection_router.h
+++ b/vpr/src/route/parallel_connection_router.h
@@ -11,7 +11,10 @@
 #include "router_stats.h"
 #include "spatial_route_tree_lookup.h"
 
+// #define PROFILE_HEAP_OCCUPANCY
+#ifdef PROFILE_HEAP_OCCUPANCY
 #include <fstream>
+#endif
 
 // For details on setting core affinity, please see `parse_core_affinity_list`.
 #define ENABLE_CORE_AFFINITY

From a3856d1d88d8346a42953196897acc60a80e06d3 Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Thu, 4 Jul 2024 16:25:46 -0400
Subject: [PATCH 4/9] [ParallelRouter] Fixed the Bug in the Multi-Queue Wrapper
 Class

Fixed a bug (basically, a typo) in the Multi-Queue wrapper class.
---
 vpr/src/route/multi_queue_priority_queue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpr/src/route/multi_queue_priority_queue.cpp b/vpr/src/route/multi_queue_priority_queue.cpp
index d5fa3ecc7ff..fc58947bfff 100644
--- a/vpr/src/route/multi_queue_priority_queue.cpp
+++ b/vpr/src/route/multi_queue_priority_queue.cpp
@@ -40,7 +40,7 @@ static inline pq_index_t cast_RRNodeId_to_pq_index_t(RRNodeId node) {
 void MultiQueuePriorityQueue::add_to_heap(const pq_prio_t& prio, const RRNodeId& node, const RRNodeId& target_node) {
     if (node == target_node) {
 #ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
-        pq_.setMinPrio(new_total_cost);
+        pq_->setMinPrio(prio);
 #endif
         return;
     }

From fa8ba4c6e730bf9755e06fd565823d285d3c433c Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Thu, 4 Jul 2024 21:00:34 -0400
Subject: [PATCH 5/9] [ParallelRouter] Fixed a Typo in the Multi-Queue Wrapper
 Class

Fixed a typo issue of a misspelled function name in the Multi-Queue
wrapper class.
---
 libs/EXTERNAL/CPS                            | 2 +-
 vpr/src/route/multi_queue_priority_queue.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/EXTERNAL/CPS b/libs/EXTERNAL/CPS
index e6bab594840..c3991cfe1a2 160000
--- a/libs/EXTERNAL/CPS
+++ b/libs/EXTERNAL/CPS
@@ -1 +1 @@
-Subproject commit e6bab594840e5db2dd3c9bacb07e8f5402f8de63
+Subproject commit c3991cfe1a2bc7097cec719fb60dc29ab39c95a4
diff --git a/vpr/src/route/multi_queue_priority_queue.cpp b/vpr/src/route/multi_queue_priority_queue.cpp
index fc58947bfff..e8a8f23bc60 100644
--- a/vpr/src/route/multi_queue_priority_queue.cpp
+++ b/vpr/src/route/multi_queue_priority_queue.cpp
@@ -40,7 +40,7 @@ static inline pq_index_t cast_RRNodeId_to_pq_index_t(RRNodeId node) {
 void MultiQueuePriorityQueue::add_to_heap(const pq_prio_t& prio, const RRNodeId& node, const RRNodeId& target_node) {
     if (node == target_node) {
 #ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
-        pq_->setMinPrio(prio);
+        pq_->setMinPrioForPop(prio);
 #endif
         return;
     }

From e7108ca0482a2f9708343feda54d22ed1c47d5d6 Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Fri, 5 Jul 2024 23:39:00 -0400
Subject: [PATCH 6/9] [ParallelRouter] Made Deterministic Dijkstra Work with
 New Optimizations

The deterministic Dijkstra of parallel router is now working with all
the newly-introduced optimizations, including (1) core affinity, (2) VPR
binary heap (indexing from one), and (3) queue draining (with min-prio)
for MQ pop.
---
 libs/EXTERNAL/CPS                            | 2 +-
 vpr/src/route/multi_queue_priority_queue.cpp | 8 +-------
 vpr/src/route/multi_queue_priority_queue.h   | 5 ++++-
 vpr/src/route/parallel_connection_router.cpp | 8 +++++++-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/libs/EXTERNAL/CPS b/libs/EXTERNAL/CPS
index c3991cfe1a2..2bdb4b5db20 160000
--- a/libs/EXTERNAL/CPS
+++ b/libs/EXTERNAL/CPS
@@ -1 +1 @@
-Subproject commit c3991cfe1a2bc7097cec719fb60dc29ab39c95a4
+Subproject commit 2bdb4b5db200d63520e1fe0b20db0ac1a9c7ea99
diff --git a/vpr/src/route/multi_queue_priority_queue.cpp b/vpr/src/route/multi_queue_priority_queue.cpp
index e8a8f23bc60..c928f18c113 100644
--- a/vpr/src/route/multi_queue_priority_queue.cpp
+++ b/vpr/src/route/multi_queue_priority_queue.cpp
@@ -37,13 +37,7 @@ static inline pq_index_t cast_RRNodeId_to_pq_index_t(RRNodeId node) {
     return static_cast<pq_index_t>(std::size_t(node));
 }
 
-void MultiQueuePriorityQueue::add_to_heap(const pq_prio_t& prio, const RRNodeId& node, const RRNodeId& target_node) {
-    if (node == target_node) {
-#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
-        pq_->setMinPrioForPop(prio);
-#endif
-        return;
-    }
+void MultiQueuePriorityQueue::add_to_heap(const pq_prio_t& prio, const RRNodeId& node) {
     pq_->push({prio, cast_RRNodeId_to_pq_index_t(node)});
 }
 
diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h
index 318b846b970..3ff2093e016 100644
--- a/vpr/src/route/multi_queue_priority_queue.h
+++ b/vpr/src/route/multi_queue_priority_queue.h
@@ -28,7 +28,7 @@ class MultiQueuePriorityQueue {
 
     void init_heap(const DeviceGrid& grid);
     bool try_pop(pq_prio_t &prio, RRNodeId &node);
-    void add_to_heap(const pq_prio_t& prio, const RRNodeId& node, const RRNodeId& target_node);
+    void add_to_heap(const pq_prio_t& prio, const RRNodeId& node);
     void push_back(const pq_prio_t& prio, const RRNodeId& node);
     bool is_empty_heap() const;
     bool is_valid() const;
@@ -38,6 +38,9 @@ class MultiQueuePriorityQueue {
     inline uint64_t getNumPops() const { return pq_->getNumPops(); }
     inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
     inline void reset() { pq_->reset(); }
+#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
+    inline void setMinPrioForPop(const pq_prio_t& minPrio) { pq_->setMinPrioForPop(minPrio); }
+#endif
 
   private:
     MQ_IO* pq_;
diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp
index 487eced6df5..bc7a4a36530 100644
--- a/vpr/src/route/parallel_connection_router.cpp
+++ b/vpr/src/route/parallel_connection_router.cpp
@@ -629,7 +629,13 @@ void ParallelConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_param
 
     releaseLock(to_node);
 
-    heap_.add_to_heap(new_total_cost, to_node, target_node);
+    if (to_node == target_node) {
+#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
+        heap_.setMinPrioForPop(new_total_cost);
+#endif
+        return ;
+    }
+    heap_.add_to_heap(new_total_cost, to_node);
 
     // update_router_stats(router_stats_,
     //                     true,

From 6dc8f14febcb63933c220f497494537d99f9b85a Mon Sep 17 00:00:00 2001
From: AlexandreSinger <alex.singer@mail.utoronto.ca>
Date: Sat, 8 Jun 2024 18:17:16 -0400
Subject: [PATCH 7/9] [Route] Added astar_offset Parameter

Using an astar_offset can help better tune the ordering heuristic for
the search used to find the shortest path in the routing graph. It is
also necessary to ensure that the heuristic is an underestimate (without
setting the astar_fac to 0.0).
---
 vpr/src/base/SetupVPR.cpp                   |  1 +
 vpr/src/base/ShowSetup.cpp                  |  2 ++
 vpr/src/base/read_options.cpp               |  8 ++++++++
 vpr/src/base/read_options.h                 |  1 +
 vpr/src/base/vpr_types.h                    |  3 +++
 vpr/src/place/timing_place_lookup.cpp       |  3 ++-
 vpr/src/route/connection_router.cpp         | 17 +++++++----------
 vpr/src/route/connection_router_interface.h |  1 +
 vpr/src/route/route_net.tpp                 |  1 +
 vpr/src/route/router_delay_profiling.cpp    |  2 ++
 vpr/test/test_connection_router.cpp         |  1 +
 11 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index cddb196ab1b..7ff91fc26e9 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -410,6 +410,7 @@ static void SetupRoutingArch(const t_arch& Arch,
 static void SetupRouterOpts(const t_options& Options, t_router_opts* RouterOpts) {
     RouterOpts->do_check_rr_graph = Options.check_rr_graph;
     RouterOpts->astar_fac = Options.astar_fac;
+    RouterOpts->astar_offset = Options.astar_offset;
     RouterOpts->router_profiler_astar_fac = Options.router_profiler_astar_fac;
     RouterOpts->post_target_prune_fac = Options.post_target_prune_fac;
     RouterOpts->post_target_prune_offset = Options.post_target_prune_offset;
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
index 042277647cf..7ef660787d1 100644
--- a/vpr/src/base/ShowSetup.cpp
+++ b/vpr/src/base/ShowSetup.cpp
@@ -338,6 +338,7 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
 
         if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
+            VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
@@ -482,6 +483,7 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
         VTR_LOG("RouterOpts.exit_after_first_routing_iteration: %s\n", RouterOpts.exit_after_first_routing_iteration ? "true" : "false");
         if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
+            VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
             VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
             VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 8940f5f350c..347fb0c8110 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -2477,6 +2477,14 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("1.2")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    route_timing_grp.add_argument(args.astar_offset, "--astar_offset")
+        .help(
+            "Controls the directedness of the timing-driven router's exploration."
+            " It is a subtractive adjustment to the lookahead heuristic."
+            " Values between 0 and 1e-9 are resonable; higher values may increase quality at the expense of run-time.")
+        .default_value("0.0")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     route_timing_grp.add_argument(args.router_profiler_astar_fac, "--router_profiler_astar_fac")
         .help(
             "Controls the directedness of the timing-driven router's exploration"
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index ce1538eeb93..d73e06daf6a 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -205,6 +205,7 @@ struct t_options {
 
     /* Timing-driven router options only */
     argparse::ArgValue<float> astar_fac;
+    argparse::ArgValue<float> astar_offset;
     argparse::ArgValue<float> router_profiler_astar_fac;
     argparse::ArgValue<float> post_target_prune_fac;
     argparse::ArgValue<float> post_target_prune_offset;
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 0e217e4d0e2..aa1d0570452 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -1332,6 +1332,8 @@ struct t_placer_opts {
  *             an essentially breadth-first search, astar_fac = 1 is near   *
  *             the usual astar algorithm and astar_fac > 1 are more         *
  *             aggressive.                                                  *
+ * astar_offset: Offset that is subtracted from the lookahead (expected     *
+ *               future costs) in the timing-driven router.                 *
  * max_criticality: The maximum criticality factor (from 0 to 1) any sink   *
  *                  will ever have (i.e. clip criticality to this number).  *
  * criticality_exp: Set criticality to (path_length(sink) / longest_path) ^ *
@@ -1419,6 +1421,7 @@ struct t_router_opts {
     enum e_router_algorithm router_algorithm;
     enum e_base_cost_type base_cost_type;
     float astar_fac;
+    float astar_offset;
     float router_profiler_astar_fac;
     float post_target_prune_fac;
     float post_target_prune_offset;
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index d0aacd8f78a..3cc771ffa2a 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -1187,7 +1187,8 @@ void OverrideDelayModel::compute_override_delay_model(
     RouterDelayProfiler& route_profiler,
     const t_router_opts& router_opts) {
     t_router_opts router_opts2 = router_opts;
-    router_opts2.astar_fac = 0.;
+    router_opts2.astar_fac = 0.f;
+    router_opts2.astar_offset = 0.f;
 
     //Look at all the direct connections that exist, and add overrides to delay model
     auto& device_ctx = g_vpr_ctx.device();
diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp
index a600f46f295..b0096dc9d85 100644
--- a/vpr/src/route/connection_router.cpp
+++ b/vpr/src/route/connection_router.cpp
@@ -1,6 +1,7 @@
 #include "connection_router.h"
-#include "rr_graph.h"
 
+#include <algorithm>
+#include "rr_graph.h"
 #include "binary_heap.h"
 #include "bucket.h"
 #include "rr_graph_fwd.h"
@@ -695,8 +696,8 @@ float ConnectionRouter<Heap>::compute_node_cost_using_rcv(const t_conn_cost_para
     float expected_total_delay_cost;
     float expected_total_cong_cost;
 
-    float expected_total_cong = cost_params.astar_fac * expected_cong + backwards_cong;
-    float expected_total_delay = cost_params.astar_fac * expected_delay + backwards_delay;
+    float expected_total_cong = expected_cong + backwards_cong;
+    float expected_total_delay = expected_delay + backwards_delay;
 
     //If budgets specified calculate cost as described by RCV paper:
     //    R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While
@@ -835,7 +836,7 @@ void ConnectionRouter<Heap>::evaluate_timing_driven_node_costs(t_heap* to,
                                                               target_node,
                                                               cost_params,
                                                               to->R_upstream);
-    total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost;
+    total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
 
     // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) {
     //     to->path_data->backward_delay += cost_params.criticality * Tdel;
@@ -952,12 +953,8 @@ void ConnectionRouter<Heap>::add_route_tree_node_to_heap(
 
     if (!rcv_path_manager.is_enabled()) {
         // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost;
-        float tot_cost = backward_path_cost
-                         + cost_params.astar_fac
-                               * router_lookahead_.get_expected_cost(inode,
-                                                                     target_node,
-                                                                     cost_params,
-                                                                     R_upstream);
+        float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream);
+        float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
         VTR_LOGV_DEBUG(router_debug_, "  Adding node %8d to heap from init route tree with cost %g (%s)\n",
                        inode,
                        tot_cost,
diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h
index 9d52966cbb9..7da28115b43 100644
--- a/vpr/src/route/connection_router_interface.h
+++ b/vpr/src/route/connection_router_interface.h
@@ -23,6 +23,7 @@ struct t_conn_delay_budget {
 struct t_conn_cost_params {
     float criticality = 1.;
     float astar_fac = 1.2;
+    float astar_offset = 0.f;
     float post_target_prune_fac = 1.2f;
     float post_target_prune_offset = 0.f;
     float bend_cost = 1.;
diff --git a/vpr/src/route/route_net.tpp b/vpr/src/route/route_net.tpp
index 8046d855b55..98e8cacadfa 100644
--- a/vpr/src/route/route_net.tpp
+++ b/vpr/src/route/route_net.tpp
@@ -139,6 +139,7 @@ inline NetResultFlags route_net(ConnectionRouter *router,
     t_conn_delay_budget conn_delay_budget;
     t_conn_cost_params cost_params;
     cost_params.astar_fac = router_opts.astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
     cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
     cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp
index 3f4dddcf8f0..e8dea9e08be 100644
--- a/vpr/src/route/router_delay_profiling.cpp
+++ b/vpr/src/route/router_delay_profiling.cpp
@@ -95,6 +95,7 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node,
     t_conn_cost_params cost_params;
     cost_params.criticality = 1.;
     cost_params.astar_fac = router_opts.router_profiler_astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
     cost_params.bend_cost = router_opts.bend_cost;
 
     route_budgets budgeting_inf(net_list_, is_flat_);
@@ -164,6 +165,7 @@ vtr::vector<RRNodeId, float> calculate_all_path_delays_from_rr_node(RRNodeId src
     t_conn_cost_params cost_params;
     cost_params.criticality = 1.;
     cost_params.astar_fac = router_opts.astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
     cost_params.bend_cost = router_opts.bend_cost;
     /* This function is called during placement. Thus, the flat routing option should be disabled. */
     //TODO: Placement is run with is_flat=false. However, since is_flat is passed, det_routing_arch should
diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index 1b0c236a29a..4ede318a120 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -41,6 +41,7 @@ static float do_one_route(RRNodeId source_node,
     t_conn_cost_params cost_params;
     cost_params.criticality = router_opts.max_criticality;
     cost_params.astar_fac = router_opts.astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
     cost_params.bend_cost = router_opts.bend_cost;
 
     const Netlist<>& net_list = is_flat ? (const Netlist<>&)g_vpr_ctx.atom().nlist : (const Netlist<>&)g_vpr_ctx.clustering().clb_nlist;

From df520ae02c9c16325424448969164d8c75f4c7ff Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Fri, 5 Jul 2024 23:47:53 -0400
Subject: [PATCH 8/9] [ParallelRouter] Added `astar_offset` and Made
 Deterministic A-Star Work

Added `astar_offset` for parallel router based on the cherry-picked
commit (ada43a5) on VPR master. With this change and setting a suitable
value for `astar_offset` (ensuring the ordering heuristic being under-
estimated), the deterministic A-Star of parallel router works.
---
 vpr/src/route/parallel_connection_router.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp
index bc7a4a36530..8bc75fee235 100644
--- a/vpr/src/route/parallel_connection_router.cpp
+++ b/vpr/src/route/parallel_connection_router.cpp
@@ -780,7 +780,7 @@ void ParallelConnectionRouter::evaluate_timing_driven_node_costs(node_t* to,
                                                               target_node,
                                                               cost_params,
                                                               to->R_upstream);
-    total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost;
+    total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
 
     // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) {
     //     to->path_data->backward_delay += cost_params.criticality * Tdel;
@@ -893,12 +893,8 @@ void ParallelConnectionRouter::add_route_tree_node_to_heap(
 
     // if (!rcv_path_manager.is_enabled()) {
         // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost;
-        float tot_cost = backward_path_cost
-                         + cost_params.astar_fac
-                               * router_lookahead_.get_expected_cost(inode,
-                                                                     target_node,
-                                                                     cost_params,
-                                                                     R_upstream);
+        float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream);
+        float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
         VTR_LOGV_DEBUG(router_debug_, "  Adding node %8d to heap from init route tree with cost %g (%s)\n",
                        inode,
                        tot_cost,

From 825b8be3600c9037bb50c53e382dd1b7bf6350e3 Mon Sep 17 00:00:00 2001
From: Hang Yan <ueqri@outlook.com>
Date: Thu, 11 Jul 2024 22:26:58 -0400
Subject: [PATCH 9/9] [ParallelRouter] Added Command-Line Options for MQ-Based
 Parallel Router

Added four command-line options for MQ-based parallel router:
(1) `--multi_queue_num_threads <# threads>`
(2) `--multi_queue_num_queues <# queues>`
(3) `--multi_queue_direct_draining <on/off>`
(4) `--thread_affinity <off (meaning no affinity, leave OS schedule) or
    set a list of CPU core ID (the first one is for the main thread),
    e.g., 0,1,2,3 or 0-3 or 0-1,2-3 or 0,1-2,3>`
---
 vpr/src/base/SetupVPR.cpp                    |   4 +
 vpr/src/base/ShowSetup.cpp                   |  16 +++
 vpr/src/base/read_options.cpp                |  81 +++++++++++++
 vpr/src/base/read_options.h                  |   4 +
 vpr/src/base/vpr_types.h                     |   4 +
 vpr/src/route/SerialNetlistRouter.h          |  16 ++-
 vpr/src/route/multi_queue_priority_queue.h   |   7 +-
 vpr/src/route/parallel_connection_router.cpp |   6 +-
 vpr/src/route/parallel_connection_router.h   | 117 +++++--------------
 vpr/src/route/router_delay_profiling.cpp     |   4 +
 vpr/test/test_connection_router.cpp          |   2 +
 11 files changed, 168 insertions(+), 93 deletions(-)

diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index 7ff91fc26e9..0c15a9c6653 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -414,6 +414,10 @@ static void SetupRouterOpts(const t_options& Options, t_router_opts* RouterOpts)
     RouterOpts->router_profiler_astar_fac = Options.router_profiler_astar_fac;
     RouterOpts->post_target_prune_fac = Options.post_target_prune_fac;
     RouterOpts->post_target_prune_offset = Options.post_target_prune_offset;
+    RouterOpts->multi_queue_num_threads = Options.multi_queue_num_threads;
+    RouterOpts->multi_queue_num_queues = Options.multi_queue_num_queues;
+    RouterOpts->multi_queue_direct_draining = Options.multi_queue_direct_draining;
+    RouterOpts->thread_affinity = Options.thread_affinity;
     RouterOpts->bb_factor = Options.bb_factor;
     RouterOpts->criticality_exp = Options.criticality_exp;
     RouterOpts->max_criticality = Options.max_criticality;
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
index 7ef660787d1..ab5c73e9b3b 100644
--- a/vpr/src/base/ShowSetup.cpp
+++ b/vpr/src/base/ShowSetup.cpp
@@ -257,6 +257,12 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
         VTR_LOG("false\n");
     }
 
+    auto transform_thread_affinity_list_to_str = [](const std::vector<int>& aff) {
+        std::string str = aff.size() ? std::to_string(aff.front()) : "off";
+        for (size_t i = 1; i < aff.size(); str += ',' + std::to_string(aff[i++])) ;
+        return str;
+    };
+
     if (DETAILED == RouterOpts.route_type) {
         VTR_LOG("RouterOpts.router_algorithm: ");
         switch (RouterOpts.router_algorithm) {
@@ -340,6 +346,12 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
             VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
+            VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
+            VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
+            VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
+            VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
+            VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
+            VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
             VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
@@ -487,6 +499,10 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
             VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
             VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
+            VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
+            VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
+            VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
+            VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
             VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 347fb0c8110..bc11f18a067 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -1259,6 +1259,67 @@ struct ParsePostSynthNetlistUnconnOutputHandling {
     }
 };
 
+struct ParseTheadAffinityList {
+    inline std::vector<std::string> get_tokens_split_by_delimiter(const std::string& str, char delimiter) {
+        std::vector<std::string> tokens;
+        std::string acc = "";
+        for(const auto &x : str) {
+            if (x == delimiter) {
+                tokens.push_back(acc);
+                acc = "";
+            } else {
+                acc += x;
+            }
+        }
+        tokens.push_back(acc);
+        return tokens;
+    }
+
+    // Parse thread/core affinity list (i.e., pin threads to specific cores).
+    // Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7`
+    // are all supported.
+    inline std::vector<int> parse_thread_affinity_list(const std::string& str) {
+        std::vector<int> thread_affinity_list;
+        std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
+        for (const auto &l1_token : lv1_tokens_split_by_comma) {
+            std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
+            size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
+            VTR_ASSERT(num_lv2_tokens == 1 || num_lv2_tokens == 2);
+            if (num_lv2_tokens == 2) {
+                int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
+                int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
+                for (int i = start_core_id; i <= end_core_id; ++i) {
+                    thread_affinity_list.push_back(i);
+                }
+            } else {
+                thread_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
+            }
+        }
+        return thread_affinity_list;
+    }
+
+    ConvertedValue<std::vector<int>> from_str(const std::string& str) {
+        ConvertedValue<std::vector<int>> conv_value;
+        VTR_ASSERT(str.size() > 0);
+        if (str == "off") {
+            conv_value.set_value({});
+        } else {
+            conv_value.set_value(parse_thread_affinity_list(str));
+        }
+        return conv_value;
+    }
+
+    ConvertedValue<std::string> to_str(std::vector<int> val) {
+        ConvertedValue<std::string> conv_value;
+        std::string str = val.size() ? std::to_string(val.front()) : "off";
+        for (size_t i = 1; i < val.size(); str += ',' + std::to_string(val[i++])) ;
+        conv_value.set_value(str);
+        return conv_value;
+    }
+
+    std::vector<std::string> default_choices() { return {}; }
+};
+
 argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) {
     std::string description =
         "Implements the specified circuit onto the target FPGA architecture"
@@ -2504,6 +2565,26 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("0.0")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    route_timing_grp.add_argument<int>(args.multi_queue_num_threads, "--multi_queue_num_threads")
+        .help("TODO")
+        .default_value("1")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<int>(args.multi_queue_num_queues, "--multi_queue_num_queues")
+        .help("TODO")
+        .default_value("2")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<bool, ParseOnOff>(args.multi_queue_direct_draining, "--multi_queue_direct_draining")
+        .help("TODO")
+        .default_value("off")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<std::vector<int>, ParseTheadAffinityList>(args.thread_affinity, "--thread_affinity")
+        .help("TODO")
+        .default_value("off")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     route_timing_grp.add_argument(args.max_criticality, "--max_criticality")
         .help(
             "Sets the maximum fraction of routing cost derived from delay (vs routability) for any net."
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index d73e06daf6a..01b06c69353 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -209,6 +209,10 @@ struct t_options {
     argparse::ArgValue<float> router_profiler_astar_fac;
     argparse::ArgValue<float> post_target_prune_fac;
     argparse::ArgValue<float> post_target_prune_offset;
+    argparse::ArgValue<int> multi_queue_num_threads;
+    argparse::ArgValue<int> multi_queue_num_queues;
+    argparse::ArgValue<bool> multi_queue_direct_draining;
+    argparse::ArgValue<std::vector<int>> thread_affinity;
     argparse::ArgValue<float> max_criticality;
     argparse::ArgValue<float> criticality_exp;
     argparse::ArgValue<float> router_init_wirelength_abort_threshold;
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index aa1d0570452..bcbc4626a02 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -1425,6 +1425,10 @@ struct t_router_opts {
     float router_profiler_astar_fac;
     float post_target_prune_fac;
     float post_target_prune_offset;
+    int multi_queue_num_threads;
+    int multi_queue_num_queues;
+    bool multi_queue_direct_draining;
+    std::vector<int> thread_affinity;
     float max_criticality;
     float criticality_exp;
     float init_wirelength_abort_threshold;
diff --git a/vpr/src/route/SerialNetlistRouter.h b/vpr/src/route/SerialNetlistRouter.h
index 89d439eddd7..3cb8a1373b3 100644
--- a/vpr/src/route/SerialNetlistRouter.h
+++ b/vpr/src/route/SerialNetlistRouter.h
@@ -21,8 +21,8 @@ class SerialNetlistRouter : public NetlistRouter {
         const RoutingPredictor& routing_predictor,
         const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
         bool is_flat)
-        : _serial_router(_make_router(router_lookahead, is_flat, false))
-        , _parallel_router(_make_router(router_lookahead, is_flat, true))
+        : _serial_router(_make_router(router_lookahead, router_opts, is_flat, false))
+        , _parallel_router(_make_router(router_lookahead, router_opts, is_flat, true))
         , _net_list(net_list)
         , _router_opts(router_opts)
         , _connections_inf(connections_inf)
@@ -45,8 +45,10 @@ class SerialNetlistRouter : public NetlistRouter {
 
   private:
     bool should_use_parallel_connection_router(const ParentNetId &net_id, int itry, float pres_fac, float worst_neg_slack);
-    
-    ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead, bool is_flat, bool is_parallel) {
+
+    ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead,
+                                            const t_router_opts& router_opts,
+                                            bool is_flat, bool is_parallel) {
         auto& device_ctx = g_vpr_ctx.device();
         auto& route_ctx = g_vpr_ctx.mutable_routing();
 
@@ -71,7 +73,11 @@ class SerialNetlistRouter : public NetlistRouter {
                 device_ctx.rr_rc_data,
                 device_ctx.rr_graph.rr_switch(),
                 route_ctx.rr_node_route_inf,
-                is_flat);
+                is_flat,
+                router_opts.multi_queue_num_threads,
+                router_opts.multi_queue_num_queues,
+                router_opts.multi_queue_direct_draining,
+                router_opts.thread_affinity);
             }
     }
     /* Context fields */
diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h
index 3ff2093e016..005c64eeeb0 100644
--- a/vpr/src/route/multi_queue_priority_queue.h
+++ b/vpr/src/route/multi_queue_priority_queue.h
@@ -1,7 +1,12 @@
 #ifndef _MULTI_QUEUE_PRIORITY_QUEUE_H
 #define _MULTI_QUEUE_PRIORITY_QUEUE_H
 
-// #define MQ_IO_ENABLE_CLEAR_FOR_POP
+// This is only used to enable the clearing code in the MQIO codebase. Whether
+// using queue draining optimization only depends on the VPR command-line option
+// `--multi_queue_direct_draining` setting during runtime. If the option is set
+// to `off`, the queue draining won't work since the `setMinPrioForPop` won't be
+// called leaving the `minPrioForPop` in MQIO object always as float maximum.
+#define MQ_IO_ENABLE_CLEAR_FOR_POP
 
 #include "heap_type.h"
 
diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp
index 8bc75fee235..26bfb429925 100644
--- a/vpr/src/route/parallel_connection_router.cpp
+++ b/vpr/src/route/parallel_connection_router.cpp
@@ -392,7 +392,7 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
     while (heap_.try_pop(new_total_cost, inode)) {
 #ifdef PROFILE_HEAP_OCCUPANCY
         if (thread_idx == 0) {
-            if (count % (1000 / mq_num_threads) == 0) {
+            if (count % 1000 == 0) {
                 heap_occ_profile_ << count << " " << heap_.getHeapOccupancy() << "\n";
             }
             count ++;
@@ -631,7 +631,9 @@ void ParallelConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_param
 
     if (to_node == target_node) {
 #ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
-        heap_.setMinPrioForPop(new_total_cost);
+        if (multi_queue_direct_draining_) {
+            heap_.setMinPrioForPop(new_total_cost);
+        }
 #endif
         return ;
     }
diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h
index 72bd0635eed..a480289dae3 100644
--- a/vpr/src/route/parallel_connection_router.h
+++ b/vpr/src/route/parallel_connection_router.h
@@ -16,9 +16,6 @@
 #include <fstream>
 #endif
 
-// For details on setting core affinity, please see `parse_core_affinity_list`.
-#define ENABLE_CORE_AFFINITY
-
 #define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE
 // #define VPR_PARALLEL_CONNECTION_ROUTER_USE_ONE_TBB
 
@@ -38,15 +35,6 @@ using ParallelPriorityQueue = OneTBBConcurrentPriorityQueue;
 #include <mutex>
 #include <condition_variable>
 
-const size_t mq_num_threads = std::atoi(
-    std::getenv("MQ_NUM_THREADS") ? std::getenv("MQ_NUM_THREADS") : "1");
-const size_t mq_num_queues_per_thread = std::atoi(
-    std::getenv("MQ_NUM_QUEUES_PER_THREAD") ? std::getenv("MQ_NUM_QUEUES_PER_THREAD") : "2");
-const size_t mq_num_queues_from_env = std::atoi(
-    std::getenv("MQ_NUM_QUEUES") ? std::getenv("MQ_NUM_QUEUES") : "0");
-const size_t mq_num_queues = mq_num_queues_from_env ?
-    mq_num_queues_from_env : (mq_num_threads * mq_num_queues_per_thread);
-
 class spin_lock_t {
     std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
 public:
@@ -118,45 +106,6 @@ class barrier_spin_t {
 
 using barrier_t = barrier_spin_t;
 
-inline std::vector<std::string> get_tokens_split_by_delimiter(std::string str, char delimiter) {
-    std::vector<std::string> tokens;
-    std::string acc = "";
-    for(const auto &x : str) {
-        if (x == delimiter) {
-            tokens.push_back(acc);
-            acc = "";
-        } else {
-            acc += x;
-        }
-    }
-    tokens.push_back(acc);
-    return tokens;
-}
-
-// To assign core affinity (i.e., pin threads to specific cores), please set the
-// environment variable `export VPR_CORE_AFFINITY=0-8` before running VPR.
-// Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7`
-// are all supported.
-inline std::vector<size_t> parse_core_affinity_list(std::string str) {
-    std::vector<size_t> core_affinity_list;
-    std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
-    for (const auto &l1_token : lv1_tokens_split_by_comma) {
-        std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
-        size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
-        assert(num_lv2_tokens == 1 || num_lv2_tokens == 2);
-        if (num_lv2_tokens == 2) {
-            int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
-            int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
-            for (int i = start_core_id; i <= end_core_id; ++i) {
-                core_affinity_list.push_back(i);
-            }
-        } else {
-            core_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
-        }
-    }
-    return core_affinity_list;
-}
-
 // Prune the heap when it contains 4x the number of nodes in the RR graph.
 // constexpr size_t kHeapPruneFactor = 4;
 
@@ -178,7 +127,11 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         const std::vector<t_rr_rc_data>& rr_rc_data,
         const vtr::vector<RRSwitchId, t_rr_switch_inf>& rr_switch_inf,
         vtr::vector<RRNodeId, t_rr_node_route_inf>& rr_node_route_inf,
-        bool is_flat)
+        bool is_flat,
+        int multi_queue_num_threads,
+        int multi_queue_num_queues,
+        bool multi_queue_direct_draining,
+        const std::vector<int>& thread_affinity)
         : grid_(grid)
         , router_lookahead_(router_lookahead)
         , rr_nodes_(rr_nodes.view())
@@ -189,60 +142,52 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         , net_terminal_group_num(g_vpr_ctx.routing().net_terminal_group_num)
         , rr_node_route_inf_(rr_node_route_inf)
         , is_flat_(is_flat)
-        , modified_rr_node_inf_(mq_num_threads)
+        , modified_rr_node_inf_(multi_queue_num_threads)
         , router_stats_(nullptr)
-        , heap_(mq_num_threads, mq_num_queues)
-        , thread_barrier_(mq_num_threads)
+        , heap_(multi_queue_num_threads, multi_queue_num_queues)
+        , thread_barrier_(multi_queue_num_threads)
         , is_router_destroying_(false)
         , locks_(rr_node_route_inf.size())
-        , router_debug_(false) {
+        , router_debug_(false)
+        , multi_queue_direct_draining_(multi_queue_direct_draining) {
         heap_.init_heap(grid);
         only_opin_inter_layer = (grid.get_num_layers() > 1) && inter_layer_connections_limited_to_opin(*rr_graph);
-        std::cout << "#T=" << mq_num_threads << " #Q=" << mq_num_queues << std::endl << std::flush;
-        sub_threads_.resize(mq_num_threads-1);
+        sub_threads_.resize(multi_queue_num_threads - 1);
         thread_barrier_.init();
 
 #ifdef PROFILE_HEAP_OCCUPANCY
         heap_occ_profile_.open("occupancy.txt", std::ios::trunc);
 #endif
 
-#ifdef ENABLE_CORE_AFFINITY
-        std::vector<size_t> thread_core_affinity_mapping;
-        if (std::getenv("VPR_CORE_AFFINITY")) {
-            thread_core_affinity_mapping = parse_core_affinity_list(std::getenv("VPR_CORE_AFFINITY"));
-            assert(thread_core_affinity_mapping.size() == mq_num_threads);
-        } else {
-            for (size_t i = 0; i < mq_num_threads; ++i) {
-                thread_core_affinity_mapping.push_back(i);
-            }
-        }
-#endif
+        bool enable_thread_affinity = thread_affinity.size() > 0;
+        VTR_ASSERT((!enable_thread_affinity) || (static_cast<int>(thread_affinity.size()) == multi_queue_num_threads));
 
-        for (size_t i = 0 ; i < mq_num_threads - 1; ++i) {
+        for (int i = 0 ; i < multi_queue_num_threads - 1; ++i) {
             sub_threads_[i] = std::thread(&ParallelConnectionRouter::timing_driven_route_connection_from_heap_sub_thread_wrapper, this, i + 1 /*0: main thread*/);
             // Create a cpu_set_t object representing a set of CPUs. Clear it and mark only CPU i as set.
-#ifdef ENABLE_CORE_AFFINITY
+            if (enable_thread_affinity) {
+                cpu_set_t cpuset;
+                CPU_ZERO(&cpuset);
+                CPU_SET(thread_affinity[i + 1], &cpuset);
+                int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(),
+                                                sizeof(cpu_set_t), &cpuset);
+                if (rc != 0) {
+                    VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
+                }
+            }
+            sub_threads_[i].detach();
+        }
+
+        if (enable_thread_affinity) {
             cpu_set_t cpuset;
             CPU_ZERO(&cpuset);
-            CPU_SET(thread_core_affinity_mapping[i + 1], &cpuset);
-            int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(),
+            CPU_SET(thread_affinity[0], &cpuset);
+            int rc = pthread_setaffinity_np(pthread_self(),
                                             sizeof(cpu_set_t), &cpuset);
             if (rc != 0) {
                 VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
             }
-#endif
-            sub_threads_[i].detach();
-        }
-#ifdef ENABLE_CORE_AFFINITY
-        cpu_set_t cpuset;
-        CPU_ZERO(&cpuset);
-        CPU_SET(thread_core_affinity_mapping[0], &cpuset);
-        int rc = pthread_setaffinity_np(pthread_self(),
-                                        sizeof(cpu_set_t), &cpuset);
-        if (rc != 0) {
-            VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
         }
-#endif
     }
 
     ~ParallelConnectionRouter() {
@@ -514,6 +459,8 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
     // Timing
     std::chrono::microseconds sssp_total_time{0};
 
+    bool multi_queue_direct_draining_;
+
     // Profiling
 #ifdef PROFILE_HEAP_OCCUPANCY
     std::ofstream heap_occ_profile_;
diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp
index e8dea9e08be..5b7a241c759 100644
--- a/vpr/src/route/router_delay_profiling.cpp
+++ b/vpr/src/route/router_delay_profiling.cpp
@@ -96,6 +96,8 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node,
     cost_params.criticality = 1.;
     cost_params.astar_fac = router_opts.router_profiler_astar_fac;
     cost_params.astar_offset = router_opts.astar_offset;
+    cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
+    cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
 
     route_budgets budgeting_inf(net_list_, is_flat_);
@@ -166,6 +168,8 @@ vtr::vector<RRNodeId, float> calculate_all_path_delays_from_rr_node(RRNodeId src
     cost_params.criticality = 1.;
     cost_params.astar_fac = router_opts.astar_fac;
     cost_params.astar_offset = router_opts.astar_offset;
+    cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
+    cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
     /* This function is called during placement. Thus, the flat routing option should be disabled. */
     //TODO: Placement is run with is_flat=false. However, since is_flat is passed, det_routing_arch should
diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index 4ede318a120..f55dfe39dd9 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -42,6 +42,8 @@ static float do_one_route(RRNodeId source_node,
     cost_params.criticality = router_opts.max_criticality;
     cost_params.astar_fac = router_opts.astar_fac;
     cost_params.astar_offset = router_opts.astar_offset;
+    cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
+    cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
 
     const Netlist<>& net_list = is_flat ? (const Netlist<>&)g_vpr_ctx.atom().nlist : (const Netlist<>&)g_vpr_ctx.clustering().clb_nlist;