diff --git a/libs/EXTERNAL/CPS b/libs/EXTERNAL/CPS
index d8b81cb9bb4..2bdb4b5db20 160000
--- a/libs/EXTERNAL/CPS
+++ b/libs/EXTERNAL/CPS
@@ -1 +1 @@
-Subproject commit d8b81cb9bb4540acd782aea513e93b8e47b0b7a3
+Subproject commit 2bdb4b5db200d63520e1fe0b20db0ac1a9c7ea99
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index cddb196ab1b..0c15a9c6653 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -410,9 +410,14 @@ static void SetupRoutingArch(const t_arch& Arch,
 static void SetupRouterOpts(const t_options& Options, t_router_opts* RouterOpts) {
     RouterOpts->do_check_rr_graph = Options.check_rr_graph;
     RouterOpts->astar_fac = Options.astar_fac;
+    RouterOpts->astar_offset = Options.astar_offset;
     RouterOpts->router_profiler_astar_fac = Options.router_profiler_astar_fac;
     RouterOpts->post_target_prune_fac = Options.post_target_prune_fac;
     RouterOpts->post_target_prune_offset = Options.post_target_prune_offset;
+    RouterOpts->multi_queue_num_threads = Options.multi_queue_num_threads;
+    RouterOpts->multi_queue_num_queues = Options.multi_queue_num_queues;
+    RouterOpts->multi_queue_direct_draining = Options.multi_queue_direct_draining;
+    RouterOpts->thread_affinity = Options.thread_affinity;
     RouterOpts->bb_factor = Options.bb_factor;
     RouterOpts->criticality_exp = Options.criticality_exp;
     RouterOpts->max_criticality = Options.max_criticality;
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
index 042277647cf..ab5c73e9b3b 100644
--- a/vpr/src/base/ShowSetup.cpp
+++ b/vpr/src/base/ShowSetup.cpp
@@ -257,6 +257,12 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
         VTR_LOG("false\n");
     }
 
+    auto transform_thread_affinity_list_to_str = [](const std::vector<int>& aff) {
+        std::string str = aff.size() ? std::to_string(aff.front()) : "off";
+        for (size_t i = 1; i < aff.size(); str += ',' + std::to_string(aff[i++])) ;
+        return str;
+    };
+
     if (DETAILED == RouterOpts.route_type) {
         VTR_LOG("RouterOpts.router_algorithm: ");
         switch (RouterOpts.router_algorithm) {
@@ -338,7 +344,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
 
         if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
+            VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
+            VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
+            VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
+            VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
+            VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
+            VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
+            VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
             VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
@@ -482,9 +495,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
         VTR_LOG("RouterOpts.exit_after_first_routing_iteration: %s\n", RouterOpts.exit_after_first_routing_iteration ? "true" : "false");
         if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
             VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
+            VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
             VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
             VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
             VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
+            VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
+            VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
+            VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
+            VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
             VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
             VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
             VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 8940f5f350c..bc11f18a067 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -1259,6 +1259,67 @@ struct ParsePostSynthNetlistUnconnOutputHandling {
     }
 };
 
+struct ParseTheadAffinityList {
+    inline std::vector<std::string> get_tokens_split_by_delimiter(const std::string& str, char delimiter) {
+        std::vector<std::string> tokens;
+        std::string acc = "";
+        for(const auto &x : str) {
+            if (x == delimiter) {
+                tokens.push_back(acc);
+                acc = "";
+            } else {
+                acc += x;
+            }
+        }
+        tokens.push_back(acc);
+        return tokens;
+    }
+
+    // Parse thread/core affinity list (i.e., pin threads to specific cores).
+    // Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7`
+    // are all supported.
+    inline std::vector<int> parse_thread_affinity_list(const std::string& str) {
+        std::vector<int> thread_affinity_list;
+        std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
+        for (const auto &l1_token : lv1_tokens_split_by_comma) {
+            std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
+            size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
+            VTR_ASSERT(num_lv2_tokens == 1 || num_lv2_tokens == 2);
+            if (num_lv2_tokens == 2) {
+                int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
+                int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
+                for (int i = start_core_id; i <= end_core_id; ++i) {
+                    thread_affinity_list.push_back(i);
+                }
+            } else {
+                thread_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
+            }
+        }
+        return thread_affinity_list;
+    }
+
+    ConvertedValue<std::vector<int>> from_str(const std::string& str) {
+        ConvertedValue<std::vector<int>> conv_value;
+        VTR_ASSERT(str.size() > 0);
+        if (str == "off") {
+            conv_value.set_value({});
+        } else {
+            conv_value.set_value(parse_thread_affinity_list(str));
+        }
+        return conv_value;
+    }
+
+    ConvertedValue<std::string> to_str(std::vector<int> val) {
+        ConvertedValue<std::string> conv_value;
+        std::string str = val.size() ? std::to_string(val.front()) : "off";
+        for (size_t i = 1; i < val.size(); str += ',' + std::to_string(val[i++])) ;
+        conv_value.set_value(str);
+        return conv_value;
+    }
+
+    std::vector<std::string> default_choices() { return {}; }
+};
+
 argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) {
     std::string description =
         "Implements the specified circuit onto the target FPGA architecture"
@@ -2477,6 +2538,14 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("1.2")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    route_timing_grp.add_argument(args.astar_offset, "--astar_offset")
+        .help(
+            "Controls the directedness of the timing-driven router's exploration."
+            " It is a subtractive adjustment to the lookahead heuristic."
+            " Values between 0 and 1e-9 are resonable; higher values may increase quality at the expense of run-time.")
+        .default_value("0.0")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     route_timing_grp.add_argument(args.router_profiler_astar_fac, "--router_profiler_astar_fac")
         .help(
             "Controls the directedness of the timing-driven router's exploration"
@@ -2496,6 +2565,26 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("0.0")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    route_timing_grp.add_argument<int>(args.multi_queue_num_threads, "--multi_queue_num_threads")
+        .help("TODO")
+        .default_value("1")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<int>(args.multi_queue_num_queues, "--multi_queue_num_queues")
+        .help("TODO")
+        .default_value("2")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<bool, ParseOnOff>(args.multi_queue_direct_draining, "--multi_queue_direct_draining")
+        .help("TODO")
+        .default_value("off")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    route_timing_grp.add_argument<std::vector<int>, ParseTheadAffinityList>(args.thread_affinity, "--thread_affinity")
+        .help("TODO")
+        .default_value("off")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     route_timing_grp.add_argument(args.max_criticality, "--max_criticality")
         .help(
             "Sets the maximum fraction of routing cost derived from delay (vs routability) for any net."
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index ce1538eeb93..01b06c69353 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -205,9 +205,14 @@ struct t_options {
 
     /* Timing-driven router options only */
     argparse::ArgValue<float> astar_fac;
+    argparse::ArgValue<float> astar_offset;
     argparse::ArgValue<float> router_profiler_astar_fac;
     argparse::ArgValue<float> post_target_prune_fac;
     argparse::ArgValue<float> post_target_prune_offset;
+    argparse::ArgValue<int> multi_queue_num_threads;
+    argparse::ArgValue<int> multi_queue_num_queues;
+    argparse::ArgValue<bool> multi_queue_direct_draining;
+    argparse::ArgValue<std::vector<int>> thread_affinity;
     argparse::ArgValue<float> max_criticality;
     argparse::ArgValue<float> criticality_exp;
     argparse::ArgValue<float> router_init_wirelength_abort_threshold;
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 0e217e4d0e2..bcbc4626a02 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -1332,6 +1332,8 @@ struct t_placer_opts {
  *             an essentially breadth-first search, astar_fac = 1 is near   *
  *             the usual astar algorithm and astar_fac > 1 are more         *
  *             aggressive.                                                  *
+ * astar_offset: Offset that is subtracted from the lookahead (expected     *
+ *               future costs) in the timing-driven router.                 *
  * max_criticality: The maximum criticality factor (from 0 to 1) any sink   *
  *                  will ever have (i.e. clip criticality to this number).  *
  * criticality_exp: Set criticality to (path_length(sink) / longest_path) ^ *
@@ -1419,9 +1421,14 @@ struct t_router_opts {
     enum e_router_algorithm router_algorithm;
     enum e_base_cost_type base_cost_type;
     float astar_fac;
+    float astar_offset;
     float router_profiler_astar_fac;
     float post_target_prune_fac;
     float post_target_prune_offset;
+    int multi_queue_num_threads;
+    int multi_queue_num_queues;
+    bool multi_queue_direct_draining;
+    std::vector<int> thread_affinity;
     float max_criticality;
     float criticality_exp;
     float init_wirelength_abort_threshold;
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index d0aacd8f78a..3cc771ffa2a 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -1187,7 +1187,8 @@ void OverrideDelayModel::compute_override_delay_model(
     RouterDelayProfiler& route_profiler,
     const t_router_opts& router_opts) {
     t_router_opts router_opts2 = router_opts;
-    router_opts2.astar_fac = 0.;
+    router_opts2.astar_fac = 0.f;
+    router_opts2.astar_offset = 0.f;
 
     //Look at all the direct connections that exist, and add overrides to delay model
     auto& device_ctx = g_vpr_ctx.device();
diff --git a/vpr/src/route/SerialNetlistRouter.h b/vpr/src/route/SerialNetlistRouter.h
index 89d439eddd7..3cb8a1373b3 100644
--- a/vpr/src/route/SerialNetlistRouter.h
+++ b/vpr/src/route/SerialNetlistRouter.h
@@ -21,8 +21,8 @@ class SerialNetlistRouter : public NetlistRouter {
         const RoutingPredictor& routing_predictor,
         const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
         bool is_flat)
-        : _serial_router(_make_router(router_lookahead, is_flat, false))
-        , _parallel_router(_make_router(router_lookahead, is_flat, true))
+        : _serial_router(_make_router(router_lookahead, router_opts, is_flat, false))
+        , _parallel_router(_make_router(router_lookahead, router_opts, is_flat, true))
         , _net_list(net_list)
         , _router_opts(router_opts)
         , _connections_inf(connections_inf)
@@ -45,8 +45,10 @@ class SerialNetlistRouter : public NetlistRouter {
 
   private:
     bool should_use_parallel_connection_router(const ParentNetId &net_id, int itry, float pres_fac, float worst_neg_slack);
-    
-    ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead, bool is_flat, bool is_parallel) {
+
+    ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead,
+                                            const t_router_opts& router_opts,
+                                            bool is_flat, bool is_parallel) {
         auto& device_ctx = g_vpr_ctx.device();
         auto& route_ctx = g_vpr_ctx.mutable_routing();
 
@@ -71,7 +73,11 @@ class SerialNetlistRouter : public NetlistRouter {
                 device_ctx.rr_rc_data,
                 device_ctx.rr_graph.rr_switch(),
                 route_ctx.rr_node_route_inf,
-                is_flat);
+                is_flat,
+                router_opts.multi_queue_num_threads,
+                router_opts.multi_queue_num_queues,
+                router_opts.multi_queue_direct_draining,
+                router_opts.thread_affinity);
             }
     }
     /* Context fields */
diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp
index a600f46f295..b0096dc9d85 100644
--- a/vpr/src/route/connection_router.cpp
+++ b/vpr/src/route/connection_router.cpp
@@ -1,6 +1,7 @@
 #include "connection_router.h"
-#include "rr_graph.h"
 
+#include <algorithm>
+#include "rr_graph.h"
 #include "binary_heap.h"
 #include "bucket.h"
 #include "rr_graph_fwd.h"
@@ -695,8 +696,8 @@ float ConnectionRouter<Heap>::compute_node_cost_using_rcv(const t_conn_cost_para
     float expected_total_delay_cost;
     float expected_total_cong_cost;
 
-    float expected_total_cong = cost_params.astar_fac * expected_cong + backwards_cong;
-    float expected_total_delay = cost_params.astar_fac * expected_delay + backwards_delay;
+    float expected_total_cong = expected_cong + backwards_cong;
+    float expected_total_delay = expected_delay + backwards_delay;
 
     //If budgets specified calculate cost as described by RCV paper:
     //    R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While
@@ -835,7 +836,7 @@ void ConnectionRouter<Heap>::evaluate_timing_driven_node_costs(t_heap* to,
                                                               target_node,
                                                               cost_params,
                                                               to->R_upstream);
-    total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost;
+    total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
 
     // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) {
     //     to->path_data->backward_delay += cost_params.criticality * Tdel;
@@ -952,12 +953,8 @@ void ConnectionRouter<Heap>::add_route_tree_node_to_heap(
 
     if (!rcv_path_manager.is_enabled()) {
         // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost;
-        float tot_cost = backward_path_cost
-                         + cost_params.astar_fac
-                               * router_lookahead_.get_expected_cost(inode,
-                                                                     target_node,
-                                                                     cost_params,
-                                                                     R_upstream);
+        float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream);
+        float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
         VTR_LOGV_DEBUG(router_debug_, "  Adding node %8d to heap from init route tree with cost %g (%s)\n",
                        inode,
                        tot_cost,
diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h
index 9d52966cbb9..7da28115b43 100644
--- a/vpr/src/route/connection_router_interface.h
+++ b/vpr/src/route/connection_router_interface.h
@@ -23,6 +23,7 @@ struct t_conn_delay_budget {
 struct t_conn_cost_params {
     float criticality = 1.;
     float astar_fac = 1.2;
+    float astar_offset = 0.f;
     float post_target_prune_fac = 1.2f;
     float post_target_prune_offset = 0.f;
     float bend_cost = 1.;
diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h
index 0428bda539e..005c64eeeb0 100644
--- a/vpr/src/route/multi_queue_priority_queue.h
+++ b/vpr/src/route/multi_queue_priority_queue.h
@@ -1,6 +1,13 @@
 #ifndef _MULTI_QUEUE_PRIORITY_QUEUE_H
 #define _MULTI_QUEUE_PRIORITY_QUEUE_H
 
+// This is only used to enable the clearing code in the MQIO codebase. Whether
+// using queue draining optimization only depends on the VPR command-line option
+// `--multi_queue_direct_draining` setting during runtime. If the option is set
+// to `off`, the queue draining won't work since the `setMinPrioForPop` won't be
+// called leaving the `minPrioForPop` in MQIO object always as float maximum.
+#define MQ_IO_ENABLE_CLEAR_FOR_POP
+
 #include "heap_type.h"
 
 #include "MultiQueueIO.h"
@@ -34,7 +41,11 @@ class MultiQueuePriorityQueue {
     void build_heap();
     inline uint64_t getNumPushes() const { return pq_->getNumPushes(); }
     inline uint64_t getNumPops() const { return pq_->getNumPops(); }
+    inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
     inline void reset() { pq_->reset(); }
+#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
+    inline void setMinPrioForPop(const pq_prio_t& minPrio) { pq_->setMinPrioForPop(minPrio); }
+#endif
 
   private:
     MQ_IO* pq_;
diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp
index 868a251f2ba..26bfb429925 100644
--- a/vpr/src/route/parallel_connection_router.cpp
+++ b/vpr/src/route/parallel_connection_router.cpp
@@ -382,8 +382,22 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
     // cheapest t_heap in current route tree to be expanded on
     float new_total_cost;
     RRNodeId inode;
+#ifdef PROFILE_HEAP_OCCUPANCY
+    unsigned count = 0;
+    if (thread_idx == 0) {
+        heap_occ_profile_ << size_t(sink_node) << "\n";
+    }
+#endif
     // While the heap is not empty do
     while (heap_.try_pop(new_total_cost, inode)) {
+#ifdef PROFILE_HEAP_OCCUPANCY
+        if (thread_idx == 0) {
+            if (count % 1000 == 0) {
+                heap_occ_profile_ << count << " " << heap_.getHeapOccupancy() << "\n";
+            }
+            count ++;
+        }
+#endif
         // update_router_stats(router_stats_,
         //                     false,
         //                     cheapest->index,
@@ -391,10 +405,6 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
 
         // Should we explore the neighbors of this node?
 
-        if (inode == sink_node) {
-            continue;
-        }
-
         if (should_not_explore_neighbors(inode, new_total_cost, rr_node_route_inf_[inode].backward_path_cost, sink_node, rr_node_route_inf_, cost_params)) {
             continue;
         }
@@ -619,7 +629,15 @@ void ParallelConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_param
 
     releaseLock(to_node);
 
-    heap_.add_to_heap(next.total_cost, to_node);
+    if (to_node == target_node) {
+#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
+        if (multi_queue_direct_draining_) {
+            heap_.setMinPrioForPop(new_total_cost);
+        }
+#endif
+        return ;
+    }
+    heap_.add_to_heap(new_total_cost, to_node);
 
     // update_router_stats(router_stats_,
     //                     true,
@@ -764,7 +782,7 @@ void ParallelConnectionRouter::evaluate_timing_driven_node_costs(node_t* to,
                                                               target_node,
                                                               cost_params,
                                                               to->R_upstream);
-    total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost;
+    total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
 
     // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) {
     //     to->path_data->backward_delay += cost_params.criticality * Tdel;
@@ -877,12 +895,8 @@ void ParallelConnectionRouter::add_route_tree_node_to_heap(
 
     // if (!rcv_path_manager.is_enabled()) {
         // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost;
-        float tot_cost = backward_path_cost
-                         + cost_params.astar_fac
-                               * router_lookahead_.get_expected_cost(inode,
-                                                                     target_node,
-                                                                     cost_params,
-                                                                     R_upstream);
+        float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream);
+        float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
         VTR_LOGV_DEBUG(router_debug_, "  Adding node %8d to heap from init route tree with cost %g (%s)\n",
                        inode,
                        tot_cost,
diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h
index 172f7010a07..a480289dae3 100644
--- a/vpr/src/route/parallel_connection_router.h
+++ b/vpr/src/route/parallel_connection_router.h
@@ -11,6 +11,11 @@
 #include "router_stats.h"
 #include "spatial_route_tree_lookup.h"
 
+// #define PROFILE_HEAP_OCCUPANCY
+#ifdef PROFILE_HEAP_OCCUPANCY
+#include <fstream>
+#endif
+
 #define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE
 // #define VPR_PARALLEL_CONNECTION_ROUTER_USE_ONE_TBB
 
@@ -30,15 +35,6 @@ using ParallelPriorityQueue = OneTBBConcurrentPriorityQueue;
 #include <mutex>
 #include <condition_variable>
 
-const size_t mq_num_threads = std::atoi(
-    std::getenv("MQ_NUM_THREADS") ? std::getenv("MQ_NUM_THREADS") : "1");
-const size_t mq_num_queues_per_thread = std::atoi(
-    std::getenv("MQ_NUM_QUEUES_PER_THREAD") ? std::getenv("MQ_NUM_QUEUES_PER_THREAD") : "2");
-const size_t mq_num_queues_from_env = std::atoi(
-    std::getenv("MQ_NUM_QUEUES") ? std::getenv("MQ_NUM_QUEUES") : "0");
-const size_t mq_num_queues = mq_num_queues_from_env ?
-    mq_num_queues_from_env : (mq_num_threads * mq_num_queues_per_thread);
-
 class spin_lock_t {
     std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
 public:
@@ -131,7 +127,11 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         const std::vector<t_rr_rc_data>& rr_rc_data,
         const vtr::vector<RRSwitchId, t_rr_switch_inf>& rr_switch_inf,
         vtr::vector<RRNodeId, t_rr_node_route_inf>& rr_node_route_inf,
-        bool is_flat)
+        bool is_flat,
+        int multi_queue_num_threads,
+        int multi_queue_num_queues,
+        bool multi_queue_direct_draining,
+        const std::vector<int>& thread_affinity)
         : grid_(grid)
         , router_lookahead_(router_lookahead)
         , rr_nodes_(rr_nodes.view())
@@ -142,22 +142,52 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         , net_terminal_group_num(g_vpr_ctx.routing().net_terminal_group_num)
         , rr_node_route_inf_(rr_node_route_inf)
         , is_flat_(is_flat)
-        , modified_rr_node_inf_(mq_num_threads)
+        , modified_rr_node_inf_(multi_queue_num_threads)
         , router_stats_(nullptr)
-        , heap_(mq_num_threads, mq_num_queues)
-        , thread_barrier_(mq_num_threads)
+        , heap_(multi_queue_num_threads, multi_queue_num_queues)
+        , thread_barrier_(multi_queue_num_threads)
         , is_router_destroying_(false)
         , locks_(rr_node_route_inf.size())
-        , router_debug_(false) {
+        , router_debug_(false)
+        , multi_queue_direct_draining_(multi_queue_direct_draining) {
         heap_.init_heap(grid);
         only_opin_inter_layer = (grid.get_num_layers() > 1) && inter_layer_connections_limited_to_opin(*rr_graph);
-        std::cout << "#T=" << mq_num_threads << " #Q=" << mq_num_queues << std::endl << std::flush;
-        sub_threads_.resize(mq_num_threads-1);
+        sub_threads_.resize(multi_queue_num_threads - 1);
         thread_barrier_.init();
-        for (size_t i = 0 ; i < mq_num_threads - 1; ++i) {
+
+#ifdef PROFILE_HEAP_OCCUPANCY
+        heap_occ_profile_.open("occupancy.txt", std::ios::trunc);
+#endif
+
+        bool enable_thread_affinity = thread_affinity.size() > 0;
+        VTR_ASSERT((!enable_thread_affinity) || (static_cast<int>(thread_affinity.size()) == multi_queue_num_threads));
+
+        for (int i = 0 ; i < multi_queue_num_threads - 1; ++i) {
             sub_threads_[i] = std::thread(&ParallelConnectionRouter::timing_driven_route_connection_from_heap_sub_thread_wrapper, this, i + 1 /*0: main thread*/);
+            // Create a cpu_set_t object representing a set of CPUs. Clear it and mark only CPU i as set.
+            if (enable_thread_affinity) {
+                cpu_set_t cpuset;
+                CPU_ZERO(&cpuset);
+                CPU_SET(thread_affinity[i + 1], &cpuset);
+                int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(),
+                                                sizeof(cpu_set_t), &cpuset);
+                if (rc != 0) {
+                    VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
+                }
+            }
             sub_threads_[i].detach();
         }
+
+        if (enable_thread_affinity) {
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(thread_affinity[0], &cpuset);
+            int rc = pthread_setaffinity_np(pthread_self(),
+                                            sizeof(cpu_set_t), &cpuset);
+            if (rc != 0) {
+                VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
+            }
+        }
     }
 
     ~ParallelConnectionRouter() {
@@ -165,6 +195,10 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
         thread_barrier_.wait();
 
         VTR_LOG("Parallel Connection Router is being destroyed. Time spent computing SSSP: %g seconds\n.", this->sssp_total_time.count() / 1000000.0);
+
+#ifdef PROFILE_HEAP_OCCUPANCY
+        heap_occ_profile_.close();
+#endif
     }
 
     // Clear's the modified list.  Should be called after reset_path_costs
@@ -424,6 +458,13 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
 
     // Timing
     std::chrono::microseconds sssp_total_time{0};
+
+    bool multi_queue_direct_draining_;
+
+    // Profiling
+#ifdef PROFILE_HEAP_OCCUPANCY
+    std::ofstream heap_occ_profile_;
+#endif
 };
 
 #endif /* _PARALLEL_CONNECTION_ROUTER_H */
diff --git a/vpr/src/route/route_net.tpp b/vpr/src/route/route_net.tpp
index 8046d855b55..98e8cacadfa 100644
--- a/vpr/src/route/route_net.tpp
+++ b/vpr/src/route/route_net.tpp
@@ -139,6 +139,7 @@ inline NetResultFlags route_net(ConnectionRouter *router,
     t_conn_delay_budget conn_delay_budget;
     t_conn_cost_params cost_params;
     cost_params.astar_fac = router_opts.astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
     cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
     cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp
index 3f4dddcf8f0..5b7a241c759 100644
--- a/vpr/src/route/router_delay_profiling.cpp
+++ b/vpr/src/route/router_delay_profiling.cpp
@@ -95,6 +95,9 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node,
     t_conn_cost_params cost_params;
     cost_params.criticality = 1.;
     cost_params.astar_fac = router_opts.router_profiler_astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
+    cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
+    cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
 
     route_budgets budgeting_inf(net_list_, is_flat_);
@@ -164,6 +167,9 @@ vtr::vector<RRNodeId, float> calculate_all_path_delays_from_rr_node(RRNodeId src
     t_conn_cost_params cost_params;
     cost_params.criticality = 1.;
     cost_params.astar_fac = router_opts.astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
+    cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
+    cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
     /* This function is called during placement. Thus, the flat routing option should be disabled. */
     //TODO: Placement is run with is_flat=false. However, since is_flat is passed, det_routing_arch should
diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index 1b0c236a29a..f55dfe39dd9 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -41,6 +41,9 @@ static float do_one_route(RRNodeId source_node,
     t_conn_cost_params cost_params;
     cost_params.criticality = router_opts.max_criticality;
     cost_params.astar_fac = router_opts.astar_fac;
+    cost_params.astar_offset = router_opts.astar_offset;
+    cost_params.post_target_prune_fac = router_opts.post_target_prune_fac;
+    cost_params.post_target_prune_offset = router_opts.post_target_prune_offset;
     cost_params.bend_cost = router_opts.bend_cost;
 
     const Netlist<>& net_list = is_flat ? (const Netlist<>&)g_vpr_ctx.atom().nlist : (const Netlist<>&)g_vpr_ctx.clustering().clb_nlist;