From 0c7800b5e3b3de38512f1f3cb2c9ba33813b6338 Mon Sep 17 00:00:00 2001
From: Duck Deux <duck2@protonmail.com>
Date: Thu, 30 Jan 2025 12:37:30 -0500
Subject: [PATCH] add NestedNetlistRouter and custom thread pool

---
 libs/libvtrutil/src/vtr_thread_pool.h | 159 ++++++++++++++++++++++++++
 vpr/src/base/ShowSetup.cpp            |   3 +
 vpr/src/base/read_options.cpp         |  15 ++-
 vpr/src/base/vpr_types.h              |   1 +
 vpr/src/route/NestedNetlistRouter.h   | 134 ++++++++++++++++++++++
 vpr/src/route/NestedNetlistRouter.tpp | 141 +++++++++++++++++++++++
 vpr/src/route/netlist_routers.h       |  15 +++
 7 files changed, 464 insertions(+), 4 deletions(-)
 create mode 100644 libs/libvtrutil/src/vtr_thread_pool.h
 create mode 100644 vpr/src/route/NestedNetlistRouter.h
 create mode 100644 vpr/src/route/NestedNetlistRouter.tpp
diff --git a/libs/libvtrutil/src/vtr_thread_pool.h b/libs/libvtrutil/src/vtr_thread_pool.h
new file mode 100644
index 00000000000..2185ac196e6
--- /dev/null
+++ b/libs/libvtrutil/src/vtr_thread_pool.h
@@ -0,0 +1,159 @@
+#pragma once
+
+/** 
+ * @file vtr_thread_pool.h
+ * @brief A generic thread pool for parallel task execution
+ */
+
+#include <thread>
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <atomic>
+#include <functional>
+#include <cstddef>
+#include <vector>
+#include "vtr_log.h"
+#include "vtr_time.h"
+
+namespace vtr {
+
+/**
+ * A thread pool for parallel task execution. It is a naive
+ * implementation which uses a queue for each thread and assigns
+ * tasks in a round robin fashion.
+ *
+ * Example usage:
+ *
+ * vtr::thread_pool pool(4);
+ * pool.schedule_work([]{
+ *     // Task body
+ * });
+ * pool.wait_for_all(); // There's no API to wait for a single task
+ */
+class thread_pool {
+  private:
+    /* Thread-local data */
+    struct ThreadData {
+        std::thread thread;
+        /* Per-thread task queue */
+        std::queue<std::function<void()>> task_queue;
+
+        /* Threads wait on cv for a stop signal or a new task
+         * queue_mutex is required for condition variable */
+        std::mutex queue_mutex;
+        std::condition_variable cv;
+        bool stop = false;
+    };
+
+    /* Container for thread-local data */
+    std::vector<std::unique_ptr<ThreadData>> threads;
+    /* Used for round-robin scheduling */
+    std::atomic<size_t> next_thread{0};
+    /* Used for wait_for_all */
+    std::atomic<size_t> active_tasks{0};
+
+    /* Condition variable for wait_for_all */
+    std::mutex completion_mutex;
+    std::condition_variable completion_cv;
+
+  public:
+    thread_pool(size_t thread_count) {
+        threads.reserve(thread_count);
+
+        for (size_t i = 0; i < thread_count; i++) {
+            auto thread_data = std::make_unique<ThreadData>();
+
+            thread_data->thread = std::thread([&]() {
+                ThreadData* td = thread_data.get();
+
+                while (true) {
+                    std::function<void()> task;
+
+                    { /* Wait until a task is available or stop signal is received */
+                        std::unique_lock<std::mutex> lock(td->queue_mutex);
+
+                        td->cv.wait(lock, [td]() {
+                            return td->stop || !td->task_queue.empty();
+                        });
+
+                        if (td->stop && td->task_queue.empty()) {
+                            return;
+                        }
+
+                        /* Fetch a task from the queue */
+                        task = std::move(td->task_queue.front());
+                        td->task_queue.pop();
+                    }
+
+                    vtr::Timer task_timer;
+                    task();
+                }
+            });
+
+            threads.push_back(std::move(thread_data));
+        }
+    }
+
+    template<typename F>
+    void schedule_work(F&& f) {
+        active_tasks++;
+
+        /* Round-robin thread assignment */
+        size_t thread_idx = (next_thread++) % threads.size();
+        auto thread_data = threads[thread_idx].get();
+
+        auto task = [this, f = std::forward<F>(f)]() {
+            vtr::Timer task_timer;
+
+            try {
+                f();
+            } catch (const std::exception& e) {
+                VTR_LOG_ERROR("Thread %zu failed task with error: %s\n",
+                              std::this_thread::get_id(), e.what());
+                throw;
+            } catch (...) {
+                VTR_LOG_ERROR("Thread %zu failed task with unknown error\n",
+                              std::this_thread::get_id());
+                throw;
+            }
+
+            size_t remaining = --active_tasks;
+            if (remaining == 0) {
+                completion_cv.notify_all();
+            }
+        };
+
+        /* Queue new task */
+        {
+            std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
+            thread_data->task_queue.push(std::move(task));
+        }
+        thread_data->cv.notify_one();
+    }
+
+    void wait_for_all() {
+        std::unique_lock<std::mutex> lock(completion_mutex);
+        completion_cv.wait(lock, [this]() { return active_tasks == 0; });
+    }
+
+    ~thread_pool() {
+        /* Stop all threads */
+        for (auto& thread_data : threads) {
+            {
+                std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
+                thread_data->stop = true;
+            }
+            thread_data->cv.notify_one();
+        }
+
+        for (auto& thread_data : threads) {
+            if (thread_data->thread.joinable()) {
+                thread_data->thread.join();
+            }
+        }
+    }
+};
+
+} // namespace vtr
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
index fc16e7c8ff2..1524af7ed83 100644
--- a/vpr/src/base/ShowSetup.cpp
+++ b/vpr/src/base/ShowSetup.cpp
@@ -252,6 +252,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
 
     VTR_LOG("RouterOpts.router_algorithm: ");
     switch (RouterOpts.router_algorithm) {
+        case NESTED:
+            VTR_LOG("NESTED\n");
+            break;
         case PARALLEL:
             VTR_LOG("PARALLEL\n");
             break;
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index e14b32e0295..553788ce528 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -282,7 +282,9 @@ struct ParseRoutePredictor {
 struct ParseRouterAlgorithm {
     ConvertedValue<e_router_algorithm> from_str(const std::string& str) {
         ConvertedValue<e_router_algorithm> conv_value;
-        if (str == "parallel")
+        if (str == "nested")
+            conv_value.set_value(NESTED);
+        else if (str == "parallel")
             conv_value.set_value(PARALLEL);
         else if (str == "parallel_decomp")
             conv_value.set_value(PARALLEL_DECOMP);
@@ -298,8 +300,12 @@ struct ParseRouterAlgorithm {
 
     ConvertedValue<std::string> to_str(e_router_algorithm val) {
         ConvertedValue<std::string> conv_value;
-        if (val == PARALLEL)
+        if (val == NESTED)
+            conv_value.set_value("nested");
+        else if (val == PARALLEL)
             conv_value.set_value("parallel");
+        else if (val == PARALLEL_DECOMP)
+            conv_value.set_value("parallel_decomp");
         else {
             VTR_ASSERT(val == TIMING_DRIVEN);
             conv_value.set_value("timing_driven");
@@ -2548,9 +2554,10 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
             "Specifies the router algorithm to use.\n"
             " * timing driven: focuses on routability and circuit speed [default]\n"
             " * parallel: timing_driven with nets in different regions of the chip routed in parallel\n"
-            " * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n")
+            " * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n"
+            " * nested: parallel with parallelized path search\n")
         .default_value("timing_driven")
-        .choices({"parallel", "parallel_decomp", "timing_driven"})
+        .choices({"nested", "parallel", "parallel_decomp", "timing_driven"})
         .show_in(argparse::ShowIn::HELP_ONLY);
 
     route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout")
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index eb0b7305df0..1fbfd52ebcd 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -1124,6 +1124,7 @@ struct t_ap_opts {
  * read_rr_graph_name:  stores the file name of the rr graph to be read by vpr */
 
 enum e_router_algorithm {
+    NESTED,
     PARALLEL,
     PARALLEL_DECOMP,
     TIMING_DRIVEN,
diff --git a/vpr/src/route/NestedNetlistRouter.h b/vpr/src/route/NestedNetlistRouter.h
new file mode 100644
index 00000000000..6870842af8f
--- /dev/null
+++ b/vpr/src/route/NestedNetlistRouter.h
@@ -0,0 +1,134 @@
+#pragma once
+
+/** @file Nested parallel case for NetlistRouter */
+#include "netlist_routers.h"
+#include "vtr_optional.h"
+#include "vtr_thread_pool.h"
+#include <unordered_map>
+
+/* Add cmd line option for this later */
+constexpr int MAX_THREADS = 4;
+
+/** Nested parallel impl for NetlistRouter.
+ *
+ * Calls a parallel ConnectionRouter for route_net to extract even more parallelism.
+ * The main reason why this is a different router instead of templating NetlistRouter
+ * on ConnectionRouter is this router does not use TBB. The scheduling performance is
+ * worse, but it can wait in individual tasks now (which is not possible with TBB).
+ *
+ * Holds enough context members to glue together ConnectionRouter and net routing functions,
+ * such as \ref route_net. Keeps the members in thread-local storage where needed,
+ * i.e. ConnectionRouters and RouteIterResults-es.
+ * See \ref route_net. */
+template<typename HeapType>
+class NestedNetlistRouter : public NetlistRouter {
+  public:
+    NestedNetlistRouter(
+        const Netlist<>& net_list,
+        const RouterLookahead* router_lookahead,
+        const t_router_opts& router_opts,
+        CBRR& connections_inf,
+        NetPinsMatrix<float>& net_delay,
+        const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+        std::shared_ptr<SetupHoldTimingInfo> timing_info,
+        NetPinTimingInvalidator* pin_timing_invalidator,
+        route_budgets& budgeting_inf,
+        const RoutingPredictor& routing_predictor,
+        const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
+        bool is_flat)
+        : _net_list(net_list)
+        , _router_lookahead(router_lookahead)
+        , _router_opts(router_opts)
+        , _connections_inf(connections_inf)
+        , _net_delay(net_delay)
+        , _netlist_pin_lookup(netlist_pin_lookup)
+        , _timing_info(timing_info)
+        , _pin_timing_invalidator(pin_timing_invalidator)
+        , _budgeting_inf(budgeting_inf)
+        , _routing_predictor(routing_predictor)
+        , _choking_spots(choking_spots)
+        , _is_flat(is_flat)
+        , _thread_pool(MAX_THREADS) {}
+    ~NestedNetlistRouter() {}
+
+    /** Run a single iteration of netlist routing for this->_net_list. This usually means calling
+     * \ref route_net for each net, which will handle other global updates.
+     * \return RouteIterResults for this iteration. */
+    RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack);
+    /** Inform the PartitionTree of the nets with updated bounding boxes */
+    void handle_bb_updated_nets(const std::vector<ParentNetId>& nets);
+
+    /** Set rcv_enabled for each ConnectionRouter this is managing */
+    void set_rcv_enabled(bool x);
+    /** Set timing_info for each ConnectionRouter this is managing */
+    void set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info);
+
+  private:
+    /** Route all nets in a PartitionTree node and add its children to the task queue. */
+    void route_partition_tree_node(PartitionTreeNode& node);
+
+    ConnectionRouter<HeapType> _make_router(const RouterLookahead* router_lookahead, bool is_flat) {
+        auto& device_ctx = g_vpr_ctx.device();
+        auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+        return ConnectionRouter<HeapType>(
+            device_ctx.grid,
+            *router_lookahead,
+            device_ctx.rr_graph.rr_nodes(),
+            &device_ctx.rr_graph,
+            device_ctx.rr_rc_data,
+            device_ctx.rr_graph.rr_switch(),
+            route_ctx.rr_node_route_inf,
+            is_flat);
+    }
+
+    /* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */
+    const Netlist<>& _net_list;
+    const RouterLookahead* _router_lookahead;
+    const t_router_opts& _router_opts;
+    CBRR& _connections_inf;
+    NetPinsMatrix<float>& _net_delay;
+    const ClusteredPinAtomPinsLookup& _netlist_pin_lookup;
+    std::shared_ptr<SetupHoldTimingInfo> _timing_info;
+    NetPinTimingInvalidator* _pin_timing_invalidator;
+    route_budgets& _budgeting_inf;
+    const RoutingPredictor& _routing_predictor;
+    const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& _choking_spots;
+    bool _is_flat;
+
+    /** Cached routing parameters for current iteration (inputs to \see route_netlist()) */
+    int _itry;
+    float _pres_fac;
+    float _worst_neg_slack;
+
+    /** The partition tree. Holds the groups of nets for each partition */
+    vtr::optional<PartitionTree> _tree;
+
+    /** Thread pool for parallel routing. See vtr_thread_pool.h for implementation */
+    vtr::thread_pool _thread_pool;
+
+    /* Thread-local storage.
+     * These are maps because thread::id is a random integer instead of 1, 2, ... */
+    std::unordered_map<std::thread::id, ConnectionRouter<HeapType>> _routers_th;
+    std::unordered_map<std::thread::id, RouteIterResults> _results_th;
+    std::mutex _storage_mutex;
+
+    /** Get a thread-local ConnectionRouter. We lock the id->router lookup, but this is
+     * accessed once per partition so the overhead should be small */
+    ConnectionRouter<HeapType>& get_thread_router() {
+        auto id = std::this_thread::get_id();
+        std::lock_guard<std::mutex> lock(_storage_mutex);
+        if (!_routers_th.count(id)) {
+            _routers_th.emplace(id, _make_router(_router_lookahead, _is_flat));
+        }
+        return _routers_th.at(id);
+    }
+
+    RouteIterResults& get_thread_results() {
+        auto id = std::this_thread::get_id();
+        std::lock_guard<std::mutex> lock(_storage_mutex);
+        return _results_th[id];
+    }
+};
+
+#include "NestedNetlistRouter.tpp"
diff --git a/vpr/src/route/NestedNetlistRouter.tpp b/vpr/src/route/NestedNetlistRouter.tpp
new file mode 100644
index 00000000000..333be28ea3b
--- /dev/null
+++ b/vpr/src/route/NestedNetlistRouter.tpp
@@ -0,0 +1,141 @@
+#pragma once
+
+/** @file Impls for ParallelNetlistRouter */
+
+#include <string>
+#include "netlist_routers.h"
+#include "route_net.h"
+#include "vtr_time.h"
+
+template<typename HeapType>
+inline RouteIterResults NestedNetlistRouter<HeapType>::route_netlist(int itry, float pres_fac, float worst_neg_slack) {
+    /* Reset results for each thread */
+    for (auto& [_, results] : _results_th) {
+        results = RouteIterResults();
+    }
+
+    /* Set the routing parameters: they won't change until the next call and that saves us the trouble of passing them around */
+    _itry = itry;
+    _pres_fac = pres_fac;
+    _worst_neg_slack = worst_neg_slack;
+
+    /* Organize netlist into a PartitionTree.
+     * Nets in a given level of nodes are guaranteed to not have any overlapping bounding boxes, so they can be routed in parallel. */
+    vtr::Timer timer;
+    if (!_tree) {
+        _tree = PartitionTree(_net_list);
+        PartitionTreeDebug::log("Iteration " + std::to_string(itry) + ": built partition tree in " + std::to_string(timer.elapsed_sec()) + " s");
+    }
+
+    /* Push a single route_partition_tree_node task to the thread pool,
+     * which will recursively schedule the rest of the tree */
+    _thread_pool.schedule_work([this]() {
+        route_partition_tree_node(_tree->root());
+    });
+
+    /* Wait for all tasks in the thread pool to complete */
+    _thread_pool.wait_for_all();
+
+    PartitionTreeDebug::log("Routing all nets took " + std::to_string(timer.elapsed_sec()) + " s");
+
+    /* Combine results from threads */
+    RouteIterResults out;
+    for (auto& [_, results] : _results_th) {
+        out.stats.combine(results.stats);
+        out.rerouted_nets.insert(out.rerouted_nets.end(), results.rerouted_nets.begin(), results.rerouted_nets.end());
+        out.bb_updated_nets.insert(out.bb_updated_nets.end(), results.bb_updated_nets.begin(), results.bb_updated_nets.end());
+        out.is_routable &= results.is_routable;
+    }
+    return out;
+}
+
+template<typename HeapType>
+void NestedNetlistRouter<HeapType>::route_partition_tree_node(PartitionTreeNode& node) {
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+    /* node.nets is an unordered set, copy into vector to sort */
+    std::vector<ParentNetId> nets(node.nets.begin(), node.nets.end());
+
+    /* Sort so net with most sinks is routed first. */
+    std::stable_sort(nets.begin(), nets.end(), [&](ParentNetId id1, ParentNetId id2) -> bool {
+        return _net_list.net_sinks(id1).size() > _net_list.net_sinks(id2).size();
+    });
+
+    vtr::Timer timer;
+
+    /* Route all nets in this node serially */
+    for (auto net_id : nets) {
+        auto& results = get_thread_results();
+        auto& router = get_thread_router();
+
+        auto flags = route_net(
+            router,
+            _net_list,
+            net_id,
+            _itry,
+            _pres_fac,
+            _router_opts,
+            _connections_inf,
+            results.stats,
+            _net_delay,
+            _netlist_pin_lookup,
+            _timing_info.get(),
+            _pin_timing_invalidator,
+            _budgeting_inf,
+            _worst_neg_slack,
+            _routing_predictor,
+            _choking_spots[net_id],
+            _is_flat,
+            route_ctx.route_bb[net_id]);
+
+        if (!flags.success && !flags.retry_with_full_bb) {
+            /* Disconnected RRG and ConnectionRouter doesn't think growing the BB will work */
+            results.is_routable = false;
+            return;
+        }
+        if (flags.retry_with_full_bb) {
+            /* ConnectionRouter thinks we should grow the BB. Do that and leave this net unrouted for now */
+            route_ctx.route_bb[net_id] = full_device_bb();
+            results.bb_updated_nets.push_back(net_id);
+            continue;
+        }
+        if (flags.was_rerouted) {
+            results.rerouted_nets.push_back(net_id);
+        }
+    }
+
+    PartitionTreeDebug::log("Node with " + std::to_string(node.nets.size())
+                            + " nets and " + std::to_string(node.vnets.size())
+                            + " virtual nets routed in " + std::to_string(timer.elapsed_sec())
+                            + " s");
+
+    /* Schedule child nodes as new tasks */
+    if (node.left && node.right) {
+        _thread_pool.schedule_work([this, left = node.left.get()]() {
+            route_partition_tree_node(*left);
+        });
+        _thread_pool.schedule_work([this, right = node.right.get()]() {
+            route_partition_tree_node(*right);
+        });
+    } else {
+        VTR_ASSERT(!node.left && !node.right); // there shouldn't be a node with a single branch
+    }
+}
+
+template<typename HeapType>
+void NestedNetlistRouter<HeapType>::handle_bb_updated_nets(const std::vector<ParentNetId>& nets) {
+    VTR_ASSERT(_tree);
+    _tree->update_nets(nets);
+}
+
+template<typename HeapType>
+void NestedNetlistRouter<HeapType>::set_rcv_enabled(bool x) {
+    for (auto& [_, router] : _routers_th) {
+        router.set_rcv_enabled(x);
+    }
+}
+
+template<typename HeapType>
+void NestedNetlistRouter<HeapType>::set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info) {
+    _timing_info = timing_info;
+}
diff --git a/vpr/src/route/netlist_routers.h b/vpr/src/route/netlist_routers.h
index 436343179b2..d64477f03ad 100644
--- a/vpr/src/route/netlist_routers.h
+++ b/vpr/src/route/netlist_routers.h
@@ -71,6 +71,7 @@ class NetlistRouter {
 
 /* Include the derived classes here to get the HeapType-templated impls */
 #include "SerialNetlistRouter.h"
+#include "NestedNetlistRouter.h"
 #ifdef VPR_USE_TBB
 #include "ParallelNetlistRouter.h"
 #include "DecompNetlistRouter.h"
@@ -104,6 +105,20 @@ inline std::unique_ptr<NetlistRouter> make_netlist_router_with_heap(
             routing_predictor,
             choking_spots,
             is_flat);
+    } else if (router_opts.router_algorithm == e_router_algorithm::NESTED) {
+        return std::make_unique<NestedNetlistRouter<HeapType>>(
+            net_list,
+            router_lookahead,
+            router_opts,
+            connections_inf,
+            net_delay,
+            netlist_pin_lookup,
+            timing_info,
+            pin_timing_invalidator,
+            budgeting_inf,
+            routing_predictor,
+            choking_spots,
+            is_flat);
     } else if (router_opts.router_algorithm == e_router_algorithm::PARALLEL) {
 #ifdef VPR_USE_TBB
         return std::make_unique<ParallelNetlistRouter<HeapType>>(