verilog-to-routing · vaughnbetz · Mar 31, 2025 · Jan 30, 2025 · vaughnbetz · Mar 18, 2025
diff --git a/libs/libvtrutil/src/vtr_thread_pool.h b/libs/libvtrutil/src/vtr_thread_pool.h
@@ -0,0 +1,159 @@
+#pragma once
+
+/** 
+ * @file vtr_thread_pool.h
+ * @brief A generic thread pool for parallel task execution
+ */
+
+#include <thread>
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <atomic>
+#include <functional>
+#include <cstddef>
+#include <vector>
+#include "vtr_log.h"
+#include "vtr_time.h"
+
+namespace vtr {
+
+/**
+ * A thread pool for parallel task execution. It is a naive
+ * implementation which uses a queue for each thread and assigns
+ * tasks in a round robin fashion.
+ *
+ * Example usage:
+ *
+ * vtr::thread_pool pool(4);
+ * pool.schedule_work([]{
+ *     // Task body
+ * });
+ * pool.wait_for_all(); // There's no API to wait for a single task
+ */
+class thread_pool {
+  private:
+    /* Thread-local data */
+    struct ThreadData {
+        std::thread thread;
+        /* Per-thread task queue */
+        std::queue<std::function<void()>> task_queue;
+
+        /* Threads wait on cv for a stop signal or a new task
+         * queue_mutex is required for condition variable */
+        std::mutex queue_mutex;
+        std::condition_variable cv;
+        bool stop = false;
+    };
+
+    /* Container for thread-local data */
+    std::vector<std::unique_ptr<ThreadData>> threads;
+    /* Used for round-robin scheduling */
+    std::atomic<size_t> next_thread{0};
+    /* Used for wait_for_all */
+    std::atomic<size_t> active_tasks{0};
+
+    /* Condition variable for wait_for_all */
+    std::mutex completion_mutex;
+    std::condition_variable completion_cv;
+
+  public:
+    thread_pool(size_t thread_count) {
+        threads.reserve(thread_count);
+
+        for (size_t i = 0; i < thread_count; i++) {
+            auto thread_data = std::make_unique<ThreadData>();
+
+            thread_data->thread = std::thread([&]() {
+                ThreadData* td = thread_data.get();
+
+                while (true) {
+                    std::function<void()> task;
+
+                    { /* Wait until a task is available or stop signal is received */
+                        std::unique_lock<std::mutex> lock(td->queue_mutex);
+
+                        td->cv.wait(lock, [td]() {
+                            return td->stop || !td->task_queue.empty();
+                        });
+
+                        if (td->stop && td->task_queue.empty()) {
+                            return;
+                        }
+
+                        /* Fetch a task from the queue */
+                        task = std::move(td->task_queue.front());
+                        td->task_queue.pop();
+                    }
+
+                    vtr::Timer task_timer;
+                    task();
+                }
+            });
+
+            threads.push_back(std::move(thread_data));
+        }
+    }
+
+    template<typename F>
+    void schedule_work(F&& f) {
+        active_tasks++;
+
+        /* Round-robin thread assignment */
+        size_t thread_idx = (next_thread++) % threads.size();
+        auto thread_data = threads[thread_idx].get();
+
+        auto task = [this, f = std::forward<F>(f)]() {
+            vtr::Timer task_timer;
+
+            try {
+                f();
+            } catch (const std::exception& e) {
+                VTR_LOG_ERROR("Thread %zu failed task with error: %s\n",
+                              std::this_thread::get_id(), e.what());
+                throw;
+            } catch (...) {
+                VTR_LOG_ERROR("Thread %zu failed task with unknown error\n",
+                              std::this_thread::get_id());
+                throw;
+            }
+
+            size_t remaining = --active_tasks;
+            if (remaining == 0) {
+                completion_cv.notify_all();
+            }
+        };
+
+        /* Queue new task */
+        {
+            std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
+            thread_data->task_queue.push(std::move(task));
+        }
+        thread_data->cv.notify_one();
+    }
+
+    void wait_for_all() {
+        std::unique_lock<std::mutex> lock(completion_mutex);
+        completion_cv.wait(lock, [this]() { return active_tasks == 0; });
+    }
+
+    ~thread_pool() {
+        /* Stop all threads */
+        for (auto& thread_data : threads) {
+            {
+                std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
+                thread_data->stop = true;
+            }
+            thread_data->cv.notify_one();
+        }
+
+        for (auto& thread_data : threads) {
+            if (thread_data->thread.joinable()) {
+                thread_data->thread.join();
+            }
+        }
+    }
+};
+
+} // namespace vtr
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
@@ -252,6 +252,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
 
     VTR_LOG("RouterOpts.router_algorithm: ");
     switch (RouterOpts.router_algorithm) {
+        case NESTED:
+            VTR_LOG("NESTED\n");
+            break;
         case PARALLEL:
             VTR_LOG("PARALLEL\n");
             break;

diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
@@ -282,7 +282,9 @@ struct ParseRoutePredictor {
 struct ParseRouterAlgorithm {
     ConvertedValue<e_router_algorithm> from_str(const std::string& str) {
         ConvertedValue<e_router_algorithm> conv_value;
-        if (str == "parallel")
+        if (str == "nested")
+            conv_value.set_value(NESTED);
+        else if (str == "parallel")
             conv_value.set_value(PARALLEL);
         else if (str == "parallel_decomp")
             conv_value.set_value(PARALLEL_DECOMP);
@@ -298,8 +300,12 @@ struct ParseRouterAlgorithm {
 
     ConvertedValue<std::string> to_str(e_router_algorithm val) {
         ConvertedValue<std::string> conv_value;
-        if (val == PARALLEL)
+        if (val == NESTED)
+            conv_value.set_value("nested");
+        else if (val == PARALLEL)
             conv_value.set_value("parallel");
+        else if (val == PARALLEL_DECOMP)
+            conv_value.set_value("parallel_decomp");
         else {
             VTR_ASSERT(val == TIMING_DRIVEN);
             conv_value.set_value("timing_driven");
@@ -2548,9 +2554,10 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
             "Specifies the router algorithm to use.\n"
             " * timing driven: focuses on routability and circuit speed [default]\n"
             " * parallel: timing_driven with nets in different regions of the chip routed in parallel\n"
-            " * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n")
+            " * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n"
+            " * nested: parallel with parallelized path search\n")
         .default_value("timing_driven")
-        .choices({"parallel", "parallel_decomp", "timing_driven"})
+        .choices({"nested", "parallel", "parallel_decomp", "timing_driven"})
         .show_in(argparse::ShowIn::HELP_ONLY);
 
     route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout")

diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
@@ -1124,6 +1124,7 @@ struct t_ap_opts {
  * read_rr_graph_name:  stores the file name of the rr graph to be read by vpr */
 
 enum e_router_algorithm {
+    NESTED,
     PARALLEL,
     PARALLEL_DECOMP,
     TIMING_DRIVEN,

diff --git a/vpr/src/route/NestedNetlistRouter.h b/vpr/src/route/NestedNetlistRouter.h
@@ -0,0 +1,134 @@
+#pragma once
+
+/** @file Nested parallel case for NetlistRouter */
+#include "netlist_routers.h"
+#include "vtr_optional.h"
+#include "vtr_thread_pool.h"
+#include <unordered_map>
+
+/* Add cmd line option for this later */
+constexpr int MAX_THREADS = 4;
+
+/** Nested parallel impl for NetlistRouter.
+ *
+ * Calls a parallel ConnectionRouter for route_net to extract even more parallelism.
+ * The main reason why this is a different router instead of templating NetlistRouter
+ * on ConnectionRouter is this router does not use TBB. The scheduling performance is
+ * worse, but it can wait in individual tasks now (which is not possible with TBB).
+ *
+ * Holds enough context members to glue together ConnectionRouter and net routing functions,
+ * such as \ref route_net. Keeps the members in thread-local storage where needed,
+ * i.e. ConnectionRouters and RouteIterResults-es.
+ * See \ref route_net. */
+template<typename HeapType>
+class NestedNetlistRouter : public NetlistRouter {
+  public:
+    NestedNetlistRouter(
+        const Netlist<>& net_list,
+        const RouterLookahead* router_lookahead,
+        const t_router_opts& router_opts,
+        CBRR& connections_inf,
+        NetPinsMatrix<float>& net_delay,
+        const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+        std::shared_ptr<SetupHoldTimingInfo> timing_info,
+        NetPinTimingInvalidator* pin_timing_invalidator,
+        route_budgets& budgeting_inf,
+        const RoutingPredictor& routing_predictor,
+        const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
+        bool is_flat)
+        : _net_list(net_list)
+        , _router_lookahead(router_lookahead)
+        , _router_opts(router_opts)
+        , _connections_inf(connections_inf)
+        , _net_delay(net_delay)
+        , _netlist_pin_lookup(netlist_pin_lookup)
+        , _timing_info(timing_info)
+        , _pin_timing_invalidator(pin_timing_invalidator)
+        , _budgeting_inf(budgeting_inf)
+        , _routing_predictor(routing_predictor)
+        , _choking_spots(choking_spots)
+        , _is_flat(is_flat)
+        , _thread_pool(MAX_THREADS) {}
+    ~NestedNetlistRouter() {}
+
+    /** Run a single iteration of netlist routing for this->_net_list. This usually means calling
+     * \ref route_net for each net, which will handle other global updates.
+     * \return RouteIterResults for this iteration. */
+    RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack);
+    /** Inform the PartitionTree of the nets with updated bounding boxes */
+    void handle_bb_updated_nets(const std::vector<ParentNetId>& nets);
+
+    /** Set rcv_enabled for each ConnectionRouter this is managing */
+    void set_rcv_enabled(bool x);
+    /** Set timing_info for each ConnectionRouter this is managing */
+    void set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info);
+
+  private:
+    /** Route all nets in a PartitionTree node and add its children to the task queue. */
+    void route_partition_tree_node(PartitionTreeNode& node);
+
+    ConnectionRouter<HeapType> _make_router(const RouterLookahead* router_lookahead, bool is_flat) {
+        auto& device_ctx = g_vpr_ctx.device();
+        auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+        return ConnectionRouter<HeapType>(
+            device_ctx.grid,
+            *router_lookahead,
+            device_ctx.rr_graph.rr_nodes(),
+            &device_ctx.rr_graph,
+            device_ctx.rr_rc_data,
+            device_ctx.rr_graph.rr_switch(),
+            route_ctx.rr_node_route_inf,
+            is_flat);
+    }
+
+    /* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */
+    const Netlist<>& _net_list;
+    const RouterLookahead* _router_lookahead;
+    const t_router_opts& _router_opts;
+    CBRR& _connections_inf;
+    NetPinsMatrix<float>& _net_delay;
+    const ClusteredPinAtomPinsLookup& _netlist_pin_lookup;
+    std::shared_ptr<SetupHoldTimingInfo> _timing_info;
+    NetPinTimingInvalidator* _pin_timing_invalidator;
+    route_budgets& _budgeting_inf;
+    const RoutingPredictor& _routing_predictor;
+    const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& _choking_spots;
+    bool _is_flat;
+
+    /** Cached routing parameters for current iteration (inputs to \see route_netlist()) */
+    int _itry;
+    float _pres_fac;
+    float _worst_neg_slack;
+
+    /** The partition tree. Holds the groups of nets for each partition */
+    vtr::optional<PartitionTree> _tree;
+
+    /** Thread pool for parallel routing. See vtr_thread_pool.h for implementation */
+    vtr::thread_pool _thread_pool;
+
+    /* Thread-local storage.
+     * These are maps because thread::id is a random integer instead of 1, 2, ... */
+    std::unordered_map<std::thread::id, ConnectionRouter<HeapType>> _routers_th;
+    std::unordered_map<std::thread::id, RouteIterResults> _results_th;
+    std::mutex _storage_mutex;
+
+    /** Get a thread-local ConnectionRouter. We lock the id->router lookup, but this is
+     * accessed once per partition so the overhead should be small */
+    ConnectionRouter<HeapType>& get_thread_router() {
+        auto id = std::this_thread::get_id();
+        std::lock_guard<std::mutex> lock(_storage_mutex);
+        if (!_routers_th.count(id)) {
+            _routers_th.emplace(id, _make_router(_router_lookahead, _is_flat));
+        }
+        return _routers_th.at(id);
+    }
+
+    RouteIterResults& get_thread_results() {
+        auto id = std::this_thread::get_id();
+        std::lock_guard<std::mutex> lock(_storage_mutex);
+        return _results_th[id];
+    }
+};
+
+#include "NestedNetlistRouter.tpp"