From 0c7800b5e3b3de38512f1f3cb2c9ba33813b6338 Mon Sep 17 00:00:00 2001 From: Duck Deux Date: Thu, 30 Jan 2025 12:37:30 -0500 Subject: [PATCH] add NestedNetlistRouter and custom thread pool --- libs/libvtrutil/src/vtr_thread_pool.h | 159 ++++++++++++++++++++++++++ vpr/src/base/ShowSetup.cpp | 3 + vpr/src/base/read_options.cpp | 15 ++- vpr/src/base/vpr_types.h | 1 + vpr/src/route/NestedNetlistRouter.h | 134 ++++++++++++++++++++++ vpr/src/route/NestedNetlistRouter.tpp | 141 +++++++++++++++++++++++ vpr/src/route/netlist_routers.h | 15 +++ 7 files changed, 464 insertions(+), 4 deletions(-) create mode 100644 libs/libvtrutil/src/vtr_thread_pool.h create mode 100644 vpr/src/route/NestedNetlistRouter.h create mode 100644 vpr/src/route/NestedNetlistRouter.tpp diff --git a/libs/libvtrutil/src/vtr_thread_pool.h b/libs/libvtrutil/src/vtr_thread_pool.h new file mode 100644 index 00000000000..2185ac196e6 --- /dev/null +++ b/libs/libvtrutil/src/vtr_thread_pool.h @@ -0,0 +1,159 @@ +#pragma once + +/** + * @file vtr_thread_pool.h + * @brief A generic thread pool for parallel task execution + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vtr_log.h" +#include "vtr_time.h" + +namespace vtr { + +/** + * A thread pool for parallel task execution. It is a naive + * implementation which uses a queue for each thread and assigns + * tasks in a round robin fashion. + * + * Example usage: + * + * vtr::thread_pool pool(4); + * pool.schedule_work([]{ + * // Task body + * }); + * pool.wait_for_all(); // There's no API to wait for a single task + */ +class thread_pool { + private: + /* Thread-local data */ + struct ThreadData { + std::thread thread; + /* Per-thread task queue */ + std::queue> task_queue; + + /* Threads wait on cv for a stop signal or a new task + * queue_mutex is required for condition variable */ + std::mutex queue_mutex; + std::condition_variable cv; + bool stop = false; + }; + + /* Container for thread-local data */ + std::vector> threads; + /* Used for round-robin scheduling */ + std::atomic next_thread{0}; + /* Used for wait_for_all */ + std::atomic active_tasks{0}; + + /* Condition variable for wait_for_all */ + std::mutex completion_mutex; + std::condition_variable completion_cv; + + public: + thread_pool(size_t thread_count) { + threads.reserve(thread_count); + + for (size_t i = 0; i < thread_count; i++) { + auto thread_data = std::make_unique(); + + thread_data->thread = std::thread([&]() { + ThreadData* td = thread_data.get(); + + while (true) { + std::function task; + + { /* Wait until a task is available or stop signal is received */ + std::unique_lock lock(td->queue_mutex); + + td->cv.wait(lock, [td]() { + return td->stop || !td->task_queue.empty(); + }); + + if (td->stop && td->task_queue.empty()) { + return; + } + + /* Fetch a task from the queue */ + task = std::move(td->task_queue.front()); + td->task_queue.pop(); + } + + vtr::Timer task_timer; + task(); + } + }); + + threads.push_back(std::move(thread_data)); + } + } + + template + void schedule_work(F&& f) { + active_tasks++; + + /* Round-robin thread assignment */ + size_t thread_idx = (next_thread++) % threads.size(); + auto thread_data = threads[thread_idx].get(); + + auto task = [this, f = std::forward(f)]() { + vtr::Timer task_timer; + + try { + f(); + } catch (const std::exception& e) { + VTR_LOG_ERROR("Thread %zu failed task with error: %s\n", + std::this_thread::get_id(), e.what()); + throw; + } catch (...) { + VTR_LOG_ERROR("Thread %zu failed task with unknown error\n", + std::this_thread::get_id()); + throw; + } + + size_t remaining = --active_tasks; + if (remaining == 0) { + completion_cv.notify_all(); + } + }; + + /* Queue new task */ + { + std::lock_guard lock(thread_data->queue_mutex); + thread_data->task_queue.push(std::move(task)); + } + thread_data->cv.notify_one(); + } + + void wait_for_all() { + std::unique_lock lock(completion_mutex); + completion_cv.wait(lock, [this]() { return active_tasks == 0; }); + } + + ~thread_pool() { + /* Stop all threads */ + for (auto& thread_data : threads) { + { + std::lock_guard lock(thread_data->queue_mutex); + thread_data->stop = true; + } + thread_data->cv.notify_one(); + } + + for (auto& thread_data : threads) { + if (thread_data->thread.joinable()) { + thread_data->thread.join(); + } + } + } +}; + +} // namespace vtr diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp index fc16e7c8ff2..1524af7ed83 100644 --- a/vpr/src/base/ShowSetup.cpp +++ b/vpr/src/base/ShowSetup.cpp @@ -252,6 +252,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) { VTR_LOG("RouterOpts.router_algorithm: "); switch (RouterOpts.router_algorithm) { + case NESTED: + VTR_LOG("NESTED\n"); + break; case PARALLEL: VTR_LOG("PARALLEL\n"); break; diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index e14b32e0295..553788ce528 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -282,7 +282,9 @@ struct ParseRoutePredictor { struct ParseRouterAlgorithm { ConvertedValue from_str(const std::string& str) { ConvertedValue conv_value; - if (str == "parallel") + if (str == "nested") + conv_value.set_value(NESTED); + else if (str == "parallel") conv_value.set_value(PARALLEL); else if (str == "parallel_decomp") conv_value.set_value(PARALLEL_DECOMP); @@ -298,8 +300,12 @@ struct ParseRouterAlgorithm { ConvertedValue to_str(e_router_algorithm val) { ConvertedValue conv_value; - if (val == PARALLEL) + if (val == NESTED) + conv_value.set_value("nested"); + else if (val == PARALLEL) conv_value.set_value("parallel"); + else if (val == PARALLEL_DECOMP) + conv_value.set_value("parallel_decomp"); else { VTR_ASSERT(val == TIMING_DRIVEN); conv_value.set_value("timing_driven"); @@ -2548,9 +2554,10 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio "Specifies the router algorithm to use.\n" " * timing driven: focuses on routability and circuit speed [default]\n" " * parallel: timing_driven with nets in different regions of the chip routed in parallel\n" - " * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n") + " * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n" + " * nested: parallel with parallelized path search\n") .default_value("timing_driven") - .choices({"parallel", "parallel_decomp", "timing_driven"}) + .choices({"nested", "parallel", "parallel_decomp", "timing_driven"}) .show_in(argparse::ShowIn::HELP_ONLY); route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout") diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index eb0b7305df0..1fbfd52ebcd 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -1124,6 +1124,7 @@ struct t_ap_opts { * read_rr_graph_name: stores the file name of the rr graph to be read by vpr */ enum e_router_algorithm { + NESTED, PARALLEL, PARALLEL_DECOMP, TIMING_DRIVEN, diff --git a/vpr/src/route/NestedNetlistRouter.h b/vpr/src/route/NestedNetlistRouter.h new file mode 100644 index 00000000000..6870842af8f --- /dev/null +++ b/vpr/src/route/NestedNetlistRouter.h @@ -0,0 +1,134 @@ +#pragma once + +/** @file Nested parallel case for NetlistRouter */ +#include "netlist_routers.h" +#include "vtr_optional.h" +#include "vtr_thread_pool.h" +#include + +/* Add cmd line option for this later */ +constexpr int MAX_THREADS = 4; + +/** Nested parallel impl for NetlistRouter. + * + * Calls a parallel ConnectionRouter for route_net to extract even more parallelism. + * The main reason why this is a different router instead of templating NetlistRouter + * on ConnectionRouter is this router does not use TBB. The scheduling performance is + * worse, but it can wait in individual tasks now (which is not possible with TBB). + * + * Holds enough context members to glue together ConnectionRouter and net routing functions, + * such as \ref route_net. Keeps the members in thread-local storage where needed, + * i.e. ConnectionRouters and RouteIterResults-es. + * See \ref route_net. */ +template +class NestedNetlistRouter : public NetlistRouter { + public: + NestedNetlistRouter( + const Netlist<>& net_list, + const RouterLookahead* router_lookahead, + const t_router_opts& router_opts, + CBRR& connections_inf, + NetPinsMatrix& net_delay, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + std::shared_ptr timing_info, + NetPinTimingInvalidator* pin_timing_invalidator, + route_budgets& budgeting_inf, + const RoutingPredictor& routing_predictor, + const vtr::vector>>& choking_spots, + bool is_flat) + : _net_list(net_list) + , _router_lookahead(router_lookahead) + , _router_opts(router_opts) + , _connections_inf(connections_inf) + , _net_delay(net_delay) + , _netlist_pin_lookup(netlist_pin_lookup) + , _timing_info(timing_info) + , _pin_timing_invalidator(pin_timing_invalidator) + , _budgeting_inf(budgeting_inf) + , _routing_predictor(routing_predictor) + , _choking_spots(choking_spots) + , _is_flat(is_flat) + , _thread_pool(MAX_THREADS) {} + ~NestedNetlistRouter() {} + + /** Run a single iteration of netlist routing for this->_net_list. This usually means calling + * \ref route_net for each net, which will handle other global updates. + * \return RouteIterResults for this iteration. */ + RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack); + /** Inform the PartitionTree of the nets with updated bounding boxes */ + void handle_bb_updated_nets(const std::vector& nets); + + /** Set rcv_enabled for each ConnectionRouter this is managing */ + void set_rcv_enabled(bool x); + /** Set timing_info for each ConnectionRouter this is managing */ + void set_timing_info(std::shared_ptr timing_info); + + private: + /** Route all nets in a PartitionTree node and add its children to the task queue. */ + void route_partition_tree_node(PartitionTreeNode& node); + + ConnectionRouter _make_router(const RouterLookahead* router_lookahead, bool is_flat) { + auto& device_ctx = g_vpr_ctx.device(); + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + return ConnectionRouter( + device_ctx.grid, + *router_lookahead, + device_ctx.rr_graph.rr_nodes(), + &device_ctx.rr_graph, + device_ctx.rr_rc_data, + device_ctx.rr_graph.rr_switch(), + route_ctx.rr_node_route_inf, + is_flat); + } + + /* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */ + const Netlist<>& _net_list; + const RouterLookahead* _router_lookahead; + const t_router_opts& _router_opts; + CBRR& _connections_inf; + NetPinsMatrix& _net_delay; + const ClusteredPinAtomPinsLookup& _netlist_pin_lookup; + std::shared_ptr _timing_info; + NetPinTimingInvalidator* _pin_timing_invalidator; + route_budgets& _budgeting_inf; + const RoutingPredictor& _routing_predictor; + const vtr::vector>>& _choking_spots; + bool _is_flat; + + /** Cached routing parameters for current iteration (inputs to \see route_netlist()) */ + int _itry; + float _pres_fac; + float _worst_neg_slack; + + /** The partition tree. Holds the groups of nets for each partition */ + vtr::optional _tree; + + /** Thread pool for parallel routing. See vtr_thread_pool.h for implementation */ + vtr::thread_pool _thread_pool; + + /* Thread-local storage. + * These are maps because thread::id is a random integer instead of 1, 2, ... */ + std::unordered_map> _routers_th; + std::unordered_map _results_th; + std::mutex _storage_mutex; + + /** Get a thread-local ConnectionRouter. We lock the id->router lookup, but this is + * accessed once per partition so the overhead should be small */ + ConnectionRouter& get_thread_router() { + auto id = std::this_thread::get_id(); + std::lock_guard lock(_storage_mutex); + if (!_routers_th.count(id)) { + _routers_th.emplace(id, _make_router(_router_lookahead, _is_flat)); + } + return _routers_th.at(id); + } + + RouteIterResults& get_thread_results() { + auto id = std::this_thread::get_id(); + std::lock_guard lock(_storage_mutex); + return _results_th[id]; + } +}; + +#include "NestedNetlistRouter.tpp" diff --git a/vpr/src/route/NestedNetlistRouter.tpp b/vpr/src/route/NestedNetlistRouter.tpp new file mode 100644 index 00000000000..333be28ea3b --- /dev/null +++ b/vpr/src/route/NestedNetlistRouter.tpp @@ -0,0 +1,141 @@ +#pragma once + +/** @file Impls for ParallelNetlistRouter */ + +#include +#include "netlist_routers.h" +#include "route_net.h" +#include "vtr_time.h" + +template +inline RouteIterResults NestedNetlistRouter::route_netlist(int itry, float pres_fac, float worst_neg_slack) { + /* Reset results for each thread */ + for (auto& [_, results] : _results_th) { + results = RouteIterResults(); + } + + /* Set the routing parameters: they won't change until the next call and that saves us the trouble of passing them around */ + _itry = itry; + _pres_fac = pres_fac; + _worst_neg_slack = worst_neg_slack; + + /* Organize netlist into a PartitionTree. + * Nets in a given level of nodes are guaranteed to not have any overlapping bounding boxes, so they can be routed in parallel. */ + vtr::Timer timer; + if (!_tree) { + _tree = PartitionTree(_net_list); + PartitionTreeDebug::log("Iteration " + std::to_string(itry) + ": built partition tree in " + std::to_string(timer.elapsed_sec()) + " s"); + } + + /* Push a single route_partition_tree_node task to the thread pool, + * which will recursively schedule the rest of the tree */ + _thread_pool.schedule_work([this]() { + route_partition_tree_node(_tree->root()); + }); + + /* Wait for all tasks in the thread pool to complete */ + _thread_pool.wait_for_all(); + + PartitionTreeDebug::log("Routing all nets took " + std::to_string(timer.elapsed_sec()) + " s"); + + /* Combine results from threads */ + RouteIterResults out; + for (auto& [_, results] : _results_th) { + out.stats.combine(results.stats); + out.rerouted_nets.insert(out.rerouted_nets.end(), results.rerouted_nets.begin(), results.rerouted_nets.end()); + out.bb_updated_nets.insert(out.bb_updated_nets.end(), results.bb_updated_nets.begin(), results.bb_updated_nets.end()); + out.is_routable &= results.is_routable; + } + return out; +} + +template +void NestedNetlistRouter::route_partition_tree_node(PartitionTreeNode& node) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + /* node.nets is an unordered set, copy into vector to sort */ + std::vector nets(node.nets.begin(), node.nets.end()); + + /* Sort so net with most sinks is routed first. */ + std::stable_sort(nets.begin(), nets.end(), [&](ParentNetId id1, ParentNetId id2) -> bool { + return _net_list.net_sinks(id1).size() > _net_list.net_sinks(id2).size(); + }); + + vtr::Timer timer; + + /* Route all nets in this node serially */ + for (auto net_id : nets) { + auto& results = get_thread_results(); + auto& router = get_thread_router(); + + auto flags = route_net( + router, + _net_list, + net_id, + _itry, + _pres_fac, + _router_opts, + _connections_inf, + results.stats, + _net_delay, + _netlist_pin_lookup, + _timing_info.get(), + _pin_timing_invalidator, + _budgeting_inf, + _worst_neg_slack, + _routing_predictor, + _choking_spots[net_id], + _is_flat, + route_ctx.route_bb[net_id]); + + if (!flags.success && !flags.retry_with_full_bb) { + /* Disconnected RRG and ConnectionRouter doesn't think growing the BB will work */ + results.is_routable = false; + return; + } + if (flags.retry_with_full_bb) { + /* ConnectionRouter thinks we should grow the BB. Do that and leave this net unrouted for now */ + route_ctx.route_bb[net_id] = full_device_bb(); + results.bb_updated_nets.push_back(net_id); + continue; + } + if (flags.was_rerouted) { + results.rerouted_nets.push_back(net_id); + } + } + + PartitionTreeDebug::log("Node with " + std::to_string(node.nets.size()) + + " nets and " + std::to_string(node.vnets.size()) + + " virtual nets routed in " + std::to_string(timer.elapsed_sec()) + + " s"); + + /* Schedule child nodes as new tasks */ + if (node.left && node.right) { + _thread_pool.schedule_work([this, left = node.left.get()]() { + route_partition_tree_node(*left); + }); + _thread_pool.schedule_work([this, right = node.right.get()]() { + route_partition_tree_node(*right); + }); + } else { + VTR_ASSERT(!node.left && !node.right); // there shouldn't be a node with a single branch + } +} + +template +void NestedNetlistRouter::handle_bb_updated_nets(const std::vector& nets) { + VTR_ASSERT(_tree); + _tree->update_nets(nets); +} + +template +void NestedNetlistRouter::set_rcv_enabled(bool x) { + for (auto& [_, router] : _routers_th) { + router.set_rcv_enabled(x); + } +} + +template +void NestedNetlistRouter::set_timing_info(std::shared_ptr timing_info) { + _timing_info = timing_info; +} diff --git a/vpr/src/route/netlist_routers.h b/vpr/src/route/netlist_routers.h index 436343179b2..d64477f03ad 100644 --- a/vpr/src/route/netlist_routers.h +++ b/vpr/src/route/netlist_routers.h @@ -71,6 +71,7 @@ class NetlistRouter { /* Include the derived classes here to get the HeapType-templated impls */ #include "SerialNetlistRouter.h" +#include "NestedNetlistRouter.h" #ifdef VPR_USE_TBB #include "ParallelNetlistRouter.h" #include "DecompNetlistRouter.h" @@ -104,6 +105,20 @@ inline std::unique_ptr make_netlist_router_with_heap( routing_predictor, choking_spots, is_flat); + } else if (router_opts.router_algorithm == e_router_algorithm::NESTED) { + return std::make_unique>( + net_list, + router_lookahead, + router_opts, + connections_inf, + net_delay, + netlist_pin_lookup, + timing_info, + pin_timing_invalidator, + budgeting_inf, + routing_predictor, + choking_spots, + is_flat); } else if (router_opts.router_algorithm == e_router_algorithm::PARALLEL) { #ifdef VPR_USE_TBB return std::make_unique>(