Skip to content

Add a NestedNetlistRouter to enable integration with fine grained parallel router #2924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions libs/libvtrutil/src/vtr_thread_pool.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#pragma once

/**
* @file vtr_thread_pool.h
* @brief A generic thread pool for parallel task execution
*/

#include <thread>
#include <queue>
#include <mutex>
#include <condition_variable>
#include <memory>
#include <atomic>
#include <functional>
#include <cstddef>
#include <vector>
#include "vtr_log.h"
#include "vtr_time.h"

namespace vtr {

/**
* A thread pool for parallel task execution. It is a naive
* implementation which uses a queue for each thread and assigns
* tasks in a round robin fashion.
*
* Example usage:
*
* vtr::thread_pool pool(4);
* pool.schedule_work([]{
* // Task body
* });
* pool.wait_for_all(); // There's no API to wait for a single task
*/
class thread_pool {
private:
/* Thread-local data */
struct ThreadData {
std::thread thread;
/* Per-thread task queue */
std::queue<std::function<void()>> task_queue;

/* Threads wait on cv for a stop signal or a new task
* queue_mutex is required for condition variable */
std::mutex queue_mutex;
std::condition_variable cv;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't the name give an idea of what the condition variable controls/is used for, rather than just cv?

bool stop = false;
};

/* Container for thread-local data */
std::vector<std::unique_ptr<ThreadData>> threads;
/* Used for round-robin scheduling */
std::atomic<size_t> next_thread{0};
/* Used for wait_for_all */
std::atomic<size_t> active_tasks{0};

/* Condition variable for wait_for_all */
std::mutex completion_mutex;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment these. (all the variables).
From our discussion: completion_mutex is to wait for all the tasks in the queue to finish.

std::condition_variable completion_cv;

public:
thread_pool(size_t thread_count) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Between the class overview and here, you need to comment how this thread pool works.

threads.reserve(thread_count);

for (size_t i = 0; i < thread_count; i++) {
auto thread_data = std::make_unique<ThreadData>();

thread_data->thread = std::thread([&]() {
ThreadData* td = thread_data.get();

while (true) {
std::function<void()> task;

{ /* Wait until a task is available or stop signal is received */
std::unique_lock<std::mutex> lock(td->queue_mutex);

td->cv.wait(lock, [td]() {
return td->stop || !td->task_queue.empty();
});

if (td->stop && td->task_queue.empty()) {
return;
}

/* Fetch a task from the queue */
task = std::move(td->task_queue.front());
td->task_queue.pop();
}

vtr::Timer task_timer;
task();
}
});

threads.push_back(std::move(thread_data));
}
}

template<typename F>
void schedule_work(F&& f) {
active_tasks++;

/* Round-robin thread assignment */
size_t thread_idx = (next_thread++) % threads.size();
auto thread_data = threads[thread_idx].get();

auto task = [this, f = std::forward<F>(f)]() {
vtr::Timer task_timer;

try {
f();
} catch (const std::exception& e) {
VTR_LOG_ERROR("Thread %zu failed task with error: %s\n",
std::this_thread::get_id(), e.what());
throw;
} catch (...) {
VTR_LOG_ERROR("Thread %zu failed task with unknown error\n",
std::this_thread::get_id());
throw;
}

size_t remaining = --active_tasks;
if (remaining == 0) {
completion_cv.notify_all();
}
};

/* Queue new task */
{
std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
thread_data->task_queue.push(std::move(task));
}
thread_data->cv.notify_one();
}

void wait_for_all() {
std::unique_lock<std::mutex> lock(completion_mutex);
completion_cv.wait(lock, [this]() { return active_tasks == 0; });
}

~thread_pool() {
/* Stop all threads */
for (auto& thread_data : threads) {
{
std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
thread_data->stop = true;
}
thread_data->cv.notify_one();
}

for (auto& thread_data : threads) {
if (thread_data->thread.joinable()) {
thread_data->thread.join();
}
}
}
};

} // namespace vtr
3 changes: 3 additions & 0 deletions vpr/src/base/ShowSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {

VTR_LOG("RouterOpts.router_algorithm: ");
switch (RouterOpts.router_algorithm) {
case NESTED:
VTR_LOG("NESTED\n");
break;
case PARALLEL:
VTR_LOG("PARALLEL\n");
break;
Expand Down
15 changes: 11 additions & 4 deletions vpr/src/base/read_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,9 @@ struct ParseRoutePredictor {
struct ParseRouterAlgorithm {
ConvertedValue<e_router_algorithm> from_str(const std::string& str) {
ConvertedValue<e_router_algorithm> conv_value;
if (str == "parallel")
if (str == "nested")
conv_value.set_value(NESTED);
else if (str == "parallel")
conv_value.set_value(PARALLEL);
else if (str == "parallel_decomp")
conv_value.set_value(PARALLEL_DECOMP);
Expand All @@ -298,8 +300,12 @@ struct ParseRouterAlgorithm {

ConvertedValue<std::string> to_str(e_router_algorithm val) {
ConvertedValue<std::string> conv_value;
if (val == PARALLEL)
if (val == NESTED)
conv_value.set_value("nested");
else if (val == PARALLEL)
conv_value.set_value("parallel");
else if (val == PARALLEL_DECOMP)
conv_value.set_value("parallel_decomp");
else {
VTR_ASSERT(val == TIMING_DRIVEN);
conv_value.set_value("timing_driven");
Expand Down Expand Up @@ -2548,9 +2554,10 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
"Specifies the router algorithm to use.\n"
" * timing driven: focuses on routability and circuit speed [default]\n"
" * parallel: timing_driven with nets in different regions of the chip routed in parallel\n"
" * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n")
" * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n"
" * nested: parallel with parallelized path search\n")
.default_value("timing_driven")
.choices({"parallel", "parallel_decomp", "timing_driven"})
.choices({"nested", "parallel", "parallel_decomp", "timing_driven"})
.show_in(argparse::ShowIn::HELP_ONLY);

route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout")
Expand Down
1 change: 1 addition & 0 deletions vpr/src/base/vpr_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,7 @@ struct t_ap_opts {
* read_rr_graph_name: stores the file name of the rr graph to be read by vpr */

enum e_router_algorithm {
NESTED,
PARALLEL,
PARALLEL_DECOMP,
TIMING_DRIVEN,
Expand Down
134 changes: 134 additions & 0 deletions vpr/src/route/NestedNetlistRouter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#pragma once

/** @file Nested parallel case for NetlistRouter */
#include "netlist_routers.h"
#include "vtr_optional.h"
#include "vtr_thread_pool.h"
#include <unordered_map>

/* Add cmd line option for this later */
constexpr int MAX_THREADS = 4;

/** Nested parallel impl for NetlistRouter.
*
* Calls a parallel ConnectionRouter for route_net to extract even more parallelism.
* The main reason why this is a different router instead of templating NetlistRouter
* on ConnectionRouter is this router does not use TBB. The scheduling performance is
* worse, but it can wait in individual tasks now (which is not possible with TBB).
*
* Holds enough context members to glue together ConnectionRouter and net routing functions,
* such as \ref route_net. Keeps the members in thread-local storage where needed,
* i.e. ConnectionRouters and RouteIterResults-es.
* See \ref route_net. */
template<typename HeapType>
class NestedNetlistRouter : public NetlistRouter {
public:
NestedNetlistRouter(
const Netlist<>& net_list,
const RouterLookahead* router_lookahead,
const t_router_opts& router_opts,
CBRR& connections_inf,
NetPinsMatrix<float>& net_delay,
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
std::shared_ptr<SetupHoldTimingInfo> timing_info,
NetPinTimingInvalidator* pin_timing_invalidator,
route_budgets& budgeting_inf,
const RoutingPredictor& routing_predictor,
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
bool is_flat)
: _net_list(net_list)
, _router_lookahead(router_lookahead)
, _router_opts(router_opts)
, _connections_inf(connections_inf)
, _net_delay(net_delay)
, _netlist_pin_lookup(netlist_pin_lookup)
, _timing_info(timing_info)
, _pin_timing_invalidator(pin_timing_invalidator)
, _budgeting_inf(budgeting_inf)
, _routing_predictor(routing_predictor)
, _choking_spots(choking_spots)
, _is_flat(is_flat)
, _thread_pool(MAX_THREADS) {}
~NestedNetlistRouter() {}

/** Run a single iteration of netlist routing for this->_net_list. This usually means calling
* \ref route_net for each net, which will handle other global updates.
* \return RouteIterResults for this iteration. */
RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack);
/** Inform the PartitionTree of the nets with updated bounding boxes */
void handle_bb_updated_nets(const std::vector<ParentNetId>& nets);

/** Set rcv_enabled for each ConnectionRouter this is managing */
void set_rcv_enabled(bool x);
/** Set timing_info for each ConnectionRouter this is managing */
void set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info);

private:
/** Route all nets in a PartitionTree node and add its children to the task queue. */
void route_partition_tree_node(PartitionTreeNode& node);

ConnectionRouter<HeapType> _make_router(const RouterLookahead* router_lookahead, bool is_flat) {
auto& device_ctx = g_vpr_ctx.device();
auto& route_ctx = g_vpr_ctx.mutable_routing();

return ConnectionRouter<HeapType>(
device_ctx.grid,
*router_lookahead,
device_ctx.rr_graph.rr_nodes(),
&device_ctx.rr_graph,
device_ctx.rr_rc_data,
device_ctx.rr_graph.rr_switch(),
route_ctx.rr_node_route_inf,
is_flat);
}

/* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */
const Netlist<>& _net_list;
const RouterLookahead* _router_lookahead;
const t_router_opts& _router_opts;
CBRR& _connections_inf;
NetPinsMatrix<float>& _net_delay;
const ClusteredPinAtomPinsLookup& _netlist_pin_lookup;
std::shared_ptr<SetupHoldTimingInfo> _timing_info;
NetPinTimingInvalidator* _pin_timing_invalidator;
route_budgets& _budgeting_inf;
const RoutingPredictor& _routing_predictor;
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& _choking_spots;
bool _is_flat;

/** Cached routing parameters for current iteration (inputs to \see route_netlist()) */
int _itry;
float _pres_fac;
float _worst_neg_slack;

/** The partition tree. Holds the groups of nets for each partition */
vtr::optional<PartitionTree> _tree;

/** Thread pool for parallel routing. See vtr_thread_pool.h for implementation */
vtr::thread_pool _thread_pool;

/* Thread-local storage.
* These are maps because thread::id is a random integer instead of 1, 2, ... */
std::unordered_map<std::thread::id, ConnectionRouter<HeapType>> _routers_th;
std::unordered_map<std::thread::id, RouteIterResults> _results_th;
std::mutex _storage_mutex;

/** Get a thread-local ConnectionRouter. We lock the id->router lookup, but this is
* accessed once per partition so the overhead should be small */
ConnectionRouter<HeapType>& get_thread_router() {
auto id = std::this_thread::get_id();
std::lock_guard<std::mutex> lock(_storage_mutex);
if (!_routers_th.count(id)) {
_routers_th.emplace(id, _make_router(_router_lookahead, _is_flat));
}
return _routers_th.at(id);
}

RouteIterResults& get_thread_results() {
auto id = std::this_thread::get_id();
std::lock_guard<std::mutex> lock(_storage_mutex);
return _results_th[id];
}
};

#include "NestedNetlistRouter.tpp"
Loading