Skip to content

Commit 2311863

Browse files
authored
Merge pull request #2924 from verilog-to-routing/custom-thread-pool
Add a NestedNetlistRouter to enable integration with fine grained parallel router
2 parents 332a9b6 + 0c7800b commit 2311863

File tree

7 files changed

+464
-4
lines changed

7 files changed

+464
-4
lines changed

libs/libvtrutil/src/vtr_thread_pool.h

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#pragma once
2+
3+
/**
4+
* @file vtr_thread_pool.h
5+
* @brief A generic thread pool for parallel task execution
6+
*/
7+
8+
#include <thread>
9+
#include <queue>
10+
#include <mutex>
11+
#include <condition_variable>
12+
#include <memory>
13+
#include <atomic>
14+
#include <functional>
15+
#include <cstddef>
16+
#include <vector>
17+
#include "vtr_log.h"
18+
#include "vtr_time.h"
19+
20+
namespace vtr {
21+
22+
/**
23+
* A thread pool for parallel task execution. It is a naive
24+
* implementation which uses a queue for each thread and assigns
25+
* tasks in a round robin fashion.
26+
*
27+
* Example usage:
28+
*
29+
* vtr::thread_pool pool(4);
30+
* pool.schedule_work([]{
31+
* // Task body
32+
* });
33+
* pool.wait_for_all(); // There's no API to wait for a single task
34+
*/
35+
class thread_pool {
36+
private:
37+
/* Thread-local data */
38+
struct ThreadData {
39+
std::thread thread;
40+
/* Per-thread task queue */
41+
std::queue<std::function<void()>> task_queue;
42+
43+
/* Threads wait on cv for a stop signal or a new task
44+
* queue_mutex is required for condition variable */
45+
std::mutex queue_mutex;
46+
std::condition_variable cv;
47+
bool stop = false;
48+
};
49+
50+
/* Container for thread-local data */
51+
std::vector<std::unique_ptr<ThreadData>> threads;
52+
/* Used for round-robin scheduling */
53+
std::atomic<size_t> next_thread{0};
54+
/* Used for wait_for_all */
55+
std::atomic<size_t> active_tasks{0};
56+
57+
/* Condition variable for wait_for_all */
58+
std::mutex completion_mutex;
59+
std::condition_variable completion_cv;
60+
61+
public:
62+
thread_pool(size_t thread_count) {
63+
threads.reserve(thread_count);
64+
65+
for (size_t i = 0; i < thread_count; i++) {
66+
auto thread_data = std::make_unique<ThreadData>();
67+
68+
thread_data->thread = std::thread([&]() {
69+
ThreadData* td = thread_data.get();
70+
71+
while (true) {
72+
std::function<void()> task;
73+
74+
{ /* Wait until a task is available or stop signal is received */
75+
std::unique_lock<std::mutex> lock(td->queue_mutex);
76+
77+
td->cv.wait(lock, [td]() {
78+
return td->stop || !td->task_queue.empty();
79+
});
80+
81+
if (td->stop && td->task_queue.empty()) {
82+
return;
83+
}
84+
85+
/* Fetch a task from the queue */
86+
task = std::move(td->task_queue.front());
87+
td->task_queue.pop();
88+
}
89+
90+
vtr::Timer task_timer;
91+
task();
92+
}
93+
});
94+
95+
threads.push_back(std::move(thread_data));
96+
}
97+
}
98+
99+
template<typename F>
100+
void schedule_work(F&& f) {
101+
active_tasks++;
102+
103+
/* Round-robin thread assignment */
104+
size_t thread_idx = (next_thread++) % threads.size();
105+
auto thread_data = threads[thread_idx].get();
106+
107+
auto task = [this, f = std::forward<F>(f)]() {
108+
vtr::Timer task_timer;
109+
110+
try {
111+
f();
112+
} catch (const std::exception& e) {
113+
VTR_LOG_ERROR("Thread %zu failed task with error: %s\n",
114+
std::this_thread::get_id(), e.what());
115+
throw;
116+
} catch (...) {
117+
VTR_LOG_ERROR("Thread %zu failed task with unknown error\n",
118+
std::this_thread::get_id());
119+
throw;
120+
}
121+
122+
size_t remaining = --active_tasks;
123+
if (remaining == 0) {
124+
completion_cv.notify_all();
125+
}
126+
};
127+
128+
/* Queue new task */
129+
{
130+
std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
131+
thread_data->task_queue.push(std::move(task));
132+
}
133+
thread_data->cv.notify_one();
134+
}
135+
136+
void wait_for_all() {
137+
std::unique_lock<std::mutex> lock(completion_mutex);
138+
completion_cv.wait(lock, [this]() { return active_tasks == 0; });
139+
}
140+
141+
~thread_pool() {
142+
/* Stop all threads */
143+
for (auto& thread_data : threads) {
144+
{
145+
std::lock_guard<std::mutex> lock(thread_data->queue_mutex);
146+
thread_data->stop = true;
147+
}
148+
thread_data->cv.notify_one();
149+
}
150+
151+
for (auto& thread_data : threads) {
152+
if (thread_data->thread.joinable()) {
153+
thread_data->thread.join();
154+
}
155+
}
156+
}
157+
};
158+
159+
} // namespace vtr

vpr/src/base/ShowSetup.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
252252

253253
VTR_LOG("RouterOpts.router_algorithm: ");
254254
switch (RouterOpts.router_algorithm) {
255+
case NESTED:
256+
VTR_LOG("NESTED\n");
257+
break;
255258
case PARALLEL:
256259
VTR_LOG("PARALLEL\n");
257260
break;

vpr/src/base/read_options.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,9 @@ struct ParseRoutePredictor {
282282
struct ParseRouterAlgorithm {
283283
ConvertedValue<e_router_algorithm> from_str(const std::string& str) {
284284
ConvertedValue<e_router_algorithm> conv_value;
285-
if (str == "parallel")
285+
if (str == "nested")
286+
conv_value.set_value(NESTED);
287+
else if (str == "parallel")
286288
conv_value.set_value(PARALLEL);
287289
else if (str == "parallel_decomp")
288290
conv_value.set_value(PARALLEL_DECOMP);
@@ -298,8 +300,12 @@ struct ParseRouterAlgorithm {
298300

299301
ConvertedValue<std::string> to_str(e_router_algorithm val) {
300302
ConvertedValue<std::string> conv_value;
301-
if (val == PARALLEL)
303+
if (val == NESTED)
304+
conv_value.set_value("nested");
305+
else if (val == PARALLEL)
302306
conv_value.set_value("parallel");
307+
else if (val == PARALLEL_DECOMP)
308+
conv_value.set_value("parallel_decomp");
303309
else {
304310
VTR_ASSERT(val == TIMING_DRIVEN);
305311
conv_value.set_value("timing_driven");
@@ -2548,9 +2554,10 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
25482554
"Specifies the router algorithm to use.\n"
25492555
" * timing driven: focuses on routability and circuit speed [default]\n"
25502556
" * parallel: timing_driven with nets in different regions of the chip routed in parallel\n"
2551-
" * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n")
2557+
" * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n"
2558+
" * nested: parallel with parallelized path search\n")
25522559
.default_value("timing_driven")
2553-
.choices({"parallel", "parallel_decomp", "timing_driven"})
2560+
.choices({"nested", "parallel", "parallel_decomp", "timing_driven"})
25542561
.show_in(argparse::ShowIn::HELP_ONLY);
25552562

25562563
route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout")

vpr/src/base/vpr_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,7 @@ struct t_ap_opts {
11241124
* read_rr_graph_name: stores the file name of the rr graph to be read by vpr */
11251125

11261126
enum e_router_algorithm {
1127+
NESTED,
11271128
PARALLEL,
11281129
PARALLEL_DECOMP,
11291130
TIMING_DRIVEN,

vpr/src/route/NestedNetlistRouter.h

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#pragma once
2+
3+
/** @file Nested parallel case for NetlistRouter */
4+
#include "netlist_routers.h"
5+
#include "vtr_optional.h"
6+
#include "vtr_thread_pool.h"
7+
#include <unordered_map>
8+
9+
/* Add cmd line option for this later */
10+
constexpr int MAX_THREADS = 4;
11+
12+
/** Nested parallel impl for NetlistRouter.
13+
*
14+
* Calls a parallel ConnectionRouter for route_net to extract even more parallelism.
15+
* The main reason why this is a different router instead of templating NetlistRouter
16+
* on ConnectionRouter is this router does not use TBB. The scheduling performance is
17+
* worse, but it can wait in individual tasks now (which is not possible with TBB).
18+
*
19+
* Holds enough context members to glue together ConnectionRouter and net routing functions,
20+
* such as \ref route_net. Keeps the members in thread-local storage where needed,
21+
* i.e. ConnectionRouters and RouteIterResults-es.
22+
* See \ref route_net. */
23+
template<typename HeapType>
24+
class NestedNetlistRouter : public NetlistRouter {
25+
public:
26+
NestedNetlistRouter(
27+
const Netlist<>& net_list,
28+
const RouterLookahead* router_lookahead,
29+
const t_router_opts& router_opts,
30+
CBRR& connections_inf,
31+
NetPinsMatrix<float>& net_delay,
32+
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
33+
std::shared_ptr<SetupHoldTimingInfo> timing_info,
34+
NetPinTimingInvalidator* pin_timing_invalidator,
35+
route_budgets& budgeting_inf,
36+
const RoutingPredictor& routing_predictor,
37+
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
38+
bool is_flat)
39+
: _net_list(net_list)
40+
, _router_lookahead(router_lookahead)
41+
, _router_opts(router_opts)
42+
, _connections_inf(connections_inf)
43+
, _net_delay(net_delay)
44+
, _netlist_pin_lookup(netlist_pin_lookup)
45+
, _timing_info(timing_info)
46+
, _pin_timing_invalidator(pin_timing_invalidator)
47+
, _budgeting_inf(budgeting_inf)
48+
, _routing_predictor(routing_predictor)
49+
, _choking_spots(choking_spots)
50+
, _is_flat(is_flat)
51+
, _thread_pool(MAX_THREADS) {}
52+
~NestedNetlistRouter() {}
53+
54+
/** Run a single iteration of netlist routing for this->_net_list. This usually means calling
55+
* \ref route_net for each net, which will handle other global updates.
56+
* \return RouteIterResults for this iteration. */
57+
RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack);
58+
/** Inform the PartitionTree of the nets with updated bounding boxes */
59+
void handle_bb_updated_nets(const std::vector<ParentNetId>& nets);
60+
61+
/** Set rcv_enabled for each ConnectionRouter this is managing */
62+
void set_rcv_enabled(bool x);
63+
/** Set timing_info for each ConnectionRouter this is managing */
64+
void set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info);
65+
66+
private:
67+
/** Route all nets in a PartitionTree node and add its children to the task queue. */
68+
void route_partition_tree_node(PartitionTreeNode& node);
69+
70+
ConnectionRouter<HeapType> _make_router(const RouterLookahead* router_lookahead, bool is_flat) {
71+
auto& device_ctx = g_vpr_ctx.device();
72+
auto& route_ctx = g_vpr_ctx.mutable_routing();
73+
74+
return ConnectionRouter<HeapType>(
75+
device_ctx.grid,
76+
*router_lookahead,
77+
device_ctx.rr_graph.rr_nodes(),
78+
&device_ctx.rr_graph,
79+
device_ctx.rr_rc_data,
80+
device_ctx.rr_graph.rr_switch(),
81+
route_ctx.rr_node_route_inf,
82+
is_flat);
83+
}
84+
85+
/* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */
86+
const Netlist<>& _net_list;
87+
const RouterLookahead* _router_lookahead;
88+
const t_router_opts& _router_opts;
89+
CBRR& _connections_inf;
90+
NetPinsMatrix<float>& _net_delay;
91+
const ClusteredPinAtomPinsLookup& _netlist_pin_lookup;
92+
std::shared_ptr<SetupHoldTimingInfo> _timing_info;
93+
NetPinTimingInvalidator* _pin_timing_invalidator;
94+
route_budgets& _budgeting_inf;
95+
const RoutingPredictor& _routing_predictor;
96+
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& _choking_spots;
97+
bool _is_flat;
98+
99+
/** Cached routing parameters for current iteration (inputs to \see route_netlist()) */
100+
int _itry;
101+
float _pres_fac;
102+
float _worst_neg_slack;
103+
104+
/** The partition tree. Holds the groups of nets for each partition */
105+
vtr::optional<PartitionTree> _tree;
106+
107+
/** Thread pool for parallel routing. See vtr_thread_pool.h for implementation */
108+
vtr::thread_pool _thread_pool;
109+
110+
/* Thread-local storage.
111+
* These are maps because thread::id is a random integer instead of 1, 2, ... */
112+
std::unordered_map<std::thread::id, ConnectionRouter<HeapType>> _routers_th;
113+
std::unordered_map<std::thread::id, RouteIterResults> _results_th;
114+
std::mutex _storage_mutex;
115+
116+
/** Get a thread-local ConnectionRouter. We lock the id->router lookup, but this is
117+
* accessed once per partition so the overhead should be small */
118+
ConnectionRouter<HeapType>& get_thread_router() {
119+
auto id = std::this_thread::get_id();
120+
std::lock_guard<std::mutex> lock(_storage_mutex);
121+
if (!_routers_th.count(id)) {
122+
_routers_th.emplace(id, _make_router(_router_lookahead, _is_flat));
123+
}
124+
return _routers_th.at(id);
125+
}
126+
127+
RouteIterResults& get_thread_results() {
128+
auto id = std::this_thread::get_id();
129+
std::lock_guard<std::mutex> lock(_storage_mutex);
130+
return _results_th[id];
131+
}
132+
};
133+
134+
#include "NestedNetlistRouter.tpp"

0 commit comments

Comments
 (0)