-
Notifications
You must be signed in to change notification settings - Fork 415
Add a NestedNetlistRouter to enable integration with fine grained parallel router #2924
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
#pragma once | ||
|
||
/** | ||
* @file vtr_thread_pool.h | ||
* @brief A generic thread pool for parallel task execution | ||
*/ | ||
|
||
#include <thread> | ||
#include <queue> | ||
#include <mutex> | ||
#include <condition_variable> | ||
#include <memory> | ||
#include <atomic> | ||
#include <functional> | ||
#include <cstddef> | ||
#include <vector> | ||
#include "vtr_log.h" | ||
#include "vtr_time.h" | ||
|
||
namespace vtr { | ||
|
||
/** | ||
* A thread pool for parallel task execution. It is a naive | ||
* implementation which uses a queue for each thread and assigns | ||
* tasks in a round robin fashion. | ||
* | ||
* Example usage: | ||
* | ||
* vtr::thread_pool pool(4); | ||
* pool.schedule_work([]{ | ||
* // Task body | ||
* }); | ||
* pool.wait_for_all(); // There's no API to wait for a single task | ||
*/ | ||
class thread_pool { | ||
private: | ||
/* Thread-local data */ | ||
struct ThreadData { | ||
std::thread thread; | ||
/* Per-thread task queue */ | ||
std::queue<std::function<void()>> task_queue; | ||
|
||
/* Threads wait on cv for a stop signal or a new task | ||
* queue_mutex is required for condition variable */ | ||
std::mutex queue_mutex; | ||
std::condition_variable cv; | ||
bool stop = false; | ||
}; | ||
|
||
/* Container for thread-local data */ | ||
std::vector<std::unique_ptr<ThreadData>> threads; | ||
/* Used for round-robin scheduling */ | ||
std::atomic<size_t> next_thread{0}; | ||
/* Used for wait_for_all */ | ||
std::atomic<size_t> active_tasks{0}; | ||
|
||
/* Condition variable for wait_for_all */ | ||
std::mutex completion_mutex; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment these. (all the variables). |
||
std::condition_variable completion_cv; | ||
|
||
public: | ||
thread_pool(size_t thread_count) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Between the class overview and here, you need to comment how this thread pool works. |
||
threads.reserve(thread_count); | ||
|
||
for (size_t i = 0; i < thread_count; i++) { | ||
auto thread_data = std::make_unique<ThreadData>(); | ||
|
||
thread_data->thread = std::thread([&]() { | ||
ThreadData* td = thread_data.get(); | ||
|
||
while (true) { | ||
std::function<void()> task; | ||
|
||
{ /* Wait until a task is available or stop signal is received */ | ||
std::unique_lock<std::mutex> lock(td->queue_mutex); | ||
|
||
td->cv.wait(lock, [td]() { | ||
return td->stop || !td->task_queue.empty(); | ||
}); | ||
|
||
if (td->stop && td->task_queue.empty()) { | ||
return; | ||
} | ||
|
||
/* Fetch a task from the queue */ | ||
task = std::move(td->task_queue.front()); | ||
td->task_queue.pop(); | ||
} | ||
|
||
vtr::Timer task_timer; | ||
task(); | ||
} | ||
}); | ||
|
||
threads.push_back(std::move(thread_data)); | ||
} | ||
} | ||
|
||
template<typename F> | ||
void schedule_work(F&& f) { | ||
active_tasks++; | ||
|
||
/* Round-robin thread assignment */ | ||
size_t thread_idx = (next_thread++) % threads.size(); | ||
auto thread_data = threads[thread_idx].get(); | ||
|
||
auto task = [this, f = std::forward<F>(f)]() { | ||
vtr::Timer task_timer; | ||
|
||
try { | ||
f(); | ||
} catch (const std::exception& e) { | ||
VTR_LOG_ERROR("Thread %zu failed task with error: %s\n", | ||
std::this_thread::get_id(), e.what()); | ||
throw; | ||
} catch (...) { | ||
VTR_LOG_ERROR("Thread %zu failed task with unknown error\n", | ||
std::this_thread::get_id()); | ||
throw; | ||
} | ||
|
||
size_t remaining = --active_tasks; | ||
if (remaining == 0) { | ||
completion_cv.notify_all(); | ||
} | ||
}; | ||
|
||
/* Queue new task */ | ||
{ | ||
std::lock_guard<std::mutex> lock(thread_data->queue_mutex); | ||
thread_data->task_queue.push(std::move(task)); | ||
} | ||
thread_data->cv.notify_one(); | ||
} | ||
|
||
void wait_for_all() { | ||
std::unique_lock<std::mutex> lock(completion_mutex); | ||
completion_cv.wait(lock, [this]() { return active_tasks == 0; }); | ||
duck2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
~thread_pool() { | ||
/* Stop all threads */ | ||
for (auto& thread_data : threads) { | ||
{ | ||
std::lock_guard<std::mutex> lock(thread_data->queue_mutex); | ||
thread_data->stop = true; | ||
} | ||
thread_data->cv.notify_one(); | ||
} | ||
|
||
for (auto& thread_data : threads) { | ||
if (thread_data->thread.joinable()) { | ||
thread_data->thread.join(); | ||
} | ||
} | ||
} | ||
}; | ||
|
||
} // namespace vtr |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
#pragma once | ||
|
||
/** @file Nested parallel case for NetlistRouter */ | ||
#include "netlist_routers.h" | ||
#include "vtr_optional.h" | ||
#include "vtr_thread_pool.h" | ||
#include <unordered_map> | ||
|
||
/* Add cmd line option for this later */ | ||
constexpr int MAX_THREADS = 4; | ||
|
||
/** Nested parallel impl for NetlistRouter. | ||
* | ||
* Calls a parallel ConnectionRouter for route_net to extract even more parallelism. | ||
* The main reason why this is a different router instead of templating NetlistRouter | ||
* on ConnectionRouter is this router does not use TBB. The scheduling performance is | ||
* worse, but it can wait in individual tasks now (which is not possible with TBB). | ||
* | ||
* Holds enough context members to glue together ConnectionRouter and net routing functions, | ||
* such as \ref route_net. Keeps the members in thread-local storage where needed, | ||
* i.e. ConnectionRouters and RouteIterResults-es. | ||
* See \ref route_net. */ | ||
template<typename HeapType> | ||
class NestedNetlistRouter : public NetlistRouter { | ||
public: | ||
NestedNetlistRouter( | ||
const Netlist<>& net_list, | ||
const RouterLookahead* router_lookahead, | ||
const t_router_opts& router_opts, | ||
CBRR& connections_inf, | ||
NetPinsMatrix<float>& net_delay, | ||
const ClusteredPinAtomPinsLookup& netlist_pin_lookup, | ||
std::shared_ptr<SetupHoldTimingInfo> timing_info, | ||
NetPinTimingInvalidator* pin_timing_invalidator, | ||
route_budgets& budgeting_inf, | ||
const RoutingPredictor& routing_predictor, | ||
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots, | ||
bool is_flat) | ||
: _net_list(net_list) | ||
, _router_lookahead(router_lookahead) | ||
, _router_opts(router_opts) | ||
, _connections_inf(connections_inf) | ||
, _net_delay(net_delay) | ||
, _netlist_pin_lookup(netlist_pin_lookup) | ||
, _timing_info(timing_info) | ||
, _pin_timing_invalidator(pin_timing_invalidator) | ||
, _budgeting_inf(budgeting_inf) | ||
, _routing_predictor(routing_predictor) | ||
, _choking_spots(choking_spots) | ||
, _is_flat(is_flat) | ||
, _thread_pool(MAX_THREADS) {} | ||
~NestedNetlistRouter() {} | ||
|
||
/** Run a single iteration of netlist routing for this->_net_list. This usually means calling | ||
* \ref route_net for each net, which will handle other global updates. | ||
* \return RouteIterResults for this iteration. */ | ||
RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack); | ||
/** Inform the PartitionTree of the nets with updated bounding boxes */ | ||
void handle_bb_updated_nets(const std::vector<ParentNetId>& nets); | ||
|
||
/** Set rcv_enabled for each ConnectionRouter this is managing */ | ||
void set_rcv_enabled(bool x); | ||
/** Set timing_info for each ConnectionRouter this is managing */ | ||
void set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info); | ||
|
||
private: | ||
/** Route all nets in a PartitionTree node and add its children to the task queue. */ | ||
void route_partition_tree_node(PartitionTreeNode& node); | ||
|
||
ConnectionRouter<HeapType> _make_router(const RouterLookahead* router_lookahead, bool is_flat) { | ||
auto& device_ctx = g_vpr_ctx.device(); | ||
auto& route_ctx = g_vpr_ctx.mutable_routing(); | ||
|
||
return ConnectionRouter<HeapType>( | ||
device_ctx.grid, | ||
*router_lookahead, | ||
device_ctx.rr_graph.rr_nodes(), | ||
&device_ctx.rr_graph, | ||
device_ctx.rr_rc_data, | ||
device_ctx.rr_graph.rr_switch(), | ||
route_ctx.rr_node_route_inf, | ||
is_flat); | ||
} | ||
|
||
/* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */ | ||
const Netlist<>& _net_list; | ||
const RouterLookahead* _router_lookahead; | ||
const t_router_opts& _router_opts; | ||
CBRR& _connections_inf; | ||
NetPinsMatrix<float>& _net_delay; | ||
const ClusteredPinAtomPinsLookup& _netlist_pin_lookup; | ||
std::shared_ptr<SetupHoldTimingInfo> _timing_info; | ||
NetPinTimingInvalidator* _pin_timing_invalidator; | ||
route_budgets& _budgeting_inf; | ||
const RoutingPredictor& _routing_predictor; | ||
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& _choking_spots; | ||
bool _is_flat; | ||
|
||
/** Cached routing parameters for current iteration (inputs to \see route_netlist()) */ | ||
int _itry; | ||
float _pres_fac; | ||
float _worst_neg_slack; | ||
|
||
/** The partition tree. Holds the groups of nets for each partition */ | ||
vtr::optional<PartitionTree> _tree; | ||
|
||
/** Thread pool for parallel routing. See vtr_thread_pool.h for implementation */ | ||
vtr::thread_pool _thread_pool; | ||
|
||
/* Thread-local storage. | ||
* These are maps because thread::id is a random integer instead of 1, 2, ... */ | ||
std::unordered_map<std::thread::id, ConnectionRouter<HeapType>> _routers_th; | ||
std::unordered_map<std::thread::id, RouteIterResults> _results_th; | ||
std::mutex _storage_mutex; | ||
|
||
/** Get a thread-local ConnectionRouter. We lock the id->router lookup, but this is | ||
* accessed once per partition so the overhead should be small */ | ||
ConnectionRouter<HeapType>& get_thread_router() { | ||
auto id = std::this_thread::get_id(); | ||
std::lock_guard<std::mutex> lock(_storage_mutex); | ||
if (!_routers_th.count(id)) { | ||
_routers_th.emplace(id, _make_router(_router_lookahead, _is_flat)); | ||
} | ||
return _routers_th.at(id); | ||
} | ||
|
||
RouteIterResults& get_thread_results() { | ||
auto id = std::this_thread::get_id(); | ||
std::lock_guard<std::mutex> lock(_storage_mutex); | ||
return _results_th[id]; | ||
} | ||
}; | ||
|
||
#include "NestedNetlistRouter.tpp" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't the name give an idea of what the condition variable controls/is used for, rather than just cv?