diff --git a/libs/EXTERNAL/CPS b/libs/EXTERNAL/CPS index d8b81cb9bb4..2bdb4b5db20 160000 --- a/libs/EXTERNAL/CPS +++ b/libs/EXTERNAL/CPS @@ -1 +1 @@ -Subproject commit d8b81cb9bb4540acd782aea513e93b8e47b0b7a3 +Subproject commit 2bdb4b5db200d63520e1fe0b20db0ac1a9c7ea99 diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index cddb196ab1b..0c15a9c6653 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -410,9 +410,14 @@ static void SetupRoutingArch(const t_arch& Arch, static void SetupRouterOpts(const t_options& Options, t_router_opts* RouterOpts) { RouterOpts->do_check_rr_graph = Options.check_rr_graph; RouterOpts->astar_fac = Options.astar_fac; + RouterOpts->astar_offset = Options.astar_offset; RouterOpts->router_profiler_astar_fac = Options.router_profiler_astar_fac; RouterOpts->post_target_prune_fac = Options.post_target_prune_fac; RouterOpts->post_target_prune_offset = Options.post_target_prune_offset; + RouterOpts->multi_queue_num_threads = Options.multi_queue_num_threads; + RouterOpts->multi_queue_num_queues = Options.multi_queue_num_queues; + RouterOpts->multi_queue_direct_draining = Options.multi_queue_direct_draining; + RouterOpts->thread_affinity = Options.thread_affinity; RouterOpts->bb_factor = Options.bb_factor; RouterOpts->criticality_exp = Options.criticality_exp; RouterOpts->max_criticality = Options.max_criticality; diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp index 042277647cf..ab5c73e9b3b 100644 --- a/vpr/src/base/ShowSetup.cpp +++ b/vpr/src/base/ShowSetup.cpp @@ -257,6 +257,12 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) { VTR_LOG("false\n"); } + auto transform_thread_affinity_list_to_str = [](const std::vector& aff) { + std::string str = aff.size() ? std::to_string(aff.front()) : "off"; + for (size_t i = 1; i < aff.size(); str += ',' + std::to_string(aff[i++])) ; + return str; + }; + if (DETAILED == RouterOpts.route_type) { VTR_LOG("RouterOpts.router_algorithm: "); switch (RouterOpts.router_algorithm) { @@ -338,7 +344,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) { if (TIMING_DRIVEN == RouterOpts.router_algorithm) { VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac); + VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset); VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac); + VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac); + VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset); + VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads); + VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues); + VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false"); + VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str()); VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp); VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality); VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold); @@ -482,9 +495,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) { VTR_LOG("RouterOpts.exit_after_first_routing_iteration: %s\n", RouterOpts.exit_after_first_routing_iteration ? "true" : "false"); if (TIMING_DRIVEN == RouterOpts.router_algorithm) { VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac); + VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset); VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac); VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac); VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset); + VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads); + VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues); + VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false"); + VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str()); VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp); VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality); VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold); diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 8940f5f350c..bc11f18a067 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -1259,6 +1259,67 @@ struct ParsePostSynthNetlistUnconnOutputHandling { } }; +struct ParseTheadAffinityList { + inline std::vector get_tokens_split_by_delimiter(const std::string& str, char delimiter) { + std::vector tokens; + std::string acc = ""; + for(const auto &x : str) { + if (x == delimiter) { + tokens.push_back(acc); + acc = ""; + } else { + acc += x; + } + } + tokens.push_back(acc); + return tokens; + } + + // Parse thread/core affinity list (i.e., pin threads to specific cores). + // Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7` + // are all supported. + inline std::vector parse_thread_affinity_list(const std::string& str) { + std::vector thread_affinity_list; + std::vector lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ','); + for (const auto &l1_token : lv1_tokens_split_by_comma) { + std::vector lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-'); + size_t num_lv2_tokens = lv2_tokens_split_by_dash.size(); + VTR_ASSERT(num_lv2_tokens == 1 || num_lv2_tokens == 2); + if (num_lv2_tokens == 2) { + int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]); + int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]); + for (int i = start_core_id; i <= end_core_id; ++i) { + thread_affinity_list.push_back(i); + } + } else { + thread_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0])); + } + } + return thread_affinity_list; + } + + ConvertedValue> from_str(const std::string& str) { + ConvertedValue> conv_value; + VTR_ASSERT(str.size() > 0); + if (str == "off") { + conv_value.set_value({}); + } else { + conv_value.set_value(parse_thread_affinity_list(str)); + } + return conv_value; + } + + ConvertedValue to_str(std::vector val) { + ConvertedValue conv_value; + std::string str = val.size() ? std::to_string(val.front()) : "off"; + for (size_t i = 1; i < val.size(); str += ',' + std::to_string(val[i++])) ; + conv_value.set_value(str); + return conv_value; + } + + std::vector default_choices() { return {}; } +}; + argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) { std::string description = "Implements the specified circuit onto the target FPGA architecture" @@ -2477,6 +2538,14 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .default_value("1.2") .show_in(argparse::ShowIn::HELP_ONLY); + route_timing_grp.add_argument(args.astar_offset, "--astar_offset") + .help( + "Controls the directedness of the timing-driven router's exploration." + " It is a subtractive adjustment to the lookahead heuristic." + " Values between 0 and 1e-9 are resonable; higher values may increase quality at the expense of run-time.") + .default_value("0.0") + .show_in(argparse::ShowIn::HELP_ONLY); + route_timing_grp.add_argument(args.router_profiler_astar_fac, "--router_profiler_astar_fac") .help( "Controls the directedness of the timing-driven router's exploration" @@ -2496,6 +2565,26 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .default_value("0.0") .show_in(argparse::ShowIn::HELP_ONLY); + route_timing_grp.add_argument(args.multi_queue_num_threads, "--multi_queue_num_threads") + .help("TODO") + .default_value("1") + .show_in(argparse::ShowIn::HELP_ONLY); + + route_timing_grp.add_argument(args.multi_queue_num_queues, "--multi_queue_num_queues") + .help("TODO") + .default_value("2") + .show_in(argparse::ShowIn::HELP_ONLY); + + route_timing_grp.add_argument(args.multi_queue_direct_draining, "--multi_queue_direct_draining") + .help("TODO") + .default_value("off") + .show_in(argparse::ShowIn::HELP_ONLY); + + route_timing_grp.add_argument, ParseTheadAffinityList>(args.thread_affinity, "--thread_affinity") + .help("TODO") + .default_value("off") + .show_in(argparse::ShowIn::HELP_ONLY); + route_timing_grp.add_argument(args.max_criticality, "--max_criticality") .help( "Sets the maximum fraction of routing cost derived from delay (vs routability) for any net." diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h index ce1538eeb93..01b06c69353 100644 --- a/vpr/src/base/read_options.h +++ b/vpr/src/base/read_options.h @@ -205,9 +205,14 @@ struct t_options { /* Timing-driven router options only */ argparse::ArgValue astar_fac; + argparse::ArgValue astar_offset; argparse::ArgValue router_profiler_astar_fac; argparse::ArgValue post_target_prune_fac; argparse::ArgValue post_target_prune_offset; + argparse::ArgValue multi_queue_num_threads; + argparse::ArgValue multi_queue_num_queues; + argparse::ArgValue multi_queue_direct_draining; + argparse::ArgValue> thread_affinity; argparse::ArgValue max_criticality; argparse::ArgValue criticality_exp; argparse::ArgValue router_init_wirelength_abort_threshold; diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 0e217e4d0e2..bcbc4626a02 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -1332,6 +1332,8 @@ struct t_placer_opts { * an essentially breadth-first search, astar_fac = 1 is near * * the usual astar algorithm and astar_fac > 1 are more * * aggressive. * + * astar_offset: Offset that is subtracted from the lookahead (expected * + * future costs) in the timing-driven router. * * max_criticality: The maximum criticality factor (from 0 to 1) any sink * * will ever have (i.e. clip criticality to this number). * * criticality_exp: Set criticality to (path_length(sink) / longest_path) ^ * @@ -1419,9 +1421,14 @@ struct t_router_opts { enum e_router_algorithm router_algorithm; enum e_base_cost_type base_cost_type; float astar_fac; + float astar_offset; float router_profiler_astar_fac; float post_target_prune_fac; float post_target_prune_offset; + int multi_queue_num_threads; + int multi_queue_num_queues; + bool multi_queue_direct_draining; + std::vector thread_affinity; float max_criticality; float criticality_exp; float init_wirelength_abort_threshold; diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp index d0aacd8f78a..3cc771ffa2a 100644 --- a/vpr/src/place/timing_place_lookup.cpp +++ b/vpr/src/place/timing_place_lookup.cpp @@ -1187,7 +1187,8 @@ void OverrideDelayModel::compute_override_delay_model( RouterDelayProfiler& route_profiler, const t_router_opts& router_opts) { t_router_opts router_opts2 = router_opts; - router_opts2.astar_fac = 0.; + router_opts2.astar_fac = 0.f; + router_opts2.astar_offset = 0.f; //Look at all the direct connections that exist, and add overrides to delay model auto& device_ctx = g_vpr_ctx.device(); diff --git a/vpr/src/route/SerialNetlistRouter.h b/vpr/src/route/SerialNetlistRouter.h index 89d439eddd7..3cb8a1373b3 100644 --- a/vpr/src/route/SerialNetlistRouter.h +++ b/vpr/src/route/SerialNetlistRouter.h @@ -21,8 +21,8 @@ class SerialNetlistRouter : public NetlistRouter { const RoutingPredictor& routing_predictor, const vtr::vector>>& choking_spots, bool is_flat) - : _serial_router(_make_router(router_lookahead, is_flat, false)) - , _parallel_router(_make_router(router_lookahead, is_flat, true)) + : _serial_router(_make_router(router_lookahead, router_opts, is_flat, false)) + , _parallel_router(_make_router(router_lookahead, router_opts, is_flat, true)) , _net_list(net_list) , _router_opts(router_opts) , _connections_inf(connections_inf) @@ -45,8 +45,10 @@ class SerialNetlistRouter : public NetlistRouter { private: bool should_use_parallel_connection_router(const ParentNetId &net_id, int itry, float pres_fac, float worst_neg_slack); - - ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead, bool is_flat, bool is_parallel) { + + ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead, + const t_router_opts& router_opts, + bool is_flat, bool is_parallel) { auto& device_ctx = g_vpr_ctx.device(); auto& route_ctx = g_vpr_ctx.mutable_routing(); @@ -71,7 +73,11 @@ class SerialNetlistRouter : public NetlistRouter { device_ctx.rr_rc_data, device_ctx.rr_graph.rr_switch(), route_ctx.rr_node_route_inf, - is_flat); + is_flat, + router_opts.multi_queue_num_threads, + router_opts.multi_queue_num_queues, + router_opts.multi_queue_direct_draining, + router_opts.thread_affinity); } } /* Context fields */ diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp index a600f46f295..b0096dc9d85 100644 --- a/vpr/src/route/connection_router.cpp +++ b/vpr/src/route/connection_router.cpp @@ -1,6 +1,7 @@ #include "connection_router.h" -#include "rr_graph.h" +#include +#include "rr_graph.h" #include "binary_heap.h" #include "bucket.h" #include "rr_graph_fwd.h" @@ -695,8 +696,8 @@ float ConnectionRouter::compute_node_cost_using_rcv(const t_conn_cost_para float expected_total_delay_cost; float expected_total_cong_cost; - float expected_total_cong = cost_params.astar_fac * expected_cong + backwards_cong; - float expected_total_delay = cost_params.astar_fac * expected_delay + backwards_delay; + float expected_total_cong = expected_cong + backwards_cong; + float expected_total_delay = expected_delay + backwards_delay; //If budgets specified calculate cost as described by RCV paper: // R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While @@ -835,7 +836,7 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, target_node, cost_params, to->R_upstream); - total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost; + total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset); // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) { // to->path_data->backward_delay += cost_params.criticality * Tdel; @@ -952,12 +953,8 @@ void ConnectionRouter::add_route_tree_node_to_heap( if (!rcv_path_manager.is_enabled()) { // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost; - float tot_cost = backward_path_cost - + cost_params.astar_fac - * router_lookahead_.get_expected_cost(inode, - target_node, - cost_params, - R_upstream); + float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream); + float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset); VTR_LOGV_DEBUG(router_debug_, " Adding node %8d to heap from init route tree with cost %g (%s)\n", inode, tot_cost, diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h index 9d52966cbb9..7da28115b43 100644 --- a/vpr/src/route/connection_router_interface.h +++ b/vpr/src/route/connection_router_interface.h @@ -23,6 +23,7 @@ struct t_conn_delay_budget { struct t_conn_cost_params { float criticality = 1.; float astar_fac = 1.2; + float astar_offset = 0.f; float post_target_prune_fac = 1.2f; float post_target_prune_offset = 0.f; float bend_cost = 1.; diff --git a/vpr/src/route/multi_queue_priority_queue.h b/vpr/src/route/multi_queue_priority_queue.h index 0428bda539e..005c64eeeb0 100644 --- a/vpr/src/route/multi_queue_priority_queue.h +++ b/vpr/src/route/multi_queue_priority_queue.h @@ -1,6 +1,13 @@ #ifndef _MULTI_QUEUE_PRIORITY_QUEUE_H #define _MULTI_QUEUE_PRIORITY_QUEUE_H +// This is only used to enable the clearing code in the MQIO codebase. Whether +// using queue draining optimization only depends on the VPR command-line option +// `--multi_queue_direct_draining` setting during runtime. If the option is set +// to `off`, the queue draining won't work since the `setMinPrioForPop` won't be +// called leaving the `minPrioForPop` in MQIO object always as float maximum. +#define MQ_IO_ENABLE_CLEAR_FOR_POP + #include "heap_type.h" #include "MultiQueueIO.h" @@ -34,7 +41,11 @@ class MultiQueuePriorityQueue { void build_heap(); inline uint64_t getNumPushes() const { return pq_->getNumPushes(); } inline uint64_t getNumPops() const { return pq_->getNumPops(); } + inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); } inline void reset() { pq_->reset(); } +#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP + inline void setMinPrioForPop(const pq_prio_t& minPrio) { pq_->setMinPrioForPop(minPrio); } +#endif private: MQ_IO* pq_; diff --git a/vpr/src/route/parallel_connection_router.cpp b/vpr/src/route/parallel_connection_router.cpp index 868a251f2ba..26bfb429925 100644 --- a/vpr/src/route/parallel_connection_router.cpp +++ b/vpr/src/route/parallel_connection_router.cpp @@ -382,8 +382,22 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f // cheapest t_heap in current route tree to be expanded on float new_total_cost; RRNodeId inode; +#ifdef PROFILE_HEAP_OCCUPANCY + unsigned count = 0; + if (thread_idx == 0) { + heap_occ_profile_ << size_t(sink_node) << "\n"; + } +#endif // While the heap is not empty do while (heap_.try_pop(new_total_cost, inode)) { +#ifdef PROFILE_HEAP_OCCUPANCY + if (thread_idx == 0) { + if (count % 1000 == 0) { + heap_occ_profile_ << count << " " << heap_.getHeapOccupancy() << "\n"; + } + count ++; + } +#endif // update_router_stats(router_stats_, // false, // cheapest->index, @@ -391,10 +405,6 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f // Should we explore the neighbors of this node? - if (inode == sink_node) { - continue; - } - if (should_not_explore_neighbors(inode, new_total_cost, rr_node_route_inf_[inode].backward_path_cost, sink_node, rr_node_route_inf_, cost_params)) { continue; } @@ -619,7 +629,15 @@ void ParallelConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_param releaseLock(to_node); - heap_.add_to_heap(next.total_cost, to_node); + if (to_node == target_node) { +#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP + if (multi_queue_direct_draining_) { + heap_.setMinPrioForPop(new_total_cost); + } +#endif + return ; + } + heap_.add_to_heap(new_total_cost, to_node); // update_router_stats(router_stats_, // true, @@ -764,7 +782,7 @@ void ParallelConnectionRouter::evaluate_timing_driven_node_costs(node_t* to, target_node, cost_params, to->R_upstream); - total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost; + total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset); // if (rcv_path_manager.is_enabled() && to->path_data != nullptr) { // to->path_data->backward_delay += cost_params.criticality * Tdel; @@ -877,12 +895,8 @@ void ParallelConnectionRouter::add_route_tree_node_to_heap( // if (!rcv_path_manager.is_enabled()) { // tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost; - float tot_cost = backward_path_cost - + cost_params.astar_fac - * router_lookahead_.get_expected_cost(inode, - target_node, - cost_params, - R_upstream); + float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream); + float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset); VTR_LOGV_DEBUG(router_debug_, " Adding node %8d to heap from init route tree with cost %g (%s)\n", inode, tot_cost, diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h index 172f7010a07..a480289dae3 100644 --- a/vpr/src/route/parallel_connection_router.h +++ b/vpr/src/route/parallel_connection_router.h @@ -11,6 +11,11 @@ #include "router_stats.h" #include "spatial_route_tree_lookup.h" +// #define PROFILE_HEAP_OCCUPANCY +#ifdef PROFILE_HEAP_OCCUPANCY +#include +#endif + #define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE // #define VPR_PARALLEL_CONNECTION_ROUTER_USE_ONE_TBB @@ -30,15 +35,6 @@ using ParallelPriorityQueue = OneTBBConcurrentPriorityQueue; #include #include -const size_t mq_num_threads = std::atoi( - std::getenv("MQ_NUM_THREADS") ? std::getenv("MQ_NUM_THREADS") : "1"); -const size_t mq_num_queues_per_thread = std::atoi( - std::getenv("MQ_NUM_QUEUES_PER_THREAD") ? std::getenv("MQ_NUM_QUEUES_PER_THREAD") : "2"); -const size_t mq_num_queues_from_env = std::atoi( - std::getenv("MQ_NUM_QUEUES") ? std::getenv("MQ_NUM_QUEUES") : "0"); -const size_t mq_num_queues = mq_num_queues_from_env ? - mq_num_queues_from_env : (mq_num_threads * mq_num_queues_per_thread); - class spin_lock_t { std::atomic_flag lock_ = ATOMIC_FLAG_INIT; public: @@ -131,7 +127,11 @@ class ParallelConnectionRouter : public ConnectionRouterInterface { const std::vector& rr_rc_data, const vtr::vector& rr_switch_inf, vtr::vector& rr_node_route_inf, - bool is_flat) + bool is_flat, + int multi_queue_num_threads, + int multi_queue_num_queues, + bool multi_queue_direct_draining, + const std::vector& thread_affinity) : grid_(grid) , router_lookahead_(router_lookahead) , rr_nodes_(rr_nodes.view()) @@ -142,22 +142,52 @@ class ParallelConnectionRouter : public ConnectionRouterInterface { , net_terminal_group_num(g_vpr_ctx.routing().net_terminal_group_num) , rr_node_route_inf_(rr_node_route_inf) , is_flat_(is_flat) - , modified_rr_node_inf_(mq_num_threads) + , modified_rr_node_inf_(multi_queue_num_threads) , router_stats_(nullptr) - , heap_(mq_num_threads, mq_num_queues) - , thread_barrier_(mq_num_threads) + , heap_(multi_queue_num_threads, multi_queue_num_queues) + , thread_barrier_(multi_queue_num_threads) , is_router_destroying_(false) , locks_(rr_node_route_inf.size()) - , router_debug_(false) { + , router_debug_(false) + , multi_queue_direct_draining_(multi_queue_direct_draining) { heap_.init_heap(grid); only_opin_inter_layer = (grid.get_num_layers() > 1) && inter_layer_connections_limited_to_opin(*rr_graph); - std::cout << "#T=" << mq_num_threads << " #Q=" << mq_num_queues << std::endl << std::flush; - sub_threads_.resize(mq_num_threads-1); + sub_threads_.resize(multi_queue_num_threads - 1); thread_barrier_.init(); - for (size_t i = 0 ; i < mq_num_threads - 1; ++i) { + +#ifdef PROFILE_HEAP_OCCUPANCY + heap_occ_profile_.open("occupancy.txt", std::ios::trunc); +#endif + + bool enable_thread_affinity = thread_affinity.size() > 0; + VTR_ASSERT((!enable_thread_affinity) || (static_cast(thread_affinity.size()) == multi_queue_num_threads)); + + for (int i = 0 ; i < multi_queue_num_threads - 1; ++i) { sub_threads_[i] = std::thread(&ParallelConnectionRouter::timing_driven_route_connection_from_heap_sub_thread_wrapper, this, i + 1 /*0: main thread*/); + // Create a cpu_set_t object representing a set of CPUs. Clear it and mark only CPU i as set. + if (enable_thread_affinity) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(thread_affinity[i + 1], &cpuset); + int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(), + sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc); + } + } sub_threads_[i].detach(); } + + if (enable_thread_affinity) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(thread_affinity[0], &cpuset); + int rc = pthread_setaffinity_np(pthread_self(), + sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc); + } + } } ~ParallelConnectionRouter() { @@ -165,6 +195,10 @@ class ParallelConnectionRouter : public ConnectionRouterInterface { thread_barrier_.wait(); VTR_LOG("Parallel Connection Router is being destroyed. Time spent computing SSSP: %g seconds\n.", this->sssp_total_time.count() / 1000000.0); + +#ifdef PROFILE_HEAP_OCCUPANCY + heap_occ_profile_.close(); +#endif } // Clear's the modified list. Should be called after reset_path_costs @@ -424,6 +458,13 @@ class ParallelConnectionRouter : public ConnectionRouterInterface { // Timing std::chrono::microseconds sssp_total_time{0}; + + bool multi_queue_direct_draining_; + + // Profiling +#ifdef PROFILE_HEAP_OCCUPANCY + std::ofstream heap_occ_profile_; +#endif }; #endif /* _PARALLEL_CONNECTION_ROUTER_H */ diff --git a/vpr/src/route/route_net.tpp b/vpr/src/route/route_net.tpp index 8046d855b55..98e8cacadfa 100644 --- a/vpr/src/route/route_net.tpp +++ b/vpr/src/route/route_net.tpp @@ -139,6 +139,7 @@ inline NetResultFlags route_net(ConnectionRouter *router, t_conn_delay_budget conn_delay_budget; t_conn_cost_params cost_params; cost_params.astar_fac = router_opts.astar_fac; + cost_params.astar_offset = router_opts.astar_offset; cost_params.post_target_prune_fac = router_opts.post_target_prune_fac; cost_params.post_target_prune_offset = router_opts.post_target_prune_offset; cost_params.bend_cost = router_opts.bend_cost; diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp index 3f4dddcf8f0..5b7a241c759 100644 --- a/vpr/src/route/router_delay_profiling.cpp +++ b/vpr/src/route/router_delay_profiling.cpp @@ -95,6 +95,9 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, t_conn_cost_params cost_params; cost_params.criticality = 1.; cost_params.astar_fac = router_opts.router_profiler_astar_fac; + cost_params.astar_offset = router_opts.astar_offset; + cost_params.post_target_prune_fac = router_opts.post_target_prune_fac; + cost_params.post_target_prune_offset = router_opts.post_target_prune_offset; cost_params.bend_cost = router_opts.bend_cost; route_budgets budgeting_inf(net_list_, is_flat_); @@ -164,6 +167,9 @@ vtr::vector calculate_all_path_delays_from_rr_node(RRNodeId src t_conn_cost_params cost_params; cost_params.criticality = 1.; cost_params.astar_fac = router_opts.astar_fac; + cost_params.astar_offset = router_opts.astar_offset; + cost_params.post_target_prune_fac = router_opts.post_target_prune_fac; + cost_params.post_target_prune_offset = router_opts.post_target_prune_offset; cost_params.bend_cost = router_opts.bend_cost; /* This function is called during placement. Thus, the flat routing option should be disabled. */ //TODO: Placement is run with is_flat=false. However, since is_flat is passed, det_routing_arch should diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp index 1b0c236a29a..f55dfe39dd9 100644 --- a/vpr/test/test_connection_router.cpp +++ b/vpr/test/test_connection_router.cpp @@ -41,6 +41,9 @@ static float do_one_route(RRNodeId source_node, t_conn_cost_params cost_params; cost_params.criticality = router_opts.max_criticality; cost_params.astar_fac = router_opts.astar_fac; + cost_params.astar_offset = router_opts.astar_offset; + cost_params.post_target_prune_fac = router_opts.post_target_prune_fac; + cost_params.post_target_prune_offset = router_opts.post_target_prune_offset; cost_params.bend_cost = router_opts.bend_cost; const Netlist<>& net_list = is_flat ? (const Netlist<>&)g_vpr_ctx.atom().nlist : (const Netlist<>&)g_vpr_ctx.clustering().clb_nlist;