Skip to content

Added Customized Heap and Occupancy Profiling for MQ #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 1 addition & 1 deletion libs/EXTERNAL/CPS
Submodule CPS updated from d8b81c to 2bdb4b
5 changes: 5 additions & 0 deletions vpr/src/base/SetupVPR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,14 @@ static void SetupRoutingArch(const t_arch& Arch,
static void SetupRouterOpts(const t_options& Options, t_router_opts* RouterOpts) {
RouterOpts->do_check_rr_graph = Options.check_rr_graph;
RouterOpts->astar_fac = Options.astar_fac;
RouterOpts->astar_offset = Options.astar_offset;
RouterOpts->router_profiler_astar_fac = Options.router_profiler_astar_fac;
RouterOpts->post_target_prune_fac = Options.post_target_prune_fac;
RouterOpts->post_target_prune_offset = Options.post_target_prune_offset;
RouterOpts->multi_queue_num_threads = Options.multi_queue_num_threads;
RouterOpts->multi_queue_num_queues = Options.multi_queue_num_queues;
RouterOpts->multi_queue_direct_draining = Options.multi_queue_direct_draining;
RouterOpts->thread_affinity = Options.thread_affinity;
RouterOpts->bb_factor = Options.bb_factor;
RouterOpts->criticality_exp = Options.criticality_exp;
RouterOpts->max_criticality = Options.max_criticality;
Expand Down
18 changes: 18 additions & 0 deletions vpr/src/base/ShowSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
VTR_LOG("false\n");
}

auto transform_thread_affinity_list_to_str = [](const std::vector<int>& aff) {
std::string str = aff.size() ? std::to_string(aff.front()) : "off";
for (size_t i = 1; i < aff.size(); str += ',' + std::to_string(aff[i++])) ;
return str;
};

if (DETAILED == RouterOpts.route_type) {
VTR_LOG("RouterOpts.router_algorithm: ");
switch (RouterOpts.router_algorithm) {
Expand Down Expand Up @@ -338,7 +344,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {

if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
Expand Down Expand Up @@ -482,9 +495,14 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
VTR_LOG("RouterOpts.exit_after_first_routing_iteration: %s\n", RouterOpts.exit_after_first_routing_iteration ? "true" : "false");
if (TIMING_DRIVEN == RouterOpts.router_algorithm) {
VTR_LOG("RouterOpts.astar_fac: %f\n", RouterOpts.astar_fac);
VTR_LOG("RouterOpts.astar_offset: %f\n", RouterOpts.astar_offset);
VTR_LOG("RouterOpts.router_profiler_astar_fac: %f\n", RouterOpts.router_profiler_astar_fac);
VTR_LOG("RouterOpts.post_target_prune_fac: %f\n", RouterOpts.post_target_prune_fac);
VTR_LOG("RouterOpts.post_target_prune_offset: %f\n", RouterOpts.post_target_prune_offset);
VTR_LOG("RouterOpts.multi_queue_num_threads: %d\n", RouterOpts.multi_queue_num_threads);
VTR_LOG("RouterOpts.multi_queue_num_queues: %d\n", RouterOpts.multi_queue_num_queues);
VTR_LOG("RouterOpts.multi_queue_direct_draining: %s\n", RouterOpts.multi_queue_direct_draining ? "true" : "false");
VTR_LOG("RouterOpts.thread_affinity: %s\n", transform_thread_affinity_list_to_str(RouterOpts.thread_affinity).c_str());
VTR_LOG("RouterOpts.criticality_exp: %f\n", RouterOpts.criticality_exp);
VTR_LOG("RouterOpts.max_criticality: %f\n", RouterOpts.max_criticality);
VTR_LOG("RouterOpts.init_wirelength_abort_threshold: %f\n", RouterOpts.init_wirelength_abort_threshold);
Expand Down
89 changes: 89 additions & 0 deletions vpr/src/base/read_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1259,6 +1259,67 @@ struct ParsePostSynthNetlistUnconnOutputHandling {
}
};

struct ParseTheadAffinityList {
inline std::vector<std::string> get_tokens_split_by_delimiter(const std::string& str, char delimiter) {
std::vector<std::string> tokens;
std::string acc = "";
for(const auto &x : str) {
if (x == delimiter) {
tokens.push_back(acc);
acc = "";
} else {
acc += x;
}
}
tokens.push_back(acc);
return tokens;
}

// Parse thread/core affinity list (i.e., pin threads to specific cores).
// Formats such as `0,1,2,3,4,5,6,7` and `0-7` and `0-3,4-7` and `0,1-2,3-6,7`
// are all supported.
inline std::vector<int> parse_thread_affinity_list(const std::string& str) {
std::vector<int> thread_affinity_list;
std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
for (const auto &l1_token : lv1_tokens_split_by_comma) {
std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
VTR_ASSERT(num_lv2_tokens == 1 || num_lv2_tokens == 2);
if (num_lv2_tokens == 2) {
int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
for (int i = start_core_id; i <= end_core_id; ++i) {
thread_affinity_list.push_back(i);
}
} else {
thread_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
}
}
return thread_affinity_list;
}

ConvertedValue<std::vector<int>> from_str(const std::string& str) {
ConvertedValue<std::vector<int>> conv_value;
VTR_ASSERT(str.size() > 0);
if (str == "off") {
conv_value.set_value({});
} else {
conv_value.set_value(parse_thread_affinity_list(str));
}
return conv_value;
}

ConvertedValue<std::string> to_str(std::vector<int> val) {
ConvertedValue<std::string> conv_value;
std::string str = val.size() ? std::to_string(val.front()) : "off";
for (size_t i = 1; i < val.size(); str += ',' + std::to_string(val[i++])) ;
conv_value.set_value(str);
return conv_value;
}

std::vector<std::string> default_choices() { return {}; }
};

argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) {
std::string description =
"Implements the specified circuit onto the target FPGA architecture"
Expand Down Expand Up @@ -2477,6 +2538,14 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
.default_value("1.2")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument(args.astar_offset, "--astar_offset")
.help(
"Controls the directedness of the timing-driven router's exploration."
" It is a subtractive adjustment to the lookahead heuristic."
" Values between 0 and 1e-9 are resonable; higher values may increase quality at the expense of run-time.")
.default_value("0.0")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument(args.router_profiler_astar_fac, "--router_profiler_astar_fac")
.help(
"Controls the directedness of the timing-driven router's exploration"
Expand All @@ -2496,6 +2565,26 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
.default_value("0.0")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument<int>(args.multi_queue_num_threads, "--multi_queue_num_threads")
.help("TODO")
.default_value("1")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument<int>(args.multi_queue_num_queues, "--multi_queue_num_queues")
.help("TODO")
.default_value("2")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument<bool, ParseOnOff>(args.multi_queue_direct_draining, "--multi_queue_direct_draining")
.help("TODO")
.default_value("off")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument<std::vector<int>, ParseTheadAffinityList>(args.thread_affinity, "--thread_affinity")
.help("TODO")
.default_value("off")
.show_in(argparse::ShowIn::HELP_ONLY);

route_timing_grp.add_argument(args.max_criticality, "--max_criticality")
.help(
"Sets the maximum fraction of routing cost derived from delay (vs routability) for any net."
Expand Down
5 changes: 5 additions & 0 deletions vpr/src/base/read_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,14 @@ struct t_options {

/* Timing-driven router options only */
argparse::ArgValue<float> astar_fac;
argparse::ArgValue<float> astar_offset;
argparse::ArgValue<float> router_profiler_astar_fac;
argparse::ArgValue<float> post_target_prune_fac;
argparse::ArgValue<float> post_target_prune_offset;
argparse::ArgValue<int> multi_queue_num_threads;
argparse::ArgValue<int> multi_queue_num_queues;
argparse::ArgValue<bool> multi_queue_direct_draining;
argparse::ArgValue<std::vector<int>> thread_affinity;
argparse::ArgValue<float> max_criticality;
argparse::ArgValue<float> criticality_exp;
argparse::ArgValue<float> router_init_wirelength_abort_threshold;
Expand Down
7 changes: 7 additions & 0 deletions vpr/src/base/vpr_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,8 @@ struct t_placer_opts {
* an essentially breadth-first search, astar_fac = 1 is near *
* the usual astar algorithm and astar_fac > 1 are more *
* aggressive. *
* astar_offset: Offset that is subtracted from the lookahead (expected *
* future costs) in the timing-driven router. *
* max_criticality: The maximum criticality factor (from 0 to 1) any sink *
* will ever have (i.e. clip criticality to this number). *
* criticality_exp: Set criticality to (path_length(sink) / longest_path) ^ *
Expand Down Expand Up @@ -1419,9 +1421,14 @@ struct t_router_opts {
enum e_router_algorithm router_algorithm;
enum e_base_cost_type base_cost_type;
float astar_fac;
float astar_offset;
float router_profiler_astar_fac;
float post_target_prune_fac;
float post_target_prune_offset;
int multi_queue_num_threads;
int multi_queue_num_queues;
bool multi_queue_direct_draining;
std::vector<int> thread_affinity;
float max_criticality;
float criticality_exp;
float init_wirelength_abort_threshold;
Expand Down
3 changes: 2 additions & 1 deletion vpr/src/place/timing_place_lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1187,7 +1187,8 @@ void OverrideDelayModel::compute_override_delay_model(
RouterDelayProfiler& route_profiler,
const t_router_opts& router_opts) {
t_router_opts router_opts2 = router_opts;
router_opts2.astar_fac = 0.;
router_opts2.astar_fac = 0.f;
router_opts2.astar_offset = 0.f;

//Look at all the direct connections that exist, and add overrides to delay model
auto& device_ctx = g_vpr_ctx.device();
Expand Down
16 changes: 11 additions & 5 deletions vpr/src/route/SerialNetlistRouter.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class SerialNetlistRouter : public NetlistRouter {
const RoutingPredictor& routing_predictor,
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
bool is_flat)
: _serial_router(_make_router(router_lookahead, is_flat, false))
, _parallel_router(_make_router(router_lookahead, is_flat, true))
: _serial_router(_make_router(router_lookahead, router_opts, is_flat, false))
, _parallel_router(_make_router(router_lookahead, router_opts, is_flat, true))
, _net_list(net_list)
, _router_opts(router_opts)
, _connections_inf(connections_inf)
Expand All @@ -45,8 +45,10 @@ class SerialNetlistRouter : public NetlistRouter {

private:
bool should_use_parallel_connection_router(const ParentNetId &net_id, int itry, float pres_fac, float worst_neg_slack);

ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead, bool is_flat, bool is_parallel) {

ConnectionRouterInterface *_make_router(const RouterLookahead* router_lookahead,
const t_router_opts& router_opts,
bool is_flat, bool is_parallel) {
auto& device_ctx = g_vpr_ctx.device();
auto& route_ctx = g_vpr_ctx.mutable_routing();

Expand All @@ -71,7 +73,11 @@ class SerialNetlistRouter : public NetlistRouter {
device_ctx.rr_rc_data,
device_ctx.rr_graph.rr_switch(),
route_ctx.rr_node_route_inf,
is_flat);
is_flat,
router_opts.multi_queue_num_threads,
router_opts.multi_queue_num_queues,
router_opts.multi_queue_direct_draining,
router_opts.thread_affinity);
}
}
/* Context fields */
Expand Down
17 changes: 7 additions & 10 deletions vpr/src/route/connection_router.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "connection_router.h"
#include "rr_graph.h"

#include <algorithm>
#include "rr_graph.h"
#include "binary_heap.h"
#include "bucket.h"
#include "rr_graph_fwd.h"
Expand Down Expand Up @@ -695,8 +696,8 @@ float ConnectionRouter<Heap>::compute_node_cost_using_rcv(const t_conn_cost_para
float expected_total_delay_cost;
float expected_total_cong_cost;

float expected_total_cong = cost_params.astar_fac * expected_cong + backwards_cong;
float expected_total_delay = cost_params.astar_fac * expected_delay + backwards_delay;
float expected_total_cong = expected_cong + backwards_cong;
float expected_total_delay = expected_delay + backwards_delay;

//If budgets specified calculate cost as described by RCV paper:
// R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While
Expand Down Expand Up @@ -835,7 +836,7 @@ void ConnectionRouter<Heap>::evaluate_timing_driven_node_costs(t_heap* to,
target_node,
cost_params,
to->R_upstream);
total_cost += to->backward_path_cost + cost_params.astar_fac * expected_cost;
total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);

// if (rcv_path_manager.is_enabled() && to->path_data != nullptr) {
// to->path_data->backward_delay += cost_params.criticality * Tdel;
Expand Down Expand Up @@ -952,12 +953,8 @@ void ConnectionRouter<Heap>::add_route_tree_node_to_heap(

if (!rcv_path_manager.is_enabled()) {
// tot_cost = backward_path_cost + cost_params.astar_fac * expected_cost;
float tot_cost = backward_path_cost
+ cost_params.astar_fac
* router_lookahead_.get_expected_cost(inode,
target_node,
cost_params,
R_upstream);
float expected_cost = router_lookahead_.get_expected_cost(inode, target_node, cost_params, R_upstream);
float tot_cost = backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset);
VTR_LOGV_DEBUG(router_debug_, " Adding node %8d to heap from init route tree with cost %g (%s)\n",
inode,
tot_cost,
Expand Down
1 change: 1 addition & 0 deletions vpr/src/route/connection_router_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ struct t_conn_delay_budget {
struct t_conn_cost_params {
float criticality = 1.;
float astar_fac = 1.2;
float astar_offset = 0.f;
float post_target_prune_fac = 1.2f;
float post_target_prune_offset = 0.f;
float bend_cost = 1.;
Expand Down
11 changes: 11 additions & 0 deletions vpr/src/route/multi_queue_priority_queue.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#ifndef _MULTI_QUEUE_PRIORITY_QUEUE_H
#define _MULTI_QUEUE_PRIORITY_QUEUE_H

// This is only used to enable the clearing code in the MQIO codebase. Whether
// using queue draining optimization only depends on the VPR command-line option
// `--multi_queue_direct_draining` setting during runtime. If the option is set
// to `off`, the queue draining won't work since the `setMinPrioForPop` won't be
// called leaving the `minPrioForPop` in MQIO object always as float maximum.
#define MQ_IO_ENABLE_CLEAR_FOR_POP

#include "heap_type.h"

#include "MultiQueueIO.h"
Expand Down Expand Up @@ -34,7 +41,11 @@ class MultiQueuePriorityQueue {
void build_heap();
inline uint64_t getNumPushes() const { return pq_->getNumPushes(); }
inline uint64_t getNumPops() const { return pq_->getNumPops(); }
inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
inline void reset() { pq_->reset(); }
#ifdef MQ_IO_ENABLE_CLEAR_FOR_POP
inline void setMinPrioForPop(const pq_prio_t& minPrio) { pq_->setMinPrioForPop(minPrio); }
#endif

private:
MQ_IO* pq_;
Expand Down
Loading
Loading