diff --git a/libs/libvtrcapnproto/CMakeLists.txt b/libs/libvtrcapnproto/CMakeLists.txt index 91a77b08014..d0d50dd50e9 100644 --- a/libs/libvtrcapnproto/CMakeLists.txt +++ b/libs/libvtrcapnproto/CMakeLists.txt @@ -19,6 +19,7 @@ endif() # Each schema used should appear here. set(CAPNP_DEFS place_delay_model.capnp + connection_map.capnp matrix.capnp gen/rr_graph_uxsdcxx.capnp map_lookahead.capnp diff --git a/libs/libvtrcapnproto/connection_map.capnp b/libs/libvtrcapnproto/connection_map.capnp new file mode 100644 index 00000000000..874b39822dd --- /dev/null +++ b/libs/libvtrcapnproto/connection_map.capnp @@ -0,0 +1,25 @@ +@0x876ec83c2fea5a18; + +using Matrix = import "matrix.capnp"; + +struct VprCostEntry { + delay @0 :Float32; + congestion @1 :Float32; + fill @2 :Bool; +} + +struct VprVector2D { + x @0 :Int64; + y @1 :Int64; +} + +struct VprFloatEntry { + value @0 :Float32; +} + +struct VprCostMap { + costMap @0 :Matrix.Matrix((Matrix.Matrix(VprCostEntry))); + offset @1 :Matrix.Matrix(VprVector2D); + depField @2 :List(Int64); + penalty @3 :Matrix.Matrix(VprFloatEntry); +} diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index 170bb9f85bd..d2efc837f80 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -156,6 +156,7 @@ void SetupVPR(const t_options* Options, } Segments = Arch->Segments; + device_ctx.segment_inf = Arch->Segments; SetupSwitches(*Arch, RoutingArch, Arch->Switches, Arch->num_switches); SetupRoutingArch(*Arch, RoutingArch); diff --git a/vpr/src/base/echo_files.cpp b/vpr/src/base/echo_files.cpp index 6f8de45933b..cd4e2566f9b 100644 --- a/vpr/src/base/echo_files.cpp +++ b/vpr/src/base/echo_files.cpp @@ -114,6 +114,8 @@ void alloc_and_load_echo_file_info() { setEchoFileName(E_ECHO_CHAN_DETAILS, "chan_details.txt"); setEchoFileName(E_ECHO_SBLOCK_PATTERN, "sblock_pattern.txt"); setEchoFileName(E_ECHO_ENDPOINT_TIMING, "endpoint_timing.echo.json"); + + setEchoFileName(E_ECHO_LOOKAHEAD_MAP, "lookahead_map.echo"); } void free_echo_file_info() { diff --git a/vpr/src/base/echo_files.h b/vpr/src/base/echo_files.h index 70df3a3d962..d273c575d50 100644 --- a/vpr/src/base/echo_files.h +++ b/vpr/src/base/echo_files.h @@ -46,6 +46,7 @@ enum e_echo_files { E_ECHO_CHAN_DETAILS, E_ECHO_SBLOCK_PATTERN, E_ECHO_ENDPOINT_TIMING, + E_ECHO_LOOKAHEAD_MAP, //Timing Graphs E_ECHO_PRE_PACKING_TIMING_GRAPH, diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h index 302a93b6b8e..ff43a23345f 100644 --- a/vpr/src/base/vpr_context.h +++ b/vpr/src/base/vpr_context.h @@ -137,6 +137,7 @@ struct DeviceContext : public Context { t_rr_node_indices rr_node_indices; //[0..NUM_RR_TYPES-1][0..grid.width()-1][0..grid.width()-1][0..size-1] std::vector rr_switch_inf; /* autogenerated in build_rr_graph based on switch fan-in. [0..(num_rr_switches-1)] */ + std::vector segment_inf; //Wire segment types in RR graph std::vector rr_segments; diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index b2d0ff16bdb..68716fcce64 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -190,10 +190,11 @@ class PlacerTimingCosts { num_levels_ = ilevel + 1; size_t num_leaves = num_nodes_in_level(ilevel); - size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1); + //size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1); VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections"); - VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); + // XXX: Temporarily disabling this assertion as num_level_befor_leaves + //VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); //We don't need to store all possible leaves if we have fewer connections //(i.e. bottom-right of tree is empty) diff --git a/vpr/src/route/route_profiling.cpp b/vpr/src/route/route_profiling.cpp index c5e5737caca..b393711eba9 100644 --- a/vpr/src/route/route_profiling.cpp +++ b/vpr/src/route/route_profiling.cpp @@ -2,6 +2,7 @@ #include "globals.h" #include "vpr_types.h" #include "route_profiling.h" +#include "rr_graph.h" namespace profiling { @@ -29,6 +30,10 @@ void time_on_fanout_analysis() {} void profiling_initialization(unsigned /*max_net_fanout*/) {} +void conn_start() {} +void conn_finish(int /*src_rr*/, int /*sink_rr*/, float /*criticality*/) {} +void net_finish() {} + #else constexpr unsigned int fanout_per_bin = 1; @@ -181,6 +186,12 @@ void congestion_analysis() { # endif } +static clock_t conn_start_time; +static float worst_conn_time = 0.f; +static int worst_src_rr; +static int worst_sink_rr; +static float worst_crit; + void profiling_initialization(unsigned max_fanout) { // add 1 so that indexing on the max fanout would still be valid time_on_fanout.resize((max_fanout / fanout_per_bin) + 1, 0); @@ -195,8 +206,37 @@ void profiling_initialization(unsigned max_fanout) { part_tree_preserved = 0; connections_forced_to_reroute = 0; connections_rerouted_due_to_forcing = 0; + worst_conn_time = 0.f; return; } + +void conn_start() { + conn_start_time = clock(); +} +void conn_finish(int src_rr, int sink_rr, float criticality) { + float route_time = static_cast(clock() - conn_start_time) / CLOCKS_PER_SEC; + if (route_time > worst_conn_time) { + worst_src_rr = src_rr; + worst_sink_rr = sink_rr; + worst_conn_time = route_time; + worst_crit = criticality; + } + + VTR_LOG("%s to %s (crit: %f) took %f\n", + describe_rr_node(src_rr).c_str(), + describe_rr_node(sink_rr).c_str(), + criticality, + route_time); +} +void net_finish() { + if (worst_conn_time > 0.f) { + VTR_LOG("Worst conn was %s to %s (crit: %f) took %f\n", + describe_rr_node(worst_src_rr).c_str(), + describe_rr_node(worst_sink_rr).c_str(), + worst_crit, + worst_conn_time); + } +} #endif } // end namespace profiling diff --git a/vpr/src/route/route_profiling.h b/vpr/src/route/route_profiling.h index 52a73655214..3cc1ec377a2 100644 --- a/vpr/src/route/route_profiling.h +++ b/vpr/src/route/route_profiling.h @@ -30,6 +30,10 @@ void congestion_analysis(); void time_on_criticality_analysis(); void time_on_fanout_analysis(); +void conn_start(); +void conn_finish(int src_rr, int sink_rr, float criticality); +void net_finish(); + void profiling_initialization(unsigned max_net_fanout); } // end namespace profiling diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp index e7eef6461c6..f58672606bf 100644 --- a/vpr/src/route/route_timing.cpp +++ b/vpr/src/route/route_timing.cpp @@ -1096,6 +1096,8 @@ bool timing_driven_route_net(ConnectionRouter& router, conn_delay_budget.short_path_criticality = budgeting_inf.get_crit_short_path(net_id, target_pin); } + profiling::conn_start(); + // build a branch in the route tree to the target if (!timing_driven_route_sink(router, net_id, @@ -1109,10 +1111,15 @@ bool timing_driven_route_net(ConnectionRouter& router, router_stats)) return false; + profiling::conn_finish(route_ctx.net_rr_terminals[net_id][0], + sink_rr, + pin_criticality[target_pin]); + ++router_stats.connections_routed; } // finished all sinks ++router_stats.nets_routed; + profiling::net_finish(); /* For later timing analysis. */ diff --git a/vpr/src/route/route_timing.h b/vpr/src/route/route_timing.h index 5bc4653035a..3974f3c6405 100644 --- a/vpr/src/route/route_timing.h +++ b/vpr/src/route/route_timing.h @@ -13,6 +13,8 @@ #include "connection_router_interface.h" #include "heap_type.h" +extern bool f_router_debug; + int get_max_pins_per_net(); bool try_timing_driven_route(const t_router_opts& router_opts, diff --git a/vpr/src/route/router_lookahead_map.cpp b/vpr/src/route/router_lookahead_map.cpp index 0b9772e2e50..6aad0be6eb5 100644 --- a/vpr/src/route/router_lookahead_map.cpp +++ b/vpr/src/route/router_lookahead_map.cpp @@ -1,187 +1,95 @@ -/* - * The router lookahead provides an estimate of the cost from an intermediate node to the target node - * during directed (A*-like) routing. - * - * The VPR 7.0 lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and - * congestion by assuming that a minimum number of wires, of the same type as the current node being expanded, can be used - * to complete the route. While this method is efficient, it can run into trouble with architectures that use - * multiple interconnected wire types. - * - * The lookahead in this file performs undirected Dijkstra searches to evaluate many paths through the routing network, - * starting from all the different wire types in the routing architecture. This ensures the lookahead captures the - * effect of inter-wire connectivity. This information is then reduced into a delta_x delta_y based lookup table for - * reach source wire type (f_cost_map). This is used for estimates from CHANX/CHANY -> SINK nodes. See Section 3.2.4 - * in Oleg Petelin's MASc thesis (2016) for more discussion. - * - * To handle estimates starting from SOURCE/OPIN's the lookahead also creates a small side look-up table of the wire types - * which are reachable from each physical tile type's SOURCEs/OPINs (f_src_opin_reachable_wires). This is used for - * SRC/OPIN -> CHANX/CHANY estimates. - * - * In the case of SRC/OPIN -> SINK estimates the resuls from the two look-ups are added together (and the minimum taken - * if there are multiple possiblities). - */ +#include "router_lookahead_map.h" -#include #include #include -#include -#include "vpr_types.h" -#include "vpr_error.h" -#include "vpr_utils.h" + +#include "rr_node.h" +#include "router_lookahead_map_utils.h" +#include "router_lookahead_sampling.h" #include "globals.h" #include "vtr_math.h" -#include "vtr_log.h" -#include "vtr_assert.h" #include "vtr_time.h" #include "vtr_geometry.h" -#include "router_lookahead_map.h" -#include "rr_graph2.h" +#include "echo_files.h" #include "rr_graph.h" -#include "route_common.h" + #include "route_timing.h" +#include "route_common.h" #ifdef VTR_ENABLE_CAPNPROTO # include "capnp/serialize.h" -# include "map_lookahead.capnp.h" +# include "connection_map.capnp.h" # include "ndmatrix_serdes.h" # include "mmap_file.h" # include "serdes_utils.h" -#endif /* VTR_ENABLE_CAPNPROTO */ +#endif -/* we will profile delay/congestion using this many tracks for each wire type */ -#define MAX_TRACK_OFFSET 16 +#if defined(VPR_USE_TBB) +# include +# include +#endif -/* we're profiling routing cost over many tracks for each wire type, so we'll have many cost entries at each |dx|,|dy| offset. - * there are many ways to "boil down" the many costs at each offset to a single entry for a given (wire type, chan_type) combination -- - * we can take the smallest cost, the average, median, etc. This define selects the method we use. +/* we're profiling routing cost over many tracks for each wire type, so we'll + * have many cost entries at each |dx|,|dy| offset. There are many ways to + * "boil down" the many costs at each offset to a single entry for a given + * (wire type, chan_type) combination we can take the smallest cost, the + * average, median, etc. This define selects the method we use. + * + * NOTE: Currently, only SMALLEST is supported. + * * See e_representative_entry_method */ -#define REPRESENTATIVE_ENTRY_METHOD SMALLEST - -/* when a list of delay/congestion entries at a coordinate in Cost_Entry is boiled down to a single - * representative entry, this enum is passed-in to specify how that representative entry should be - * calculated */ -enum e_representative_entry_method { - FIRST = 0, //the first cost that was recorded - SMALLEST, //the smallest-delay cost recorded - AVERAGE, - GEOMEAN, - MEDIAN -}; +#define REPRESENTATIVE_ENTRY_METHOD util::SMALLEST -/* a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion. - * since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored - * in the final lookahead cost map */ -class Expansion_Cost_Entry { - public: - std::vector cost_vector; - - private: - Cost_Entry get_smallest_entry(); - Cost_Entry get_average_entry(); - Cost_Entry get_geomean_entry(); - Cost_Entry get_median_entry(); - - public: - void add_cost_entry(float add_delay, float add_congestion) { - Cost_Entry cost_entry(add_delay, add_congestion); - if (REPRESENTATIVE_ENTRY_METHOD == SMALLEST) { - /* taking the smallest-delay entry anyway, so no need to push back multple entries */ - if (this->cost_vector.empty()) { - this->cost_vector.push_back(cost_entry); - } else { - if (add_delay < this->cost_vector[0].delay) { - this->cost_vector[0] = cost_entry; - } - } - } else { - this->cost_vector.push_back(cost_entry); - } - } - void clear_cost_entries() { - this->cost_vector.clear(); - } +#define CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS - Cost_Entry get_representative_cost_entry(e_representative_entry_method method) { - float nan = std::numeric_limits::quiet_NaN(); - Cost_Entry entry(nan, nan); +#define X_OFFSET 2 +#define Y_OFFSET 2 - if (!cost_vector.empty()) { - switch (method) { - case FIRST: - entry = cost_vector[0]; - break; - case SMALLEST: - entry = this->get_smallest_entry(); - break; - case AVERAGE: - entry = this->get_average_entry(); - break; - case GEOMEAN: - entry = this->get_geomean_entry(); - break; - case MEDIAN: - entry = this->get_median_entry(); - break; - default: - break; - } - } - return entry; - } -}; +#define MAX_EXPANSION_LEVEL 1 -/* a class that represents an entry in the Dijkstra expansion priority queue */ -class PQ_Entry { - public: - RRNodeId rr_node; //index in device_ctx.rr_nodes that this entry represents - float cost; //the cost of the path to get to this node +// Don't continue storing a path after hitting a lower-or-same cost entry. +static constexpr bool BREAK_ON_MISS = false; - /* store backward delay, R and congestion info */ - float delay; - float R_upstream; - float congestion_upstream; - - PQ_Entry(RRNodeId set_rr_node, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node) { - this->rr_node = set_rr_node; - - auto& device_ctx = g_vpr_ctx.device(); - this->delay = parent_delay; - this->congestion_upstream = parent_congestion_upstream; - this->R_upstream = parent_R_upstream; - if (!starting_node) { - int cost_index = device_ctx.rr_nodes.node_cost_index(RRNodeId(set_rr_node)); - //this->delay += g_rr_nodes[set_rr_node].C() * (g_rr_switch_inf[switch_ind].R + 0.5*g_rr_nodes[set_rr_node].R()) + - // g_rr_switch_inf[switch_ind].Tdel; - - //FIXME going to use the delay data that the VPR7 lookahead uses. For some reason the delay calculation above calculates - // a value that's just a little smaller compared to what VPR7 lookahead gets. While the above calculation should be more accurate, - // I have found that it produces the same CPD results but with worse runtime. - // - // TODO: figure out whether anything's wrong with the calculation above and use that instead. If not, add the other - // terms like T_quadratic and R_upstream to the calculation below (they are == 0 or UNDEFINED for buffered archs I think) +// Distance penalties filling are calculated based on available samples, but can be adjusted with this factor. +static constexpr float PENALTY_FACTOR = 1.f; +static constexpr float PENALTY_MIN = 1e-12f; - //NOTE: We neglect the T_quadratic and C_load terms and Switch R, so this lookahead is likely - // less accurate on unbuffered (e.g. pass-gate) architectures +static constexpr int MIN_PATH_COUNT = 1000; - this->delay += device_ctx.rr_indexed_data[cost_index].T_linear; +template +static std::pair run_dijkstra(int start_node_ind, + std::vector* node_expanded, + std::vector* paths, + RoutingCosts* routing_costs); - this->congestion_upstream += device_ctx.rr_indexed_data[cost_index].base_cost; - } +// also known as the L1 norm +static int manhattan_distance(const vtr::Point& a, const vtr::Point& b) { + return abs(b.x() - a.x()) + abs(b.y() - a.y()); +} - if (this->delay < 0) { - VTR_LOG("NEGATIVE DELAY!\n"); - } +template +constexpr const T& clamp(const T& v, const T& lo, const T& hi) { + return std::min(std::max(v, lo), hi); +} - /* set the cost of this node */ - this->cost = this->delay; +template +static vtr::Point closest_point_in_rect(const vtr::Rect& r, const vtr::Point& p) { + if (r.empty()) { + return vtr::Point(0, 0); + } else { + return vtr::Point(clamp(p.x(), r.xmin(), r.xmax() - 1), + clamp(p.y(), r.ymin(), r.ymax() - 1)); } +} - bool operator<(const PQ_Entry& obj) const { - /* inserted into max priority queue so want queue entries with a lower cost to be greater */ - return (this->cost > obj.cost); - } -}; +// deltas calculation +static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y); +static void adjust_rr_position(const RRNodeId rr, int& x, int& y); +static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y); +static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y); +static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y); +// additional lookups for IPIN/OPIN missing delays struct t_reachable_wire_inf { e_rr_type wire_rr_type; int wire_seg_index; @@ -191,120 +99,335 @@ struct t_reachable_wire_inf { float delay; }; -/* used during Dijkstra expansion to store delay/congestion info lists for each relative coordinate for a given segment and channel type. - * the list at each coordinate is later boiled down to a single representative cost entry to be stored in the final cost map */ -typedef vtr::Matrix t_routing_cost_map; //[0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1] - -typedef std::vector>> t_src_opin_reachable_wires; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index] - // ^ ^ ^ - // | | | - // physical block type index | Reachable wire info - // | - // SOURCE/OPIN ptc - -struct t_dijkstra_data { - /* a list of boolean flags (one for each rr node) to figure out if a certain node has already been expanded */ - vtr::vector node_expanded; - /* for each node keep a list of the cost with which that node has been visited (used to determine whether to push - * a candidate node onto the expansion queue */ - vtr::vector node_visited_costs; - /* a priority queue for expansion */ - std::priority_queue pq; -}; +typedef std::vector>> t_src_opin_delays; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index] + // ^ ^ ^ + // | | | + // physical block type index | Reachable wire info + // | + // SOURCE/OPIN ptc -/******** File-Scope Variables ********/ +typedef std::vector>> t_chan_ipins_delays; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index] + // ^ ^ ^ + // | | | + // physical block type index | Wire to IPIN segment info + // | + // SINK/IPIN ptc -constexpr int DIRECT_CONNECT_SPECIAL_SEG_TYPE = -1; +//Look-up table from SOURCE/OPIN to CHANX/CHANY of various types +t_src_opin_delays f_src_opin_delays; -//Look-up table from CHANX/CHANY (to SINKs) for various distances -t_wire_cost_map f_wire_cost_map; +//Look-up table from CHANX/CHANY to SINK/IPIN of various types +t_chan_ipins_delays f_chan_ipins_delays; -//Look-up table from SOURCE/OPIN to CHANX/CHANY of various types -t_src_opin_reachable_wires f_src_opin_reachable_wires; +constexpr int DIRECT_CONNECT_SPECIAL_SEG_TYPE = -1; -/******** File-Scope Functions ********/ -Cost_Entry get_wire_cost_entry(e_rr_type rr_type, int seg_index, int delta_x, int delta_y); -static void compute_router_wire_lookahead(const std::vector& segment_inf); static void compute_router_src_opin_lookahead(); -static vtr::Point pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point start); -void dijkstra_flood_to_wires(int itile, RRNodeId inode, t_src_opin_reachable_wires& src_opin_reachable_wires); - -/* returns index of a node from which to start routing */ -static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset); -/* runs Dijkstra's algorithm from specified node until all nodes have been visited. Each time a pin is visited, the delay/congestion information - * to that pin is stored is added to an entry in the routing_cost_map */ -static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routing_cost_map& routing_cost_map, t_dijkstra_data* data); -/* iterates over the children of the specified node and selectively pushes them onto the priority queue */ -static void expand_dijkstra_neighbours(PQ_Entry parent_entry, vtr::vector& node_visited_costs, vtr::vector& node_expanded, std::priority_queue& pq); -/* sets the lookahead cost map entries based on representative cost entries from routing_cost_map */ -static void set_lookahead_map_costs(int segment_index, e_rr_type chan_type, t_routing_cost_map& routing_cost_map); -/* fills in missing lookahead map entries by copying the cost of the closest valid entry */ -static void fill_in_missing_lookahead_entries(int segment_index, e_rr_type chan_type); -/* returns a cost entry in the f_wire_cost_map that is near the specified coordinates (and preferably towards (0,0)) */ -static Cost_Entry get_nearby_cost_entry(int x, int y, int segment_index, int chan_index); -/* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */ -static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y); -static void adjust_rr_position(const RRNodeId rr, int& x, int& y); -static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y); -static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y); -static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y); +static void compute_router_chan_ipin_lookahead(); +static vtr::Point pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point prev); +static void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_delays& src_opin_delays); +static void dijkstra_flood_to_ipins(RRNodeId node, t_chan_ipins_delays& chan_ipins_delays); + +// build the segment map +void CostMap::build_segment_map() { + const auto& device_ctx = g_vpr_ctx.device(); + segment_map_.resize(device_ctx.rr_nodes.size()); + for (size_t i = 0; i < segment_map_.size(); ++i) { + auto& from_node = device_ctx.rr_nodes[i]; + + int from_cost_index = from_node.cost_index(); + int from_seg_index = device_ctx.rr_indexed_data[from_cost_index].seg_index; -static void print_wire_cost_map(const std::vector& segment_inf); -static void print_router_cost_map(const t_routing_cost_map& router_cost_map); + segment_map_[i] = from_seg_index; + } +} -/******** Interface class member function definitions ********/ -float MapLookahead::get_expected_cost(int current_node, int target_node, const t_conn_cost_params& params, float /*R_upstream*/) const { - auto& device_ctx = g_vpr_ctx.device(); - auto& rr_graph = device_ctx.rr_nodes; +// resize internal data structures +void CostMap::set_counts(size_t seg_count) { + cost_map_.clear(); + offset_.clear(); + penalty_.clear(); + cost_map_.resize({2, seg_count}); + offset_.resize({2, seg_count}); + penalty_.resize({2, seg_count}); + seg_count_ = seg_count; +} - RRNodeId current(current_node); - RRNodeId target(target_node); +// cached node -> segment map +int CostMap::node_to_segment(int from_node_ind) const { + return segment_map_[from_node_ind]; +} - t_rr_type rr_type = rr_graph.node_type(current); +static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) { + penalty = std::max(penalty, PENALTY_MIN); + return util::Cost_Entry(entry.delay + distance * penalty * PENALTY_FACTOR, + entry.congestion, entry.fill); +} - if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) { - return get_lookahead_map_cost(current, target, params.criticality); - } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */ - return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost); - } else { /* Change this if you want to investigate route-throughs */ - return (0.); +// get a cost entry for a segment type, connection box type, and offset +util::Cost_Entry CostMap::find_cost(int from_seg_index, e_rr_type rr_type, int delta_x, int delta_y) const { + int chan_index = 0; + if (rr_type == CHANY) { + chan_index = 1; + } + + VTR_ASSERT(from_seg_index >= 0 && from_seg_index < (ssize_t)offset_.size()); + const auto& cost_map = cost_map_[chan_index][from_seg_index]; + if (cost_map.dim_size(0) == 0 || cost_map.dim_size(1) == 0) { + return util::Cost_Entry(); } + + vtr::Point coord(delta_x - offset_[chan_index][from_seg_index].first, + delta_y - offset_[chan_index][from_seg_index].second); + vtr::Rect bounds(0, 0, cost_map.dim_size(0), cost_map.dim_size(1)); + auto closest = closest_point_in_rect(bounds, coord); + auto cost = cost_map_[chan_index][from_seg_index][closest.x()][closest.y()]; + float penalty = penalty_[chan_index][from_seg_index]; + auto distance = manhattan_distance(closest, coord); + return penalize(cost, distance, penalty); } -void MapLookahead::compute(const std::vector& segment_inf) { - compute_router_lookahead(segment_inf); +// set the cost map for a segment type and connection box type, filling holes +void CostMap::set_cost_map(const RoutingCosts& delay_costs, const RoutingCosts& base_costs) { + // calculate the bounding boxes + vtr::Matrix> bounds({2, seg_count_}); + for (const auto& entry : delay_costs) { + bounds[entry.first.chan_index][entry.first.seg_index].expand_bounding_box(vtr::Rect(entry.first.delta)); + } + for (const auto& entry : base_costs) { + bounds[entry.first.chan_index][entry.first.seg_index].expand_bounding_box(vtr::Rect(entry.first.delta)); + } + + // store bounds + for (size_t chan = 0; chan < 2; chan++) { + for (size_t seg = 0; seg < seg_count_; seg++) { + const auto& chan_seg_bounds = bounds[chan][seg]; + if (chan_seg_bounds.empty()) { + // Didn't find any sample routes, so routing isn't possible between these segment/connection box types. + offset_[chan][seg] = std::make_pair(0, 0); + cost_map_[chan][seg] = vtr::NdMatrix( + {size_t(0), size_t(0)}); + continue; + } else { + offset_[chan][seg] = std::make_pair(chan_seg_bounds.xmin(), chan_seg_bounds.ymin()); + cost_map_[chan][seg] = vtr::NdMatrix( + {size_t(chan_seg_bounds.width()), size_t(chan_seg_bounds.height())}); + } + } + } + + // store entries into the matrices + for (const auto& entry : delay_costs) { + int seg = entry.first.seg_index; + int chan = entry.first.chan_index; + const auto& chan_seg_bounds = bounds[chan][seg]; + int x = entry.first.delta.x() - chan_seg_bounds.xmin(); + int y = entry.first.delta.y() - chan_seg_bounds.ymin(); + cost_map_[chan][seg][x][y].delay = entry.second; + } + for (const auto& entry : base_costs) { + int seg = entry.first.seg_index; + int chan = entry.first.chan_index; + const auto& chan_seg_bounds = bounds[chan][seg]; + int x = entry.first.delta.x() - chan_seg_bounds.xmin(); + int y = entry.first.delta.y() - chan_seg_bounds.ymin(); + cost_map_[chan][seg][x][y].congestion = entry.second; + } + + // fill the holes + for (size_t chan = 0; chan < 2; chan++) { + for (size_t seg = 0; seg < seg_count_; seg++) { + penalty_[chan][seg] = std::numeric_limits::infinity(); + const auto& chan_seg_bounds = bounds[chan][seg]; + if (chan_seg_bounds.empty()) { + continue; + } + auto& matrix = cost_map_[chan][seg]; + + // calculate delay penalty + float min_delay = std::numeric_limits::infinity(), max_delay = 0.f; + vtr::Point min_location(0, 0), max_location(0, 0); + for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) { + for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) { + util::Cost_Entry& cost_entry = matrix[ix][iy]; + if (cost_entry.valid()) { + if (cost_entry.delay < min_delay) { + min_delay = cost_entry.delay; + min_location = vtr::Point(ix, iy); + } + if (cost_entry.delay > max_delay) { + max_delay = cost_entry.delay; + max_location = vtr::Point(ix, iy); + } + } + } + } + float delay_penalty = (max_delay - min_delay) / static_cast(std::max(1, manhattan_distance(max_location, min_location))); + penalty_[chan][seg] = delay_penalty; + + // find missing cost entries and fill them in by copying a nearby cost entry + std::vector> missing; + bool couldnt_fill = false; + auto shifted_bounds = vtr::Rect(0, 0, chan_seg_bounds.width(), chan_seg_bounds.height()); + int max_fill = 0; + for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) { + for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) { + util::Cost_Entry& cost_entry = matrix[ix][iy]; + if (!cost_entry.valid()) { + // maximum search radius + util::Cost_Entry filler; + int distance; + std::tie(filler, distance) = get_nearby_cost_entry(matrix, ix, iy, shifted_bounds); + if (filler.valid()) { + missing.push_back(std::make_tuple(ix, iy, penalize(filler, distance, delay_penalty))); + max_fill = std::max(max_fill, distance); + } else { + couldnt_fill = true; + } + } + } + if (couldnt_fill) { + // give up trying to fill an empty matrix + break; + } + } + + if (!couldnt_fill) { + VTR_LOG("At %d: max_fill = %d, delay_penalty = %e\n", seg, max_fill, delay_penalty); + } + + // write back the missing entries + for (auto& xy_entry : missing) { + matrix[std::get<0>(xy_entry)][std::get<1>(xy_entry)] = std::get<2>(xy_entry); + } + + if (couldnt_fill) { + VTR_LOG_WARN("Couldn't fill holes in the cost matrix for %d -> %ld, %d x %d bounding box\n", + chan, seg, chan_seg_bounds.width(), chan_seg_bounds.height()); + for (unsigned y = 0; y < matrix.dim_size(1); y++) { + for (unsigned x = 0; x < matrix.dim_size(0); x++) { + VTR_ASSERT(!matrix[x][y].valid()); + } + } + } + } + } } -void MapLookahead::read(const std::string& file) { - read_router_lookahead(file); +// prints an ASCII diagram of each cost map for a segment type (debug) +// o => above average +// . => at or below average +// * => invalid (missing) +void CostMap::print(int iseg) const { + for (size_t chan_index = 0; + chan_index < 2; + chan_index++) { + auto& matrix = cost_map_[chan_index][iseg]; + if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) { + VTR_LOG("cost EMPTY for chan_index = %lu\n", chan_index); + continue; + } + VTR_LOG("cost for chan_index = %lu (%zu, %zu)\n", chan_index, matrix.dim_size(0), matrix.dim_size(1)); + double sum = 0.0; + for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) { + for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) { + const auto& entry = matrix[ix][iy]; + if (entry.valid()) { + sum += entry.delay; + } + } + } + double avg = sum / ((double)matrix.dim_size(0) * (double)matrix.dim_size(1)); + for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) { + for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) { + const auto& entry = matrix[ix][iy]; + if (!entry.valid()) { + VTR_LOG("*"); + } else if (entry.delay * 4 > avg * 5) { + VTR_LOG("O"); + } else if (entry.delay > avg) { + VTR_LOG("o"); + } else if (entry.delay * 4 > avg * 3) { + VTR_LOG("."); + } else { + VTR_LOG(" "); + } + } + VTR_LOG("\n"); + } + } +} - //Next, compute which wire types are accessible (and the cost to reach them) - //from the different physical tile type's SOURCEs & OPINs - compute_router_src_opin_lookahead(); +// list segment type and connection box type pairs that have empty cost maps (debug) +std::vector> CostMap::list_empty() const { + std::vector> results; + for (int chan_index = 0; chan_index < (int)cost_map_.dim_size(1); chan_index++) { + for (int iseg = 0; iseg < (int)cost_map_.dim_size(0); iseg++) { + auto& matrix = cost_map_[chan_index][iseg]; + if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) results.push_back(std::make_pair(chan_index, iseg)); + } + } + return results; } -void MapLookahead::write(const std::string& file) const { - write_router_lookahead(file); +static void assign_min_entry(util::Cost_Entry* dst, const util::Cost_Entry& src) { + if (src.delay < dst->delay) { + dst->delay = src.delay; + } + if (src.congestion < dst->congestion) { + dst->congestion = src.congestion; + } +} + +// find the minimum cost entry from the nearest manhattan distance neighbor +std::pair CostMap::get_nearby_cost_entry(const vtr::NdMatrix& matrix, + int cx, + int cy, + const vtr::Rect& bounds) { + // spiral around (cx, cy) looking for a nearby entry + bool in_bounds = bounds.contains(vtr::Point(cx, cy)); + if (!in_bounds) { + return std::make_pair(util::Cost_Entry(), 0); + } + int n = 0; + util::Cost_Entry fill(matrix[cx][cy]); + fill.fill = true; + + while (in_bounds && !fill.valid()) { + n++; + in_bounds = false; + util::Cost_Entry min_entry; + for (int ox = -n; ox <= n; ox++) { + int x = cx + ox; + int oy = n - abs(ox); + int yp = cy + oy; + int yn = cy - oy; + if (bounds.contains(vtr::Point(x, yp))) { + assign_min_entry(&min_entry, matrix[x][yp]); + in_bounds = true; + } + if (bounds.contains(vtr::Point(x, yn))) { + assign_min_entry(&min_entry, matrix[x][yn]); + in_bounds = true; + } + } + if (!std::isfinite(fill.delay)) { + fill.delay = min_entry.delay; + } + if (!std::isfinite(fill.congestion)) { + fill.congestion = min_entry.congestion; + } + } + return std::make_pair(fill, n); } -/******** Function Definitions ********/ -/* queries the lookahead_map (should have been computed prior to routing) to get the expected cost - * from the specified source to the specified target */ -float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac) { +std::pair MapLookahead::get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - int delta_x, delta_y; - get_xy_deltas(from_node, to_node, &delta_x, &delta_y); - delta_x = abs(delta_x); - delta_y = abs(delta_y); - - float expected_cost = std::numeric_limits::infinity(); - e_rr_type from_type = rr_graph.node_type(from_node); if (from_type == SOURCE || from_type == OPIN) { //When estimating costs from a SOURCE/OPIN we look-up to find which wire types (and the - //cost to reach them) in f_src_opin_reachable_wires. Once we know what wire types are + //cost to reach them) in f_src_opin_delays. Once we know what wire types are //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final //delay to reach the sink. @@ -313,7 +436,7 @@ float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float critica auto from_ptc = rr_graph.node_ptc_num(from_node); - if (f_src_opin_reachable_wires[tile_index][from_ptc].empty()) { + if (f_src_opin_delays[tile_index][from_ptc].empty()) { //During lookahead profiling we were unable to find any wires which connected //to this PTC. // @@ -324,1062 +447,1129 @@ float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float critica //Such RR graphs are of course unroutable, but that should be determined by the //router. So just return an arbitrary value here rather than error. - //We choose to return the largest (non-infinite) value possible, but scaled - //down by a large factor to maintain some dynaimc range in case this value ends - //up being processed (e.g. by the timing analyzer). - // - //The cost estimate should still be *extremely* large compared to a typical delay, and - //so should ensure that the router de-prioritizes exploring this path, but does not - //forbid the router from trying. - expected_cost = std::numeric_limits::max() / 1e12; + return std::pair(0.f, 0.f); } else { //From the current SOURCE/OPIN we look-up the wiretypes which are reachable //and then add the estimates from those wire types for the distance of interest. //If there are multiple options we use the minimum value. - for (const auto& kv : f_src_opin_reachable_wires[tile_index][from_ptc]) { + + float delay = 0; + float congestion = 0; + float expected_cost = std::numeric_limits::infinity(); + + for (const auto& kv : f_src_opin_delays[tile_index][from_ptc]) { const t_reachable_wire_inf& reachable_wire_inf = kv.second; - Cost_Entry wire_cost_entry; + util::Cost_Entry cost_entry; if (reachable_wire_inf.wire_rr_type == SINK) { //Some pins maybe reachable via a direct (OPIN -> IPIN) connection. //In the lookahead, we treat such connections as 'special' wire types //with no delay/congestion cost - wire_cost_entry.delay = 0; - wire_cost_entry.congestion = 0; + cost_entry.delay = 0; + cost_entry.congestion = 0; } else { //For an actual accessible wire, we query the wire look-up to get it's //delay and congestion cost estimates - wire_cost_entry = get_wire_cost_entry(reachable_wire_inf.wire_rr_type, reachable_wire_inf.wire_seg_index, delta_x, delta_y); + cost_entry = cost_map_.find_cost(reachable_wire_inf.wire_seg_index, reachable_wire_inf.wire_rr_type, delta_x, delta_y); } - float this_cost = (criticality_fac) * (reachable_wire_inf.congestion + wire_cost_entry.congestion) - + (1. - criticality_fac) * (reachable_wire_inf.delay + wire_cost_entry.delay); + float this_cost = (criticality_fac) * (reachable_wire_inf.delay + cost_entry.delay) + + (1. - criticality_fac) * (reachable_wire_inf.congestion + cost_entry.congestion); - expected_cost = std::min(expected_cost, this_cost); + if (this_cost < expected_cost) { + delay = reachable_wire_inf.delay; + congestion = reachable_wire_inf.congestion; + } } - } - - VTR_ASSERT_SAFE_MSG(std::isfinite(expected_cost), - vtr::string_fmt("Lookahead failed to estimate cost from %s: %s", - rr_node_arch_name(size_t(from_node)).c_str(), - describe_rr_node(size_t(from_node)).c_str()) - .c_str()); - - } else { - VTR_ASSERT_SAFE(from_type == CHANX || from_type == CHANY); - //When estimating costs from a wire, we directly look-up the result in the wire lookahead (f_wire_cost_map) - - int from_cost_index = rr_graph.node_cost_index(from_node); - int from_seg_index = device_ctx.rr_indexed_data[from_cost_index].seg_index; - - VTR_ASSERT(from_seg_index >= 0); - /* now get the expected cost from our lookahead map */ - Cost_Entry cost_entry = get_wire_cost_entry(from_type, from_seg_index, delta_x, delta_y); - - float expected_delay = cost_entry.delay; - float expected_congestion = cost_entry.congestion; - - expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion; + return std::pair(delay, congestion); + } - VTR_ASSERT_SAFE_MSG(std::isfinite(expected_cost), + VTR_ASSERT_SAFE_MSG(false, vtr::string_fmt("Lookahead failed to estimate cost from %s: %s", rr_node_arch_name(size_t(from_node)).c_str(), describe_rr_node(size_t(from_node)).c_str()) .c_str()); } - return expected_cost; + return std::pair(0.f, 0.f); } -Cost_Entry get_wire_cost_entry(e_rr_type rr_type, int seg_index, int delta_x, int delta_y) { - VTR_ASSERT_SAFE(rr_type == CHANX || rr_type == CHANY); - - int chan_index = 0; - if (rr_type == CHANY) { - chan_index = 1; +// derive a cost from the map between two nodes +float MapLookahead::get_map_cost(int from_node_ind, + int to_node_ind, + float criticality_fac) const { + if (from_node_ind == to_node_ind) { + return 0.f; } - VTR_ASSERT_SAFE(delta_x < (int)f_wire_cost_map.dim_size(2)); - VTR_ASSERT_SAFE(delta_y < (int)f_wire_cost_map.dim_size(3)); - - return f_wire_cost_map[chan_index][seg_index][delta_x][delta_y]; -} - -/* Computes the lookahead map to be used by the router. If a map was computed prior to this, a new one will not be computed again. - * The rr graph must have been built before calling this function. */ -void compute_router_lookahead(const std::vector& segment_inf) { - vtr::ScopedStartFinishTimer timer("Computing router lookahead map"); - - //First compute the delay map when starting from the various wire types - //(CHANX/CHANY)in the routing architecture - compute_router_wire_lookahead(segment_inf); - - //Next, compute which wire types are accessible (and the cost to reach them) - //from the different physical tile type's SOURCEs & OPINs - compute_router_src_opin_lookahead(); -} - -static void compute_router_wire_lookahead(const std::vector& segment_inf) { - vtr::ScopedStartFinishTimer timer("Computing wire lookahead"); - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; auto& rr_graph = device_ctx.rr_nodes; - //Re-allocate - f_wire_cost_map = t_wire_cost_map({2, segment_inf.size(), device_ctx.grid.width(), device_ctx.grid.height()}); + auto from_node_type = device_ctx.rr_nodes[to_node_ind].type(); - int longest_length = 0; - for (const auto& seg_inf : segment_inf) { - longest_length = std::max(longest_length, seg_inf.length); - } + RRNodeId to_node(to_node_ind); + RRNodeId from_node(from_node_ind); - //Start sampling at the lower left non-corner - int ref_x = 1; - int ref_y = 1; - - //Sample from locations near the reference location (to capture maximum distance paths) - //Also sample from locations at least the longest wire length away from the edge (to avoid - //edge effects for shorter distances) - std::vector ref_increments = {0, 1, - longest_length, longest_length + 1}; - - //Uniquify the increments (avoid sampling the same locations repeatedly if they happen to - //overlap) - std::sort(ref_increments.begin(), ref_increments.end()); - ref_increments.erase(std::unique(ref_increments.begin(), ref_increments.end()), ref_increments.end()); - - //Upper right non-corner - int target_x = device_ctx.grid.width() - 2; - int target_y = device_ctx.grid.height() - 2; - - //Profile each wire segment type - for (int iseg = 0; iseg < int(segment_inf.size()); iseg++) { - //First try to pick good representative sample locations for each type - std::map> sample_nodes; - for (e_rr_type chan_type : {CHANX, CHANY}) { - for (int ref_inc : ref_increments) { - int sample_x = ref_x + ref_inc; - int sample_y = ref_y + ref_inc; - - if (sample_x >= int(grid.width())) continue; - if (sample_y >= int(grid.height())) continue; - - for (int track_offset = 0; track_offset < MAX_TRACK_OFFSET; track_offset += 2) { - /* get the rr node index from which to start routing */ - RRNodeId start_node = get_start_node(sample_x, sample_y, - target_x, target_y, //non-corner upper right - chan_type, iseg, track_offset); - - if (!start_node) { - continue; - } + int dx, dy; + get_xy_deltas(from_node, to_node, &dx, &dy); - sample_nodes[chan_type].push_back(RRNodeId(start_node)); - } - } - } + float extra_delay, extra_congestion; + std::tie(extra_delay, extra_congestion) = this->get_src_opin_delays(from_node, dx, dy, criticality_fac); - //If we failed to find any representative sample locations, search exhaustively - // - //This is to ensure we sample 'unusual' wire types which may not exist in all channels - //(e.g. clock routing) - for (e_rr_type chan_type : {CHANX, CHANY}) { - if (!sample_nodes[chan_type].empty()) continue; + int from_seg_index = cost_map_.node_to_segment(from_node_ind); + util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, from_node_type, dx, dy); - //Try an exhaustive search to find a suitable sample point - for (int inode = 0; inode < int(device_ctx.rr_nodes.size()); ++inode) { - auto rr_node = RRNodeId(inode); - auto rr_type = rr_graph.node_type(rr_node); - if (rr_type != chan_type) continue; + if (!cost_entry.valid()) { + // there is no route + VTR_LOGV_DEBUG(f_router_debug, + "Not connected %d (%s, %d) -> %d (%s)\n", + from_node_ind, device_ctx.rr_nodes[from_node_ind].type_string(), from_seg_index, + to_node_ind, device_ctx.rr_nodes[to_node_ind].type_string()); + return std::numeric_limits::infinity(); + } - int cost_index = rr_graph.node_cost_index(rr_node); - VTR_ASSERT(cost_index != OPEN); + auto to_tile_type = device_ctx.grid[rr_graph.node_xlow(to_node)][rr_graph.node_ylow(to_node)].type; + auto to_tile_index = to_tile_type->index; - int seg_index = device_ctx.rr_indexed_data[cost_index].seg_index; + auto to_ptc = rr_graph.node_ptc_num(to_node); - if (seg_index == iseg) { - sample_nodes[chan_type].push_back(RRNodeId(inode)); - } + float site_pin_delay = std::numeric_limits::infinity(); + if (f_chan_ipins_delays[to_tile_index].size() != 0) { + for (const auto& kv : f_chan_ipins_delays[to_tile_index][to_ptc]) { + const t_reachable_wire_inf& reachable_wire_inf = kv.second; - if (sample_nodes[chan_type].size() >= ref_increments.size()) { - break; - } - } + float this_delay = reachable_wire_inf.delay; + site_pin_delay = std::min(this_delay, site_pin_delay); } + } - //Finally, now that we have a list of sample locations, run a Djikstra flood from - //each sample location to profile the routing network from this type - - t_dijkstra_data dijkstra_data; - t_routing_cost_map routing_cost_map({device_ctx.grid.width(), device_ctx.grid.height()}); - - for (e_rr_type chan_type : {CHANX, CHANY}) { - if (sample_nodes[chan_type].empty()) { - VTR_LOG_WARN("Unable to find any sample location for segment %s type '%s' (length %d)\n", - rr_node_typename[chan_type], - segment_inf[iseg].name.c_str(), - segment_inf[iseg].length); - } else { - //reset cost for this segment - routing_cost_map.fill(Expansion_Cost_Entry()); - - for (RRNodeId sample_node : sample_nodes[chan_type]) { - int sample_x = rr_graph.node_xlow(sample_node); - int sample_y = rr_graph.node_ylow(sample_node); - - if (rr_graph.node_direction(sample_node) == DEC_DIRECTION) { - sample_x = rr_graph.node_xhigh(sample_node); - sample_y = rr_graph.node_yhigh(sample_node); - } - - run_dijkstra(sample_node, - sample_x, - sample_y, - routing_cost_map, - &dijkstra_data); - } - - if (false) print_router_cost_map(routing_cost_map); - - /* boil down the cost list in routing_cost_map at each coordinate to a representative cost entry and store it in the lookahead - * cost map */ - set_lookahead_map_costs(iseg, chan_type, routing_cost_map); - - /* fill in missing entries in the lookahead cost map by copying the closest cost entries (cost map was computed based on - * a reference coordinate > (0,0) so some entries that represent a cross-chip distance have not been computed) */ - fill_in_missing_lookahead_entries(iseg, chan_type); - } - } + float expected_delay = cost_entry.delay + extra_delay; + float expected_congestion = cost_entry.congestion + extra_congestion; + + expected_delay += site_pin_delay; + + float expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion; + + VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node \n\t%s to \n\t%s\n", + describe_rr_node(from_node_ind).c_str(), + describe_rr_node(to_node_ind).c_str()); + const std::string& segment_name = device_ctx.segment_inf[from_seg_index].name; + VTR_LOGV_DEBUG(f_router_debug, "Lookahead returned %s (%d) with distance (%zd, %zd)\n", + segment_name.c_str(), from_seg_index, + dx, dy); + VTR_LOGV_DEBUG(f_router_debug, "Lookahead delay: %g\n", expected_delay); + VTR_LOGV_DEBUG(f_router_debug, "Lookahead congestion: %g\n", expected_congestion); + VTR_LOGV_DEBUG(f_router_debug, "Criticality: %g\n", criticality_fac); + VTR_LOGV_DEBUG(f_router_debug, "Lookahead cost: %g\n", expected_cost); + VTR_LOGV_DEBUG(f_router_debug, "Site pin delay: %g\n", site_pin_delay); + + /* XXX: temporarily disable this for further debugging as it appears that some costs are set to infinity + if (!std::isfinite(expected_cost) { + VTR_LOG_ERROR("infinite cost for segment %d at (%d, %d)\n", from_seg_index, (int)dx, (int)dy); + VTR_ASSERT(0); + } */ + + if (expected_cost < 0.f) { + VTR_LOG_ERROR("negative cost for segment %d at (%d, %d)\n", from_seg_index, (int)dx, (int)dy); + VTR_ASSERT(0); } - if (false) print_wire_cost_map(segment_inf); + return expected_cost; } -static void compute_router_src_opin_lookahead() { - vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead"); +// add a best cost routing path from start_node_ind to node_ind to routing costs +template +static bool add_paths(int start_node_ind, + Entry current, + const std::vector& paths, + RoutingCosts* routing_costs) { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - f_src_opin_reachable_wires.clear(); + int node_ind = current.rr_node_ind; + RRNodeId node(node_ind); - f_src_opin_reachable_wires.resize(device_ctx.physical_tile_types.size()); + bool new_sample_found = false; - std::vector rr_nodes_at_loc; + auto to_tile_type = device_ctx.grid[rr_graph.node_xlow(node)][rr_graph.node_ylow(node)].type; + auto to_tile_index = to_tile_type->index; - //We assume that the routing connectivity of each instance of a physical tile is the same, - //and so only measure one instance of each type - for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) { - for (e_rr_type rr_type : {SOURCE, OPIN}) { - vtr::Point sample_loc(-1, -1); + auto to_ptc = rr_graph.node_ptc_num(node); - size_t num_sampled_locs = 0; - bool ptcs_with_no_reachable_wires = true; - while (ptcs_with_no_reachable_wires) { //Haven't found wire connected to ptc - ptcs_with_no_reachable_wires = false; + float site_pin_delay = std::numeric_limits::infinity(); + if (f_chan_ipins_delays[to_tile_index].size() != 0) { + for (const auto& kv : f_chan_ipins_delays[to_tile_index][to_ptc]) { + const t_reachable_wire_inf& reachable_wire_inf = kv.second; - sample_loc = pick_sample_tile(&device_ctx.physical_tile_types[itile], sample_loc); + float this_delay = reachable_wire_inf.delay; + site_pin_delay = std::min(this_delay, site_pin_delay); + } + } - if (sample_loc.x() == -1 && sample_loc.y() == -1) { - //No untried instances of the current tile type left - VTR_LOG_WARN("Found no %ssample locations for %s in %s\n", - (num_sampled_locs == 0) ? "" : "more ", - rr_node_typename[rr_type], - device_ctx.physical_tile_types[itile].name); - break; - } + // reconstruct the path + std::vector path; + for (int i = paths[node_ind].parent; i != start_node_ind; i = paths[i].parent) { + VTR_ASSERT(i != -1); + path.push_back(i); + } + path.push_back(start_node_ind); - //VTR_LOG("Sampling %s at (%d,%d)\n", device_ctx.physical_tile_types[itile].name, sample_loc.x(), sample_loc.y()); + current.adjust_Tsw(-site_pin_delay); - rr_nodes_at_loc.clear(); + // add each node along the path subtracting the incremental costs from the current costs + Entry start_to_here(start_node_ind, UNDEFINED, nullptr); + int parent = start_node_ind; + for (auto it = path.rbegin(); it != path.rend(); it++) { + RRNodeId this_node(*it); + auto& here = device_ctx.rr_nodes[*it]; + int seg_index = device_ctx.rr_indexed_data[here.cost_index()].seg_index; - get_rr_node_indices(device_ctx.rr_node_indices, sample_loc.x(), sample_loc.y(), rr_type, &rr_nodes_at_loc); - for (int inode : rr_nodes_at_loc) { - if (inode < 0) continue; + auto chan_type = rr_graph.node_type(node); - RRNodeId node_id(inode); + int ichan = 0; + if (chan_type == CHANY) { + ichan = 1; + } - int ptc = rr_graph.node_ptc_num(node_id); + int delta_x, delta_y; + get_xy_deltas(this_node, node, &delta_x, &delta_y); - if (ptc >= int(f_src_opin_reachable_wires[itile].size())) { - f_src_opin_reachable_wires[itile].resize(ptc + 1); //Inefficient but functional... - } + vtr::Point delta(delta_x, delta_y); - //Find the wire types which are reachable from inode and record them and - //the cost to reach them - dijkstra_flood_to_wires(itile, node_id, f_src_opin_reachable_wires); + RoutingCostKey key = { + ichan, + seg_index, + delta}; - if (f_src_opin_reachable_wires[itile][ptc].empty()) { - VTR_LOG_WARN("Found no reachable wires from %s (%s) at (%d,%d)\n", - rr_node_typename[rr_type], - rr_node_arch_name(inode).c_str(), - sample_loc.x(), - sample_loc.y()); + if (*it != start_node_ind) { + auto& parent_node = device_ctx.rr_nodes[parent]; + start_to_here = Entry(*it, parent_node.edge_switch(paths[*it].edge), &start_to_here); + parent = *it; + } - ptcs_with_no_reachable_wires = true; - } - } + float cost = current.cost() - start_to_here.cost(); + if (cost < 0.f && cost > -10e-15 /* 10 femtosecond */) { + cost = 0.f; + } - ++num_sampled_locs; - } - if (ptcs_with_no_reachable_wires) { - VPR_ERROR(VPR_ERROR_ROUTE, "Some SOURCE/OPINs have no reachable wires\n"); + VTR_ASSERT(cost >= 0.f); + + // NOTE: implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST + auto result = routing_costs->insert(std::make_pair(key, cost)); + if (!result.second) { + if (cost < result.first->second) { + result.first->second = cost; + new_sample_found = true; + } else if (BREAK_ON_MISS) { + break; } + } else { + new_sample_found = true; } } + return new_sample_found; } -static vtr::Point pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point prev) { - //Very simple for now, just pick the fist matching tile found - vtr::Point loc(OPEN, OPEN); - - //VTR_LOG("Prev: %d,%d\n", prev.x(), prev.y()); - +/* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */ +static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y) { auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - - int y_init = prev.y() + 1; //Start searching next element above prev - - for (int x = prev.x(); x < int(grid.width()); ++x) { - if (x < 0) continue; + auto& rr_graph = device_ctx.rr_nodes; - //VTR_LOG(" x: %d\n", x); + e_rr_type from_type = rr_graph.node_type(from_node); + e_rr_type to_type = rr_graph.node_type(to_node); - for (int y = y_init; y < int(grid.height()); ++y) { - if (y < 0) continue; + if (!is_chan(from_type) && !is_chan(to_type)) { + //Alternate formulation for non-channel types + int from_x = 0; + int from_y = 0; + adjust_rr_position(from_node, from_x, from_y); - //VTR_LOG(" y: %d\n", y); - if (grid[x][y].type == tile_type) { - loc.set_x(x); - loc.set_y(y); - break; - } + int to_x = 0; + int to_y = 0; + adjust_rr_position(to_node, to_x, to_y); + + *delta_x = to_x - from_x; + *delta_y = to_y - from_y; + } else { + //Traditional formulation + + /* get chan/seg coordinates of the from/to nodes. seg coordinate is along the wire, + * chan coordinate is orthogonal to the wire */ + int from_seg_low; + int from_seg_high; + int from_chan; + int to_seg; + int to_chan; + if (from_type == CHANY) { + from_seg_low = rr_graph.node_ylow(from_node); + from_seg_high = rr_graph.node_yhigh(from_node); + from_chan = rr_graph.node_xlow(from_node); + to_seg = rr_graph.node_ylow(to_node); + to_chan = rr_graph.node_xlow(to_node); + } else { + from_seg_low = rr_graph.node_xlow(from_node); + from_seg_high = rr_graph.node_xhigh(from_node); + from_chan = rr_graph.node_ylow(from_node); + to_seg = rr_graph.node_xlow(to_node); + to_chan = rr_graph.node_ylow(to_node); } - if (loc.x() != OPEN && loc.y() != OPEN) { - break; + /* now we want to count the minimum number of *channel segments* between the from and to nodes */ + int delta_seg, delta_chan; + + /* orthogonal to wire */ + int no_need_to_pass_by_clb = 0; //if we need orthogonal wires then we don't need to pass by the target CLB along the current wire direction + if (to_chan > from_chan + 1) { + /* above */ + delta_chan = to_chan - from_chan; + no_need_to_pass_by_clb = 1; + } else if (to_chan < from_chan) { + /* below */ + delta_chan = from_chan - to_chan + 1; + no_need_to_pass_by_clb = 1; } else { - y_init = 0; //Prepare to search next column + /* adjacent to current channel */ + delta_chan = 0; + no_need_to_pass_by_clb = 0; + } + + /* along same direction as wire. */ + if (to_seg > from_seg_high) { + /* ahead */ + delta_seg = to_seg - from_seg_high - no_need_to_pass_by_clb; + } else if (to_seg < from_seg_low) { + /* behind */ + delta_seg = from_seg_low - to_seg - no_need_to_pass_by_clb; + } else { + /* along the span of the wire */ + delta_seg = 0; + } + + /* account for wire direction. lookahead map was computed by looking up and to the right starting at INC wires. for targets + * that are opposite of the wire direction, let's add 1 to delta_seg */ + e_direction from_dir = rr_graph.node_direction(from_node); + if (is_chan(from_type) + && ((to_seg < from_seg_low && from_dir == INC_DIRECTION) || (to_seg > from_seg_high && from_dir == DEC_DIRECTION))) { + delta_seg++; + } + + if (from_type == CHANY) { + *delta_x = delta_chan; + *delta_y = delta_seg; + } else { + *delta_x = delta_seg; + *delta_y = delta_chan; } } - //VTR_LOG("Next: %d,%d\n", loc.x(), loc.y()); - return loc; + VTR_ASSERT_SAFE(std::abs(*delta_x) < (int)device_ctx.grid.width()); + VTR_ASSERT_SAFE(std::abs(*delta_y) < (int)device_ctx.grid.height()); } -void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_reachable_wires& src_opin_reachable_wires) { +static void adjust_rr_position(const RRNodeId rr, int& x, int& y) { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - struct t_pq_entry { - float delay; - float congestion; - RRNodeId node; - - bool operator<(const t_pq_entry& rhs) const { - return this->delay < rhs.delay; - } - }; - - std::priority_queue pq; - - t_pq_entry root; - root.congestion = 0.; - root.delay = 0.; - root.node = node; + e_rr_type rr_type = rr_graph.node_type(rr); - int ptc = rr_graph.node_ptc_num(node); + if (is_chan(rr_type)) { + adjust_rr_wire_position(rr, x, y); + } else if (is_pin(rr_type)) { + adjust_rr_pin_position(rr, x, y); + } else { + VTR_ASSERT_SAFE(is_src_sink(rr_type)); + adjust_rr_src_sink_position(rr, x, y); + } +} +static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y) { /* - * Perform Djikstra from the SOURCE/OPIN of interest, stopping at the the first - * reachable wires (i.e until we hit the inter-block routing network), or a SINK - * (via a direct-connect). + * VPR uses a co-ordinate system where wires above and to the right of a block + * are at the same location as the block: * - * Note that typical RR graphs are structured : * - * SOURCE ---> OPIN --> CHANX/CHANY - * | - * --> OPIN --> CHANX/CHANY - * | - * ... + * <-----------C + * D + * | +---------+ ^ + * | | | | + * | | (1,1) | | + * | | | | + * V +---------+ | + * A + * B-----------> * - * possibly with direct-connects of the form: + * So wires are located as follows: * - * SOURCE --> OPIN --> IPIN --> SINK + * A: (1, 1) CHANY + * B: (1, 0) CHANX + * C: (1, 1) CHANX + * D: (0, 1) CHANY * - * and there is a small number of fixed hops from SOURCE/OPIN to CHANX/CHANY or - * to a SINK (via a direct-connect), so this runs very fast (i.e. O(1)) + * But all pins incident on the surrounding channels + * would be at (1,1) along with a relevant side. + * + * In the following, we adjust the positions of pins to + * account for the channel they are incident too. + * + * Note that blocks at (0,*) such as IOs, may have (unconnected) + * pins on the left side, so we also clip the minimum x to zero. + * Similarly for blocks at (*,0) we clip the minimum y to zero. */ - pq.push(root); - while (!pq.empty()) { - t_pq_entry curr = pq.top(); - pq.pop(); - - e_rr_type curr_rr_type = rr_graph.node_type(curr.node); - if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) { - //We stop expansion at any CHANX/CHANY/SINK - int seg_index; - if (curr_rr_type != SINK) { - //It's a wire, figure out its type - int cost_index = rr_graph.node_cost_index(curr.node); - seg_index = device_ctx.rr_indexed_data[cost_index].seg_index; - } else { - //This is a direct-connect path between an IPIN and OPIN, - //which terminated at a SINK. - // - //We treat this as a 'special' wire type - seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE; - } - - //Keep costs of the best path to reach each wire type - if (!src_opin_reachable_wires[itile][ptc].count(seg_index) - || curr.delay < src_opin_reachable_wires[itile][ptc][seg_index].delay) { - src_opin_reachable_wires[itile][ptc][seg_index].wire_rr_type = curr_rr_type; - src_opin_reachable_wires[itile][ptc][seg_index].wire_seg_index = seg_index; - src_opin_reachable_wires[itile][ptc][seg_index].delay = curr.delay; - src_opin_reachable_wires[itile][ptc][seg_index].congestion = curr.congestion; - } - - } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) { - //We allow expansion through SOURCE/OPIN/IPIN types - int cost_index = rr_graph.node_cost_index(curr.node); - float incr_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost + auto& device_ctx = g_vpr_ctx.device(); + auto& rr_graph = device_ctx.rr_nodes; - for (RREdgeId edge : rr_graph.edge_range(curr.node)) { - int iswitch = rr_graph.edge_switch(edge); - float incr_delay = device_ctx.rr_switch_inf[iswitch].Tdel; + VTR_ASSERT_SAFE(is_pin(rr_graph.node_type(rr))); + VTR_ASSERT_SAFE(rr_graph.node_xlow(rr) == rr_graph.node_xhigh(rr)); + VTR_ASSERT_SAFE(rr_graph.node_ylow(rr) == rr_graph.node_yhigh(rr)); - RRNodeId next_node = rr_graph.edge_sink_node(edge); + x = rr_graph.node_xlow(rr); + y = rr_graph.node_ylow(rr); - t_pq_entry next; - next.congestion = curr.congestion + incr_cong; //Of current node - next.delay = curr.delay + incr_delay; //To reach next node - next.node = next_node; + e_side rr_side = rr_graph.node_side(rr); - pq.push(next); - } - } else { - VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type"); - } + if (rr_side == LEFT) { + x -= 1; + x = std::max(x, 0); + } else if (rr_side == BOTTOM && y > 0) { + y -= 1; + y = std::max(y, 0); } } -/* returns index of a node from which to start routing */ -static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset) { +static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y) { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - int result = UNDEFINED; - - if (rr_type != CHANX && rr_type != CHANY) { - VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Must start lookahead routing from CHANX or CHANY node\n"); - } - - /* determine which direction the wire should go in based on the start & target coordinates */ - e_direction direction = INC_DIRECTION; - if ((rr_type == CHANX && target_x < start_x) || (rr_type == CHANY && target_y < start_y)) { - direction = DEC_DIRECTION; - } - - int start_lookup_x = start_x; - int start_lookup_y = start_y; - if (rr_type == CHANX) { - //Bizarely, rr_node_indices stores CHANX with swapped x/y... - std::swap(start_lookup_x, start_lookup_y); - } - - const std::vector& channel_node_list = device_ctx.rr_node_indices[rr_type][start_lookup_x][start_lookup_y][0]; - - /* find first node in channel that has specified segment index and goes in the desired direction */ - for (unsigned itrack = 0; itrack < channel_node_list.size(); itrack++) { - int node_ind = channel_node_list[itrack]; - if (node_ind < 0) continue; - - RRNodeId node_id(node_ind); - - VTR_ASSERT(rr_graph.node_type(node_id) == rr_type); + VTR_ASSERT_SAFE(is_chan(rr_graph.node_type(rr))); - e_direction node_direction = rr_graph.node_direction(node_id); - int node_cost_ind = rr_graph.node_cost_index(node_id); - int node_seg_ind = device_ctx.rr_indexed_data[node_cost_ind].seg_index; + e_direction rr_dir = rr_graph.node_direction(rr); - if ((node_direction == direction || node_direction == BI_DIRECTION) && node_seg_ind == seg_index) { - /* found first track that has the specified segment index and goes in the desired direction */ - result = node_ind; - if (track_offset == 0) { - break; - } - track_offset -= 2; - } + if (rr_dir == DEC_DIRECTION) { + x = rr_graph.node_xhigh(rr); + y = rr_graph.node_yhigh(rr); + } else if (rr_dir == INC_DIRECTION) { + x = rr_graph.node_xlow(rr); + y = rr_graph.node_ylow(rr); + } else { + VTR_ASSERT_SAFE(rr_dir == BI_DIRECTION); + //Not sure what to do here... + //Try average for now. + x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.); + y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.); } - - return RRNodeId(result); } -/* runs Dijkstra's algorithm from specified node until all nodes have been visited. Each time a pin is visited, the delay/congestion information - * to that pin is stored is added to an entry in the routing_cost_map */ -static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routing_cost_map& routing_cost_map, t_dijkstra_data* data) { +static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y) { + //SOURCE/SINK nodes assume the full dimensions of their + //associated block + // + //Use the average position. auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - auto& node_expanded = data->node_expanded; - node_expanded.resize(device_ctx.rr_nodes.size()); - std::fill(node_expanded.begin(), node_expanded.end(), false); + VTR_ASSERT_SAFE(is_src_sink(rr_graph.node_type(rr))); - auto& node_visited_costs = data->node_visited_costs; - node_visited_costs.resize(device_ctx.rr_nodes.size()); - std::fill(node_visited_costs.begin(), node_visited_costs.end(), -1.0); + x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.); + y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.); +} +/* Runs Dijkstra's algorithm from specified node until all nodes have been + * visited. Each time a pin is visited, the delay/congestion information + * to that pin is stored to an entry in the routing_cost_map. + * + * Returns the maximum (last) minimum cost path stored, and + * the number of paths from start_node_ind stored. */ +template +static std::pair run_dijkstra(int start_node_ind, + std::vector* node_expanded, + std::vector* paths, + RoutingCosts* routing_costs) { + auto& device_ctx = g_vpr_ctx.device(); + int path_count = 0; + + /* a list of boolean flags (one for each rr node) to figure out if a + * certain node has already been expanded */ + std::fill(node_expanded->begin(), node_expanded->end(), false); + /* For each node keep a list of the cost with which that node has been + * visited (used to determine whether to push a candidate node onto the + * expansion queue. + * Also store the parent node so we can reconstruct a specific path. */ + std::fill(paths->begin(), paths->end(), util::Search_Path{std::numeric_limits::infinity(), -1, -1}); /* a priority queue for expansion */ - std::priority_queue& pq = data->pq; - - //Clear priority queue if non-empty - while (!pq.empty()) { - pq.pop(); - } + std::priority_queue, std::greater> pq; /* first entry has no upstream delay or congestion */ - PQ_Entry first_entry(start_node, UNDEFINED, 0, 0, 0, true); + Entry first_entry(start_node_ind, UNDEFINED, nullptr); + float max_cost = first_entry.cost(); pq.push(first_entry); /* now do routing */ while (!pq.empty()) { - PQ_Entry current = pq.top(); + auto current = pq.top(); pq.pop(); - RRNodeId curr_node = current.rr_node; + int node_ind = current.rr_node_ind; /* check that we haven't already expanded from this node */ - if (node_expanded[curr_node]) { + if ((*node_expanded)[node_ind]) { continue; } - //VTR_LOG("Expanding with delay=%10.3g cong=%10.3g (%s)\n", current.delay, current.congestion_upstream, describe_rr_node(curr_node).c_str()); - /* if this node is an ipin record its congestion/delay in the routing_cost_map */ - if (rr_graph.node_type(curr_node) == IPIN) { - int ipin_x = rr_graph.node_xlow(curr_node); - int ipin_y = rr_graph.node_ylow(curr_node); - - if (ipin_x >= start_x && ipin_y >= start_y) { - int delta_x, delta_y; - get_xy_deltas(start_node, curr_node, &delta_x, &delta_y); - delta_x = std::abs(delta_x); - delta_y = std::abs(delta_y); + if (device_ctx.rr_nodes[node_ind].type() == IPIN) { + // the last cost should be the highest + max_cost = current.cost(); - routing_cost_map[delta_x][delta_y].add_cost_entry(current.delay, current.congestion_upstream); - } + path_count++; + add_paths(start_node_ind, current, *paths, routing_costs); + } else { + util::expand_dijkstra_neighbours(device_ctx.rr_nodes, + current, paths, node_expanded, &pq); + (*node_expanded)[node_ind] = true; } - - expand_dijkstra_neighbours(current, node_visited_costs, node_expanded, pq); - node_expanded[curr_node] = true; } + return std::make_pair(max_cost, path_count); } -/* iterates over the children of the specified node and selectively pushes them onto the priority queue */ -static void expand_dijkstra_neighbours(PQ_Entry parent_entry, vtr::vector& node_visited_costs, vtr::vector& node_expanded, std::priority_queue& pq) { - auto& device_ctx = g_vpr_ctx.device(); - auto& rr_graph = device_ctx.rr_nodes; +// compute the cost maps for lookahead +void MapLookahead::compute(const std::vector& segment_inf) { + compute_router_src_opin_lookahead(); + compute_router_chan_ipin_lookahead(); - RRNodeId parent = parent_entry.rr_node; + vtr::ScopedStartFinishTimer timer("Computing connection box lookahead map"); - for (RREdgeId edge : rr_graph.edge_range(parent)) { - RRNodeId child_node = rr_graph.edge_sink_node(edge); - int switch_ind = rr_graph.edge_switch(edge); + // Initialize rr_node_route_inf if not already + alloc_and_load_rr_node_route_structs(); - if (rr_graph.node_type(child_node) == SINK) return; + size_t num_segments = segment_inf.size(); + std::vector sample_regions = find_sample_regions(num_segments); - /* skip this child if it has already been expanded from */ - if (node_expanded[child_node]) { - continue; - } + /* free previous delay map and allocate new one */ + auto& device_ctx = g_vpr_ctx.device(); + cost_map_.set_counts(segment_inf.size()); + cost_map_.build_segment_map(); + + VTR_ASSERT(REPRESENTATIVE_ENTRY_METHOD == util::SMALLEST); + RoutingCosts all_delay_costs; + RoutingCosts all_base_costs; + + /* run Dijkstra's algorithm for each segment type & channel type combination */ +#if defined(VPR_USE_TBB) + tbb::mutex all_costs_mutex; + tbb::parallel_for_each(sample_regions, [&](const SampleRegion& region) { +#else + for (const auto& region : sample_regions) { +#endif + // holds the cost entries for a run + RoutingCosts delay_costs; + RoutingCosts base_costs; + int total_path_count = 0; + std::vector node_expanded(device_ctx.rr_nodes.size()); + std::vector paths(device_ctx.rr_nodes.size()); + + for (auto& point : region.points) { + // statistics + vtr::Timer run_timer; + float max_delay_cost = 0.f; + float max_base_cost = 0.f; + int path_count = 0; + for (auto node_ind : point.nodes) { + { + auto result = run_dijkstra(node_ind, &node_expanded, &paths, &delay_costs); + max_delay_cost = std::max(max_delay_cost, result.first); + path_count += result.second; + } + { + auto result = run_dijkstra(node_ind, &node_expanded, &paths, &base_costs); + max_base_cost = std::max(max_base_cost, result.first); + path_count += result.second; + } + } - PQ_Entry child_entry(child_node, switch_ind, parent_entry.delay, - parent_entry.R_upstream, parent_entry.congestion_upstream, false); + if (path_count > 0) { + VTR_LOG("Expanded %d paths of segment type %s(%d) starting at (%d, %d) from %d segments, max_cost %e %e (%g paths/sec)\n", + path_count, segment_inf[region.segment_type].name.c_str(), region.segment_type, + point.location.x(), point.location.y(), + (int)point.nodes.size(), + max_delay_cost, max_base_cost, + path_count / run_timer.elapsed_sec()); + } - //VTR_ASSERT(child_entry.cost >= 0); //Asertion fails in practise. TODO: debug + /* + * if (path_count == 0) { + * for (auto node_ind : point.nodes) { + * VTR_LOG("Expanded node %s\n", describe_rr_node(node_ind).c_str()); + * } + * } + */ - /* skip this child if it has been visited with smaller cost */ - if (node_visited_costs[child_node] >= 0 && node_visited_costs[child_node] < child_entry.cost) { - continue; + total_path_count += path_count; + if (total_path_count > MIN_PATH_COUNT) { + break; + } } - /* finally, record the cost with which the child was visited and put the child entry on the queue */ - node_visited_costs[child_node] = child_entry.cost; - pq.push(child_entry); - } -} - -/* sets the lookahead cost map entries based on representative cost entries from routing_cost_map */ -static void set_lookahead_map_costs(int segment_index, e_rr_type chan_type, t_routing_cost_map& routing_cost_map) { - int chan_index = 0; - if (chan_type == CHANY) { - chan_index = 1; - } +#if defined(VPR_USE_TBB) + all_costs_mutex.lock(); +#endif - /* set the lookahead cost map entries with a representative cost entry from routing_cost_map */ - for (unsigned ix = 0; ix < routing_cost_map.dim_size(0); ix++) { - for (unsigned iy = 0; iy < routing_cost_map.dim_size(1); iy++) { - Expansion_Cost_Entry& expansion_cost_entry = routing_cost_map[ix][iy]; + if (total_path_count == 0) { + VTR_LOG_WARN("No paths found for sample region %s(%d, %d)\n", + segment_inf[region.segment_type].name.c_str(), region.grid_location.x(), region.grid_location.y()); + } - f_wire_cost_map[chan_index][segment_index][ix][iy] = expansion_cost_entry.get_representative_cost_entry(REPRESENTATIVE_ENTRY_METHOD); + // combine the cost map from this run with the final cost maps for each segment + for (const auto& cost : delay_costs) { + const auto& val = cost.second; + auto result = all_delay_costs.insert(std::make_pair(cost.first, val)); + if (!result.second) { + // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST + result.first->second = std::min(result.first->second, val); + } + } + for (const auto& cost : base_costs) { + const auto& val = cost.second; + auto result = all_base_costs.insert(std::make_pair(cost.first, val)); + if (!result.second) { + // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST + result.first->second = std::min(result.first->second, val); + } } - } -} -/* fills in missing lookahead map entries by copying the cost of the closest valid entry */ -static void fill_in_missing_lookahead_entries(int segment_index, e_rr_type chan_type) { - int chan_index = 0; - if (chan_type == CHANY) { - chan_index = 1; +#if defined(VPR_USE_TBB) + all_costs_mutex.unlock(); + }); +#else } +#endif - auto& device_ctx = g_vpr_ctx.device(); + VTR_LOG("Combining results\n"); + /* boil down the cost list in routing_cost_map at each coordinate to a + * representative cost entry and store it in the lookahead cost map */ + cost_map_.set_cost_map(all_delay_costs, all_base_costs); + +// diagnostics +#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_ENTRIES) + for (auto& cost : all_costs) { + const auto& key = cost.first; + const auto& val = cost.second; + VTR_LOG("%d -> %d (%d, %d): %g, %g\n", + val.from_node, val.to_node, + key.delta.x(), key.delta.y(), + val.cost_entry.delay, val.cost_entry.congestion); + } +#endif - /* find missing cost entries and fill them in by copying a nearby cost entry */ - for (unsigned ix = 0; ix < device_ctx.grid.width(); ix++) { - for (unsigned iy = 0; iy < device_ctx.grid.height(); iy++) { - Cost_Entry cost_entry = f_wire_cost_map[chan_index][segment_index][ix][iy]; +#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS) + for (int iseg = 0; iseg < (ssize_t)num_segments; iseg++) { + VTR_LOG("cost map for %s(%d)\n", + segment_inf[iseg].name.c_str(), iseg); + cost_map_.print(iseg); + } +#endif - if (std::isnan(cost_entry.delay) && std::isnan(cost_entry.congestion)) { - Cost_Entry copied_entry = get_nearby_cost_entry(ix, iy, segment_index, chan_index); - f_wire_cost_map[chan_index][segment_index][ix][iy] = copied_entry; - } - } +#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_EMPTY_MAPS) + for (std::pair p : cost_map_.list_empty()) { + int ichan, iseg; + std::tie(ichan, iseg) = p; + VTR_LOG("cost map for %s(%d), chan %d EMPTY\n", + segment_inf[iseg].name.c_str(), iseg, box_id); } +#endif } -/* returns a cost entry in the f_wire_cost_map that is near the specified coordinates (and preferably towards (0,0)) */ -static Cost_Entry get_nearby_cost_entry(int x, int y, int segment_index, int chan_index) { - /* compute the slope from x,y to 0,0 and then move towards 0,0 by one unit to get the coordinates - * of the cost entry to be copied */ +static void compute_router_src_opin_lookahead() { + vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead"); + auto& device_ctx = g_vpr_ctx.device(); + auto& rr_graph = device_ctx.rr_nodes; - //VTR_ASSERT(x > 0 || y > 0); //Asertion fails in practise. TODO: debug + f_src_opin_delays.clear(); - float slope; - if (x == 0) { - slope = 1e12; //just a really large number - } else if (y == 0) { - slope = 1e-12; //just a really small number - } else { - slope = (float)y / (float)x; - } + f_src_opin_delays.resize(device_ctx.physical_tile_types.size()); - int copy_x, copy_y; - if (slope >= 1.0) { - copy_y = y - 1; - copy_x = vtr::nint((float)y / slope); - } else { - copy_x = x - 1; - copy_y = vtr::nint((float)x * slope); - } + std::vector rr_nodes_at_loc; - copy_y = std::max(copy_y, 0); //Clip to zero - copy_x = std::max(copy_x, 0); //Clip to zero + //We assume that the routing connectivity of each instance of a physical tile is the same, + //and so only measure one instance of each type + for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) { + for (e_rr_type rr_type : {SOURCE, OPIN}) { + vtr::Point sample_loc(-1, -1); - Cost_Entry copy_entry = f_wire_cost_map[chan_index][segment_index][copy_x][copy_y]; + size_t num_sampled_locs = 0; + bool ptcs_with_no_delays = true; + while (ptcs_with_no_delays) { //Haven't found wire connected to ptc + ptcs_with_no_delays = false; - /* if the entry to be copied is also empty, recurse */ - if (std::isnan(copy_entry.delay) && std::isnan(copy_entry.congestion)) { - if (copy_x == 0 && copy_y == 0) { - copy_entry = Cost_Entry(0., 0.); //(0, 0) entry is invalid so set zero to terminate recursion - } else { - copy_entry = get_nearby_cost_entry(copy_x, copy_y, segment_index, chan_index); - } - } + sample_loc = pick_sample_tile(&device_ctx.physical_tile_types[itile], sample_loc); - return copy_entry; -} + if (sample_loc.x() == -1 && sample_loc.y() == -1) { + //No untried instances of the current tile type left + VTR_LOG_WARN("Found no %ssample locations for %s in %s\n", + (num_sampled_locs == 0) ? "" : "more ", + rr_node_typename[rr_type], + device_ctx.physical_tile_types[itile].name); + break; + } -/* returns cost entry with the smallest delay */ -Cost_Entry Expansion_Cost_Entry::get_smallest_entry() { - Cost_Entry smallest_entry; + //VTR_LOG("Sampling %s at (%d,%d)\n", device_ctx.physical_tile_types[itile].name, sample_loc.x(), sample_loc.y()); - for (auto entry : this->cost_vector) { - if (std::isnan(smallest_entry.delay) || entry.delay < smallest_entry.delay) { - smallest_entry = entry; - } - } + rr_nodes_at_loc.clear(); - return smallest_entry; -} + get_rr_node_indices(device_ctx.rr_node_indices, sample_loc.x(), sample_loc.y(), rr_type, &rr_nodes_at_loc); + for (int inode : rr_nodes_at_loc) { + if (inode < 0) continue; -/* returns a cost entry that represents the average of all the recorded entries */ -Cost_Entry Expansion_Cost_Entry::get_average_entry() { - float avg_delay = 0; - float avg_congestion = 0; + RRNodeId node_id(inode); - for (auto cost_entry : this->cost_vector) { - avg_delay += cost_entry.delay; - avg_congestion += cost_entry.congestion; - } + int ptc = rr_graph.node_ptc_num(node_id); - avg_delay /= (float)this->cost_vector.size(); - avg_congestion /= (float)this->cost_vector.size(); + if (ptc >= int(f_src_opin_delays[itile].size())) { + f_src_opin_delays[itile].resize(ptc + 1); //Inefficient but functional... + } - return Cost_Entry(avg_delay, avg_congestion); -} + //Find the wire types which are reachable from inode and record them and + //the cost to reach them + dijkstra_flood_to_wires(itile, node_id, f_src_opin_delays); -/* returns a cost entry that represents the geomean of all the recorded entries */ -Cost_Entry Expansion_Cost_Entry::get_geomean_entry() { - float geomean_delay = 0; - float geomean_cong = 0; - for (auto cost_entry : this->cost_vector) { - geomean_delay += log(cost_entry.delay); - geomean_cong += log(cost_entry.congestion); - } + if (f_src_opin_delays[itile][ptc].empty()) { + VTR_LOGV_DEBUG(f_router_debug, "Found no reachable wires from %s (%s) at (%d,%d)\n", + rr_node_typename[rr_type], + rr_node_arch_name(inode).c_str(), + sample_loc.x(), + sample_loc.y()); - geomean_delay = exp(geomean_delay / (float)this->cost_vector.size()); - geomean_cong = exp(geomean_cong / (float)this->cost_vector.size()); + ptcs_with_no_delays = true; + } + } - return Cost_Entry(geomean_delay, geomean_cong); + ++num_sampled_locs; + } + if (ptcs_with_no_delays) { + VPR_ERROR(VPR_ERROR_ROUTE, "Some SOURCE/OPINs have no reachable wires\n"); + } + } + } } -/* returns a cost entry that represents the medial of all recorded entries */ -Cost_Entry Expansion_Cost_Entry::get_median_entry() { - /* find median by binning the delays of all entries and then chosing the bin - * with the largest number of entries */ +static void compute_router_chan_ipin_lookahead() { + vtr::ScopedStartFinishTimer timer("Computing chan/ipin lookahead"); + auto& device_ctx = g_vpr_ctx.device(); - int num_bins = 10; + f_chan_ipins_delays.clear(); - /* find entries with smallest and largest delays */ - Cost_Entry min_del_entry; - Cost_Entry max_del_entry; - for (auto entry : this->cost_vector) { - if (std::isnan(min_del_entry.delay) || entry.delay < min_del_entry.delay) { - min_del_entry = entry; - } - if (std::isnan(max_del_entry.delay) || entry.delay > max_del_entry.delay) { - max_del_entry = entry; - } - } + f_chan_ipins_delays.resize(device_ctx.physical_tile_types.size()); - /* get the bin size */ - float delay_diff = max_del_entry.delay - min_del_entry.delay; - float bin_size = delay_diff / (float)num_bins; + std::vector rr_nodes_at_loc; - /* sort the cost entries into bins */ - std::vector> entry_bins(num_bins, std::vector()); - for (auto entry : this->cost_vector) { - float bin_num = floor((entry.delay - min_del_entry.delay) / bin_size); + //We assume that the routing connectivity of each instance of a physical tile is the same, + //and so only measure one instance of each type + for (auto tile_type : device_ctx.physical_tile_types) { + vtr::Point sample_loc(-1, -1); - VTR_ASSERT(vtr::nint(bin_num) >= 0 && vtr::nint(bin_num) <= num_bins); - if (vtr::nint(bin_num) == num_bins) { - /* largest entry will otherwise have an out-of-bounds bin number */ - bin_num -= 1; - } - entry_bins[vtr::nint(bin_num)].push_back(entry); - } + sample_loc = pick_sample_tile(&tile_type, sample_loc); - /* find the bin with the largest number of elements */ - int largest_bin = 0; - int largest_size = 0; - for (int ibin = 0; ibin < num_bins; ibin++) { - if (entry_bins[ibin].size() > (unsigned)largest_size) { - largest_bin = ibin; - largest_size = (unsigned)entry_bins[ibin].size(); + if (sample_loc.x() == -1 && sample_loc.y() == -1) { + //No untried instances of the current tile type left + VTR_LOG_WARN("Found no sample locations for %s\n", + tile_type.name); + continue; } - } - /* get the representative delay of the largest bin */ - Cost_Entry representative_entry = entry_bins[largest_bin][0]; + int min_x = std::max(0, sample_loc.x() - X_OFFSET); + int min_y = std::max(0, sample_loc.y() - Y_OFFSET); + int max_x = std::min(int(device_ctx.grid.width()), sample_loc.x() + X_OFFSET); + int max_y = std::min(int(device_ctx.grid.height()), sample_loc.y() + Y_OFFSET); - return representative_entry; -} + for (int ix = min_x; ix < max_x; ix++) { + for (int iy = min_y; iy < max_y; iy++) { + for (auto rr_type : {CHANX, CHANY}) { + rr_nodes_at_loc.clear(); -/* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */ -static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y) { - auto& device_ctx = g_vpr_ctx.device(); - auto& rr_graph = device_ctx.rr_nodes; + get_rr_node_indices(device_ctx.rr_node_indices, ix, iy, rr_type, &rr_nodes_at_loc); + for (int inode : rr_nodes_at_loc) { + if (inode < 0) continue; - e_rr_type from_type = rr_graph.node_type(from_node); - e_rr_type to_type = rr_graph.node_type(to_node); + RRNodeId node_id(inode); - if (!is_chan(from_type) && !is_chan(to_type)) { - //Alternate formulation for non-channel types - int from_x = 0; - int from_y = 0; - adjust_rr_position(from_node, from_x, from_y); + //Find the IPINs which are reachable from the wires within the bounding box + //around the selected tile location + dijkstra_flood_to_ipins(node_id, f_chan_ipins_delays); + } + } + } + } + } +} - int to_x = 0; - int to_y = 0; - adjust_rr_position(to_node, to_x, to_y); +static vtr::Point pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point prev) { + //Very simple for now, just pick the fist matching tile found + vtr::Point loc(OPEN, OPEN); - *delta_x = to_x - from_x; - *delta_y = to_y - from_y; - } else { - //Traditional formulation + //VTR_LOG("Prev: %d,%d\n", prev.x(), prev.y()); - /* get chan/seg coordinates of the from/to nodes. seg coordinate is along the wire, - * chan coordinate is orthogonal to the wire */ - int from_seg_low; - int from_seg_high; - int from_chan; - int to_seg; - int to_chan; - if (from_type == CHANY) { - from_seg_low = rr_graph.node_ylow(from_node); - from_seg_high = rr_graph.node_yhigh(from_node); - from_chan = rr_graph.node_xlow(from_node); - to_seg = rr_graph.node_ylow(to_node); - to_chan = rr_graph.node_xlow(to_node); - } else { - from_seg_low = rr_graph.node_xlow(from_node); - from_seg_high = rr_graph.node_xhigh(from_node); - from_chan = rr_graph.node_ylow(from_node); - to_seg = rr_graph.node_xlow(to_node); - to_chan = rr_graph.node_ylow(to_node); - } + auto& device_ctx = g_vpr_ctx.device(); + auto& grid = device_ctx.grid; - /* now we want to count the minimum number of *channel segments* between the from and to nodes */ - int delta_seg, delta_chan; + int y_init = prev.y() + 1; //Start searching next element above prev - /* orthogonal to wire */ - int no_need_to_pass_by_clb = 0; //if we need orthogonal wires then we don't need to pass by the target CLB along the current wire direction - if (to_chan > from_chan + 1) { - /* above */ - delta_chan = to_chan - from_chan; - no_need_to_pass_by_clb = 1; - } else if (to_chan < from_chan) { - /* below */ - delta_chan = from_chan - to_chan + 1; - no_need_to_pass_by_clb = 1; - } else { - /* adjacent to current channel */ - delta_chan = 0; - no_need_to_pass_by_clb = 0; - } + for (int x = prev.x(); x < int(grid.width()); ++x) { + if (x < 0) continue; - /* along same direction as wire. */ - if (to_seg > from_seg_high) { - /* ahead */ - delta_seg = to_seg - from_seg_high - no_need_to_pass_by_clb; - } else if (to_seg < from_seg_low) { - /* behind */ - delta_seg = from_seg_low - to_seg - no_need_to_pass_by_clb; - } else { - /* along the span of the wire */ - delta_seg = 0; - } + //VTR_LOG(" x: %d\n", x); - /* account for wire direction. lookahead map was computed by looking up and to the right starting at INC wires. for targets - * that are opposite of the wire direction, let's add 1 to delta_seg */ - e_direction from_dir = rr_graph.node_direction(from_node); - if (is_chan(from_type) - && ((to_seg < from_seg_low && from_dir == INC_DIRECTION) || (to_seg > from_seg_high && from_dir == DEC_DIRECTION))) { - delta_seg++; + for (int y = y_init; y < int(grid.height()); ++y) { + if (y < 0) continue; + + //VTR_LOG(" y: %d\n", y); + if (grid[x][y].type->index == tile_type->index) { + loc.set_x(x); + loc.set_y(y); + return loc; + } } - if (from_type == CHANY) { - *delta_x = delta_chan; - *delta_y = delta_seg; + if (loc.x() != OPEN && loc.y() != OPEN) { + break; } else { - *delta_x = delta_seg; - *delta_y = delta_chan; + y_init = 0; //Prepare to search next column } } + //VTR_LOG("Next: %d,%d\n", loc.x(), loc.y()); - VTR_ASSERT_SAFE(std::abs(*delta_x) < (int)device_ctx.grid.width()); - VTR_ASSERT_SAFE(std::abs(*delta_y) < (int)device_ctx.grid.height()); + return loc; } -static void adjust_rr_position(const RRNodeId rr, int& x, int& y) { +static void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_delays& src_opin_delays) { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - e_rr_type rr_type = rr_graph.node_type(rr); + struct t_pq_entry { + float delay; + float congestion; + RRNodeId node; - if (is_chan(rr_type)) { - adjust_rr_wire_position(rr, x, y); - } else if (is_pin(rr_type)) { - adjust_rr_pin_position(rr, x, y); - } else { - VTR_ASSERT_SAFE(is_src_sink(rr_type)); - adjust_rr_src_sink_position(rr, x, y); - } -} + bool operator<(const t_pq_entry& rhs) const { + return this->delay < rhs.delay; + } + }; + + std::priority_queue pq; + + t_pq_entry root; + root.congestion = 0.; + root.delay = 0.; + root.node = node; + + int ptc = rr_graph.node_ptc_num(node); -static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y) { /* - * VPR uses a co-ordinate system where wires above and to the right of a block - * are at the same location as the block: - * - * - * <-----------C - * D - * | +---------+ ^ - * | | | | - * | | (1,1) | | - * | | | | - * V +---------+ | - * A - * B-----------> + * Perform Djikstra from the SOURCE/OPIN of interest, stopping at the the first + * reachable wires (i.e until we hit the inter-block routing network), or a SINK + * (via a direct-connect). * - * So wires are located as follows: + * Note that typical RR graphs are structured : * - * A: (1, 1) CHANY - * B: (1, 0) CHANX - * C: (1, 1) CHANX - * D: (0, 1) CHANY + * SOURCE ---> OPIN --> CHANX/CHANY + * | + * --> OPIN --> CHANX/CHANY + * | + * ... * - * But all pins incident on the surrounding channels - * would be at (1,1) along with a relevant side. + * possibly with direct-connects of the form: * - * In the following, we adjust the positions of pins to - * account for the channel they are incident too. + * SOURCE --> OPIN --> IPIN --> SINK * - * Note that blocks at (0,*) such as IOs, may have (unconnected) - * pins on the left side, so we also clip the minimum x to zero. - * Similarly for blocks at (*,0) we clip the minimum y to zero. + * and there is a small number of fixed hops from SOURCE/OPIN to CHANX/CHANY or + * to a SINK (via a direct-connect), so this runs very fast (i.e. O(1)) */ - auto& device_ctx = g_vpr_ctx.device(); - auto& rr_graph = device_ctx.rr_nodes; + pq.push(root); + while (!pq.empty()) { + t_pq_entry curr = pq.top(); + pq.pop(); - VTR_ASSERT_SAFE(is_pin(rr_graph.node_type(rr))); - VTR_ASSERT_SAFE(rr_graph.node_xlow(rr) == rr_graph.node_xhigh(rr)); - VTR_ASSERT_SAFE(rr_graph.node_ylow(rr) == rr_graph.node_yhigh(rr)); + e_rr_type curr_rr_type = rr_graph.node_type(curr.node); + if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) { + //We stop expansion at any CHANX/CHANY/SINK + int seg_index; + if (curr_rr_type != SINK) { + //It's a wire, figure out its type + int cost_index = rr_graph.node_cost_index(curr.node); + seg_index = device_ctx.rr_indexed_data[cost_index].seg_index; + } else { + //This is a direct-connect path between an IPIN and OPIN, + //which terminated at a SINK. + // + //We treat this as a 'special' wire type + seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE; + } - x = rr_graph.node_xlow(rr); - y = rr_graph.node_ylow(rr); + //Keep costs of the best path to reach each wire type + if (!src_opin_delays[itile][ptc].count(seg_index) + || curr.delay < src_opin_delays[itile][ptc][seg_index].delay) { + src_opin_delays[itile][ptc][seg_index].wire_rr_type = curr_rr_type; + src_opin_delays[itile][ptc][seg_index].wire_seg_index = seg_index; + src_opin_delays[itile][ptc][seg_index].delay = curr.delay; + src_opin_delays[itile][ptc][seg_index].congestion = curr.congestion; + } - e_side rr_side = rr_graph.node_side(rr); + } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) { + //We allow expansion through SOURCE/OPIN/IPIN types + int cost_index = rr_graph.node_cost_index(curr.node); + float incr_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost - if (rr_side == LEFT) { - x -= 1; - x = std::max(x, 0); - } else if (rr_side == BOTTOM && y > 0) { - y -= 1; - y = std::max(y, 0); + for (RREdgeId edge : rr_graph.edge_range(curr.node)) { + int iswitch = rr_graph.edge_switch(edge); + float incr_delay = device_ctx.rr_switch_inf[iswitch].Tdel; + + RRNodeId next_node = rr_graph.edge_sink_node(edge); + + t_pq_entry next; + next.congestion = curr.congestion + incr_cong; //Of current node + next.delay = curr.delay + incr_delay; //To reach next node + next.node = next_node; + + pq.push(next); + } + } else { + VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type"); + } } } -static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y) { +static void dijkstra_flood_to_ipins(RRNodeId node, t_chan_ipins_delays& chan_ipins_delays) { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; - VTR_ASSERT_SAFE(is_chan(rr_graph.node_type(rr))); + struct t_pq_entry { + float delay; + float congestion; + RRNodeId node; + int level; + int prev_seg_index; - e_direction rr_dir = rr_graph.node_direction(rr); + bool operator<(const t_pq_entry& rhs) const { + return this->delay < rhs.delay; + } + }; - if (rr_dir == DEC_DIRECTION) { - x = rr_graph.node_xhigh(rr); - y = rr_graph.node_yhigh(rr); - } else if (rr_dir == INC_DIRECTION) { - x = rr_graph.node_xlow(rr); - y = rr_graph.node_ylow(rr); - } else { - VTR_ASSERT_SAFE(rr_dir == BI_DIRECTION); - //Not sure what to do here... - //Try average for now. - x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.); - y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.); - } -} + std::priority_queue pq; -static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y) { - //SOURCE/SINK nodes assume the full dimensions of their - //associated block - // - //Use the average position. - auto& device_ctx = g_vpr_ctx.device(); - auto& rr_graph = device_ctx.rr_nodes; + t_pq_entry root; + root.congestion = 0.; + root.delay = 0.; + root.node = node; + root.level = 0; + root.prev_seg_index = OPEN; - VTR_ASSERT_SAFE(is_src_sink(rr_graph.node_type(rr))); + /* + * Perform Djikstra from the CHAN of interest, stopping at the the first + * reachable IPIN + * + * Note that typical RR graphs are structured : + * + * CHANX/CHANY --> CHANX/CHANY --> ... --> CHANX/CHANY --> IPIN --> SINK + * | + * --> CHANX/CHANY --> ... --> CHANX/CHANY --> IPIN --> SINK + * | + * ... + * + * and there is a variable number of hops from a given CHANX/CHANY to IPIN. + * To avoid impacting on run-time, a fixed number of hops is performed. This + * should be enough to find the delay from the last CAHN to IPIN connection. + */ + pq.push(root); + while (!pq.empty()) { + t_pq_entry curr = pq.top(); + pq.pop(); - x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.); - y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.); -} + e_rr_type curr_rr_type = rr_graph.node_type(curr.node); + if (curr_rr_type == IPIN) { + int seg_index = curr.prev_seg_index; -static void print_wire_cost_map(const std::vector& segment_inf) { - auto& device_ctx = g_vpr_ctx.device(); + int node_x = rr_graph.node_xlow(curr.node); + int node_y = rr_graph.node_ylow(curr.node); - for (size_t chan_index = 0; chan_index < f_wire_cost_map.dim_size(0); chan_index++) { - for (size_t iseg = 0; iseg < f_wire_cost_map.dim_size(1); iseg++) { - vtr::printf("Seg %d (%s, length %d) %d\n", - iseg, - segment_inf[iseg].name.c_str(), - segment_inf[iseg].length, - chan_index); - for (size_t iy = 0; iy < device_ctx.grid.height(); iy++) { - for (size_t ix = 0; ix < device_ctx.grid.width(); ix++) { - vtr::printf("%2d,%2d: %10.3g\t", ix, iy, f_wire_cost_map[chan_index][iseg][ix][iy].delay); - } - vtr::printf("\n"); + auto tile_type = device_ctx.grid[node_x][node_y].type; + int itile = tile_type->index; + + int ptc = rr_graph.node_ptc_num(curr.node); + + if (ptc >= int(chan_ipins_delays[itile].size())) { + chan_ipins_delays[itile].resize(ptc + 1); //Inefficient but functional... } - vtr::printf("\n\n"); - } - } -} -static void print_router_cost_map(const t_routing_cost_map& router_cost_map) { - VTR_LOG("Djikstra Flood Costs:\n"); - for (size_t x = 0; x < router_cost_map.dim_size(0); x++) { - for (size_t y = 0; y < router_cost_map.dim_size(1); y++) { - VTR_LOG("(%zu,%zu):\n", x, y); + //Keep costs of the best path to reach each wire type + chan_ipins_delays[itile][ptc][seg_index].wire_rr_type = curr_rr_type; + chan_ipins_delays[itile][ptc][seg_index].wire_seg_index = seg_index; + chan_ipins_delays[itile][ptc][seg_index].delay = curr.delay; + chan_ipins_delays[itile][ptc][seg_index].congestion = curr.congestion; + } else if (curr_rr_type == CHANX || curr_rr_type == CHANY) { + if (curr.level >= MAX_EXPANSION_LEVEL) { + continue; + } + + //We allow expansion through SOURCE/OPIN/IPIN types + int cost_index = rr_graph.node_cost_index(curr.node); + float new_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost + int seg_index = device_ctx.rr_indexed_data[cost_index].seg_index; + + for (RREdgeId edge : rr_graph.edge_range(curr.node)) { + int iswitch = rr_graph.edge_switch(edge); + float new_delay = device_ctx.rr_switch_inf[iswitch].Tdel; + + RRNodeId next_node = rr_graph.edge_sink_node(edge); + + t_pq_entry next; + next.congestion = new_cong; //Of current node + next.delay = new_delay; //To reach next node + next.node = next_node; + next.level = curr.level + 1; + next.prev_seg_index = seg_index; - for (size_t i = 0; i < router_cost_map[x][y].cost_vector.size(); ++i) { - Cost_Entry entry = router_cost_map[x][y].cost_vector[i]; - VTR_LOG(" %d: delay=%10.3g cong=%10.3g\n", i, entry.delay, entry.congestion); + pq.push(next); } + } else { + VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type"); } } } -// -// When writing capnp targetted serialization, always allow compilation when -// VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception -// instead. -// -#ifndef VTR_ENABLE_CAPNPROTO +// get an expected minimum cost for routing from the current node to the target node +float MapLookahead::get_expected_cost( + int current_node, + int target_node, + const t_conn_cost_params& params, + float /*R_upstream*/) const { + auto& device_ctx = g_vpr_ctx.device(); -# define DISABLE_ERROR \ - "is disabled because VTR_ENABLE_CAPNPROTO=OFF." \ - "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable." + t_rr_type rr_type = device_ctx.rr_nodes[current_node].type(); -void read_router_lookahead(const std::string& /*file*/) { - VPR_THROW(VPR_ERROR_PLACE, "MapLookahead::read " DISABLE_ERROR); + if (rr_type == CHANX || rr_type == CHANY) { + return get_map_cost( + current_node, target_node, params.criticality); + } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */ + return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost); + } else { /* Change this if you want to investigate route-throughs */ + return (0.); + } } -void DeltaDelayModel::write(const std::string& /*file*/) const { - VPR_THROW(VPR_ERROR_PLACE, "MapLookahead::write " DISABLE_ERROR); +#ifndef VTR_ENABLE_CAPNPROTO + +void MapLookahead::read(const std::string& file) { + VPR_THROW(VPR_ERROR_ROUTE, "MapLookahead::read not implemented"); +} +void MapLookahead::write(const std::string& file) const { + VPR_THROW(VPR_ERROR_ROUTE, "MapLookahead::write not implemented"); } -#else /* VTR_ENABLE_CAPNPROTO */ +#else + +void MapLookahead::read(const std::string& file) { + cost_map_.read(file); + + compute_router_src_opin_lookahead(); + compute_router_chan_ipin_lookahead(); +} +void MapLookahead::write(const std::string& file) const { + cost_map_.write(file); +} -static void ToCostEntry(Cost_Entry* out, const VprMapCostEntry::Reader& in) { +static void ToCostEntry(util::Cost_Entry* out, const VprCostEntry::Reader& in) { out->delay = in.getDelay(); out->congestion = in.getCongestion(); + out->fill = in.getFill(); } -static void FromCostEntry(VprMapCostEntry::Builder* out, const Cost_Entry& in) { +static void FromCostEntry(VprCostEntry::Builder* out, const util::Cost_Entry& in) { out->setDelay(in.delay); out->setCongestion(in.congestion); + out->setFill(in.fill); +} + +static void ToVprVector2D(std::pair* out, const VprVector2D::Reader& in) { + *out = std::make_pair(in.getX(), in.getY()); +} + +static void FromVprVector2D(VprVector2D::Builder* out, const std::pair& in) { + out->setX(in.first); + out->setY(in.second); +} + +static void ToMatrixCostEntry(vtr::NdMatrix* out, + const Matrix::Reader& in) { + ToNdMatrix<2, VprCostEntry, util::Cost_Entry>(out, in, ToCostEntry); } -void read_router_lookahead(const std::string& file) { - vtr::ScopedStartFinishTimer timer("Loading router wire lookahead map"); +static void FromMatrixCostEntry( + Matrix::Builder* out, + const vtr::NdMatrix& in) { + FromNdMatrix<2, VprCostEntry, util::Cost_Entry>( + out, in, FromCostEntry); +} + +static void ToFloat(float* out, const VprFloatEntry::Reader& in) { + // Getting a scalar field is always "get()". + *out = in.getValue(); +} + +static void FromFloat(VprFloatEntry::Builder* out, const float& in) { + // Setting a scalar field is always "set(value)". + out->setValue(in); +} + +void CostMap::read(const std::string& file) { + build_segment_map(); MmapFile f(file); - /* Increase reader limit to 1G words to allow for large files. */ ::capnp::ReaderOptions opts = default_large_capnp_opts(); ::capnp::FlatArrayMessageReader reader(f.getData(), opts); - auto map = reader.getRoot(); + auto cost_map = reader.getRoot(); + { + const auto& offset = cost_map.getOffset(); + ToNdMatrix<2, VprVector2D, std::pair>( + &offset_, offset, ToVprVector2D); + } + + { + const auto& cost_maps = cost_map.getCostMap(); + ToNdMatrix<2, Matrix, vtr::NdMatrix>( + &cost_map_, cost_maps, ToMatrixCostEntry); + } - ToNdMatrix<4, VprMapCostEntry, Cost_Entry>(&f_wire_cost_map, map.getCostMap(), ToCostEntry); + { + const auto& penalty = cost_map.getPenalty(); + ToNdMatrix<2, VprFloatEntry, float>( + &penalty_, penalty, ToFloat); + } } -void write_router_lookahead(const std::string& file) { +void CostMap::write(const std::string& file) const { ::capnp::MallocMessageBuilder builder; - auto map = builder.initRoot(); + auto cost_map = builder.initRoot(); + + { + auto offset = cost_map.initOffset(); + FromNdMatrix<2, VprVector2D, std::pair>( + &offset, offset_, FromVprVector2D); + } + + { + auto cost_maps = cost_map.initCostMap(); + FromNdMatrix<2, Matrix, vtr::NdMatrix>( + &cost_maps, cost_map_, FromMatrixCostEntry); + } - auto cost_map = map.initCostMap(); - FromNdMatrix<4, VprMapCostEntry, Cost_Entry>(&cost_map, f_wire_cost_map, FromCostEntry); + { + auto penalty = cost_map.initPenalty(); + FromNdMatrix<2, VprFloatEntry, float>( + &penalty, penalty_, FromFloat); + } writeMessageToFile(file, &builder); } - #endif diff --git a/vpr/src/route/router_lookahead_map.h b/vpr/src/route/router_lookahead_map.h index e37c9967a00..4d8b043f0e6 100644 --- a/vpr/src/route/router_lookahead_map.h +++ b/vpr/src/route/router_lookahead_map.h @@ -1,48 +1,84 @@ -#pragma once +#ifndef CONNECTION_BOX_LOOKAHEAD_H_ +#define CONNECTION_BOX_LOOKAHEAD_H_ -#include -#include -#include "vtr_ndmatrix.h" +#include +#include "physical_types.h" #include "router_lookahead.h" +#include "router_lookahead_map_utils.h" +#include "vtr_geometry.h" -class MapLookahead : public RouterLookahead { - protected: - float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override; - void compute(const std::vector& segment_inf) override; - void read(const std::string& file) override; - void write(const std::string& file) const override; -}; +// Keys in the RoutingCosts map +struct RoutingCostKey { + // index of the channel (CHANX or CHANY) + int chan_index; -/* f_cost_map is an array of these cost entries that specifies delay/congestion estimates - * to travel relative x/y distances */ -class Cost_Entry { - public: - float delay; - float congestion; + // segment type index + int seg_index; + + // offset of the destination connection box from the starting segment + vtr::Point delta; + + RoutingCostKey() + : seg_index(-1) + , delta(0, 0) {} + RoutingCostKey(int chan_index_arg, int seg_index_arg, vtr::Point delta_arg) + : chan_index(chan_index_arg) + , seg_index(seg_index_arg) + , delta(delta_arg) {} - Cost_Entry() - : Cost_Entry(std::numeric_limits::quiet_NaN(), - std::numeric_limits::quiet_NaN()) { + bool operator==(const RoutingCostKey& other) const { + return seg_index == other.seg_index && chan_index == other.chan_index && delta == other.delta; } +}; - Cost_Entry(float set_delay, float set_congestion) { - delay = set_delay; - congestion = set_congestion; +// hash implementation for RoutingCostKey +struct HashRoutingCostKey { + std::size_t operator()(RoutingCostKey const& key) const noexcept { + std::size_t hash = std::hash{}(key.chan_index); + vtr::hash_combine(hash, key.seg_index); + vtr::hash_combine(hash, key.delta.x()); + vtr::hash_combine(hash, key.delta.y()); + return hash; } }; -/* provides delay/congestion estimates to travel specified distances - * in the x/y direction */ -typedef vtr::NdMatrix t_wire_cost_map; //[0..1][[0..num_seg_types-1]0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1] - //[0..1] entry distinguish between CHANX/CHANY start nodes respectively +// Map used to store intermediate routing costs +typedef std::unordered_map RoutingCosts; + +// Dense cost maps per source segment and destination connection box types +class CostMap { + public: + void set_counts(size_t seg_count); + void build_segment_map(); + int node_to_segment(int from_node_ind) const; + util::Cost_Entry find_cost(int from_seg_index, e_rr_type rr_type, int delta_x, int delta_y) const; + void set_cost_map(const RoutingCosts& delay_costs, const RoutingCosts& base_costs); + std::pair get_nearby_cost_entry(const vtr::NdMatrix& matrix, int cx, int cy, const vtr::Rect& bounds); + void read(const std::string& file); + void write(const std::string& file) const; + void print(int iseg) const; + std::vector> list_empty() const; + + private: + vtr::Matrix> cost_map_; + vtr::Matrix> offset_; + vtr::Matrix penalty_; + std::vector segment_map_; + size_t seg_count_; +}; + +// Implementation of RouterLookahead based on source segment and destination connection box types +class MapLookahead : public RouterLookahead { + public: + std::pair get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const; + float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override; + float get_map_cost(int from_node_ind, int to_node_ind, float criticality_fac) const; + void compute(const std::vector& segment_inf) override; -void read_router_lookahead(const std::string& file); -void write_router_lookahead(const std::string& file); + void read(const std::string& file) override; + void write(const std::string& file) const override; -/* Computes the lookahead map to be used by the router. If a map was computed prior to this, a new one will not be computed again. - * The rr graph must have been built before calling this function. */ -void compute_router_lookahead(const std::vector& segment_inf); + CostMap cost_map_; +}; -/* queries the lookahead_map (should have been computed prior to routing) to get the expected cost - * from the specified source to the specified target */ -float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac); +#endif diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp new file mode 100644 index 00000000000..57c12ab84fd --- /dev/null +++ b/vpr/src/route/router_lookahead_map_utils.cpp @@ -0,0 +1,246 @@ +#include "router_lookahead_map_utils.h" + +#include "globals.h" +#include "vpr_context.h" +#include "vtr_math.h" +#include "route_common.h" + +namespace util { + +PQ_Entry::PQ_Entry( + int set_rr_node_ind, + int switch_ind, + float parent_delay, + float parent_R_upstream, + float parent_congestion_upstream, + bool starting_node, + float Tsw_adjust) { + this->rr_node_ind = set_rr_node_ind; + + auto& device_ctx = g_vpr_ctx.device(); + this->delay = parent_delay; + this->congestion_upstream = parent_congestion_upstream; + this->R_upstream = parent_R_upstream; + if (!starting_node) { + float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel; + Tsw += Tsw_adjust; + VTR_ASSERT(Tsw >= 0.f); + float Rsw = device_ctx.rr_switch_inf[switch_ind].R; + float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C(); + float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R(); + + float T_linear = 0.f; + if (device_ctx.rr_switch_inf[switch_ind].buffered()) { + T_linear = Tsw + Rsw * Cnode + 0.5 * Rnode * Cnode; + } else { /* Pass transistor */ + T_linear = Tsw + 0.5 * Rsw * Cnode; + } + + float base_cost = 0.f; + if (device_ctx.rr_switch_inf[switch_ind].configurable()) { + base_cost = get_rr_cong_cost(set_rr_node_ind); + } + + VTR_ASSERT(T_linear >= 0.); + VTR_ASSERT(base_cost >= 0.); + this->delay += T_linear; + + this->congestion_upstream += base_cost; + } + + /* set the cost of this node */ + this->cost = this->delay; +} + +util::PQ_Entry_Delay::PQ_Entry_Delay( + int set_rr_node_ind, + int switch_ind, + const util::PQ_Entry_Delay* parent) { + this->rr_node_ind = set_rr_node_ind; + + if (parent != nullptr) { + auto& device_ctx = g_vpr_ctx.device(); + float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel; + float Rsw = device_ctx.rr_switch_inf[switch_ind].R; + float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C(); + float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R(); + + float T_linear = 0.f; + if (device_ctx.rr_switch_inf[switch_ind].buffered()) { + T_linear = Tsw + Rsw * Cnode + 0.5 * Rnode * Cnode; + } else { /* Pass transistor */ + T_linear = Tsw + 0.5 * Rsw * Cnode; + } + + VTR_ASSERT(T_linear >= 0.); + this->delay_cost = parent->delay_cost + T_linear; + } else { + this->delay_cost = 0.f; + } +} + +util::PQ_Entry_Base_Cost::PQ_Entry_Base_Cost( + int set_rr_node_ind, + int switch_ind, + const util::PQ_Entry_Base_Cost* parent) { + this->rr_node_ind = set_rr_node_ind; + + if (parent != nullptr) { + auto& device_ctx = g_vpr_ctx.device(); + if (device_ctx.rr_switch_inf[switch_ind].configurable()) { + this->base_cost = parent->base_cost + get_rr_cong_cost(set_rr_node_ind); + } else { + this->base_cost = parent->base_cost; + } + } else { + this->base_cost = 0.f; + } +} + +/* returns cost entry with the smallest delay */ +util::Cost_Entry util::Expansion_Cost_Entry::get_smallest_entry() const { + util::Cost_Entry smallest_entry; + + for (auto entry : this->cost_vector) { + if (!smallest_entry.valid() || entry.delay < smallest_entry.delay) { + smallest_entry = entry; + } + } + + return smallest_entry; +} + +/* returns a cost entry that represents the average of all the recorded entries */ +util::Cost_Entry util::Expansion_Cost_Entry::get_average_entry() const { + float avg_delay = 0; + float avg_congestion = 0; + + for (auto cost_entry : this->cost_vector) { + avg_delay += cost_entry.delay; + avg_congestion += cost_entry.congestion; + } + + avg_delay /= (float)this->cost_vector.size(); + avg_congestion /= (float)this->cost_vector.size(); + + return util::Cost_Entry(avg_delay, avg_congestion); +} + +/* returns a cost entry that represents the geomean of all the recorded entries */ +util::Cost_Entry util::Expansion_Cost_Entry::get_geomean_entry() const { + float geomean_delay = 0; + float geomean_cong = 0; + for (auto cost_entry : this->cost_vector) { + geomean_delay += log(cost_entry.delay); + geomean_cong += log(cost_entry.congestion); + } + + geomean_delay = exp(geomean_delay / (float)this->cost_vector.size()); + geomean_cong = exp(geomean_cong / (float)this->cost_vector.size()); + + return util::Cost_Entry(geomean_delay, geomean_cong); +} + +/* returns a cost entry that represents the medial of all recorded entries */ +util::Cost_Entry util::Expansion_Cost_Entry::get_median_entry() const { + /* find median by binning the delays of all entries and then chosing the bin + * with the largest number of entries */ + + int num_bins = 10; + + /* find entries with smallest and largest delays */ + util::Cost_Entry min_del_entry; + util::Cost_Entry max_del_entry; + for (auto entry : this->cost_vector) { + if (!min_del_entry.valid() || entry.delay < min_del_entry.delay) { + min_del_entry = entry; + } + if (!max_del_entry.valid() || entry.delay > max_del_entry.delay) { + max_del_entry = entry; + } + } + + /* get the bin size */ + float delay_diff = max_del_entry.delay - min_del_entry.delay; + float bin_size = delay_diff / (float)num_bins; + + /* sort the cost entries into bins */ + std::vector> entry_bins(num_bins, std::vector()); + for (auto entry : this->cost_vector) { + float bin_num = floor((entry.delay - min_del_entry.delay) / bin_size); + + VTR_ASSERT(vtr::nint(bin_num) >= 0 && vtr::nint(bin_num) <= num_bins); + if (vtr::nint(bin_num) == num_bins) { + /* largest entry will otherwise have an out-of-bounds bin number */ + bin_num -= 1; + } + entry_bins[vtr::nint(bin_num)].push_back(entry); + } + + /* find the bin with the largest number of elements */ + int largest_bin = 0; + int largest_size = 0; + for (int ibin = 0; ibin < num_bins; ibin++) { + if (entry_bins[ibin].size() > (unsigned)largest_size) { + largest_bin = ibin; + largest_size = (unsigned)entry_bins[ibin].size(); + } + } + + /* get the representative delay of the largest bin */ + util::Cost_Entry representative_entry = entry_bins[largest_bin][0]; + + return representative_entry; +} + +template +void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes, + const Entry& parent_entry, + std::vector* paths, + std::vector* node_expanded, + std::priority_queue, + std::greater>* pq) { + int parent_ind = size_t(parent_entry.rr_node_ind); + + auto& parent_node = rr_nodes[parent_ind]; + + for (int iedge = 0; iedge < parent_node.num_edges(); iedge++) { + int child_node_ind = parent_node.edge_sink_node(iedge); + int switch_ind = parent_node.edge_switch(iedge); + + /* skip this child if it has already been expanded from */ + if ((*node_expanded)[child_node_ind]) { + continue; + } + + Entry child_entry(child_node_ind, switch_ind, &parent_entry); + VTR_ASSERT(child_entry.cost() >= 0); + + /* Create (if it doesn't exist) or update (if the new cost is lower) + * to specified node */ + Search_Path path_entry = {child_entry.cost(), parent_ind, iedge}; + auto& path = (*paths)[child_node_ind]; + if (path_entry.cost < path.cost) { + pq->push(child_entry); + path = path_entry; + } + } +} + +template void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes, + const PQ_Entry_Delay& parent_entry, + std::vector* paths, + std::vector* node_expanded, + std::priority_queue, + std::greater>* pq); +template void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes, + const PQ_Entry_Base_Cost& parent_entry, + std::vector* paths, + std::vector* node_expanded, + std::priority_queue, + std::greater>* pq); + +} // namespace util diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h new file mode 100644 index 00000000000..28b60e3072b --- /dev/null +++ b/vpr/src/route/router_lookahead_map_utils.h @@ -0,0 +1,206 @@ +#ifndef ROUTER_LOOKAHEAD_MAP_UTILS_H_ +#define ROUTER_LOOKAHEAD_MAP_UTILS_H_ +/* + * The router lookahead provides an estimate of the cost from an intermediate node to the target node + * during directed (A*-like) routing. + * + * The VPR 7.0 lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and + * congestion by assuming that a minimum number of wires, of the same type as the current node being expanded, can be used + * to complete the route. While this method is efficient, it can run into trouble with architectures that use + * multiple interconnected wire types. + * + * The lookahead in this file pre-computes delay/congestion costs up and to the right of a starting tile. This generates + * delay/congestion tables for {CHANX, CHANY} channel types, over all wire types defined in the architecture file. + * See Section 3.2.4 in Oleg Petelin's MASc thesis (2016) for more discussion. + * + */ + +#include +#include +#include +#include +#include +#include "vpr_types.h" +#include "rr_node.h" + +namespace util { + +/* when a list of delay/congestion entries at a coordinate in Cost_Entry is boiled down to a single + * representative entry, this enum is passed-in to specify how that representative entry should be + * calculated */ +enum e_representative_entry_method { + FIRST = 0, //the first cost that was recorded + SMALLEST, //the smallest-delay cost recorded + AVERAGE, + GEOMEAN, + MEDIAN +}; + +/* f_cost_map is an array of these cost entries that specifies delay/congestion estimates + * to travel relative x/y distances */ +class Cost_Entry { + public: + float delay; + float congestion; + bool fill; + + Cost_Entry() { + delay = std::numeric_limits::infinity(); + congestion = std::numeric_limits::infinity(); + fill = false; + } + Cost_Entry(float set_delay, float set_congestion) + : delay(set_delay) + , congestion(set_congestion) + , fill(false) {} + Cost_Entry(float set_delay, float set_congestion, bool set_fill) + : delay(set_delay) + , congestion(set_congestion) + , fill(set_fill) {} + bool valid() const { + return std::isfinite(delay) && std::isfinite(congestion); + } +}; + +/* a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion. + * since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored + * in the final lookahead cost map */ +class Expansion_Cost_Entry { + private: + std::vector cost_vector; + + Cost_Entry get_smallest_entry() const; + Cost_Entry get_average_entry() const; + Cost_Entry get_geomean_entry() const; + Cost_Entry get_median_entry() const; + + public: + void add_cost_entry(e_representative_entry_method method, + float add_delay, + float add_congestion) { + Cost_Entry cost_entry(add_delay, add_congestion); + if (method == SMALLEST) { + /* taking the smallest-delay entry anyway, so no need to push back multple entries */ + if (this->cost_vector.empty()) { + this->cost_vector.push_back(cost_entry); + } else { + if (add_delay < this->cost_vector[0].delay) { + this->cost_vector[0] = cost_entry; + } + } + } else { + this->cost_vector.push_back(cost_entry); + } + } + void clear_cost_entries() { + this->cost_vector.clear(); + } + + Cost_Entry get_representative_cost_entry(e_representative_entry_method method) const { + Cost_Entry entry; + + if (!cost_vector.empty()) { + switch (method) { + case FIRST: + entry = cost_vector[0]; + break; + case SMALLEST: + entry = this->get_smallest_entry(); + break; + case AVERAGE: + entry = this->get_average_entry(); + break; + case GEOMEAN: + entry = this->get_geomean_entry(); + break; + case MEDIAN: + entry = this->get_median_entry(); + break; + default: + break; + } + } + return entry; + } +}; + +/* a class that represents an entry in the Dijkstra expansion priority queue */ +class PQ_Entry { + public: + int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents + float cost; //the cost of the path to get to this node + + /* store backward delay, R and congestion info */ + float delay; + float R_upstream; + float congestion_upstream; + + PQ_Entry(int set_rr_node_ind, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node, float Tsw_adjust); + + bool operator<(const PQ_Entry& obj) const { + /* inserted into max priority queue so want queue entries with a lower cost to be greater */ + return (this->cost > obj.cost); + } +}; + +// A version of PQ_Entry that only calculates and stores the delay. +class PQ_Entry_Delay { + public: + int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents + float delay_cost; //the cost of the path to get to this node + + PQ_Entry_Delay(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Delay* parent); + + float cost() const { + return delay_cost; + } + + void adjust_Tsw(float amount) { + delay_cost += amount; + } + + bool operator>(const PQ_Entry_Delay& obj) const { + return (this->delay_cost > obj.delay_cost); + } +}; + +// A version of PQ_Entry that only calculates and stores the base cost. +class PQ_Entry_Base_Cost { + public: + int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents + float base_cost; + + PQ_Entry_Base_Cost(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Base_Cost* parent); + + float cost() const { + return base_cost; + } + + void adjust_Tsw(float /* amount */) { + // do nothing + } + + bool operator>(const PQ_Entry_Base_Cost& obj) const { + return (this->base_cost > obj.base_cost); + } +}; + +struct Search_Path { + float cost; + int parent; + int edge; +}; + +/* iterates over the children of the specified node and selectively pushes them onto the priority queue */ +template +void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes, + const Entry& parent_entry, + std::vector* paths, + std::vector* node_expanded, + std::priority_queue, + std::greater>* pq); + +} // namespace util + +#endif diff --git a/vpr/src/route/router_lookahead_sampling.cpp b/vpr/src/route/router_lookahead_sampling.cpp new file mode 100644 index 00000000000..568d5134602 --- /dev/null +++ b/vpr/src/route/router_lookahead_sampling.cpp @@ -0,0 +1,229 @@ +#include "router_lookahead_sampling.h" + +#include + +#include "globals.h" +#include "vtr_math.h" +#include "vtr_geometry.h" +#include "vtr_time.h" + +// Sample based an NxN grid of starting segments, where N = SAMPLE_GRID_SIZE +static constexpr int SAMPLE_GRID_SIZE = 2; + +// quantiles (like percentiles but 0-1) of segment count to use as a selection criteria +// choose locations with higher, but not extreme, counts of each segment type +static constexpr double kSamplingCountLowerQuantile = 0.5; +static constexpr double kSamplingCountUpperQuantile = 0.7; + +// also known as the L1 norm +static int manhattan_distance(const vtr::Point& a, const vtr::Point& b) { + return abs(b.x() - a.x()) + abs(b.y() - a.y()); +} + +// the smallest bounding box containing a node +static vtr::Rect bounding_box_for_node(int node_ind) { + auto& device_ctx = g_vpr_ctx.device(); + auto& rr_graph = device_ctx.rr_nodes; + int x = rr_graph.node_xlow(RRNodeId(node_ind)); + int y = rr_graph.node_ylow(RRNodeId(node_ind)); + + return vtr::Rect(vtr::Point(x, y)); +} + +static vtr::Rect sample_window(const vtr::Rect& bounding_box, int sx, int sy, int n) { + return vtr::Rect(sample(bounding_box, sx, sy, n), + sample(bounding_box, sx + 1, sy + 1, n)); +} + +static std::vector choose_points(const vtr::Matrix& counts, + const vtr::Rect& window, + int min_count, + int max_count) { + VTR_ASSERT(min_count <= max_count); + std::vector points; + for (int y = window.ymin(); y < window.ymax(); y++) { + for (int x = window.xmin(); x < window.xmax(); x++) { + if (counts[x][y] >= min_count && counts[x][y] <= max_count) { + points.push_back(SamplePoint{/* .location = */ vtr::Point(x, y), + /* .nodes = */ {}}); + } + } + } + + vtr::Point center = sample(window, 1, 1, 2); + + // sort by distance from center + std::sort(points.begin(), points.end(), + [&](const SamplePoint& a, const SamplePoint& b) { + return manhattan_distance(a.location, center) < manhattan_distance(b.location, center); + }); + + return points; +} + +// histogram is a map from segment count to number of locations having that count +static int quantile(const std::map& histogram, float ratio) { + if (histogram.empty()) { + return 0; + } + int sum = 0; + for (const auto& entry : histogram) { + sum += entry.second; + } + int limit = std::ceil(sum * ratio); + for (const auto& entry : histogram) { + limit -= entry.second; + if (limit <= 0) { + return entry.first; + } + } + return 0; +} + +// select a good number of segments to find +static std::map count_histogram(const vtr::Rect& box, const vtr::Matrix& counts) { + std::map histogram; + for (int y = box.ymin(); y < box.ymax(); y++) { + for (int x = box.xmin(); x < box.xmax(); x++) { + int count = counts[x][y]; + if (count > 0) { + ++histogram[count]; + } + } + } + return histogram; +} + +// Used to calculate each region's `order.' +// A space-filling curve will order the regions so that +// nearby points stay close in order. A Hilbert curve might +// be better, but a Morton (Z)-order curve is easy to compute, +// because it's just interleaving binary bits, so this +// function interleaves with 0's so that the X and Y +// dimensions can then be OR'ed together. +static uint64_t interleave(uint32_t x) { + uint64_t i = x; + i = (i ^ (i << 16)) & 0x0000ffff0000ffff; + i = (i ^ (i << 8)) & 0x00ff00ff00ff00ff; + i = (i ^ (i << 4)) & 0x0f0f0f0f0f0f0f0f; + i = (i ^ (i << 2)) & 0x3333333333333333; + i = (i ^ (i << 1)) & 0x5555555555555555; + return i; +} + +// for each segment type, find the nearest nodes to an equally spaced grid of points +// within the bounding box for that segment type +std::vector find_sample_regions(int num_segments) { + vtr::ScopedStartFinishTimer timer("finding sample regions"); + std::vector sample_regions; + auto& device_ctx = g_vpr_ctx.device(); + auto& rr_nodes = device_ctx.rr_nodes; + std::vector> segment_counts(num_segments); + + // compute bounding boxes for each segment type + std::vector> bounding_box_for_segment(num_segments, vtr::Rect()); + for (size_t i = 0; i < rr_nodes.size(); i++) { + auto& node = rr_nodes[i]; + if (node.type() != CHANX && node.type() != CHANY) continue; + if (node.capacity() == 0 || node.num_edges() == 0) continue; + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; + + VTR_ASSERT(seg_index != OPEN); + VTR_ASSERT(seg_index < num_segments); + + bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(i)); + } + + // initialize counts + for (int seg = 0; seg < num_segments; seg++) { + const auto& box = bounding_box_for_segment[seg]; + segment_counts[seg] = vtr::Matrix({size_t(box.width()), size_t(box.height())}, 0); + } + + // count sample points + for (size_t i = 0; i < rr_nodes.size(); i++) { + auto& node = rr_nodes[i]; + if (node.type() != CHANX && node.type() != CHANY) continue; + if (node.capacity() == 0 || node.num_edges() == 0) continue; + int x = rr_nodes.node_xlow(RRNodeId(i)); + int y = rr_nodes.node_ylow(RRNodeId(i)); + + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; + segment_counts[seg_index][x][y] += 1; + + VTR_ASSERT(seg_index != OPEN); + VTR_ASSERT(seg_index < num_segments); + } + + // select sample points + for (int i = 0; i < num_segments; i++) { + const auto& counts = segment_counts[i]; + const auto& bounding_box = bounding_box_for_segment[i]; + if (bounding_box.empty()) continue; + for (int y = 0; y < SAMPLE_GRID_SIZE; y++) { + for (int x = 0; x < SAMPLE_GRID_SIZE; x++) { + vtr::Rect window = sample_window(bounding_box, x, y, SAMPLE_GRID_SIZE); + if (window.empty()) continue; + + auto histogram = count_histogram(window, segment_counts[i]); + SampleRegion region = { + /* .segment_type = */ i, + /* .grid_location = */ vtr::Point(x, y), + /* .points = */ choose_points(counts, window, quantile(histogram, kSamplingCountLowerQuantile), quantile(histogram, kSamplingCountUpperQuantile)), + /* .order = */ 0}; + if (!region.points.empty()) { + /* In order to improve caching, the list of sample points are + * sorted to keep points that are nearby on the Euclidean plane also + * nearby in the vector of sample points. + * + * This means subsequent expansions on the same thread are likely + * to cover a similar set of nodes, so they are more likely to be + * cached. This improves performance by about 7%, which isn't a lot, + * but not a bad improvement for a few lines of code. */ + vtr::Point location = region.points[0].location; + + // interleave bits of X and Y for a Z-curve ordering. + region.order = interleave(location.x()) | (interleave(location.y()) << 1); + + sample_regions.push_back(region); + } + } + } + } + + // sort regions + std::sort(sample_regions.begin(), sample_regions.end(), + [](const SampleRegion& a, const SampleRegion& b) { + return a.order < b.order; + }); + + // build an index of sample points on segment type and location + std::map, SamplePoint*> sample_point_index; + for (auto& region : sample_regions) { + for (auto& point : region.points) { + sample_point_index[std::make_tuple(region.segment_type, point.location.x(), point.location.y())] = &point; + } + } + + // collect the node indices for each segment type at the selected sample points + for (size_t i = 0; i < rr_nodes.size(); i++) { + auto& node = rr_nodes[i]; + if (node.type() != CHANX && node.type() != CHANY) continue; + if (node.capacity() == 0 || node.num_edges() == 0) continue; + + int x = rr_nodes.node_xlow(RRNodeId(i)); + int y = rr_nodes.node_ylow(RRNodeId(i)); + + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; + + VTR_ASSERT(seg_index != OPEN); + VTR_ASSERT(seg_index < num_segments); + + auto point = sample_point_index.find(std::make_tuple(seg_index, x, y)); + if (point != sample_point_index.end()) { + point->second->nodes.push_back(i); + } + } + + return sample_regions; +} diff --git a/vpr/src/route/router_lookahead_sampling.h b/vpr/src/route/router_lookahead_sampling.h new file mode 100644 index 00000000000..6668792e9c6 --- /dev/null +++ b/vpr/src/route/router_lookahead_sampling.h @@ -0,0 +1,35 @@ +#ifndef ROUTER_LOOKAHEAD_SAMPLING_H +#define ROUTER_LOOKAHEAD_SAMPLING_H + +#include +#include "vtr_geometry.h" +#include "globals.h" + +// a sample point for a segment type, contains all segments at the VPR location +struct SamplePoint { + // canonical location + vtr::Point location; + + // nodes to expand + std::vector nodes; +}; + +struct SampleRegion { + // all nodes in `points' have this segment type + int segment_type; + + // location on the sample grid + vtr::Point grid_location; + + // locations to try + // The computation will keep expanding each of the points + // until a number of paths (segment -> connection box) are found. + std::vector points; + + // used to sort the regions to improve caching + uint64_t order; +}; + +std::vector find_sample_regions(int num_segments); + +#endif diff --git a/vpr/src/route/rr_graph_storage.h b/vpr/src/route/rr_graph_storage.h index cc0207ee123..66837ef5f78 100644 --- a/vpr/src/route/rr_graph_storage.h +++ b/vpr/src/route/rr_graph_storage.h @@ -48,7 +48,7 @@ * This field is valid only for IPINs and OPINs and should be ignored * * otherwise. */ struct alignas(16) t_rr_node_data { - int8_t cost_index_ = -1; + int16_t cost_index_ = -1; int16_t rc_index_ = -1; int16_t xlow_ = -1; diff --git a/vpr/test/test_map_lookahead_serdes.cpp b/vpr/test/test_map_lookahead_serdes.cpp deleted file mode 100644 index 270bc689a20..00000000000 --- a/vpr/test/test_map_lookahead_serdes.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "catch.hpp" - -#include "router_lookahead_map.h" - -extern t_wire_cost_map f_wire_cost_map; - -namespace { - -#ifdef VTR_ENABLE_CAPNPROTO -static constexpr const char kMapLookaheadBin[] = "test_map_lookahead.bin"; - -TEST_CASE("round_trip_map_lookahead", "[vpr]") { - constexpr std::array kDim({10, 12, 15, 16}); - - f_wire_cost_map.resize(kDim); - for (size_t x = 0; x < kDim[0]; ++x) { - for (size_t y = 0; y < kDim[1]; ++y) { - for (size_t z = 0; z < kDim[2]; ++z) { - for (size_t w = 0; w < kDim[3]; ++w) { - f_wire_cost_map[x][y][z][w].delay = (x + 1) * (y + 1) * (z + 1) * (w + 1); - f_wire_cost_map[x][y][z][w].congestion = 2 * (x + 1) * (y + 1) * (z + 1) * (w + 1); - } - } - } - } - - write_router_lookahead(kMapLookaheadBin); - - for (size_t x = 0; x < kDim[0]; ++x) { - for (size_t y = 0; y < kDim[1]; ++y) { - for (size_t z = 0; z < kDim[2]; ++z) { - for (size_t w = 0; w < kDim[3]; ++w) { - f_wire_cost_map[x][y][z][w].delay = 0.f; - f_wire_cost_map[x][y][z][w].congestion = 0.f; - } - } - } - } - - f_wire_cost_map.resize({0, 0, 0, 0}); - - read_router_lookahead(kMapLookaheadBin); - - for (size_t i = 0; i < kDim.size(); ++i) { - REQUIRE(f_wire_cost_map.dim_size(i) == kDim[i]); - } - - for (size_t x = 0; x < kDim[0]; ++x) { - for (size_t y = 0; y < kDim[1]; ++y) { - for (size_t z = 0; z < kDim[2]; ++z) { - for (size_t w = 0; w < kDim[3]; ++w) { - REQUIRE(f_wire_cost_map[x][y][z][w].delay == (x + 1) * (y + 1) * (z + 1) * (w + 1)); - REQUIRE(f_wire_cost_map[x][y][z][w].congestion == 2 * (x + 1) * (y + 1) * (z + 1) * (w + 1)); - } - } - } - } -} - -#endif - -} // namespace