diff --git a/vpr/src/route/router_lookahead_map.cpp b/vpr/src/route/router_lookahead_map.cpp index 0b9772e2e50..6ece229cfc2 100644 --- a/vpr/src/route/router_lookahead_map.cpp +++ b/vpr/src/route/router_lookahead_map.cpp @@ -35,6 +35,7 @@ #include "vtr_time.h" #include "vtr_geometry.h" #include "router_lookahead_map.h" +#include "router_lookahead_sampling.h" #include "rr_graph2.h" #include "rr_graph.h" #include "route_common.h" @@ -48,6 +49,11 @@ # include "serdes_utils.h" #endif /* VTR_ENABLE_CAPNPROTO */ +#if defined(VPR_USE_TBB) +# include +# include +#endif + /* we will profile delay/congestion using this many tracks for each wire type */ #define MAX_TRACK_OFFSET 16 @@ -193,7 +199,7 @@ struct t_reachable_wire_inf { /* used during Dijkstra expansion to store delay/congestion info lists for each relative coordinate for a given segment and channel type. * the list at each coordinate is later boiled down to a single representative cost entry to be stored in the final cost map */ -typedef vtr::Matrix t_routing_cost_map; //[0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1] +typedef vtr::NdMatrix t_routing_cost_map; //[0..1][0..num_segments-1][0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1] typedef std::vector>> t_src_opin_reachable_wires; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index] // ^ ^ ^ @@ -229,17 +235,15 @@ static void compute_router_src_opin_lookahead(); static vtr::Point pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point start); void dijkstra_flood_to_wires(int itile, RRNodeId inode, t_src_opin_reachable_wires& src_opin_reachable_wires); -/* returns index of a node from which to start routing */ -static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset); /* runs Dijkstra's algorithm from specified node until all nodes have been visited. Each time a pin is visited, the delay/congestion information * to that pin is stored is added to an entry in the routing_cost_map */ static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routing_cost_map& routing_cost_map, t_dijkstra_data* data); /* iterates over the children of the specified node and selectively pushes them onto the priority queue */ static void expand_dijkstra_neighbours(PQ_Entry parent_entry, vtr::vector& node_visited_costs, vtr::vector& node_expanded, std::priority_queue& pq); /* sets the lookahead cost map entries based on representative cost entries from routing_cost_map */ -static void set_lookahead_map_costs(int segment_index, e_rr_type chan_type, t_routing_cost_map& routing_cost_map); +static void set_lookahead_map_costs(t_routing_cost_map& routing_cost_map); /* fills in missing lookahead map entries by copying the cost of the closest valid entry */ -static void fill_in_missing_lookahead_entries(int segment_index, e_rr_type chan_type); +static void fill_in_missing_lookahead_entries(int num_segments); /* returns a cost entry in the f_wire_cost_map that is near the specified coordinates (and preferably towards (0,0)) */ static Cost_Entry get_nearby_cost_entry(int x, int y, int segment_index, int chan_index); /* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */ @@ -424,135 +428,66 @@ static void compute_router_wire_lookahead(const std::vector& segm vtr::ScopedStartFinishTimer timer("Computing wire lookahead"); auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; auto& rr_graph = device_ctx.rr_nodes; //Re-allocate f_wire_cost_map = t_wire_cost_map({2, segment_inf.size(), device_ctx.grid.width(), device_ctx.grid.height()}); - int longest_length = 0; - for (const auto& seg_inf : segment_inf) { - longest_length = std::max(longest_length, seg_inf.length); - } - - //Start sampling at the lower left non-corner - int ref_x = 1; - int ref_y = 1; - - //Sample from locations near the reference location (to capture maximum distance paths) - //Also sample from locations at least the longest wire length away from the edge (to avoid - //edge effects for shorter distances) - std::vector ref_increments = {0, 1, - longest_length, longest_length + 1}; - - //Uniquify the increments (avoid sampling the same locations repeatedly if they happen to - //overlap) - std::sort(ref_increments.begin(), ref_increments.end()); - ref_increments.erase(std::unique(ref_increments.begin(), ref_increments.end()), ref_increments.end()); - - //Upper right non-corner - int target_x = device_ctx.grid.width() - 2; - int target_y = device_ctx.grid.height() - 2; - - //Profile each wire segment type - for (int iseg = 0; iseg < int(segment_inf.size()); iseg++) { - //First try to pick good representative sample locations for each type - std::map> sample_nodes; - for (e_rr_type chan_type : {CHANX, CHANY}) { - for (int ref_inc : ref_increments) { - int sample_x = ref_x + ref_inc; - int sample_y = ref_y + ref_inc; - - if (sample_x >= int(grid.width())) continue; - if (sample_y >= int(grid.height())) continue; - - for (int track_offset = 0; track_offset < MAX_TRACK_OFFSET; track_offset += 2) { - /* get the rr node index from which to start routing */ - RRNodeId start_node = get_start_node(sample_x, sample_y, - target_x, target_y, //non-corner upper right - chan_type, iseg, track_offset); - - if (!start_node) { - continue; - } - - sample_nodes[chan_type].push_back(RRNodeId(start_node)); - } - } - } - - //If we failed to find any representative sample locations, search exhaustively - // - //This is to ensure we sample 'unusual' wire types which may not exist in all channels - //(e.g. clock routing) - for (e_rr_type chan_type : {CHANX, CHANY}) { - if (!sample_nodes[chan_type].empty()) continue; + size_t num_segments = segment_inf.size(); + std::vector sample_regions = find_sample_regions(num_segments); - //Try an exhaustive search to find a suitable sample point - for (int inode = 0; inode < int(device_ctx.rr_nodes.size()); ++inode) { - auto rr_node = RRNodeId(inode); - auto rr_type = rr_graph.node_type(rr_node); - if (rr_type != chan_type) continue; + /* run Dijkstra's algorithm for each segment type & channel type combination */ +#if defined(VPR_USE_TBB) + tbb::mutex all_costs_mutex; + tbb::parallel_for_each(sample_regions, [&](const SampleRegion& region) { +#else + for (const auto& region : sample_regions) { +#endif + t_dijkstra_data dijkstra_data; + t_routing_cost_map routing_cost_map({2, num_segments, device_ctx.grid.width(), device_ctx.grid.height()}); + routing_cost_map.fill(Expansion_Cost_Entry()); - int cost_index = rr_graph.node_cost_index(rr_node); - VTR_ASSERT(cost_index != OPEN); + for (auto& point : region.points) { + for (auto node_ind : point.nodes) { + //reset cost for this segment + RRNodeId sample_node(node_ind); - int seg_index = device_ctx.rr_indexed_data[cost_index].seg_index; + int sample_x = rr_graph.node_xlow(sample_node); + int sample_y = rr_graph.node_ylow(sample_node); - if (seg_index == iseg) { - sample_nodes[chan_type].push_back(RRNodeId(inode)); + if (rr_graph.node_direction(sample_node) == DEC_DIRECTION) { + sample_x = rr_graph.node_xhigh(sample_node); + sample_y = rr_graph.node_yhigh(sample_node); } - if (sample_nodes[chan_type].size() >= ref_increments.size()) { - break; - } + run_dijkstra(sample_node, + sample_x, + sample_y, + routing_cost_map, + &dijkstra_data); } } - //Finally, now that we have a list of sample locations, run a Djikstra flood from - //each sample location to profile the routing network from this type - - t_dijkstra_data dijkstra_data; - t_routing_cost_map routing_cost_map({device_ctx.grid.width(), device_ctx.grid.height()}); - - for (e_rr_type chan_type : {CHANX, CHANY}) { - if (sample_nodes[chan_type].empty()) { - VTR_LOG_WARN("Unable to find any sample location for segment %s type '%s' (length %d)\n", - rr_node_typename[chan_type], - segment_inf[iseg].name.c_str(), - segment_inf[iseg].length); - } else { - //reset cost for this segment - routing_cost_map.fill(Expansion_Cost_Entry()); - - for (RRNodeId sample_node : sample_nodes[chan_type]) { - int sample_x = rr_graph.node_xlow(sample_node); - int sample_y = rr_graph.node_ylow(sample_node); - - if (rr_graph.node_direction(sample_node) == DEC_DIRECTION) { - sample_x = rr_graph.node_xhigh(sample_node); - sample_y = rr_graph.node_yhigh(sample_node); - } - - run_dijkstra(sample_node, - sample_x, - sample_y, - routing_cost_map, - &dijkstra_data); - } +#if defined(VPR_USE_TBB) + all_costs_mutex.lock(); +#endif - if (false) print_router_cost_map(routing_cost_map); + if (false) print_router_cost_map(routing_cost_map); - /* boil down the cost list in routing_cost_map at each coordinate to a representative cost entry and store it in the lookahead - * cost map */ - set_lookahead_map_costs(iseg, chan_type, routing_cost_map); + /* boil down the cost list in routing_cost_map at each coordinate to a representative cost entry and store it in the lookahead + * cost map */ + set_lookahead_map_costs(routing_cost_map); - /* fill in missing entries in the lookahead cost map by copying the closest cost entries (cost map was computed based on - * a reference coordinate > (0,0) so some entries that represent a cross-chip distance have not been computed) */ - fill_in_missing_lookahead_entries(iseg, chan_type); - } - } +#if defined(VPR_USE_TBB) + all_costs_mutex.unlock(); + }); +#else } +#endif + + /* fill in missing entries in the lookahead cost map by copying the closest cost entries (cost map was computed based on + * a reference coordinate > (0,0) so some entries that represent a cross-chip distance have not been computed) */ + fill_in_missing_lookahead_entries(num_segments); if (false) print_wire_cost_map(segment_inf); } @@ -765,64 +700,22 @@ void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_reachable_wire } } -/* returns index of a node from which to start routing */ -static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset) { - auto& device_ctx = g_vpr_ctx.device(); - auto& rr_graph = device_ctx.rr_nodes; - - int result = UNDEFINED; - - if (rr_type != CHANX && rr_type != CHANY) { - VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Must start lookahead routing from CHANX or CHANY node\n"); - } - - /* determine which direction the wire should go in based on the start & target coordinates */ - e_direction direction = INC_DIRECTION; - if ((rr_type == CHANX && target_x < start_x) || (rr_type == CHANY && target_y < start_y)) { - direction = DEC_DIRECTION; - } - - int start_lookup_x = start_x; - int start_lookup_y = start_y; - if (rr_type == CHANX) { - //Bizarely, rr_node_indices stores CHANX with swapped x/y... - std::swap(start_lookup_x, start_lookup_y); - } - - const std::vector& channel_node_list = device_ctx.rr_node_indices[rr_type][start_lookup_x][start_lookup_y][0]; - - /* find first node in channel that has specified segment index and goes in the desired direction */ - for (unsigned itrack = 0; itrack < channel_node_list.size(); itrack++) { - int node_ind = channel_node_list[itrack]; - if (node_ind < 0) continue; - - RRNodeId node_id(node_ind); - - VTR_ASSERT(rr_graph.node_type(node_id) == rr_type); - - e_direction node_direction = rr_graph.node_direction(node_id); - int node_cost_ind = rr_graph.node_cost_index(node_id); - int node_seg_ind = device_ctx.rr_indexed_data[node_cost_ind].seg_index; - - if ((node_direction == direction || node_direction == BI_DIRECTION) && node_seg_ind == seg_index) { - /* found first track that has the specified segment index and goes in the desired direction */ - result = node_ind; - if (track_offset == 0) { - break; - } - track_offset -= 2; - } - } - - return RRNodeId(result); -} - /* runs Dijkstra's algorithm from specified node until all nodes have been visited. Each time a pin is visited, the delay/congestion information * to that pin is stored is added to an entry in the routing_cost_map */ static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routing_cost_map& routing_cost_map, t_dijkstra_data* data) { auto& device_ctx = g_vpr_ctx.device(); auto& rr_graph = device_ctx.rr_nodes; + // Get start node channel + int chan = 0; + if (rr_graph.node_type(start_node) == CHANY) { + chan = 1; + } + + // Get start node segment + int cost_index = rr_graph.node_cost_index(start_node); + int seg_index = device_ctx.rr_indexed_data[cost_index].seg_index; + auto& node_expanded = data->node_expanded; node_expanded.resize(device_ctx.rr_nodes.size()); std::fill(node_expanded.begin(), node_expanded.end(), false); @@ -869,7 +762,7 @@ static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routin delta_x = std::abs(delta_x); delta_y = std::abs(delta_y); - routing_cost_map[delta_x][delta_y].add_cost_entry(current.delay, current.congestion_upstream); + routing_cost_map[chan][seg_index][delta_x][delta_y].add_cost_entry(current.delay, current.congestion_upstream); } } @@ -913,39 +806,37 @@ static void expand_dijkstra_neighbours(PQ_Entry parent_entry, vtr::vector& segment_inf) { static void print_router_cost_map(const t_routing_cost_map& router_cost_map) { VTR_LOG("Djikstra Flood Costs:\n"); - for (size_t x = 0; x < router_cost_map.dim_size(0); x++) { - for (size_t y = 0; y < router_cost_map.dim_size(1); y++) { - VTR_LOG("(%zu,%zu):\n", x, y); - - for (size_t i = 0; i < router_cost_map[x][y].cost_vector.size(); ++i) { - Cost_Entry entry = router_cost_map[x][y].cost_vector[i]; - VTR_LOG(" %d: delay=%10.3g cong=%10.3g\n", i, entry.delay, entry.congestion); + for (size_t ichan; ichan < router_cost_map.dim_size(0); ichan++) { + for (size_t iseg; iseg < router_cost_map.dim_size(1); iseg++) { + for (size_t x = 0; x < router_cost_map.dim_size(2); x++) { + for (size_t y = 0; y < router_cost_map.dim_size(3); y++) { + VTR_LOG("CHAN %zu, Seg index %zu (%zu,%zu):\n", ichan, iseg, x, y); + + for (size_t i = 0; i < router_cost_map[ichan][iseg][x][y].cost_vector.size(); ++i) { + Cost_Entry entry = router_cost_map[ichan][iseg][x][y].cost_vector[i]; + VTR_LOG(" %d: delay=%10.3g cong=%10.3g\n", i, entry.delay, entry.congestion); + } + } } } } diff --git a/vpr/src/route/router_lookahead_sampling.cpp b/vpr/src/route/router_lookahead_sampling.cpp new file mode 100644 index 00000000000..568d5134602 --- /dev/null +++ b/vpr/src/route/router_lookahead_sampling.cpp @@ -0,0 +1,229 @@ +#include "router_lookahead_sampling.h" + +#include + +#include "globals.h" +#include "vtr_math.h" +#include "vtr_geometry.h" +#include "vtr_time.h" + +// Sample based an NxN grid of starting segments, where N = SAMPLE_GRID_SIZE +static constexpr int SAMPLE_GRID_SIZE = 2; + +// quantiles (like percentiles but 0-1) of segment count to use as a selection criteria +// choose locations with higher, but not extreme, counts of each segment type +static constexpr double kSamplingCountLowerQuantile = 0.5; +static constexpr double kSamplingCountUpperQuantile = 0.7; + +// also known as the L1 norm +static int manhattan_distance(const vtr::Point& a, const vtr::Point& b) { + return abs(b.x() - a.x()) + abs(b.y() - a.y()); +} + +// the smallest bounding box containing a node +static vtr::Rect bounding_box_for_node(int node_ind) { + auto& device_ctx = g_vpr_ctx.device(); + auto& rr_graph = device_ctx.rr_nodes; + int x = rr_graph.node_xlow(RRNodeId(node_ind)); + int y = rr_graph.node_ylow(RRNodeId(node_ind)); + + return vtr::Rect(vtr::Point(x, y)); +} + +static vtr::Rect sample_window(const vtr::Rect& bounding_box, int sx, int sy, int n) { + return vtr::Rect(sample(bounding_box, sx, sy, n), + sample(bounding_box, sx + 1, sy + 1, n)); +} + +static std::vector choose_points(const vtr::Matrix& counts, + const vtr::Rect& window, + int min_count, + int max_count) { + VTR_ASSERT(min_count <= max_count); + std::vector points; + for (int y = window.ymin(); y < window.ymax(); y++) { + for (int x = window.xmin(); x < window.xmax(); x++) { + if (counts[x][y] >= min_count && counts[x][y] <= max_count) { + points.push_back(SamplePoint{/* .location = */ vtr::Point(x, y), + /* .nodes = */ {}}); + } + } + } + + vtr::Point center = sample(window, 1, 1, 2); + + // sort by distance from center + std::sort(points.begin(), points.end(), + [&](const SamplePoint& a, const SamplePoint& b) { + return manhattan_distance(a.location, center) < manhattan_distance(b.location, center); + }); + + return points; +} + +// histogram is a map from segment count to number of locations having that count +static int quantile(const std::map& histogram, float ratio) { + if (histogram.empty()) { + return 0; + } + int sum = 0; + for (const auto& entry : histogram) { + sum += entry.second; + } + int limit = std::ceil(sum * ratio); + for (const auto& entry : histogram) { + limit -= entry.second; + if (limit <= 0) { + return entry.first; + } + } + return 0; +} + +// select a good number of segments to find +static std::map count_histogram(const vtr::Rect& box, const vtr::Matrix& counts) { + std::map histogram; + for (int y = box.ymin(); y < box.ymax(); y++) { + for (int x = box.xmin(); x < box.xmax(); x++) { + int count = counts[x][y]; + if (count > 0) { + ++histogram[count]; + } + } + } + return histogram; +} + +// Used to calculate each region's `order.' +// A space-filling curve will order the regions so that +// nearby points stay close in order. A Hilbert curve might +// be better, but a Morton (Z)-order curve is easy to compute, +// because it's just interleaving binary bits, so this +// function interleaves with 0's so that the X and Y +// dimensions can then be OR'ed together. +static uint64_t interleave(uint32_t x) { + uint64_t i = x; + i = (i ^ (i << 16)) & 0x0000ffff0000ffff; + i = (i ^ (i << 8)) & 0x00ff00ff00ff00ff; + i = (i ^ (i << 4)) & 0x0f0f0f0f0f0f0f0f; + i = (i ^ (i << 2)) & 0x3333333333333333; + i = (i ^ (i << 1)) & 0x5555555555555555; + return i; +} + +// for each segment type, find the nearest nodes to an equally spaced grid of points +// within the bounding box for that segment type +std::vector find_sample_regions(int num_segments) { + vtr::ScopedStartFinishTimer timer("finding sample regions"); + std::vector sample_regions; + auto& device_ctx = g_vpr_ctx.device(); + auto& rr_nodes = device_ctx.rr_nodes; + std::vector> segment_counts(num_segments); + + // compute bounding boxes for each segment type + std::vector> bounding_box_for_segment(num_segments, vtr::Rect()); + for (size_t i = 0; i < rr_nodes.size(); i++) { + auto& node = rr_nodes[i]; + if (node.type() != CHANX && node.type() != CHANY) continue; + if (node.capacity() == 0 || node.num_edges() == 0) continue; + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; + + VTR_ASSERT(seg_index != OPEN); + VTR_ASSERT(seg_index < num_segments); + + bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(i)); + } + + // initialize counts + for (int seg = 0; seg < num_segments; seg++) { + const auto& box = bounding_box_for_segment[seg]; + segment_counts[seg] = vtr::Matrix({size_t(box.width()), size_t(box.height())}, 0); + } + + // count sample points + for (size_t i = 0; i < rr_nodes.size(); i++) { + auto& node = rr_nodes[i]; + if (node.type() != CHANX && node.type() != CHANY) continue; + if (node.capacity() == 0 || node.num_edges() == 0) continue; + int x = rr_nodes.node_xlow(RRNodeId(i)); + int y = rr_nodes.node_ylow(RRNodeId(i)); + + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; + segment_counts[seg_index][x][y] += 1; + + VTR_ASSERT(seg_index != OPEN); + VTR_ASSERT(seg_index < num_segments); + } + + // select sample points + for (int i = 0; i < num_segments; i++) { + const auto& counts = segment_counts[i]; + const auto& bounding_box = bounding_box_for_segment[i]; + if (bounding_box.empty()) continue; + for (int y = 0; y < SAMPLE_GRID_SIZE; y++) { + for (int x = 0; x < SAMPLE_GRID_SIZE; x++) { + vtr::Rect window = sample_window(bounding_box, x, y, SAMPLE_GRID_SIZE); + if (window.empty()) continue; + + auto histogram = count_histogram(window, segment_counts[i]); + SampleRegion region = { + /* .segment_type = */ i, + /* .grid_location = */ vtr::Point(x, y), + /* .points = */ choose_points(counts, window, quantile(histogram, kSamplingCountLowerQuantile), quantile(histogram, kSamplingCountUpperQuantile)), + /* .order = */ 0}; + if (!region.points.empty()) { + /* In order to improve caching, the list of sample points are + * sorted to keep points that are nearby on the Euclidean plane also + * nearby in the vector of sample points. + * + * This means subsequent expansions on the same thread are likely + * to cover a similar set of nodes, so they are more likely to be + * cached. This improves performance by about 7%, which isn't a lot, + * but not a bad improvement for a few lines of code. */ + vtr::Point location = region.points[0].location; + + // interleave bits of X and Y for a Z-curve ordering. + region.order = interleave(location.x()) | (interleave(location.y()) << 1); + + sample_regions.push_back(region); + } + } + } + } + + // sort regions + std::sort(sample_regions.begin(), sample_regions.end(), + [](const SampleRegion& a, const SampleRegion& b) { + return a.order < b.order; + }); + + // build an index of sample points on segment type and location + std::map, SamplePoint*> sample_point_index; + for (auto& region : sample_regions) { + for (auto& point : region.points) { + sample_point_index[std::make_tuple(region.segment_type, point.location.x(), point.location.y())] = &point; + } + } + + // collect the node indices for each segment type at the selected sample points + for (size_t i = 0; i < rr_nodes.size(); i++) { + auto& node = rr_nodes[i]; + if (node.type() != CHANX && node.type() != CHANY) continue; + if (node.capacity() == 0 || node.num_edges() == 0) continue; + + int x = rr_nodes.node_xlow(RRNodeId(i)); + int y = rr_nodes.node_ylow(RRNodeId(i)); + + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; + + VTR_ASSERT(seg_index != OPEN); + VTR_ASSERT(seg_index < num_segments); + + auto point = sample_point_index.find(std::make_tuple(seg_index, x, y)); + if (point != sample_point_index.end()) { + point->second->nodes.push_back(i); + } + } + + return sample_regions; +} diff --git a/vpr/src/route/router_lookahead_sampling.h b/vpr/src/route/router_lookahead_sampling.h new file mode 100644 index 00000000000..6668792e9c6 --- /dev/null +++ b/vpr/src/route/router_lookahead_sampling.h @@ -0,0 +1,35 @@ +#ifndef ROUTER_LOOKAHEAD_SAMPLING_H +#define ROUTER_LOOKAHEAD_SAMPLING_H + +#include +#include "vtr_geometry.h" +#include "globals.h" + +// a sample point for a segment type, contains all segments at the VPR location +struct SamplePoint { + // canonical location + vtr::Point location; + + // nodes to expand + std::vector nodes; +}; + +struct SampleRegion { + // all nodes in `points' have this segment type + int segment_type; + + // location on the sample grid + vtr::Point grid_location; + + // locations to try + // The computation will keep expanding each of the points + // until a number of paths (segment -> connection box) are found. + std::vector points; + + // used to sort the regions to improve caching + uint64_t order; +}; + +std::vector find_sample_regions(int num_segments); + +#endif