|
| 1 | +#include "router_lookahead_sampling.h" |
| 2 | + |
| 3 | +#include <vector> |
| 4 | + |
| 5 | +#include "globals.h" |
| 6 | +#include "vtr_math.h" |
| 7 | +#include "vtr_geometry.h" |
| 8 | +#include "vtr_time.h" |
| 9 | + |
| 10 | +// Sample based an NxN grid of starting segments, where N = SAMPLE_GRID_SIZE |
| 11 | +static constexpr int SAMPLE_GRID_SIZE = 2; |
| 12 | + |
| 13 | +// quantiles (like percentiles but 0-1) of segment count to use as a selection criteria |
| 14 | +// choose locations with higher, but not extreme, counts of each segment type |
| 15 | +static constexpr double kSamplingCountLowerQuantile = 0.5; |
| 16 | +static constexpr double kSamplingCountUpperQuantile = 0.7; |
| 17 | + |
| 18 | +// also known as the L1 norm |
| 19 | +static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b) { |
| 20 | + return abs(b.x() - a.x()) + abs(b.y() - a.y()); |
| 21 | +} |
| 22 | + |
| 23 | +// the smallest bounding box containing a node |
| 24 | +static vtr::Rect<int> bounding_box_for_node(int node_ind) { |
| 25 | + auto& device_ctx = g_vpr_ctx.device(); |
| 26 | + auto& rr_graph = device_ctx.rr_nodes; |
| 27 | + int x = rr_graph.node_xlow(RRNodeId(node_ind)); |
| 28 | + int y = rr_graph.node_ylow(RRNodeId(node_ind)); |
| 29 | + |
| 30 | + return vtr::Rect<int>(vtr::Point<int>(x, y)); |
| 31 | +} |
| 32 | + |
| 33 | +static vtr::Rect<int> sample_window(const vtr::Rect<int>& bounding_box, int sx, int sy, int n) { |
| 34 | + return vtr::Rect<int>(sample(bounding_box, sx, sy, n), |
| 35 | + sample(bounding_box, sx + 1, sy + 1, n)); |
| 36 | +} |
| 37 | + |
| 38 | +static std::vector<SamplePoint> choose_points(const vtr::Matrix<int>& counts, |
| 39 | + const vtr::Rect<int>& window, |
| 40 | + int min_count, |
| 41 | + int max_count) { |
| 42 | + VTR_ASSERT(min_count <= max_count); |
| 43 | + std::vector<SamplePoint> points; |
| 44 | + for (int y = window.ymin(); y < window.ymax(); y++) { |
| 45 | + for (int x = window.xmin(); x < window.xmax(); x++) { |
| 46 | + if (counts[x][y] >= min_count && counts[x][y] <= max_count) { |
| 47 | + points.push_back(SamplePoint{/* .location = */ vtr::Point<int>(x, y), |
| 48 | + /* .nodes = */ {}}); |
| 49 | + } |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + vtr::Point<int> center = sample(window, 1, 1, 2); |
| 54 | + |
| 55 | + // sort by distance from center |
| 56 | + std::sort(points.begin(), points.end(), |
| 57 | + [&](const SamplePoint& a, const SamplePoint& b) { |
| 58 | + return manhattan_distance(a.location, center) < manhattan_distance(b.location, center); |
| 59 | + }); |
| 60 | + |
| 61 | + return points; |
| 62 | +} |
| 63 | + |
| 64 | +// histogram is a map from segment count to number of locations having that count |
| 65 | +static int quantile(const std::map<int, int>& histogram, float ratio) { |
| 66 | + if (histogram.empty()) { |
| 67 | + return 0; |
| 68 | + } |
| 69 | + int sum = 0; |
| 70 | + for (const auto& entry : histogram) { |
| 71 | + sum += entry.second; |
| 72 | + } |
| 73 | + int limit = std::ceil(sum * ratio); |
| 74 | + for (const auto& entry : histogram) { |
| 75 | + limit -= entry.second; |
| 76 | + if (limit <= 0) { |
| 77 | + return entry.first; |
| 78 | + } |
| 79 | + } |
| 80 | + return 0; |
| 81 | +} |
| 82 | + |
| 83 | +// select a good number of segments to find |
| 84 | +static std::map<int, int> count_histogram(const vtr::Rect<int>& box, const vtr::Matrix<int>& counts) { |
| 85 | + std::map<int, int> histogram; |
| 86 | + for (int y = box.ymin(); y < box.ymax(); y++) { |
| 87 | + for (int x = box.xmin(); x < box.xmax(); x++) { |
| 88 | + int count = counts[x][y]; |
| 89 | + if (count > 0) { |
| 90 | + ++histogram[count]; |
| 91 | + } |
| 92 | + } |
| 93 | + } |
| 94 | + return histogram; |
| 95 | +} |
| 96 | + |
| 97 | +// Used to calculate each region's `order.' |
| 98 | +// A space-filling curve will order the regions so that |
| 99 | +// nearby points stay close in order. A Hilbert curve might |
| 100 | +// be better, but a Morton (Z)-order curve is easy to compute, |
| 101 | +// because it's just interleaving binary bits, so this |
| 102 | +// function interleaves with 0's so that the X and Y |
| 103 | +// dimensions can then be OR'ed together. |
| 104 | +static uint64_t interleave(uint32_t x) { |
| 105 | + uint64_t i = x; |
| 106 | + i = (i ^ (i << 16)) & 0x0000ffff0000ffff; |
| 107 | + i = (i ^ (i << 8)) & 0x00ff00ff00ff00ff; |
| 108 | + i = (i ^ (i << 4)) & 0x0f0f0f0f0f0f0f0f; |
| 109 | + i = (i ^ (i << 2)) & 0x3333333333333333; |
| 110 | + i = (i ^ (i << 1)) & 0x5555555555555555; |
| 111 | + return i; |
| 112 | +} |
| 113 | + |
| 114 | +// for each segment type, find the nearest nodes to an equally spaced grid of points |
| 115 | +// within the bounding box for that segment type |
| 116 | +std::vector<SampleRegion> find_sample_regions(int num_segments) { |
| 117 | + vtr::ScopedStartFinishTimer timer("finding sample regions"); |
| 118 | + std::vector<SampleRegion> sample_regions; |
| 119 | + auto& device_ctx = g_vpr_ctx.device(); |
| 120 | + auto& rr_nodes = device_ctx.rr_nodes; |
| 121 | + std::vector<vtr::Matrix<int>> segment_counts(num_segments); |
| 122 | + |
| 123 | + // compute bounding boxes for each segment type |
| 124 | + std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>()); |
| 125 | + for (size_t i = 0; i < rr_nodes.size(); i++) { |
| 126 | + auto& node = rr_nodes[i]; |
| 127 | + if (node.type() != CHANX && node.type() != CHANY) continue; |
| 128 | + if (node.capacity() == 0 || node.num_edges() == 0) continue; |
| 129 | + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; |
| 130 | + |
| 131 | + VTR_ASSERT(seg_index != OPEN); |
| 132 | + VTR_ASSERT(seg_index < num_segments); |
| 133 | + |
| 134 | + bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(i)); |
| 135 | + } |
| 136 | + |
| 137 | + // initialize counts |
| 138 | + for (int seg = 0; seg < num_segments; seg++) { |
| 139 | + const auto& box = bounding_box_for_segment[seg]; |
| 140 | + segment_counts[seg] = vtr::Matrix<int>({size_t(box.width()), size_t(box.height())}, 0); |
| 141 | + } |
| 142 | + |
| 143 | + // count sample points |
| 144 | + for (size_t i = 0; i < rr_nodes.size(); i++) { |
| 145 | + auto& node = rr_nodes[i]; |
| 146 | + if (node.type() != CHANX && node.type() != CHANY) continue; |
| 147 | + if (node.capacity() == 0 || node.num_edges() == 0) continue; |
| 148 | + int x = rr_nodes.node_xlow(RRNodeId(i)); |
| 149 | + int y = rr_nodes.node_ylow(RRNodeId(i)); |
| 150 | + |
| 151 | + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; |
| 152 | + segment_counts[seg_index][x][y] += 1; |
| 153 | + |
| 154 | + VTR_ASSERT(seg_index != OPEN); |
| 155 | + VTR_ASSERT(seg_index < num_segments); |
| 156 | + } |
| 157 | + |
| 158 | + // select sample points |
| 159 | + for (int i = 0; i < num_segments; i++) { |
| 160 | + const auto& counts = segment_counts[i]; |
| 161 | + const auto& bounding_box = bounding_box_for_segment[i]; |
| 162 | + if (bounding_box.empty()) continue; |
| 163 | + for (int y = 0; y < SAMPLE_GRID_SIZE; y++) { |
| 164 | + for (int x = 0; x < SAMPLE_GRID_SIZE; x++) { |
| 165 | + vtr::Rect<int> window = sample_window(bounding_box, x, y, SAMPLE_GRID_SIZE); |
| 166 | + if (window.empty()) continue; |
| 167 | + |
| 168 | + auto histogram = count_histogram(window, segment_counts[i]); |
| 169 | + SampleRegion region = { |
| 170 | + /* .segment_type = */ i, |
| 171 | + /* .grid_location = */ vtr::Point<int>(x, y), |
| 172 | + /* .points = */ choose_points(counts, window, quantile(histogram, kSamplingCountLowerQuantile), quantile(histogram, kSamplingCountUpperQuantile)), |
| 173 | + /* .order = */ 0}; |
| 174 | + if (!region.points.empty()) { |
| 175 | + /* In order to improve caching, the list of sample points are |
| 176 | + * sorted to keep points that are nearby on the Euclidean plane also |
| 177 | + * nearby in the vector of sample points. |
| 178 | + * |
| 179 | + * This means subsequent expansions on the same thread are likely |
| 180 | + * to cover a similar set of nodes, so they are more likely to be |
| 181 | + * cached. This improves performance by about 7%, which isn't a lot, |
| 182 | + * but not a bad improvement for a few lines of code. */ |
| 183 | + vtr::Point<int> location = region.points[0].location; |
| 184 | + |
| 185 | + // interleave bits of X and Y for a Z-curve ordering. |
| 186 | + region.order = interleave(location.x()) | (interleave(location.y()) << 1); |
| 187 | + |
| 188 | + sample_regions.push_back(region); |
| 189 | + } |
| 190 | + } |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + // sort regions |
| 195 | + std::sort(sample_regions.begin(), sample_regions.end(), |
| 196 | + [](const SampleRegion& a, const SampleRegion& b) { |
| 197 | + return a.order < b.order; |
| 198 | + }); |
| 199 | + |
| 200 | + // build an index of sample points on segment type and location |
| 201 | + std::map<std::tuple<int, int, int>, SamplePoint*> sample_point_index; |
| 202 | + for (auto& region : sample_regions) { |
| 203 | + for (auto& point : region.points) { |
| 204 | + sample_point_index[std::make_tuple(region.segment_type, point.location.x(), point.location.y())] = &point; |
| 205 | + } |
| 206 | + } |
| 207 | + |
| 208 | + // collect the node indices for each segment type at the selected sample points |
| 209 | + for (size_t i = 0; i < rr_nodes.size(); i++) { |
| 210 | + auto& node = rr_nodes[i]; |
| 211 | + if (node.type() != CHANX && node.type() != CHANY) continue; |
| 212 | + if (node.capacity() == 0 || node.num_edges() == 0) continue; |
| 213 | + |
| 214 | + int x = rr_nodes.node_xlow(RRNodeId(i)); |
| 215 | + int y = rr_nodes.node_ylow(RRNodeId(i)); |
| 216 | + |
| 217 | + int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index; |
| 218 | + |
| 219 | + VTR_ASSERT(seg_index != OPEN); |
| 220 | + VTR_ASSERT(seg_index < num_segments); |
| 221 | + |
| 222 | + auto point = sample_point_index.find(std::make_tuple(seg_index, x, y)); |
| 223 | + if (point != sample_point_index.end()) { |
| 224 | + point->second->nodes.push_back(i); |
| 225 | + } |
| 226 | + } |
| 227 | + |
| 228 | + return sample_regions; |
| 229 | +} |
0 commit comments