diff --git a/.gitignore b/.gitignore index 3582f2ee54d..ff6abf7f9a8 100644 --- a/.gitignore +++ b/.gitignore @@ -155,3 +155,8 @@ tags cmake-build-debug cmake-build-release /.metadata/ + +# +# Clangd +# +compile_commands.json diff --git a/doc/src/api/vpr/route_tree.rst b/doc/src/api/vpr/route_tree.rst index 7be12dda86a..8515381bac7 100644 --- a/doc/src/api/vpr/route_tree.rst +++ b/doc/src/api/vpr/route_tree.rst @@ -20,3 +20,10 @@ RouteTreeNode .. doxygenclass:: RouteTreeNode :project: vpr :members: + +RTExploredNode +------------- + +.. doxygenclass:: RTExploredNode + :project: vpr + :members: diff --git a/doc/src/api/vprinternals/router_heap.rst b/doc/src/api/vprinternals/router_heap.rst index cb652811e6c..1d213379a89 100644 --- a/doc/src/api/vprinternals/router_heap.rst +++ b/doc/src/api/vprinternals/router_heap.rst @@ -2,30 +2,13 @@ Router Heap ============== -t_heap ----------- -.. doxygenstruct:: t_heap - :project: vpr - :members: - HeapInterface ---------- .. doxygenclass:: HeapInterface :project: vpr :members: -HeapStorage ----------- -.. doxygenclass:: HeapStorage - :project: vpr - :members: - -KAryHeap +DAryHeap ---------- -.. doxygenclass:: KAryHeap +.. doxygenclass:: DAryHeap :project: vpr - -FourAryHeap ----------- -.. doxygenclass:: FourAryHeap - :project: vpr \ No newline at end of file diff --git a/doc/src/vpr/command_line_usage.rst b/doc/src/vpr/command_line_usage.rst index be5724431f4..16d02691001 100644 --- a/doc/src/vpr/command_line_usage.rst +++ b/doc/src/vpr/command_line_usage.rst @@ -1396,7 +1396,7 @@ The following options are only valid when the router is in timing-driven mode (t **Default:** ``safe`` -.. option:: --routing_budgets_algorithm { disable | minimax | scale_delay } +.. option:: --routing_budgets_algorithm { disable | minimax | yoyo | scale_delay } .. warning:: Experimental @@ -1404,7 +1404,9 @@ The following options are only valid when the router is in timing-driven mode (t ``disable`` is used to disable the budget feature. This uses the default VPR and ignores hold time constraints. - ``minimax`` sets the minimum and maximum budgets by distributing the long path and short path slacks depending on the the current delay values. This uses the routing cost valleys and Minimax-PERT algorithm :cite:`minimax_pert,RCV_algorithm`. + ``minimax`` sets the minimum and maximum budgets by distributing the long path and short path slacks depending on the the current delay values. This uses the Minimax-PERT algorithm :cite:`minimax_pert`. + + ``yoyo`` allocates budgets using minimax algorithm (as above), and enables hold slack resolution in the router using the Routing Cost Valleys (RCV) algorithm :cite:`RCV_algorithm`. ``scale_delay`` has the minimum budgets set to 0 and the maximum budgets is set to the delay of a net scaled by the pin criticality (net delay/pin criticality). diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp index 7b9c170fbe7..d0a6ff6034e 100644 --- a/utils/route_diag/src/main.cpp +++ b/utils/route_diag/src/main.cpp @@ -114,7 +114,7 @@ static void do_one_route(const Netlist<>& net_list, is_flat); enable_router_debug(router_opts, ParentNetId(), sink_node, 1, &router); bool found_path; - t_heap cheapest; + RTExploredNode cheapest; ConnectionParameters conn_params(ParentNetId::INVALID(), -1, false, diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp index 3af8faa8713..f7af0074b55 100644 --- a/vpr/src/base/ShowSetup.cpp +++ b/vpr/src/base/ShowSetup.cpp @@ -468,9 +468,6 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) { case e_heap_type::FOUR_ARY_HEAP: VTR_LOG("FOUR_ARY_HEAP\n"); break; - case e_heap_type::BUCKET_HEAP_APPROXIMATION: - VTR_LOG("BUCKET_HEAP_APPROXIMATION\n"); - break; default: VPR_FATAL_ERROR(VPR_ERROR_UNKNOWN, "Unknown router_heap\n"); } diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 97b3c9babaa..7ddaa08feff 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -272,7 +272,7 @@ struct RouteBudgetsAlgorithm { } std::vector default_choices() { - return {"minimax", "scale_delay", "disable"}; + return {"minimax", "yoyo", "scale_delay", "disable"}; } }; @@ -1063,8 +1063,6 @@ struct ParseRouterHeap { conv_value.set_value(e_heap_type::BINARY_HEAP); else if (str == "four_ary") conv_value.set_value(e_heap_type::FOUR_ARY_HEAP); - else if (str == "bucket") - conv_value.set_value(e_heap_type::BUCKET_HEAP_APPROXIMATION); else { std::stringstream msg; msg << "Invalid conversion from '" << str << "' to e_heap_type (expected one of: " << argparse::join(default_choices(), ", ") << ")"; @@ -1077,11 +1075,9 @@ struct ParseRouterHeap { ConvertedValue conv_value; if (val == e_heap_type::BINARY_HEAP) conv_value.set_value("binary"); - else if (val == e_heap_type::FOUR_ARY_HEAP) - conv_value.set_value("four_ary"); else { - VTR_ASSERT(val == e_heap_type::BUCKET_HEAP_APPROXIMATION); - conv_value.set_value("bucket"); + VTR_ASSERT(val == e_heap_type::FOUR_ARY_HEAP); + conv_value.set_value("four_ary"); } return conv_value; } diff --git a/vpr/src/base/read_route.cpp b/vpr/src/base/read_route.cpp index b137c007327..d2d3bc14d54 100644 --- a/vpr/src/base/read_route.cpp +++ b/vpr/src/base/read_route.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include "atom_netlist.h" #include "atom_netlist_utils.h" @@ -46,7 +45,6 @@ #include "route_common.h" #include "route_tree.h" #include "read_route.h" -#include "four_ary_heap.h" #include "old_traceback.h" diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index d2bc5f03da7..73e52554e90 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -1607,7 +1607,10 @@ constexpr bool is_src_sink(e_rr_type type) { return (type == SOURCE || type == S * is being used. * @param backward_path_cost Total cost of the path up to and including this * node. - * @param occ The current occupancy of the associated rr node + * @param R_upstream Upstream resistance to ground from this node in the current + * path search (connection routing), including the resistance + * of the node itself (device_ctx.rr_nodes[index].R). + * @param occ The current occupancy of the associated rr node. */ struct t_rr_node_route_inf { RREdgeId prev_edge; @@ -1615,6 +1618,7 @@ struct t_rr_node_route_inf { float acc_cost; float path_cost; float backward_path_cost; + float R_upstream; public: //Accessors short occ() const { return occ_; } diff --git a/vpr/src/route/binary_heap.cpp b/vpr/src/route/binary_heap.cpp deleted file mode 100644 index 8053960d955..00000000000 --- a/vpr/src/route/binary_heap.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#include "binary_heap.h" -#include "vtr_log.h" - -// child indices of a heap -static inline size_t left(size_t i) { return i << 1; } -static inline size_t right(size_t i) { return (i << 1) + 1; } - -inline size_t BinaryHeap::parent(size_t i) const { return i >> 1; } - -bool BinaryHeap::is_valid() const { - if (heap_.empty()) { - return false; - } - - for (size_t i = 1; i <= heap_tail_ >> 1; ++i) { - if (left(i) < heap_tail_ && heap_[left(i)].cost < heap_[i].cost) return false; - if (right(i) < heap_tail_ && heap_[right(i)].cost < heap_[i].cost) return false; - } - - return true; -} - -t_heap* BinaryHeap::get_heap_head() { - /* Returns a pointer to the smallest element on the heap, or NULL if the * - * heap is empty. Invalid (index == OPEN) entries on the heap are never * - * returned -- they are just skipped over. */ - - t_heap* cheapest; - size_t hole, child; - - do { - if (heap_tail_ == 1) { /* Empty heap. */ - VTR_LOG_WARN("Empty heap occurred in get_heap_head.\n"); - return (nullptr); - } - - cheapest = heap_[1].elem_ptr; - - hole = 1; - child = 2; - - --heap_tail_; - - while (child < heap_tail_) { - if (heap_[child + 1].cost < heap_[child].cost) - ++child; // become right child - - heap_[hole] = heap_[child]; - hole = child; - child = left(child); - } - - sift_up(hole, heap_[heap_tail_]); - } while (!cheapest->index.is_valid()); /* Get another one if invalid entry. */ - - return (cheapest); -} - -// make a heap rooted at index hole by **sifting down** in O(lgn) time -void BinaryHeap::sift_down(size_t hole) { - heap_elem head{heap_[hole]}; - size_t child{left(hole)}; - - while (child < heap_tail_) { - if (child + 1 < heap_tail_ && heap_[child + 1].cost < heap_[child].cost) - ++child; - - if (heap_[child].cost < head.cost) { - heap_[hole] = heap_[child]; - hole = child; - child = left(child); - } else - break; - } - - heap_[hole] = head; -} \ No newline at end of file diff --git a/vpr/src/route/binary_heap.h b/vpr/src/route/binary_heap.h deleted file mode 100644 index 2857200c0a3..00000000000 --- a/vpr/src/route/binary_heap.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef VTR_BINARY_HEAP_H -#define VTR_BINARY_HEAP_H - -#include "k_ary_heap.h" -#include - -class BinaryHeap : public KAryHeap { - public: - bool is_valid() const final; - t_heap* get_heap_head() final; - - private: - void sift_down(size_t hole) final; - size_t parent(size_t i) const final; -}; - -#endif //VTR_BINARY_HEAP_H diff --git a/vpr/src/route/bucket.cpp b/vpr/src/route/bucket.cpp deleted file mode 100644 index 1804a74e4ca..00000000000 --- a/vpr/src/route/bucket.cpp +++ /dev/null @@ -1,550 +0,0 @@ -#include "bucket.h" - -#include -#include "rr_graph_fwd.h" -#include "vtr_log.h" -#include "vpr_error.h" - -/* Bucket spacing algorithm: - * - * The size in cost each bucket consumes is a fixed width determined by - * conv_factor_. The bucket index equation is simply: - * - * bucket index = cost * conv_factor_ - * - * The default conv_factor_ is 1e12, e.g. each bucket is 1 picosecond wide. - * - * There two reasons to change conv_factor_: - * - The maximum cost item in the bucket would require too many buckets in - * heap_, and would cause memory usage to climb higher than desired. - * - The front bucket contains too many items, making the pop operation too - * cost insenstive. - * - * The other consideration is to avoid rescaling the buckets too often, as - * that operation consumes time without delivering useful work. - * - * To prevent rescaling constantly, the bucket heap determines if a rescaling - * is needed based on two conditions: - * - * - The maximum item cost (max_cost_) would require a bucket index that is - * greater than max_buckets_. When this occurs, a rescaling is done to - * make the width of the buckets larger so that the cost index for the - * max_cost_ item fits within max_buckets_. - * - * - A larger max_buckets_ results in more memory consumption, but - * accomidates a wider range of items without needing to rescale. - * - * - The number of items in the first bucket exceeds kIncreaseFocusLimit. In - * this case, the bucket heap will shrink the width of the buckets so that - * the number of entries in the first bucket drops below - * kIncreaseFocusLimit. - * - * In both of the above cases, rescaling is determined by the following - * (simplified) equation: - * - * conv_factor_ = division_scaling_ / max_cost_ - * - * The default division_scaling_ is kInitialDivisionScaling (50k). This - * can be read as using 50k buckets to evenly divided based on the - * maximum cost. For example, if max cost = 100 ns, then each bucket would - * be 2 picosecond wide. - * - * When the number of elements in the first bucket exceeds - * kIncreaseFocusLimit, division_scaling_ is multiplied by two, effectively - * halving the bucket size for a given max_cost_. In addition max_buckets_ - * is also multiplied by two to result in a similiar rescaling rate as - * max_cost_ increases. - * - * This multiply by two logic could result in an unbounded memory consumption - * in the number of buckets. To limit this, the 2x scaling for - * division_scaling_ and max_buckets_ is limited such that max_buckets_ never - * exceeds kMaxMaxBuckets - * - */ - -// Initial bucket scaling. A larger division scaling results in smaller cost -// range per bucket. -static constexpr float kInitialDivisionScaling = 50000.f; -// Initial maximum number of buckets before bucket rescaling. -static constexpr ssize_t kInitialMaxBuckets = 1000000; -// If the division scaling results in more than kIncreaseFocusLimit elements -// in the first bucket, than division scaling is increased by 2x to try to -// lower the size of the first bucket. -// -// This is an attempt to dynamically scale the bucket widths to prevent the -// bucket heap from being too cost insenstive / imprecise. -// -// When the division scaling is increased by 2x, the maximum number of buckets -// also is increased by 2x to prevent excessive rescaling during runtime. -static constexpr size_t kIncreaseFocusLimit = 2048; -// To prevent unbounded division scaling, the 2x when the first bucket is too -// large is limited by kMaxMaxBuckets. If increasing the division scaling -// will result in max_buckets_ exceeding kMaxMaxBuckets, then division scaling -// will not be increased again. -static constexpr ssize_t kMaxMaxBuckets = 16000000; - -BucketItems::BucketItems() noexcept - : alloced_items_(0) - , num_heap_allocated_(0) - , heap_free_head_(nullptr) {} - -Bucket::Bucket() noexcept - : outstanding_items_(0) - , seed_(1231) - , heap_(nullptr) - , heap_size_(0) - , heap_head_(std::numeric_limits::max()) - , heap_tail_(0) - , conv_factor_(0.f) - , division_scaling_(kInitialDivisionScaling) - , max_buckets_(kInitialMaxBuckets) - , min_cost_(0.f) - , max_cost_(0.f) - , num_items_(0) - , max_index_(std::numeric_limits::max()) - , prune_limit_(std::numeric_limits::max()) - , prune_count_(0) - , front_head_(std::numeric_limits::max()) {} - -Bucket::~Bucket() { - free_all_memory(); -} - -void Bucket::init_heap(const DeviceGrid& grid) { - delete[] heap_; - heap_ = nullptr; - - heap_size_ = (grid.width() - 1) * (grid.height() - 1); - - heap_ = new BucketItem*[heap_size_]; - for (size_t i = 0; i < (size_t)heap_size_; i++) - heap_[i] = 0; - - heap_head_ = std::numeric_limits::max(); - front_head_ = std::numeric_limits::max(); - heap_tail_ = 0; - num_items_ = 0; - prune_count_ = 0; - - conv_factor_ = kDefaultConvFactor; - division_scaling_ = kInitialDivisionScaling; - max_buckets_ = kInitialMaxBuckets; - - min_cost_ = std::numeric_limits::max(); - max_cost_ = std::numeric_limits::min(); -} - -void Bucket::free_all_memory() { - delete[] heap_; - heap_ = nullptr; - - items_.free(); -} - -void Bucket::expand(size_t required_number_of_buckets) { - auto old_size = heap_size_; - heap_size_ = required_number_of_buckets * 2; - size_t i; - - std::vector temp(heap_, heap_ + old_size); - delete[] heap_; - heap_ = new BucketItem*[heap_size_]; - for (i = 0; i < old_size; i++) - heap_[i] = temp[i]; - for (i = temp.size(); i < heap_size_; i++) - heap_[i] = nullptr; -} - -void Bucket::verify() { - for (size_t bucket = heap_head_; bucket <= heap_tail_; ++bucket) { - for (BucketItem* data = heap_[bucket]; data != nullptr; - data = data->next_bucket) { - VTR_ASSERT(data->item.cost >= 0); - int bucket_idx = cost_to_int(data->item.cost); - if (bucket_idx != static_cast(bucket)) { - VPR_FATAL_ERROR(VPR_ERROR_ROUTE, - "Wrong bucket for cost %g bucket_idx %d bucket %zu conv_factor %g", - data->item.cost, bucket_idx, bucket, conv_factor_); - } - } - } -} - -void Bucket::empty_heap() { - VTR_ASSERT(outstanding_items_ == 0); - - if (heap_head_ != std::numeric_limits::max()) { - std::fill(heap_ + heap_head_, heap_ + heap_tail_ + 1, nullptr); - } - heap_head_ = std::numeric_limits::max(); - front_head_ = std::numeric_limits::max(); - heap_tail_ = 0; - num_items_ = 0; - prune_count_ = 0; - min_push_cost_.clear(); - - // Quickly reset all items to being free'd - items_.clear(); - - conv_factor_ = kDefaultConvFactor; - division_scaling_ = kInitialDivisionScaling; - max_buckets_ = kInitialMaxBuckets; - - min_cost_ = std::numeric_limits::max(); - max_cost_ = std::numeric_limits::min(); -} - -float Bucket::rescale_func() const { - // Choose a scaling factor that accomidates division_scaling_ buckets - // between min_cost_ and max_cost_. - // - // If min and max are close to each other, assume 3 orders of - // magnitude between min and max. The goal is to rescale less often - // when the larger costs haven't been seen yet. - // - // If min and max are at least 3 orders of magnitude apart, scale - // soley based on max cost. The goal at this point is to keep the - // number of buckets between division_scaling_ and division_scaling_*2. - return division_scaling_ / max_cost_ / std::max(1.f, 1000.f / (max_cost_ / min_cost_)); -} - -void Bucket::check_conv_factor() const { - VTR_ASSERT(cost_to_int(min_cost_) >= 0); - VTR_ASSERT(cost_to_int(max_cost_) >= 0); - VTR_ASSERT(cost_to_int(max_cost_) < max_buckets_); -} - -// Checks if the scaling factor for cost results in a reasonable -// number of buckets based on the maximum cost value seen. -// -// Target number of buckets is between 50k and 100k buckets. -// Default scaling is each bucket is around ~1 ps wide. -// -// Designs with scaled costs less than 100000 (e.g. 100 ns) shouldn't require -// a bucket resize. -void Bucket::check_scaling() { - float min_cost = min_cost_; - float max_cost = max_cost_; - VTR_ASSERT(max_cost != std::numeric_limits::min()); - if (min_cost == std::numeric_limits::max()) { - min_cost = max_cost; - } - auto min_bucket = cost_to_int(min_cost); - auto max_bucket = cost_to_int(max_cost); - - // If scaling is invalid or more than 100k buckets are needed, rescale. - if (min_bucket < 0 || max_bucket < 0 || max_bucket > max_buckets_) { - rescale(); - } -} - -void Bucket::rescale() { - conv_factor_ = rescale_func(); - check_conv_factor(); - front_head_ = std::numeric_limits::max(); - - // Reheap after adjusting scaling. - if (heap_head_ != std::numeric_limits::max()) { - std::vector reheap; - for (size_t bucket = heap_head_; bucket <= heap_tail_; ++bucket) { - for (BucketItem* item = heap_[bucket]; item != nullptr; item = item->next_bucket) { - reheap.push_back(item); - } - } - - std::fill(heap_ + heap_head_, heap_ + heap_tail_ + 1, nullptr); - heap_head_ = std::numeric_limits::max(); - heap_tail_ = 0; - - for (BucketItem* item : reheap) { - outstanding_items_ += 1; - push_back(&item->item); - } - } -} - -void Bucket::push_back(t_heap* hptr) { - VTR_ASSERT(outstanding_items_ > 0); - outstanding_items_ -= 1; - - float cost = hptr->cost; - if (!std::isfinite(cost)) { - BucketItem* item = reinterpret_cast(hptr); - items_.free_item(item); - return; - } - - if (!min_push_cost_.empty()) { - if (hptr->cost > min_push_cost_[size_t(hptr->index)]) { - BucketItem* item = reinterpret_cast(hptr); - items_.free_item(item); - return; - } - - min_push_cost_[size_t(hptr->index)] = hptr->cost; - } - - // Check to see if the range of costs observed by the heap has changed. - bool check_scale = false; - - // Exclude 0 cost from min_cost to provide useful scaling factor. - if (cost < min_cost_ && cost > 0) { - min_cost_ = cost; - check_scale = true; - } - if (cost > max_cost_) { - max_cost_ = cost; - check_scale = true; - } - - // Rescale the number and size of buckets if needed based on the new - // cost range. - if (check_scale) { - check_scaling(); - } - - // Which bucket should this go into? - auto int_cost = cost_to_int(cost); - - if (int_cost < 0) { - VTR_LOG_WARN("Cost is negative? cost = %g, bucket = %d\n", cost, int_cost); - int_cost = 0; - } - - size_t uint_cost = int_cost; - - // Is that bucket allocated? - if (uint_cost >= heap_size_) { - // Not enough buckets! - expand(uint_cost); - } - - // Insert into bucket - auto* prev = heap_[uint_cost]; - - // Static assert ensures that BucketItem::item is at offset 0, - // so this cast is safe. - BucketItem* item = reinterpret_cast(hptr); - - if (front_head_ == uint_cost) { - VTR_ASSERT(prev != nullptr); - front_list_.back()->next_bucket = item; - item->next_bucket = nullptr; - front_list_.push_back(item); - } else { - // Otherwise just add to front list. - item->next_bucket = prev; - heap_[uint_cost] = item; - } - - if (uint_cost < heap_head_) { - heap_head_ = uint_cost; - } - if (uint_cost > heap_tail_) { - heap_tail_ = uint_cost; - } - - num_items_ += 1; - if (num_items_ > prune_limit_) { - prune_heap(); - } -} - -t_heap* Bucket::get_heap_head() { - auto heap_head = heap_head_; - auto heap_tail = heap_tail_; - BucketItem** heap = heap_; - - // Check empty - if (heap_head == std::numeric_limits::max()) { - return nullptr; - } - - if (front_head_ != heap_head) { - front_list_.clear(); - for (BucketItem* item = heap[heap_head]; item != nullptr; item = item->next_bucket) { - front_list_.push_back(item); - VTR_ASSERT(front_list_.size() <= num_items_); - } - - // If the front bucket is more than kIncreaseFocusLimit, then change - // the division scaling to attempt to shrink the front bucket size. - // - // kMaxMaxBuckets prevents this scaling from continuing without limit. - if (front_list_.size() > kIncreaseFocusLimit && max_buckets_ < kMaxMaxBuckets) { - division_scaling_ *= 2; - max_buckets_ *= 2; - rescale(); - return get_heap_head(); - } - VTR_ASSERT(!front_list_.empty()); - front_head_ = heap_head; - VTR_ASSERT_DEBUG(check_front_list()); - } - - // Find first non-empty bucket - - // Randomly remove element - size_t count = fast_rand() % front_list_.size(); - BucketItem* item = front_list_[count]; - - // If the element is the back of the list, just remove it. - if (count + 1 == front_list_.size()) { - if (front_list_.size() > 1) { - // Stitch into list. - front_list_[count - 1]->next_bucket = nullptr; - } else { - // List is now empty. - heap[heap_head] = nullptr; - } - } else { - // This is not the back element, so swap the element we are popping - // with the back element, then remove it. - BucketItem* swap = front_list_.back(); - if (front_list_.size() > 2) { - front_list_[front_list_.size() - 2]->next_bucket = nullptr; - } - - // Update the front_list_ - front_list_[count] = swap; - - if (count == 0) { - // Swap this element to the front of the list. - heap[heap_head] = swap; - } else { - // Stitch this element back into the list - front_list_[count - 1]->next_bucket = swap; - } - - swap->next_bucket = item->next_bucket; - } - - front_list_.pop_back(); - - VTR_ASSERT_DEBUG(check_front_list()); - - // Update first non-empty bucket if bucket is now empty - if (heap[heap_head] == nullptr) { - heap_head += 1; - while (heap_head <= heap_tail && heap[heap_head] == nullptr) { - heap_head += 1; - } - - if (heap_head > heap_tail) { - heap_head = std::numeric_limits::max(); - } - - heap_head_ = heap_head; - front_head_ = std::numeric_limits::max(); - } - - outstanding_items_ += 1; - num_items_ -= 1; - return &item->item; -} - -void Bucket::print() { - for (size_t i = heap_head_; i < heap_tail_; ++i) { - if (heap_[heap_head_] != nullptr) { - VTR_LOG("B:%d ", i); - for (auto* item = heap_[i]; item != nullptr; item = item->next_bucket) { - VTR_LOG(" %e", item->item.cost); - } - } - } - VTR_LOG("\n"); -} - -void Bucket::set_prune_limit(size_t max_index, size_t prune_limit) { - if (prune_limit != std::numeric_limits::max()) { - VTR_ASSERT(max_index < prune_limit); - } - max_index_ = max_index; - prune_limit_ = prune_limit; -} - -void Bucket::prune_heap() { - std::vector best_heap_item(max_index_, nullptr); - - for (size_t bucket = heap_head_; bucket <= heap_tail_; ++bucket) { - for (BucketItem* item = heap_[bucket]; item != nullptr; item = item->next_bucket) { - auto idx = size_t(item->item.index); - VTR_ASSERT(idx < max_index_); - if (best_heap_item[idx] == nullptr - || best_heap_item[idx]->item.cost > item->item.cost) { - best_heap_item[idx] = item; - } - } - } - - min_cost_ = std::numeric_limits::max(); - max_cost_ = std::numeric_limits::min(); - for (size_t bucket = heap_head_; bucket <= heap_tail_; ++bucket) { - BucketItem* item = heap_[bucket]; - while (item != nullptr) { - BucketItem* next_item = item->next_bucket; - auto idx = size_t(item->item.index); - - if (best_heap_item[idx] != item) { - // This item isn't the cheapest, return it to the free list. - items_.free_item(item); - } else { - // Update min_cost_ and max_cost_ - if (min_cost_ > item->item.cost) { - min_cost_ = item->item.cost; - } - if (max_cost_ < item->item.cost) { - max_cost_ = item->item.cost; - } - } - - item = next_item; - } - } - - // Rescale heap after pruning. - conv_factor_ = rescale_func(); - check_conv_factor(); - - std::fill(heap_, heap_ + heap_size_, nullptr); - heap_head_ = std::numeric_limits::max(); - front_head_ = std::numeric_limits::max(); - front_list_.clear(); - heap_tail_ = 0; - num_items_ = 0; - prune_count_ += 1; - - // Re-heap the pruned elements. - for (BucketItem* item : best_heap_item) { - if (item == nullptr) { - continue; - } - - outstanding_items_ += 1; - push_back(&item->item); - } - - verify(); - - if (prune_count_ >= 1) { - // If pruning is happening repeatedly, start pruning at entry. - min_push_cost_.resize(max_index_, std::numeric_limits::infinity()); - } -} - -bool Bucket::check_front_list() const { - VTR_ASSERT(heap_head_ == front_head_); - size_t i = 0; - BucketItem* item = heap_[heap_head_]; - while (item != nullptr) { - if (front_list_.at(i) != item) { - VTR_LOG( - "front_list_ (%p size %zu) [%zu] %p != item %p\n", - front_list_.data(), front_list_.size(), i, front_list_[i], item); - VTR_ASSERT(front_list_[i] == item); - } - i += 1; - item = item->next_bucket; - } - return false; -} diff --git a/vpr/src/route/bucket.h b/vpr/src/route/bucket.h deleted file mode 100644 index b712d54eb7b..00000000000 --- a/vpr/src/route/bucket.h +++ /dev/null @@ -1,307 +0,0 @@ -#ifndef _BUCKET_H -#define _BUCKET_H - -#include - -#include "heap_type.h" -#include "vtr_log.h" - -struct BucketItem { - t_heap item; - BucketItem* next_bucket; -}; - -// Allocator for t_heap items. -// -// This allocator supports fast clearing by maintaining an explicit object -// pool and a free list. -// -// The object pool maintained in heap_items_. Whenever a new object is -// created from the chunk allocator heap_ch_ it is added to heap_items_. -// -// When a client of BucketItems requests an objet, BucketItems first checks -// if there are any objects in the object pool that have not been allocated -// to the client (alloced_items_ < heap_items_.size()). If there are objects -// in the object pool that have not been alloced, these are use first. -// -// Once all objects from the object pool have been released, future allocations -// come from the free list (maintained in heap_free_head_). When the free list -// is empty, only then is a new item allocated from the chunk allocator. -// -// BucketItems::clear provides a fast way to reset the object pool under the -// assumption that no live references exists. It does this by mark the free -// list as empty and the object pool as being fully returned to BucketItems. -// This operation is extremely fast compared with putting all elements back -// onto the free list, as it only involves setting 3 values. -// -// This faster clear **requires** that all previous references to t_heap objects -// are dropped prior to calling clear, otherwise a silent use-after-free issue -// may occur. However because BucketItems is used in conjunction with Bucket, -// and the typical use case is for the heap to be fully emptied between -// routing, this optimization is safe. -// -class BucketItems { - public: - BucketItems() noexcept; - - // Returns all allocated items to be available for allocation. - // - // This operation is only safe if all outstanding references are discarded. - // This is true when the router is starting on a new net, as all outstanding - // items should in the bucket will be cleared at the start of routing. - void clear() { - heap_free_head_ = nullptr; - num_heap_allocated_ = 0; - alloced_items_ = 0; - } - - // Iterators over all items ever allocated. This is not the list of alive - // items, but can be used for fast invalidation if needed. - std::vector::iterator begin() { - return heap_items_.begin(); - } - std::vector::iterator end() { - return heap_items_.end(); - } - - // Deallocate all items. Outstanding references to items will become - // invalid. - void free() { - // Free each individual heap item. - for (auto* item : heap_items_) { - vtr::chunk_delete(item, &heap_ch_); - } - heap_items_.clear(); - - /*free the memory chunks that were used by heap and linked f pointer */ - free_chunk_memory(&heap_ch_); - } - - // Allocate an item. This may cause a dynamic allocation if no previously - // allocated items are available. - BucketItem* alloc_item() { - BucketItem* temp_ptr; - if (alloced_items_ < heap_items_.size()) { - // Return an unused object from the object pool. - temp_ptr = heap_items_[alloced_items_++]; - } else { - if (heap_free_head_ == nullptr) { /* No elements on the free list */ - heap_free_head_ = vtr::chunk_new(&heap_ch_); - heap_free_head_->next_bucket = nullptr; - heap_items_.push_back(heap_free_head_); - alloced_items_ += 1; - } - - temp_ptr = heap_free_head_; - heap_free_head_ = heap_free_head_->next_bucket; - } - - num_heap_allocated_++; - - return temp_ptr; - } - - // Return a free'd item to be reallocated. - void free_item(BucketItem* hptr) { - hptr->next_bucket = heap_free_head_; - heap_free_head_ = hptr; - num_heap_allocated_--; - } - - // Number of outstanding allocations. - int num_heap_allocated() { - return num_heap_allocated_; - } - - private: - /* Vector of all items ever allocated. Used for full item iteration and - * for reuse after a `clear` invocation. */ - std::vector heap_items_; - - /* Tracks how many items from heap_items_ are in use. */ - size_t alloced_items_; - - /* Number of outstanding allocated items. */ - int num_heap_allocated_; - - /* For managing my own list of currently free heap data structures. */ - BucketItem* heap_free_head_; - - /* For keeping track of the sudo malloc memory for the heap*/ - vtr::t_chunk heap_ch_; -}; - -// Prority queue approximation using cost buckets and randomization. -// -// The cost buckets are each a linked lists for costs at kDefaultConvFactor -// intervals. Given that cost is approximately delay, each bucket contains ~1 -// picosecond (1e12) worth of items. -// -// Items are pushed into the linked list that matches their cost [0, 1) -// picosecond. When popping the Bucket, a random item in the cheapest bucket -// with items is returned. This randomization exists to prevent the router -// from following identical paths when operating with identical costs. -// Consider two parallel paths to a node. -// -// To ensure that number of buckets do not get too large, whenever is element -// is added to the heap, the number of buckets required is checked. If more -// than 100k buckets are required, then the width of the buckets (conv_factor_) -// are rescaled such that ~50k buckets are required. -// -// Important node: This approximation makes some assumptions about the -// structure of costs. -// -// Assumptions: -// 1. 0 is the minimum cost -// 2. Costs that are different by 0.1 % of the maximum cost are effectively -// equivilant -// 3. The cost function is roughly linear. -// -class Bucket : public HeapInterface { - public: - Bucket() noexcept; - ~Bucket(); - - t_heap* alloc() final { - outstanding_items_ += 1; - t_heap* hptr = &items_.alloc_item()->item; - return hptr; - } - void free(t_heap* hptr) final { - // Static assert ensures that BucketItem::item is at offset 0, - // so this cast is safe. - outstanding_items_ -= 1; - items_.free_item(reinterpret_cast(hptr)); - } - - // Allocate initial buckets for items. - void init_heap(const DeviceGrid& grid) final; - - // Deallocate memory for buckets. - void free_all_memory() final; - - // Empties all buckets of items. - // - // This does NOT call BucketItems::free_item on contained items. The - // assumption is that when Bucket::clear is called, BucketItems::clear - // is also called. - void empty_heap() final; - - // Push an item onto a bucket. - void push_back(t_heap* hptr) final; - - void add_to_heap(t_heap* hptr) final { - push_back(hptr); - } - - void build_heap() final { - } - - void set_prune_limit(size_t max_index, size_t prune_limit) final; - - // Pop an item from the cheapest non-empty bucket. - // - // Returns nullptr if empty. - t_heap* get_heap_head() final; - - // True if all buckets are empty. - bool is_empty_heap() const final { - return heap_head_ == std::numeric_limits::max(); - } - - bool is_valid() const final { - return true; - } - - // Sanity check state of buckets (e.g. all items within each bucket have - // a cost that matches their bucket index. - void verify(); - - // Print items contained in buckets. - void print(); - - private: - // Factor used to convert cost from float to int. Should be scaled to - // enable sufficent precision in bucketting. - static constexpr float kDefaultConvFactor = 1e12; - - // Convert cost from float to integer bucket id. - int cost_to_int(float cost) const { - return (int)(cost * conv_factor_); - } - - // Simple fast random function used for randomizing item selection on pop. - size_t fast_rand() { - seed_ = (0x234ab32a1 * seed_) ^ (0x12acbade); - return seed_; - } - - void check_scaling(); - void rescale(); - float rescale_func() const; - void check_conv_factor() const; - bool check_front_list() const; - - // Expand the number of buckets. - // - // Only call if insufficient buckets exist. - void expand(size_t required_number_of_buckets); - - void prune_heap(); - - BucketItems items_; /* Item storage */ - - /* Number of t_heap objects alloc'd but not returned to Bucket. - * Used to verify that clearing is safe. */ - ssize_t outstanding_items_; - - size_t seed_; /* Seed for fast_rand, should be non-zero */ - - BucketItem** heap_; /* Buckets for linked lists*/ - size_t heap_size_; /* Number of buckets */ - size_t heap_head_; /* First non-empty bucket */ - size_t heap_tail_; /* Last non-empty bucket */ - float conv_factor_; /* Cost bucket scaling factor. - * - * Larger conv_factor_ means each bucket is - * smaller. - * - * bucket index = cost * conv_factor_ - * - */ - float division_scaling_; /* Scaling factor used during rescaling. - * Larger division scaling results in larger - * conversion factor. - */ - ssize_t max_buckets_; /* Maximum number of buckets to control when to - * rescale. - */ - - float min_cost_; /* Smallest cost seen */ - float max_cost_; /* Largest cost seen */ - - size_t num_items_; /* Number of items in the bucket heap. */ - size_t max_index_; /* Maximum value for index. */ - size_t prune_limit_; /* Maximum number of elements this bucket heap should - * have before the heap self compacts. - */ - size_t prune_count_; /* The number of times the bucket heap has self - * compacted. - */ - std::vector min_push_cost_; /* Lowest push cost for each index. - * Only used if the bucket has - * self-pruned. - */ - - /* In order to quickly randomly pop an element from the front bucket, - * a list of items is made. - * - * front_head_ points to the heap_ index this array was constructed from. - * If front_head_ is size_t::max or doesn't equal heap_head_, front_list_ - * needs to be re-computed. - * */ - size_t front_head_; - std::vector front_list_; -}; - -#endif /* _BUCKET_H */ diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp index 210783648ad..23fedf6c262 100644 --- a/vpr/src/route/connection_router.cpp +++ b/vpr/src/route/connection_router.cpp @@ -2,9 +2,6 @@ #include #include "rr_graph.h" -#include "binary_heap.h" -#include "four_ary_heap.h" -#include "bucket.h" #include "rr_graph_fwd.h" static bool relevant_node_to_target(const RRGraphView* rr_graph, @@ -25,7 +22,7 @@ static void update_router_stats(RouterStats* router_stats, /** return tuple */ template -std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree( +std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -36,28 +33,33 @@ std::tuple ConnectionRouter::timing_driven_route_conne conn_params_ = &conn_params; bool retry = false; - t_heap* cheapest; - std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(rt_root, sink_node, cost_params, bounding_box); - - if (cheapest != nullptr) { - rcv_path_manager.update_route_tree_set(cheapest->path_data); - update_cheapest(cheapest); - t_heap out = *cheapest; - heap_.free(cheapest); + retry = timing_driven_route_connection_common_setup(rt_root, sink_node, cost_params, bounding_box); + + if (!std::isinf(rr_node_route_inf_[sink_node].path_cost)) { + // Only the `index`, `prev_edge`, and `rcv_path_backward_delay` fields of `out` + // are used after this function returns. + RTExploredNode out; + out.index = sink_node; + out.prev_edge = rr_node_route_inf_[sink_node].prev_edge; + if (rcv_path_manager.is_enabled()) { + out.rcv_path_backward_delay = rcv_path_data[sink_node]->backward_delay; + rcv_path_manager.update_route_tree_set(rcv_path_data[sink_node]); + rcv_path_manager.empty_heap(); + } heap_.empty_heap(); - rcv_path_manager.empty_heap(); return std::make_tuple(true, /*retry=*/false, out); } else { reset_path_costs(); - modified_rr_node_inf_.clear(); + clear_modified_rr_node_info(); heap_.empty_heap(); - return std::make_tuple(false, retry, t_heap()); + rcv_path_manager.empty_heap(); + return std::make_tuple(false, retry, RTExploredNode()); } } -/** Return */ +/** Return whether to retry with full bb */ template -std::tuple ConnectionRouter::timing_driven_route_connection_common_setup( +bool ConnectionRouter::timing_driven_route_connection_common_setup( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -72,18 +74,18 @@ std::tuple ConnectionRouter::timing_driven_route_connection if (heap_.is_empty_heap()) { VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); - return std::make_tuple(false, nullptr); + return false; } VTR_LOGV_DEBUG(router_debug_, " Routing to %d as normal net (BB: %d,%d,%d x %d,%d,%d)\n", sink_node, bounding_box.layer_min, bounding_box.xmin, bounding_box.ymin, bounding_box.layer_max, bounding_box.xmax, bounding_box.ymax); - t_heap* cheapest = timing_driven_route_connection_from_heap(sink_node, - cost_params, - bounding_box); + timing_driven_route_connection_from_heap(sink_node, + cost_params, + bounding_box); - if (cheapest == nullptr) { + if (std::isinf(rr_node_route_inf_[sink_node].path_cost)) { // No path found within the current bounding box. // // If the bounding box is already max size, just fail @@ -94,15 +96,15 @@ std::tuple ConnectionRouter::timing_driven_route_connection && bounding_box.layer_min == 0 && bounding_box.layer_max == (int)(grid_.get_num_layers() - 1)) { VTR_LOG("%s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); - return std::make_tuple(false, nullptr); + return false; } // Otherwise, leave unrouted and bubble up a signal to retry this net with a full-device bounding box VTR_LOG_WARN("No routing path for connection to sink_rr %d, leaving unrouted to retry later\n", sink_node); - return std::make_tuple(true, nullptr); + return true; } - return std::make_tuple(false, cheapest); + return false; } // Finds a path from the route tree rooted at rt_root to sink_node for a high fanout net. @@ -111,7 +113,7 @@ std::tuple ConnectionRouter::timing_driven_route_connection // which is spatially close to the sink is added to the heap. // Returns a tuple of */ template -std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree_high_fanout( +std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree_high_fanout( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -131,7 +133,7 @@ std::tuple ConnectionRouter::timing_driven_route_conne if (heap_.is_empty_heap()) { VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); - return std::make_tuple(false, false, t_heap()); + return std::make_tuple(false, false, RTExploredNode()); } VTR_LOGV_DEBUG(router_debug_, " Routing to %d as high fanout net (BB: %d,%d,%d x %d,%d,%d)\n", sink_node, @@ -139,12 +141,11 @@ std::tuple ConnectionRouter::timing_driven_route_conne high_fanout_bb.layer_max, high_fanout_bb.xmax, high_fanout_bb.ymax); bool retry_with_full_bb = false; - t_heap* cheapest; - cheapest = timing_driven_route_connection_from_heap(sink_node, - cost_params, - high_fanout_bb); + timing_driven_route_connection_from_heap(sink_node, + cost_params, + high_fanout_bb); - if (cheapest == nullptr) { + if (std::isinf(rr_node_route_inf_[sink_node].path_cost)) { //Found no path, that may be due to an unlucky choice of existing route tree sub-set, //try again with the full route tree to be sure this is not an artifact of high-fanout routing VTR_LOG_WARN("No routing path found in high-fanout mode for net %zu connection (to sink_rr %d), retrying with full route tree\n", size_t(conn_params.net_id_), sink_node); @@ -152,42 +153,41 @@ std::tuple ConnectionRouter::timing_driven_route_conne //Reset any previously recorded node costs so timing_driven_route_connection() //starts over from scratch. reset_path_costs(); - modified_rr_node_inf_.clear(); + clear_modified_rr_node_info(); - std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup(rt_root, - sink_node, - cost_params, - net_bounding_box); + retry_with_full_bb = timing_driven_route_connection_common_setup(rt_root, + sink_node, + cost_params, + net_bounding_box); } - if (cheapest == nullptr) { + if (std::isinf(rr_node_route_inf_[sink_node].path_cost)) { VTR_LOG("%s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); heap_.empty_heap(); rcv_path_manager.empty_heap(); - return std::make_tuple(false, retry_with_full_bb, t_heap()); + return std::make_tuple(false, retry_with_full_bb, RTExploredNode()); } - rcv_path_manager.update_route_tree_set(cheapest->path_data); - update_cheapest(cheapest); - - t_heap out = *cheapest; - heap_.free(cheapest); + RTExploredNode out; + out.index = sink_node; + out.prev_edge = rr_node_route_inf_[sink_node].prev_edge; + if (rcv_path_manager.is_enabled()) { + out.rcv_path_backward_delay = rcv_path_data[sink_node]->backward_delay; + rcv_path_manager.update_route_tree_set(rcv_path_data[sink_node]); + rcv_path_manager.empty_heap(); + } heap_.empty_heap(); - rcv_path_manager.empty_heap(); return std::make_tuple(true, retry_with_full_bb, out); } -//Finds a path to sink_node, starting from the elements currently in the heap. -// +// Finds a path to sink_node, starting from the elements currently in the heap. // This is the core maze routing routine. -// -// Returns either the last element of the path, or nullptr if no path is found template -t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeId sink_node, - const t_conn_cost_params& cost_params, - const t_bb& bounding_box) { +void ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeId sink_node, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box) { VTR_ASSERT_SAFE(heap_.is_valid()); if (heap_.is_empty_heap()) { //No source @@ -220,18 +220,20 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI target_bb.layer_min = rr_graph_->node_layer(RRNodeId(sink_node)); target_bb.layer_max = rr_graph_->node_layer(RRNodeId(sink_node)); - t_heap* cheapest = nullptr; - while (!heap_.is_empty_heap()) { - // cheapest t_heap in current route tree to be expanded on - cheapest = heap_.get_heap_head(); + // Start measuring path search time + std::chrono::steady_clock::time_point begin_time = std::chrono::steady_clock::now(); + + HeapNode cheapest; + while (heap_.try_pop(cheapest)) { + // inode with cheapest total cost in current route tree to be expanded on + const auto& [ new_total_cost, inode ] = cheapest; update_router_stats(router_stats_, false, - cheapest->index, + inode, rr_graph_); - RRNodeId inode = cheapest->index; VTR_LOGV_DEBUG(router_debug_, " Popping node %d (cost: %g)\n", - inode, cheapest->cost); + inode, new_total_cost); // Have we found the target? if (inode == sink_node) { @@ -239,40 +241,32 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI // This is then placed into the traceback so that the correct path is returned // TODO: This can be eliminated by modifying the actual traceback function in route_timing if (rcv_path_manager.is_enabled()) { - rcv_path_manager.insert_backwards_path_into_traceback(cheapest->path_data, cheapest->cost, cheapest->backward_path_cost, route_ctx); + rcv_path_manager.insert_backwards_path_into_traceback(rcv_path_data[inode], + rr_node_route_inf_[inode].path_cost, + rr_node_route_inf_[inode].backward_path_cost, + route_ctx); } VTR_LOGV_DEBUG(router_debug_, " Found target %8d (%s)\n", inode, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str()); break; } // If not, keep searching - timing_driven_expand_cheapest(cheapest, + timing_driven_expand_cheapest(inode, + new_total_cost, sink_node, cost_params, bounding_box, target_bb); - - rcv_path_manager.free_path_struct(cheapest->path_data); - heap_.free(cheapest); - cheapest = nullptr; } - if (router_debug_) { - //Update known path costs for nodes pushed but not popped, useful for debugging - empty_heap_annotating_node_route_inf(); - } - - if (cheapest == nullptr) { /* Impossible routing. No path for net. */ - VTR_LOGV_DEBUG(router_debug_, " Empty heap (no path found)\n"); - return nullptr; - } - - return cheapest; + // Stop measuring path search time + std::chrono::steady_clock::time_point end_time = std::chrono::steady_clock::now(); + path_search_cumulative_time += std::chrono::duration_cast(end_time - begin_time); } // Find shortest paths from specified route tree to all nodes in the RR graph template -vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_route_tree( +vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_route_tree( const RouteTreeNode& rt_root, const t_conn_cost_params& cost_params, const t_bb& bounding_box, @@ -296,14 +290,11 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho // // Since there is no single *target* node this uses Dijkstra's algorithm // with a modified exit condition (runs until heap is empty). -// -// Note that to re-use code used for the regular A*-based router we use a -// no-operation lookahead which always returns zero. template -vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_heap( +vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_heap( const t_conn_cost_params& cost_params, const t_bb& bounding_box) { - vtr::vector cheapest_paths(rr_nodes_.size()); + vtr::vector cheapest_paths(rr_nodes_.size()); VTR_ASSERT_SAFE(heap_.is_valid()); @@ -311,17 +302,20 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho VTR_LOGV_DEBUG(router_debug_, " Initial heap empty (no source)\n"); } - while (!heap_.is_empty_heap()) { - // cheapest t_heap in current route tree to be expanded on - t_heap* cheapest = heap_.get_heap_head(); + // Start measuring path search time + std::chrono::steady_clock::time_point begin_time = std::chrono::steady_clock::now(); + + HeapNode cheapest; + while (heap_.try_pop(cheapest)) { + // inode with cheapest total cost in current route tree to be expanded on + const auto& [ new_total_cost, inode ] = cheapest; update_router_stats(router_stats_, false, - cheapest->index, + inode, rr_graph_); - RRNodeId inode = cheapest->index; VTR_LOGV_DEBUG(router_debug_, " Popping node %d (cost: %g)\n", - inode, cheapest->cost); + inode, new_total_cost); // Since we want to find shortest paths to all nodes in the graph // we do not specify a target node. @@ -330,78 +324,80 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho // lookahead we can re-use the node exploration code from the regular router RRNodeId target_node = RRNodeId::INVALID(); - timing_driven_expand_cheapest(cheapest, + timing_driven_expand_cheapest(inode, + new_total_cost, target_node, cost_params, bounding_box, t_bb()); - if (cheapest_paths[inode].index == RRNodeId::INVALID() || cheapest_paths[inode].cost >= cheapest->cost) { - VTR_LOGV_DEBUG(router_debug_, " Better cost to node %d: %g (was %g)\n", inode, cheapest->cost, cheapest_paths[inode].cost); - cheapest_paths[inode] = *cheapest; + if (cheapest_paths[inode].index == RRNodeId::INVALID() || cheapest_paths[inode].total_cost >= new_total_cost) { + VTR_LOGV_DEBUG(router_debug_, " Better cost to node %d: %g (was %g)\n", inode, new_total_cost, cheapest_paths[inode].total_cost); + // Only the `index` and `prev_edge` fields of `cheapest_paths[inode]` are used after this function returns + cheapest_paths[inode].index = inode; + cheapest_paths[inode].prev_edge = rr_node_route_inf_[inode].prev_edge; } else { - VTR_LOGV_DEBUG(router_debug_, " Worse cost to node %d: %g (better %g)\n", inode, cheapest->cost, cheapest_paths[inode].cost); + VTR_LOGV_DEBUG(router_debug_, " Worse cost to node %d: %g (better %g)\n", inode, new_total_cost, cheapest_paths[inode].total_cost); } - - rcv_path_manager.free_path_struct(cheapest->path_data); - heap_.free(cheapest); } + // Stop measuring path search time + std::chrono::steady_clock::time_point end_time = std::chrono::steady_clock::now(); + path_search_cumulative_time += std::chrono::duration_cast(end_time - begin_time); + return cheapest_paths; } template -void ConnectionRouter::timing_driven_expand_cheapest(t_heap* cheapest, +void ConnectionRouter::timing_driven_expand_cheapest(RRNodeId from_node, + float new_total_cost, RRNodeId target_node, const t_conn_cost_params& cost_params, const t_bb& bounding_box, const t_bb& target_bb) { - RRNodeId inode = cheapest->index; - - t_rr_node_route_inf* route_inf = &rr_node_route_inf_[inode]; - float best_total_cost = route_inf->path_cost; - float best_back_cost = route_inf->backward_path_cost; - - float new_total_cost = cheapest->cost; - float new_back_cost = cheapest->backward_path_cost; - - /* I only re-expand a node if both the "known" backward cost is lower * - * in the new expansion (this is necessary to prevent loops from * - * forming in the routing and causing havoc) *and* the expected total * - * cost to the sink is lower than the old value. Different R_upstream * - * values could make a path with lower back_path_cost less desirable * - * than one with higher cost. Test whether or not I should disallow * - * re-expansion based on a higher total cost. */ - - if (best_total_cost > new_total_cost && ((rcv_path_manager.is_enabled()) || best_back_cost > new_back_cost)) { - // Explore from this node, since the current/new partial path has the best cost - // found so far - VTR_LOGV_DEBUG(router_debug_, " Better cost to %d\n", inode); + float best_total_cost = rr_node_route_inf_[from_node].path_cost; + if (best_total_cost == new_total_cost) { + // Explore from this node, since its total cost is exactly the same as + // the best total cost ever seen for this node. Otherwise, prune this node + // to reduce redundant work (i.e., unnecessary neighbor exploration). + // `new_total_cost` is used here as an identifier to detect if the pair + // (from_node or inode, new_total_cost) was the most recently pushed + // element for the corresponding node. + // + // Note: For RCV, it often isn't searching for a shortest path; it is + // searching for a path in the target delay range. So it might find a + // path to node n that has a higher `backward_path_cost` but the `total_cost` + // (including expected delay to sink, going through a cost function that + // checks that against the target delay) might be lower than the previously + // stored value. In that case we want to re-expand the node so long as + // it doesn't create a loop. That `rcv_path_manager` should store enough + // info for us to avoid loops. + RTExploredNode current; + current.index = from_node; + current.backward_path_cost = rr_node_route_inf_[from_node].backward_path_cost; + current.prev_edge = rr_node_route_inf_[from_node].prev_edge; + current.R_upstream = rr_node_route_inf_[from_node].R_upstream; + + VTR_LOGV_DEBUG(router_debug_, " Better cost to %d\n", from_node); VTR_LOGV_DEBUG(router_debug_, " New total cost: %g\n", new_total_cost); - VTR_LOGV_DEBUG(router_debug_, " New back cost: %g\n", new_back_cost); - VTR_LOGV_DEBUG(router_debug_ && (cheapest->prev_edge() != RREdgeId::INVALID()), - " Setting path costs for associated node %d (from %d edge %zu)\n", - cheapest->index, - static_cast(rr_graph_->edge_src_node(cheapest->prev_edge())), - static_cast(cheapest->prev_edge())); + VTR_LOGV_DEBUG(router_debug_ && (current.prev_edge != RREdgeId::INVALID()), + " Setting path costs for associated node %d (from %d edge %zu)\n", + from_node, + static_cast(rr_graph_->edge_src_node(current.prev_edge)), + static_cast(current.prev_edge)); - update_cheapest(cheapest, route_inf); - - timing_driven_expand_neighbours(cheapest, cost_params, bounding_box, - target_node, target_bb); + timing_driven_expand_neighbours(current, cost_params, bounding_box, target_node, target_bb); } else { // Post-heap prune, do not re-explore from the current/new partial path as it // has worse cost than the best partial path to this node found so far - VTR_LOGV_DEBUG(router_debug_, " Worse cost to %d\n", inode); + VTR_LOGV_DEBUG(router_debug_, " Worse cost to %d\n", from_node); VTR_LOGV_DEBUG(router_debug_, " Old total cost: %g\n", best_total_cost); - VTR_LOGV_DEBUG(router_debug_, " Old back cost: %g\n", best_back_cost); VTR_LOGV_DEBUG(router_debug_, " New total cost: %g\n", new_total_cost); - VTR_LOGV_DEBUG(router_debug_, " New back cost: %g\n", new_back_cost); } } template -void ConnectionRouter::timing_driven_expand_neighbours(t_heap* current, +void ConnectionRouter::timing_driven_expand_neighbours(const RTExploredNode& current, const t_conn_cost_params& cost_params, const t_bb& bounding_box, RRNodeId target_node, @@ -409,8 +405,7 @@ void ConnectionRouter::timing_driven_expand_neighbours(t_heap* current, /* Puts all the rr_nodes adjacent to current on the heap. */ // For each node associated with the current heap element, expand all of it's neighbors - RRNodeId from_node = current->index; - auto edges = rr_nodes_.edge_range(from_node); + auto edges = rr_nodes_.edge_range(current.index); // This is a simple prefetch that prefetches: // - RR node data reachable from this node @@ -440,7 +435,6 @@ void ConnectionRouter::timing_driven_expand_neighbours(t_heap* current, for (RREdgeId from_edge : edges) { RRNodeId to_node = rr_nodes_.edge_sink_node(from_edge); timing_driven_expand_neighbour(current, - from_node, from_edge, to_node, cost_params, @@ -454,8 +448,7 @@ void ConnectionRouter::timing_driven_expand_neighbours(t_heap* current, // RR nodes outside the expanded bounding box specified in bounding_box are not added // to the heap. template -void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, - RRNodeId from_node, +void ConnectionRouter::timing_driven_expand_neighbour(const RTExploredNode& current, RREdgeId from_edge, RRNodeId to_node, const t_conn_cost_params& cost_params, @@ -464,6 +457,8 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, const t_bb& target_bb) { VTR_ASSERT(bounding_box.layer_max < g_vpr_ctx.device().grid.get_num_layers()); + const RRNodeId& from_node = current.index; + // BB-pruning // Disable BB-pruning if RCV is enabled, as this can make it harder for circuits with high negative hold slack to resolve this // TODO: Only disable pruning if the net has negative hold slack, maybe go off budgets @@ -522,14 +517,13 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, // Other pruning methods have been disabled when RCV is on, so this method is required to prevent "loops" from being created bool node_exists = false; if (rcv_path_manager.is_enabled()) { - node_exists = rcv_path_manager.node_exists_in_tree(current->path_data, + node_exists = rcv_path_manager.node_exists_in_tree(rcv_path_data[from_node], to_node); } if (!node_exists || !rcv_path_manager.is_enabled()) { timing_driven_add_to_heap(cost_params, current, - from_node, to_node, from_edge, target_node); @@ -539,43 +533,53 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, // Add to_node to the heap, and also add any nodes which are connected by non-configurable edges template void ConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_params& cost_params, - const t_heap* current, - RRNodeId from_node, + const RTExploredNode& current, RRNodeId to_node, const RREdgeId from_edge, RRNodeId target_node) { const auto& device_ctx = g_vpr_ctx.device(); - t_heap next; + const RRNodeId& from_node = current.index; + + // Initialized to current + RTExploredNode next; + next.R_upstream = current.R_upstream; + next.index = to_node; + next.prev_edge = from_edge; + next.total_cost = std::numeric_limits::infinity(); // Not used directly + next.backward_path_cost = current.backward_path_cost; // Initalize RCV data struct if needed, otherwise it's set to nullptr rcv_path_manager.alloc_path_struct(next.path_data); - - // Costs initialized to current - next.cost = std::numeric_limits::infinity(); //Not used directly - next.backward_path_cost = current->backward_path_cost; - // path_data variables are initialized to current values - if (rcv_path_manager.is_enabled() && current->path_data) { - next.path_data->backward_cong = current->path_data->backward_cong; - next.path_data->backward_delay = current->path_data->backward_delay; + if (rcv_path_manager.is_enabled() && rcv_path_data[from_node]) { + next.path_data->backward_cong = rcv_path_data[from_node]->backward_cong; + next.path_data->backward_delay = rcv_path_data[from_node]->backward_delay; } - next.R_upstream = current->R_upstream; - evaluate_timing_driven_node_costs(&next, cost_params, from_node, - to_node, - from_edge, target_node); float best_total_cost = rr_node_route_inf_[to_node].path_cost; float best_back_cost = rr_node_route_inf_[to_node].backward_path_cost; - float new_total_cost = next.cost; + float new_total_cost = next.total_cost; float new_back_cost = next.backward_path_cost; - if (new_total_cost < best_total_cost && ((rcv_path_manager.is_enabled()) || (new_back_cost < best_back_cost))) { + // We need to only expand this node if it is a better path. And we need to + // update its `rr_node_route_inf` data as we put it into the heap; there may + // be other (previously explored) paths to this node in the heap already, + // but they will be pruned when we pop those heap nodes later as we'll see + // they have inferior costs to what is in the `rr_node_route_inf` data for + // this node. + // FIXME: Adding a link to the FPT paper when it is public + // + // When RCV is enabled, prune based on the RCV-specific total path cost (see + // in `compute_node_cost_using_rcv` in `evaluate_timing_driven_node_costs`) + // to allow detours to get better QoR. + if ((!rcv_path_manager.is_enabled() && best_back_cost > new_back_cost) || + (rcv_path_manager.is_enabled() && best_total_cost > new_total_cost)) { VTR_LOGV_DEBUG(router_debug_, " Expanding to node %d (%s)\n", to_node, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, @@ -589,26 +593,10 @@ void ConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_params& // //Pre-heap prune to keep the heap small, by not putting paths which are known to be //sub-optimal (at this point in time) into the heap. - t_heap* next_ptr = heap_.alloc(); - - // Use the already created next path structure pointer when RCV is enabled - if (rcv_path_manager.is_enabled()) rcv_path_manager.move(next_ptr->path_data, next.path_data); - - //Record how we reached this node - next_ptr->cost = next.cost; - next_ptr->R_upstream = next.R_upstream; - next_ptr->backward_path_cost = next.backward_path_cost; - next_ptr->index = to_node; - next_ptr->set_prev_edge(from_edge); - - if (rcv_path_manager.is_enabled() && current->path_data) { - next_ptr->path_data->path_rr = current->path_data->path_rr; - next_ptr->path_data->edge = current->path_data->edge; - next_ptr->path_data->path_rr.emplace_back(from_node); - next_ptr->path_data->edge.emplace_back(from_edge); - } - heap_.add_to_heap(next_ptr); + update_cheapest(next, from_node); + + heap_.add_to_heap({new_total_cost, to_node}); update_router_stats(router_stats_, true, to_node, @@ -694,15 +682,16 @@ void ConnectionRouter::empty_rcv_route_tree_set() { template void ConnectionRouter::set_rcv_enabled(bool enable) { rcv_path_manager.set_enabled(enable); + if (enable) { + rcv_path_data.resize(rr_node_route_inf_.size()); + } } -//Calculates the cost of reaching to_node +//Calculates the cost of reaching to_node (i.e., to->index) template -void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, +void ConnectionRouter::evaluate_timing_driven_node_costs(RTExploredNode* to, const t_conn_cost_params& cost_params, RRNodeId from_node, - RRNodeId to_node, - RREdgeId from_edge, RRNodeId target_node) { /* new_costs.backward_cost: is the "known" part of the cost to this node -- the * congestion cost of all the routing resources back to the existing route @@ -713,8 +702,8 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, * new_costs.R_upstream: is the upstream resistance at the end of this node */ - //Info for the switch connecting from_node to_node - int iswitch = rr_nodes_.edge_switch(from_edge); + //Info for the switch connecting from_node to_node (i.e., to->index) + int iswitch = rr_nodes_.edge_switch(to->prev_edge); bool switch_buffered = rr_switch_inf_[iswitch].buffered(); bool reached_configurably = rr_switch_inf_[iswitch].configurable(); float switch_R = rr_switch_inf_[iswitch].R; @@ -722,7 +711,7 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, float switch_Cinternal = rr_switch_inf_[iswitch].Cinternal; //To node info - auto rc_index = rr_graph_->node_rc_index(to_node); + auto rc_index = rr_graph_->node_rc_index(to->index); float node_C = rr_rc_data_[rc_index].C; float node_R = rr_rc_data_[rc_index].R; @@ -761,12 +750,12 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, float cong_cost = 0.; if (reached_configurably) { - cong_cost = get_rr_cong_cost(to_node, cost_params.pres_fac); + cong_cost = get_rr_cong_cost(to->index, cost_params.pres_fac); } else { //Reached by a non-configurable edge. //Therefore the from_node and to_node are part of the same non-configurable node set. #ifdef VTR_ASSERT_SAFE_ENABLED - VTR_ASSERT_SAFE_MSG(same_non_config_node_set(from_node, to_node), + VTR_ASSERT_SAFE_MSG(same_non_config_node_set(from_node, to->index), "Non-configurably connected edges should be part of the same node set"); #endif @@ -775,8 +764,8 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, //cost. cong_cost = 0.; } - if (conn_params_->router_opt_choke_points_ && is_flat_ && rr_graph_->node_type(to_node) == IPIN) { - auto find_res = conn_params_->connection_choking_spots_.find(to_node); + if (conn_params_->router_opt_choke_points_ && is_flat_ && rr_graph_->node_type(to->index) == IPIN) { + auto find_res = conn_params_->connection_choking_spots_.find(to->index); if (find_res != conn_params_->connection_choking_spots_.end()) { cong_cost = cong_cost / pow(2, (float)find_res->second); } @@ -788,7 +777,7 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, if (cost_params.bend_cost != 0.) { t_rr_type from_type = rr_graph_->node_type(from_node); - t_rr_type to_type = rr_graph_->node_type(to_node); + t_rr_type to_type = rr_graph_->node_type(to->index); if ((from_type == CHANX && to_type == CHANY) || (from_type == CHANY && to_type == CHANX)) { to->backward_path_cost += cost_params.bend_cost; //Bend cost } @@ -798,46 +787,23 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, if (rcv_path_manager.is_enabled() && to->path_data != nullptr) { to->path_data->backward_delay += cost_params.criticality * Tdel; - to->path_data->backward_cong += (1. - cost_params.criticality) * get_rr_cong_cost(to_node, cost_params.pres_fac); + to->path_data->backward_cong += (1. - cost_params.criticality) * get_rr_cong_cost(to->index, cost_params.pres_fac); - total_cost = compute_node_cost_using_rcv(cost_params, to_node, target_node, to->path_data->backward_delay, to->path_data->backward_cong, to->R_upstream); + total_cost = compute_node_cost_using_rcv(cost_params, to->index, target_node, to->path_data->backward_delay, to->path_data->backward_cong, to->R_upstream); } else { const auto& device_ctx = g_vpr_ctx.device(); //Update total cost - float expected_cost = router_lookahead_.get_expected_cost(to_node, - target_node, - cost_params, - to->R_upstream); + float expected_cost = router_lookahead_.get_expected_cost(to->index, target_node, cost_params, to->R_upstream); VTR_LOGV_DEBUG(router_debug_ && !std::isfinite(expected_cost), - " Lookahead from %s (%s) to %s (%s) is non-finite, expected_cost = %f, to->R_upstream = %f\n", - rr_node_arch_name(to_node, is_flat_).c_str(), - describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str(), - rr_node_arch_name(target_node, is_flat_).c_str(), - describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, target_node, is_flat_).c_str(), - expected_cost, to->R_upstream); + " Lookahead from %s (%s) to %s (%s) is non-finite, expected_cost = %f, to->R_upstream = %f\n", + rr_node_arch_name(to->index, is_flat_).c_str(), + describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to->index, is_flat_).c_str(), + rr_node_arch_name(target_node, is_flat_).c_str(), + describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, target_node, is_flat_).c_str(), + expected_cost, to->R_upstream); total_cost += to->backward_path_cost + cost_params.astar_fac * std::max(0.f, expected_cost - cost_params.astar_offset); } - to->cost = total_cost; -} - -template -void ConnectionRouter::empty_heap_annotating_node_route_inf() { - //Pop any remaining nodes in the heap and annotate their costs - // - //Useful for visualizing router expansion in graphics, as it shows - //the cost of all nodes considered by the router (e.g. nodes never - //expanded, such as parts of the initial route tree far from the - //target). - while (!heap_.is_empty_heap()) { - t_heap* tmp = heap_.get_heap_head(); - - rr_node_route_inf_[tmp->index].path_cost = tmp->cost; - rr_node_route_inf_[tmp->index].backward_path_cost = tmp->backward_path_cost; - modified_rr_node_inf_.push_back(tmp->index); - - rcv_path_manager.free_path_struct(tmp->path_data); - heap_.free(tmp); - } + to->total_cost = total_cost; } //Adds the route tree rooted at rt_node to the heap, preparing it to be @@ -914,14 +880,35 @@ void ConnectionRouter::add_route_tree_node_to_heap( tot_cost, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str()); - push_back_node(&heap_, rr_node_route_inf_, - inode, tot_cost, RREdgeId::INVALID(), - backward_path_cost, R_upstream); + if (tot_cost > rr_node_route_inf_[inode].path_cost) { + return ; + } + add_to_mod_list(inode); + rr_node_route_inf_[inode].path_cost = tot_cost; + rr_node_route_inf_[inode].prev_edge = RREdgeId::INVALID(); + rr_node_route_inf_[inode].backward_path_cost = backward_path_cost; + rr_node_route_inf_[inode].R_upstream = R_upstream; + heap_.push_back({tot_cost, inode}); + + // push_back_node(&heap_, rr_node_route_inf_, + // inode, tot_cost, RREdgeId::INVALID(), + // backward_path_cost, R_upstream); } else { float expected_total_cost = compute_node_cost_using_rcv(cost_params, inode, target_node, rt_node.Tdel, 0, R_upstream); - push_back_node_with_info(&heap_, inode, expected_total_cost, - backward_path_cost, R_upstream, rt_node.Tdel, &rcv_path_manager); + add_to_mod_list(inode); + rr_node_route_inf_[inode].path_cost = expected_total_cost; + rr_node_route_inf_[inode].prev_edge = RREdgeId::INVALID(); + rr_node_route_inf_[inode].backward_path_cost = backward_path_cost; + rr_node_route_inf_[inode].R_upstream = R_upstream; + + rcv_path_manager.alloc_path_struct(rcv_path_data[inode]); + rcv_path_data[inode]->backward_delay = rt_node.Tdel; + + heap_.push_back({expected_total_cost, inode}); + + // push_back_node_with_info(&heap_, inode, expected_total_cost, + // backward_path_cost, R_upstream, rt_node.Tdel, &rcv_path_manager); } update_router_stats(router_stats_, @@ -1144,16 +1131,6 @@ std::unique_ptr make_connection_router(e_heap_type he rr_switch_inf, rr_node_route_inf, is_flat); - case e_heap_type::BUCKET_HEAP_APPROXIMATION: - return std::make_unique>( - grid, - router_lookahead, - rr_nodes, - rr_graph, - rr_rc_data, - rr_switch_inf, - rr_node_route_inf, - is_flat); default: VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Unknown heap_type %d", heap_type); diff --git a/vpr/src/route/connection_router.h b/vpr/src/route/connection_router.h index 4118c5a7c7f..cee93384974 100644 --- a/vpr/src/route/connection_router.h +++ b/vpr/src/route/connection_router.h @@ -10,10 +10,9 @@ #include "router_stats.h" #include "spatial_route_tree_lookup.h" -// Prune the heap when it contains 4x the number of nodes in the RR graph. -constexpr size_t kHeapPruneFactor = 4; +#include "d_ary_heap.h" -// This class encapsolates the timing driven connection router. This class +// This class encapsulates the timing driven connection router. This class // routes from some initial set of sources (via the input rt tree) to a // particular sink. // @@ -44,12 +43,17 @@ class ConnectionRouter : public ConnectionRouterInterface { , rr_node_route_inf_(rr_node_route_inf) , is_flat_(is_flat) , router_stats_(nullptr) - , router_debug_(false) { + , router_debug_(false) + , path_search_cumulative_time(0) { heap_.init_heap(grid); - heap_.set_prune_limit(rr_nodes_.size(), kHeapPruneFactor * rr_nodes_.size()); only_opin_inter_layer = (grid.get_num_layers() > 1) && inter_layer_connections_limited_to_opin(*rr_graph); } + ~ConnectionRouter() { + VTR_LOG("Serial Connection Router is being destroyed. Time spent on path search: %.3f seconds.\n", + std::chrono::duration(path_search_cumulative_time).count()); + } + // Clear's the modified list. Should be called after reset_path_costs // have been called. void clear_modified_rr_node_info() final { @@ -58,7 +62,14 @@ class ConnectionRouter : public ConnectionRouterInterface { // Reset modified data in rr_node_route_inf based on modified_rr_node_inf. void reset_path_costs() final { + // Reset the node info stored in rr_node_route_inf variable ::reset_path_costs(modified_rr_node_inf_); + // Reset the node info stored inside the connection router + if (rcv_path_manager.is_enabled()) { + for (const auto& node : modified_rr_node_inf_) { + rcv_path_data[node] = nullptr; + } + } } /** Finds a path from the route tree rooted at rt_root to sink_node. @@ -68,8 +79,8 @@ class ConnectionRouter : public ConnectionRouterInterface { * Returns a tuple of: * bool: path exists? (hard failure, rr graph disconnected) * bool: should retry with full bounding box? (only used in parallel routing) - * t_heap: heap element of cheapest path */ - std::tuple timing_driven_route_connection_from_route_tree( + * RTExploredNode: the explored sink node, from which the cheapest path can be found via back-tracing */ + std::tuple timing_driven_route_connection_from_route_tree( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -86,8 +97,8 @@ class ConnectionRouter : public ConnectionRouterInterface { * Returns a tuple of: * bool: path exists? (hard failure, rr graph disconnected) * bool: should retry with full bounding box? (only used in parallel routing) - * t_heap: heap element of cheapest path */ - std::tuple timing_driven_route_connection_from_route_tree_high_fanout( + * RTExploredNode: the explored sink node, from which the cheapest path can be found via back-tracing */ + std::tuple timing_driven_route_connection_from_route_tree_high_fanout( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -105,7 +116,10 @@ class ConnectionRouter : public ConnectionRouterInterface { // Dijkstra's algorithm with a modified exit condition (runs until heap is // empty). When using cost_params.astar_fac = 0, for efficiency the // RouterLookahead used should be the NoOpLookahead. - vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( + // + // Note: This routine is currently used only to generate information that + // may be helpful in debugging an architecture. + vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( const RouteTreeNode& rt_root, const t_conn_cost_params& cost_params, const t_bb& bounding_box, @@ -136,18 +150,24 @@ class ConnectionRouter : public ConnectionRouterInterface { } } - // Update the route path to the node pointed to by cheapest. - inline void update_cheapest(t_heap* cheapest) { - update_cheapest(cheapest, &rr_node_route_inf_[cheapest->index]); - } - - inline void update_cheapest(t_heap* cheapest, t_rr_node_route_inf* route_inf) { - //Record final link to target - add_to_mod_list(cheapest->index); - - route_inf->prev_edge = cheapest->prev_edge(); - route_inf->path_cost = cheapest->cost; - route_inf->backward_path_cost = cheapest->backward_path_cost; + // Update the route path to the node `cheapest.index` via the path from + // `from_node` via `cheapest.prev_edge`. + inline void update_cheapest(RTExploredNode& cheapest, const RRNodeId& from_node) { + const RRNodeId& inode = cheapest.index; + add_to_mod_list(inode); + rr_node_route_inf_[inode].prev_edge = cheapest.prev_edge; + rr_node_route_inf_[inode].path_cost = cheapest.total_cost; + rr_node_route_inf_[inode].backward_path_cost = cheapest.backward_path_cost; + + // Use the already created next path structure pointer when RCV is enabled + if (rcv_path_manager.is_enabled()) { + rcv_path_manager.move(rcv_path_data[inode], cheapest.path_data); + + rcv_path_data[inode]->path_rr = rcv_path_data[from_node]->path_rr; + rcv_path_data[inode]->edge = rcv_path_data[from_node]->edge; + rcv_path_data[inode]->path_rr.push_back(from_node); + rcv_path_data[inode]->edge.push_back(cheapest.prev_edge); + } } /** Common logic from timing_driven_route_connection_from_route_tree and @@ -157,9 +177,8 @@ class ConnectionRouter : public ConnectionRouterInterface { * @param[in] sink_node Sink node ID to route to * @param[in] cost_params * @param[in] bounding_box Keep search confined to this bounding box - * @return bool Signal to retry this connection with a full-device bounding box, - * @return t_heap* Heap element describing the path found. */ - std::tuple timing_driven_route_connection_common_setup( + * @return bool Signal to retry this connection with a full-device bounding box */ + bool timing_driven_route_connection_common_setup( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -168,20 +187,23 @@ class ConnectionRouter : public ConnectionRouterInterface { // Finds a path to sink_node, starting from the elements currently in the // heap. // + // If the path is not found, which means that the path_cost of sink_node in + // RR node route info has never been updated, `rr_node_route_inf_[sink_node] + // .path_cost` will be the initial value (i.e., float infinity). This case + // can be detected by `std::isinf(rr_node_route_inf_[sink_node].path_cost)`. + // // This is the core maze routing routine. // // Note: For understanding the connection router, start here. - // - // Returns either the last element of the path, or nullptr if no path is - // found - t_heap* timing_driven_route_connection_from_heap( + void timing_driven_route_connection_from_heap( RRNodeId sink_node, const t_conn_cost_params& cost_params, const t_bb& bounding_box); // Expand this current node if it is a cheaper path. void timing_driven_expand_cheapest( - t_heap* cheapest, + RRNodeId from_node, + float new_total_cost, RRNodeId target_node, const t_conn_cost_params& cost_params, const t_bb& bounding_box, @@ -189,20 +211,19 @@ class ConnectionRouter : public ConnectionRouterInterface { // Expand each neighbor of the current node. void timing_driven_expand_neighbours( - t_heap* current, + const RTExploredNode& current, const t_conn_cost_params& cost_params, const t_bb& bounding_box, RRNodeId target_node, const t_bb& target_bb); - // Conditionally adds to_node to the router heap (via path from from_node + // Conditionally adds to_node to the router heap (via path from current.index // via from_edge). // // RR nodes outside bounding box specified in bounding_box are not added // to the heap. void timing_driven_expand_neighbour( - t_heap* current, - RRNodeId from_node, + const RTExploredNode& current, RREdgeId from_edge, RRNodeId to_node, const t_conn_cost_params& cost_params, @@ -214,28 +235,23 @@ class ConnectionRouter : public ConnectionRouterInterface { // non-configurable edges void timing_driven_add_to_heap( const t_conn_cost_params& cost_params, - const t_heap* current, - RRNodeId from_node, + const RTExploredNode& current, RRNodeId to_node, RREdgeId from_edge, RRNodeId target_node); // Calculates the cost of reaching to_node void evaluate_timing_driven_node_costs( - t_heap* to, + RTExploredNode* to, const t_conn_cost_params& cost_params, RRNodeId from_node, - RRNodeId to_node, - RREdgeId from_edge, RRNodeId target_node); // Find paths from current heap to all nodes in the RR graph - vtr::vector timing_driven_find_all_shortest_paths_from_heap( + vtr::vector timing_driven_find_all_shortest_paths_from_heap( const t_conn_cost_params& cost_params, const t_bb& bounding_box); - void empty_heap_annotating_node_route_inf(); - //Adds the route tree rooted at rt_node to the heap, preparing it to be //used as branch-points for further routing. void add_route_tree_to_heap(const RouteTreeNode& rt_node, @@ -286,8 +302,13 @@ class ConnectionRouter : public ConnectionRouterInterface { bool only_opin_inter_layer; - // The path manager for RCV, keeps track of the route tree as a set, also manages the allocation of the heap types + // Cumulative time spent in the path search part of the connection router. + std::chrono::microseconds path_search_cumulative_time; + + // The path manager for RCV, keeps track of the route tree as a set, also + // manages the allocation of `rcv_path_data`. PathManager rcv_path_manager; + vtr::vector rcv_path_data; }; /** Construct a connection router that uses the specified heap type. diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h index b732e8f839e..62111edc285 100644 --- a/vpr/src/route/connection_router_interface.h +++ b/vpr/src/route/connection_router_interface.h @@ -52,8 +52,8 @@ class ConnectionRouterInterface { * Returns a tuple of: * bool: path exists? (hard failure, rr graph disconnected) * bool: should retry with full bounding box? - * t_heap: heap element of cheapest path */ - virtual std::tuple timing_driven_route_connection_from_route_tree( + * RTExploredNode: the explored sink node, from which the cheapest path can be found via back-tracing */ + virtual std::tuple timing_driven_route_connection_from_route_tree( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -71,8 +71,8 @@ class ConnectionRouterInterface { * Returns a tuple of: * bool: path exists? (hard failure, rr graph disconnected) * bool: should retry with full bounding box? - * t_heap: heap element of cheapest path */ - virtual std::tuple timing_driven_route_connection_from_route_tree_high_fanout( + * RTExploredNode: the explored sink node, from which the cheapest path can be found via back-tracing */ + virtual std::tuple timing_driven_route_connection_from_route_tree_high_fanout( const RouteTreeNode& rt_root, RRNodeId sink_node, const t_conn_cost_params& cost_params, @@ -91,7 +91,10 @@ class ConnectionRouterInterface { // Dijkstra's algorithm with a modified exit condition (runs until heap is // empty). When using cost_params.astar_fac = 0, for efficiency the // RouterLookahead used should be the NoOpLookahead. - virtual vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( + // + // Note: This routine is currently used only to generate information that + // may be helpful in debugging an architecture. + virtual vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( const RouteTreeNode& rt_root, const t_conn_cost_params& cost_params, const t_bb& bounding_box, diff --git a/vpr/src/route/d_ary_heap.h b/vpr/src/route/d_ary_heap.h new file mode 100644 index 00000000000..5ac59f1eef2 --- /dev/null +++ b/vpr/src/route/d_ary_heap.h @@ -0,0 +1,74 @@ +#ifndef _VTR_D_ARY_HEAP_H +#define _VTR_D_ARY_HEAP_H + +#include + +#include "device_grid.h" +#include "heap_type.h" +#include "d_ary_heap.tpp" + +/** + * @brief Min-heap with D child nodes per parent. + * + * @note + * Currently, DAryHeap only has two children, BinaryHeap and FourAryHeap. On small circuits, + * these heaps have negligible differences in runtime, but on larger heaps, runtime is lower when + * using FourAryHeap. On Koios large benchmarks, the runtime is ~5% better on FourAryHeap compared + * to BinaryHeap. This is likely because FourAryHeap has lower tree height, and as we can fit 8 + * heap node (each is 8 bytes) on a cache line (commonly 64 bytes on modern architectures), each + * heap operation (the comparison among sibling nodes) tends to benefit from the caches. +*/ +template +class DAryHeap : public HeapInterface { + public: + using priority_queue = customized_d_ary_priority_queue, HeapNodeComparator>; + + DAryHeap() {} + + void init_heap(const DeviceGrid& grid) { + size_t target_heap_size = (grid.width() - 1) * (grid.height() - 1); + pq_.reserve(target_heap_size); // reserve the memory for the heap structure + } + + bool try_pop(HeapNode& heap_node) { + if (pq_.empty()) { + return false; + } else { + heap_node = pq_.top(); + pq_.pop(); + return true; + } + } + + void add_to_heap(const HeapNode& heap_node) { + pq_.push(heap_node); + } + + void push_back(const HeapNode& heap_node) { + pq_.push(heap_node); // FIXME: add to heap without maintaining the heap property + } + + void build_heap() { + // FIXME: restore the heap property after pushing back nodes + } + + bool is_valid() const { + return true; // FIXME: checking if the heap property is maintained or not + } + + void empty_heap() { + pq_.clear(); + } + + bool is_empty_heap() const { + return (bool)(pq_.empty()); + } + + private: + priority_queue pq_; +}; + +using BinaryHeap = DAryHeap<2>; +using FourAryHeap = DAryHeap<4>; + +#endif /* _VTR_D_ARY_HEAP_H */ diff --git a/vpr/src/route/d_ary_heap.tpp b/vpr/src/route/d_ary_heap.tpp new file mode 100644 index 00000000000..565b8bac72b --- /dev/null +++ b/vpr/src/route/d_ary_heap.tpp @@ -0,0 +1,162 @@ +#pragma once + +#include +#include + +template, class Compare = std::less> +class customized_d_ary_priority_queue { + static_assert(D == 2 || D == 4, "Only support binary or 4-ary priority queue"); + + public: + typedef Container container_type; + typedef typename Container::value_type value_type; + typedef typename Container::size_type size_type; + typedef typename Container::reference reference; + typedef typename Container::const_reference const_reference; + + Compare comp_; + /** + * @details + * heap_ is indexed from [1..heap_size]; the 0th element is unused. This simplifies arithmetic + * in first_child_index() and parent_index() functions. + * + * @todo + * If an 8-ary heap is implemented, experiment with starting at index 0 + */ + Container heap_; + + private: + inline size_t parent_index(const size_t i) { + if constexpr (D == 2) { + return i >> 1; + } else { + return (i + 2) >> 2; + } + } + + inline size_t first_child_index(const size_t i) { + if constexpr (D == 2) { + return i << 1; + } else { + return (i << 2) - 2; + } + } + + inline size_t largest_child_index(const size_t first_child) { + if constexpr (D == 2) { + return first_child + !!comp_(heap_[first_child], heap_[first_child + 1]); + } else { + const size_t child_1 = first_child; + const size_t child_2 = child_1 + 1; + const size_t child_3 = child_1 + 2; + const size_t child_4 = child_1 + 3; + const size_t first_half_largest = child_1 + !!comp_(heap_[child_1], heap_[child_2]); + const size_t second_half_largest = child_3 + !!comp_(heap_[child_3], heap_[child_4]); + return comp_(heap_[first_half_largest], heap_[second_half_largest]) ? second_half_largest : first_half_largest; + } + } + + inline size_t largest_child_index_partial(const size_t first_child, const size_t num_children /*must < `D`*/) { + if constexpr (D == 2) { + (void) num_children; + return first_child; + } else { + switch (num_children) { + case 3: { + const size_t child_1 = first_child; + const size_t child_2 = child_1 + 1; + const size_t child_3 = child_1 + 2; + const size_t first_two_children_largest = child_1 + !!comp_(heap_[child_1], heap_[child_2]); + return comp_(heap_[first_two_children_largest], heap_[child_3]) ? child_3 : first_two_children_largest; + } + case 2: { + return first_child + !!comp_(heap_[first_child], heap_[first_child + 1]); + } + default: { + return first_child; + } + } + } + } + + inline void pop_customized_heap() { + size_t length = heap_.size() - 1; + auto end = heap_.end(); + auto value = std::move(end[-1]); + end[-1] = std::move(heap_[1]); + size_t index = 1; + for (;;) { + size_t first_child = first_child_index(index); + size_t last_child = first_child + (D - 1); + if (last_child < length) { + size_t largest_child = largest_child_index(first_child); + if (!comp_(value, heap_[largest_child])) { + break; + } + heap_[index] = std::move(heap_[largest_child]); + index = largest_child; + } else if (first_child < length) { + size_t largest_child = largest_child_index_partial(first_child, length - first_child); + if (comp_(value, heap_[largest_child])) { + heap_[index] = std::move(heap_[largest_child]); + index = largest_child; + } + break; + } else { + break; + } + } + heap_[index] = std::move(value); + } + + inline void push_customized_heap() { + auto value = std::move(heap_.back()); + size_t index = heap_.size() - 1; + while (index > 1) { + size_t parent = parent_index(index); + if (!comp_(heap_[parent], value)) { + break; + } + heap_[index] = std::move(heap_[parent]); + index = parent; + } + heap_[index] = std::move(value); + } + + public: + explicit customized_d_ary_priority_queue(const Compare& compare = Compare(), + const Container& cont = Container()) + : comp_(compare) + , heap_(cont) { + heap_.resize(1); // FIXME: currently do not support `make_heap` from cont (heap_) + } + + inline bool empty() const { + return heap_.size() == 1; // heap_[0] is invalid, heap is indexed from 1 + } + + inline size_type size() const { + return heap_.size() - 1; // heap_[0] is invalid, heap is indexed from 1 + } + + inline const_reference top() const { return heap_[1]; } + + inline void pop() { + pop_customized_heap(); + heap_.pop_back(); + } + + inline void push(const value_type& value) { + heap_.push_back(value); + push_customized_heap(); + } + + inline void push(value_type&& value) { + heap_.push_back(std::move(value)); + push_customized_heap(); + } + + inline void clear() { heap_.resize(1); } + + inline void reserve(size_type new_cap) { heap_.reserve(new_cap + 1); } +}; diff --git a/vpr/src/route/four_ary_heap.cpp b/vpr/src/route/four_ary_heap.cpp deleted file mode 100644 index e70ed389e9a..00000000000 --- a/vpr/src/route/four_ary_heap.cpp +++ /dev/null @@ -1,107 +0,0 @@ -#include "four_ary_heap.h" -#include "vtr_log.h" - -// The leftmost/smallest-index child of node i -static inline size_t first_child(size_t i) { return (i << 2) - 2; } - -inline size_t FourAryHeap::parent(size_t i) const { return (i + 2) >> 2; } - -inline size_t FourAryHeap::smallest_child(size_t i) const { - // This function could be a simple loop to find the min cost child. However, - // using switch-case is 3% faster, which is worthwhile as this function is - // called very frequently. - - const size_t child_1 = first_child(i); - const size_t child_2 = child_1 + 1; - const size_t child_3 = child_1 + 2; - const size_t child_4 = child_1 + 3; - - size_t num_children = std::max(std::min(4, (int)heap_tail_ - (int)child_1), 0); - - switch (num_children) { - case 4: { - size_t minA = (heap_[child_1].cost < heap_[child_2].cost) ? child_1 : child_2; - size_t minB = (heap_[child_3].cost < heap_[child_4].cost) ? child_3 : child_4; - return (heap_[minA].cost < heap_[minB].cost) ? minA : minB; - } - case 3: { - size_t minA = (heap_[child_1].cost < heap_[child_2].cost) ? child_1 : child_2; - return (heap_[minA].cost < heap_[child_3].cost) ? minA : child_3; - } - case 2: - return (heap_[child_1].cost < heap_[child_2].cost) ? child_1 : child_2; - default: - return child_1; - } -} - -bool FourAryHeap::is_valid() const { - if (heap_.empty()) { - return false; - } - - for (size_t i = 1; i <= parent(heap_tail_); ++i) { - size_t leftmost_child = first_child(i); - - for (size_t j = 0; j < 4; ++j) { - if (leftmost_child + j >= heap_tail_) - break; - else if (heap_[leftmost_child + j].cost < heap_[i].cost) - return false; - } - } - - return true; -} - -t_heap* FourAryHeap::get_heap_head() { - /* Returns a pointer to the smallest element on the heap, or NULL if the * - * heap is empty. Invalid (index == OPEN) entries on the heap are never * - * returned -- they are just skipped over. */ - - t_heap* cheapest; - size_t hole, child; - - do { - if (heap_tail_ == 1) { /* Empty heap. */ - VTR_LOG_WARN("Empty heap occurred in get_heap_head.\n"); - return (nullptr); - } - - cheapest = heap_[1].elem_ptr; - - hole = 1; - child = smallest_child(hole); - - --heap_tail_; - - while (child < heap_tail_) { - child = smallest_child(hole); - - heap_[hole] = heap_[child]; - hole = child; - child = first_child(hole); - } - - sift_up(hole, heap_[heap_tail_]); - } while (!cheapest->index.is_valid()); /* Get another one if invalid entry. */ - - return (cheapest); -} - -// make a heap rooted at index hole by **sifting down** in O(lgn) time -void FourAryHeap::sift_down(size_t hole) { - heap_elem head{heap_[hole]}; - size_t child{smallest_child(hole)}; - - while (child < heap_tail_) { - if (heap_[child].cost < head.cost) { - heap_[hole] = heap_[child]; - hole = child; - child = smallest_child(hole); - } else - break; - } - - heap_[hole] = head; -} \ No newline at end of file diff --git a/vpr/src/route/four_ary_heap.h b/vpr/src/route/four_ary_heap.h deleted file mode 100644 index 8dcb1d01b7d..00000000000 --- a/vpr/src/route/four_ary_heap.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef VTR_FOUR_ARY_HEAP_H -#define VTR_FOUR_ARY_HEAP_H - -#include "k_ary_heap.h" -#include - -/** - * @brief Minheap with 4 child nodes per parent. - * - * @note - * Currently, KAryHeap's two children are BinaryHeap and FourAryHeap. On small circuits, these - * heaps have negligible differences in runtime, but on larger heaps, runtime is lower when - * using FourAryHeap. On titan benchmarks, the runtime is ~1.8% better on FourAryHeap compared - * to BinaryHeap. This is likely because FourAryHeap is more cache friendly, as we can fit 5 - * heap_elem on a cache line. -*/ -class FourAryHeap : public KAryHeap { - public: - bool is_valid() const final; - t_heap* get_heap_head() final; - - private: - void sift_down(size_t hole) final; - size_t parent(size_t i) const final; - - /** - * @param i The parent node. - * - * @return The child node of i with the smallest cost. Returns the first (smallest index) child of i - * if i has no children. - */ - size_t smallest_child(size_t i) const; -}; - -#endif //VTR_FOUR_ARY_HEAP_H diff --git a/vpr/src/route/heap_type.cpp b/vpr/src/route/heap_type.cpp index f9ee97dd657..d3cfec9c495 100644 --- a/vpr/src/route/heap_type.cpp +++ b/vpr/src/route/heap_type.cpp @@ -1,63 +1,7 @@ #include "heap_type.h" -#include "binary_heap.h" -#include "four_ary_heap.h" -#include "bucket.h" -#include "rr_graph_fwd.h" #include "vpr_error.h" -#include "vpr_types.h" - -HeapStorage::HeapStorage() - : heap_free_head_(nullptr) - , num_heap_allocated_(0) {} - -t_heap* -HeapStorage::alloc() { - if (heap_free_head_ == nullptr) { /* No elements on the free list */ - heap_free_head_ = vtr::chunk_new(&heap_ch_); - } - - //Extract the head - t_heap* temp_ptr = heap_free_head_; - heap_free_head_ = heap_free_head_->next_heap_item(); - - num_heap_allocated_++; - - //Reset - temp_ptr->set_next_heap_item(nullptr); - temp_ptr->cost = 0.; - temp_ptr->backward_path_cost = 0.; - temp_ptr->R_upstream = 0.; - temp_ptr->index = RRNodeId::INVALID(); - temp_ptr->path_data = nullptr; - temp_ptr->set_prev_edge(RREdgeId::INVALID()); - return (temp_ptr); -} - -void HeapStorage::free(t_heap* hptr) { - hptr->set_next_heap_item(heap_free_head_); - heap_free_head_ = hptr; - num_heap_allocated_--; -} - -void HeapStorage::free_all_memory() { - VTR_ASSERT(num_heap_allocated_ == 0); - - if (heap_free_head_ != nullptr) { - t_heap* curr = heap_free_head_; - while (curr) { - t_heap* tmp = curr; - curr = curr->next_heap_item(); - - vtr::chunk_delete(tmp, &heap_ch_); - } - - heap_free_head_ = nullptr; - } - - /*free the memory chunks that were used by heap and linked f pointer */ - free_chunk_memory(&heap_ch_); -} +#include "d_ary_heap.h" std::unique_ptr make_heap(e_heap_type heap_type) { switch (heap_type) { @@ -65,8 +9,6 @@ std::unique_ptr make_heap(e_heap_type heap_type) { return std::make_unique(); case e_heap_type::FOUR_ARY_HEAP: return std::make_unique(); - case e_heap_type::BUCKET_HEAP_APPROXIMATION: - return std::make_unique(); default: VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Unknown heap_type %d", heap_type); } diff --git a/vpr/src/route/heap_type.h b/vpr/src/route/heap_type.h index e3dcb071c7d..dd722928bcc 100644 --- a/vpr/src/route/heap_type.h +++ b/vpr/src/route/heap_type.h @@ -1,6 +1,7 @@ #ifndef _HEAP_TYPE_H #define _HEAP_TYPE_H +#include #include "physical_types.h" #include "device_grid.h" #include "vtr_memory.h" @@ -8,186 +9,81 @@ #include "rr_graph_fwd.h" #include "route_path_manager.h" +using HeapNodePriority = float; +using HeapNodeId = RRNodeId; +// Ensure that the heap node structure occupies only 64 bits to make the heap cache-friendly +// and achieve high performance. +static_assert(sizeof(RRNodeId) == sizeof(uint32_t)); + /** * @brief Used by the heap as its fundamental data structure. Each heap - * element represents a partial route. + * node contains only the heap priority value (i.e., the cost of the RR node) + * and the index of the RR node. The size of each heap node is minimized to + * ensure that the heap is cache-friendly and to make the initialization and + * copying of heap nodes efficient. */ -struct t_heap { - ///@brief The cost used to sort heap. For the timing-driven router this is the backward_path_cost + expected cost to the target. - float cost = 0.; - ///@brief The "known" cost of the path up to and including this node. Used only by the timing-driven router. In this case, the - ///.cost member contains not only the known backward cost but also an expected cost to the target. - float backward_path_cost = 0.; - ///@brief Used only by the timing-driven router. Stores the upstream resistance to ground from this node, including the resistance - /// of the node itself (device_ctx.rr_nodes[index].R). - float R_upstream = 0.; - ///@brief The RR node index associated with the costs/R_upstream values. - RRNodeId index = RRNodeId::INVALID(); - ///@brief Structure to handle extra RCV structures. Managed by PathManager class. - t_heap_path* path_data; - - /** - * @brief Get the next t_heap item in the linked list. - */ - t_heap* next_heap_item() const { - return u.next; - } - - /** - * @brief Set the next t_heap item in the linked list. - */ - void set_next_heap_item(t_heap* next) { - u.next = next; - } - - /** - * @brief Get the edge from the previous node used to reach the current node. - * - * @note - * Be careful: will return 0 (a valid id!) if uninitialized. - */ - constexpr RREdgeId prev_edge() const { - static_assert(sizeof(uint32_t) == sizeof(RREdgeId)); - return RREdgeId(u.prev_edge); - } - - /** - * @brief Set the edge from the previous node used to reach the current node.. - */ - inline void set_prev_edge(RREdgeId edge) { - static_assert(sizeof(uint32_t) == sizeof(RREdgeId)); - u.prev_edge = size_t(edge); - } - - private: - union { - ///@brief Pointer to the next t_heap structure in the free linked list. - t_heap* next = nullptr; - - /** - * @brief The edge from the previous node used to reach the current. Not used when on the heap. - * - * @note - * The previous edge is not a StrongId for performance & brevity - * reasons: StrongIds can't be trivially placed into an anonymous - * union. - */ - uint32_t prev_edge; - } u; +struct HeapNode { + ///@brief The priority value or cost used to sort heap. For the timing-driven router + /// this is the total_cost (i.e., backward_path_cost + expected cost to the target). + HeapNodePriority prio; + ///@brief The RR node index associated with the cost. + HeapNodeId node; }; /** - * @brief t_heap object pool, useful for implementing heaps that conform to - * HeapInterface. + * @brief The comparison function object used to sort heap, following the STL style. */ -class HeapStorage { - public: - HeapStorage(); - - /** - * @brief Allocate a heap item. - * - * @return The allocated item. - */ - t_heap* alloc(); - - /** - * @brief Free a heap item. - */ - void free(t_heap* hptr); - - /** - * @brief Free all heap items. - */ - void free_all_memory(); - - private: - /* For keeping track of the sudo malloc memory for the heap*/ - vtr::t_chunk heap_ch_; - - t_heap* heap_free_head_; - size_t num_heap_allocated_; +struct HeapNodeComparator { + bool operator()(const HeapNode& u, const HeapNode& v) { + return u.prio > v.prio; + } }; /** * @brief Interface to heap used for router optimization. - * - * @note - * Objects used in instances of HeapInterface must always be allocated - * and free'd using the HeapInterface::alloc and HeapInterface::free methods - * of that instance. Object pools are likely in use. - * - * @details - * As a general rule, any t_heap objects returned from this interface, - * **must** be HeapInterface::free'd before destroying the HeapInterface - * instance. This ensure that no leaks are present in the users of the heap. - * Violating this assumption may result in an assertion violation. */ class HeapInterface { public: virtual ~HeapInterface() {} - /** - * @brief Allocate a heap item. - * - * @details - * This transfers ownership of the t_heap object from HeapInterface to the - * caller. - */ - virtual t_heap* alloc() = 0; - - /** - * @brief Free a heap item. - * - * @details - * HeapInterface::free can be called on objects returned from either - * HeapInterface::alloc or HeapInterface::get_heap_head. - * - * @param hptr The element to free. - */ - virtual void free(t_heap* hptr) = 0; - /** * @brief Initializes heap storage based on the size of the device. * * @note * This method **must** be invoked at least once prior to the * following methods being called:
+ * - try_pop
* - add_to_heap
* - push_back
- * - get_heap_head
- * - is_empty_heap
- * - empty_heap
* - build_heap
+ * - empty_heap
+ * - is_empty_heap
* * @param grid The FPGA device grid */ virtual void init_heap(const DeviceGrid& grid) = 0; /** - * @brief Add t_heap to heap, preserving heap property. - * - * @details - * This transfers ownership of the t_heap object to HeapInterface from the - * called. + * @brief Pop the head (smallest element) of the heap. Return true if the pop + * succeeds; otherwise (if the heap is empty), return false. * - * @param hptr The element to add. + * @param heap_node The reference to a location to store the popped heap node. */ - virtual void add_to_heap(t_heap* hptr) = 0; + virtual bool try_pop(HeapNode& heap_node) = 0; /** - * @brief Add t_heap to heap, however does not preserve heap property. + * @brief Add HeapNode to heap, preserving heap property. * - * @details - * This is useful if multiple t_heap's are being added in bulk. Once - * all t_heap's have been added, HeapInterface::build_heap can be invoked - * to restore the heap property in an efficient way.

- * This transfers ownership of the t_heap object to HeapInterface from the - * called. + * @param heap_node The element to add. + */ + virtual void add_to_heap(const HeapNode& heap_node) = 0; + + /** + * @brief Add HeapNode to heap, however does not preserve heap property. * * @param hptr The element to insert. */ - virtual void push_back(t_heap* const hptr) = 0; + virtual void push_back(const HeapNode& heap_node) = 0; /** * @brief Restore the heap property. @@ -198,20 +94,6 @@ class HeapInterface { */ virtual void build_heap() = 0; - /** - * @brief Pop the head (smallest element) of the heap, and return it. - * - * @details - * This transfers ownership of the t_heap object from HeapInterface to the - * caller. - */ - virtual t_heap* get_heap_head() = 0; - - /** - * @brief Is the heap empty? - */ - virtual bool is_empty_heap() const = 0; - /** * @brief Is the heap valid? */ @@ -223,50 +105,15 @@ class HeapInterface { virtual void empty_heap() = 0; /** - * @brief Free all storage used by the heap. - * - * @details - * This returns all memory allocated by the HeapInterface instance. Only - * call this if the heap is no longer being used. - * - * @note - * Only invoke this method if all objects returned from this - * HeapInterface instance have been free'd. - */ - virtual void free_all_memory() = 0; - - /** - * @brief Set maximum number of elements that the heap should contain - * (the prune_limit). If the prune limit is hit, then the heap should - * kick out duplicate index entries. - * - * @details - * The prune limit exists to provide a maximum bound on memory usage in - * the heap. In some pathological cases, the router may explore - * incrementally better paths, resulting in many duplicate entries for - * RR nodes. To handle this edge case, if the number of heap items - * exceeds the prune_limit, then the heap will compacts itself.

- * The heap compaction process simply means taking the lowest cost entry - * for each index (e.g. RR node). All nodes with higher costs can safely - * be dropped.

- * The pruning process is intended to bound the memory usage the heap can - * consume based on the prune_limit, which is expected to be a function of - * the graph size. - * - * @param max_index The highest index possible in the heap. - * @param prune_limit The maximum number of heap entries before pruning should - * take place. This should always be higher than max_index, likely by a - * significant amount. The pruning process has some overhead, so prune_limit - * should be ~2-4x the max_index to prevent excess pruning when not required. + * @brief Is the heap empty? */ - virtual void set_prune_limit(size_t max_index, size_t prune_limit) = 0; + virtual bool is_empty_heap() const = 0; }; enum class e_heap_type { INVALID_HEAP = 0, BINARY_HEAP, FOUR_ARY_HEAP, - BUCKET_HEAP_APPROXIMATION, }; /** diff --git a/vpr/src/route/k_ary_heap.cpp b/vpr/src/route/k_ary_heap.cpp deleted file mode 100644 index f7dc7b8093c..00000000000 --- a/vpr/src/route/k_ary_heap.cpp +++ /dev/null @@ -1,173 +0,0 @@ -#include "k_ary_heap.h" -#include "rr_graph_fwd.h" -#include "vtr_log.h" - -KAryHeap::KAryHeap() - : heap_() - , heap_size_(0) - , heap_tail_(0) - , max_index_(std::numeric_limits::max()) - , prune_limit_(std::numeric_limits::max()) {} - -KAryHeap::~KAryHeap() { - free_all_memory(); -} - -t_heap* KAryHeap::alloc() { - return storage_.alloc(); -} -void KAryHeap::free(t_heap* hptr) { - storage_.free(hptr); -} - -void KAryHeap::init_heap(const DeviceGrid& grid) { - size_t target_heap_size = (grid.width() - 1) * (grid.height() - 1); - if (heap_.empty() || heap_size_ < target_heap_size) { - if (!heap_.empty()) { - // coverity[offset_free : Intentional] - heap_.clear(); - } - heap_size_ = (grid.width() - 1) * (grid.height() - 1); - heap_.resize(heap_size_ + 1); /* heap_size_ + 1 because heap stores from [1..heap_size] */ - } - heap_tail_ = 1; -} - -void KAryHeap::add_to_heap(t_heap* hptr) { - expand_heap_if_full(); - // start with undefined hole - ++heap_tail_; - heap_elem new_elem = {hptr, hptr->cost}; - sift_up(heap_tail_ - 1, new_elem); - - // If we have pruned, rebuild the heap now. - if (check_prune_limit()) { - build_heap(); - } -} - -bool KAryHeap::is_empty_heap() const { - return (bool)(heap_tail_ == 1); -} - -void KAryHeap::empty_heap() { - for (size_t i = 1; i < heap_tail_; i++) - free(heap_[i].elem_ptr); - - heap_tail_ = 1; -} - -size_t KAryHeap::size() const { return heap_tail_ - 1; } // heap[0] is not valid element - -// runs in O(n) time by sifting down; the least work is done on the most elements: 1 swap for bottom layer, 2 swap for 2nd, ... lgn swap for top -// 1*(n/k^1) + 2*(n/k^2) + 3*(n/k^3) + ... + lgn*1 = k*n (sum of i/k^i) -void KAryHeap::build_heap() { - for (size_t i = parent(heap_tail_); i != 0; --i) - sift_down(i); -} - -void KAryHeap::set_prune_limit(size_t max_index, size_t prune_limit) { - if (prune_limit != std::numeric_limits::max()) { - VTR_ASSERT(max_index < prune_limit); - } - max_index_ = max_index; - prune_limit_ = prune_limit; -} - -void KAryHeap::sift_up(size_t leaf, heap_elem const& node) { - while ((leaf > 1) && (node.cost < heap_[parent(leaf)].cost)) { - // sift hole up - heap_[leaf] = heap_[parent(leaf)]; - leaf = parent(leaf); - } - - heap_[leaf] = node; -} - -void KAryHeap::expand_heap_if_full() { - if (heap_tail_ >= heap_size_) { /* Heap is full */ - heap_size_ *= 2; - heap_.resize(heap_size_ + 1); - } -} - -// adds an element to the back of heap and expand if necessary, but does not maintain heap property -void KAryHeap::push_back(t_heap* const hptr) { - expand_heap_if_full(); - - heap_elem new_elem = {hptr, hptr->cost}; - heap_[heap_tail_] = new_elem; - ++heap_tail_; - - check_prune_limit(); -} - -void KAryHeap::free_all_memory() { - if (!heap_.empty()) { - empty_heap(); - // coverity[offset_free : Intentional] - heap_.clear(); - } - - // heap_ = nullptr; /* Defensive coding: crash hard if I use these. */ - storage_.free_all_memory(); -} - -bool KAryHeap::check_prune_limit() { - if (heap_tail_ > prune_limit_) { - prune_heap(); - return true; - } - - return false; -} - -void KAryHeap::prune_heap() { - VTR_ASSERT(max_index_ < prune_limit_); - - heap_elem blank_elem = {nullptr, 0.0}; - std::vector best_heap_item(max_index_, blank_elem); - - // Find the cheapest instance of each index and store it. - for (size_t i = 1; i < heap_tail_; i++) { - if (heap_[i].elem_ptr == nullptr) { - continue; - } - - if (!heap_[i].elem_ptr->index.is_valid()) { - free(heap_[i].elem_ptr); - heap_[i].elem_ptr = nullptr; - continue; - } - - auto idx = size_t(heap_[i].elem_ptr->index); - - VTR_ASSERT(idx < max_index_); - - if (best_heap_item[idx].elem_ptr == nullptr || best_heap_item[idx].cost > heap_[i].cost) { - best_heap_item[idx] = heap_[i]; - } - } - - // Free unused nodes. - for (size_t i = 1; i < heap_tail_; i++) { - if (heap_[i].elem_ptr == nullptr) { - continue; - } - - auto idx = size_t(heap_[i].elem_ptr->index); - - if (best_heap_item[idx].elem_ptr != heap_[i].elem_ptr) { - free(heap_[i].elem_ptr); - heap_[i].elem_ptr = nullptr; - } - } - - heap_tail_ = 1; - - for (size_t i = 0; i < max_index_; ++i) { - if (best_heap_item[i].elem_ptr != nullptr) { - heap_[heap_tail_++] = best_heap_item[i]; - } - } -} diff --git a/vpr/src/route/k_ary_heap.h b/vpr/src/route/k_ary_heap.h deleted file mode 100644 index fb0e8763fdf..00000000000 --- a/vpr/src/route/k_ary_heap.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef VTR_K_ARY_HEAP_H -#define VTR_K_ARY_HEAP_H - -#include "heap_type.h" -#include - -/** - * @brief Abstract class whose children are HeapInterface implementations of a k-ary minheap. - */ -class KAryHeap : public HeapInterface { - public: - KAryHeap(); - ~KAryHeap(); - - t_heap* alloc() final; - void free(t_heap* hptr) final; - - void init_heap(const DeviceGrid& grid) final; - void add_to_heap(t_heap* hptr) final; - void push_back(t_heap* const hptr) final; - bool is_empty_heap() const final; - void empty_heap() final; - void build_heap() final; - void set_prune_limit(size_t max_index, size_t prune_limit) final; - void free_all_memory() final; - - virtual bool is_valid() const = 0; - virtual t_heap* get_heap_head() = 0; - - protected: - /** - * @brief The struct which the heap_ vector contains. - * - * @details - * Previously, heap_ was made of only t_heap pointers. This meant that - * all comparisons required dereferencing to attain the element's cost. - * Now, the cost is attained by dereferencing only once in add_to_heap(). - * This resulted in a slightly larger memory footprint but a ~1.4% runtime - * improvement. - * - * @param elem_ptr A pointer to the t_heap struct which contains all - * the node's information. - * @param cost The cost of the node. - * - * @todo - * We are currently storing the node cost in two places (in elem_ptr->cost and cost). This might be fixed in two ways:
- * 1. Don't store the cost in t_heap.
- * 2. Instead of using pointers, use a 32-bit ID. If we do this, we can create a new 8-ary heap, which is likely to be even - * faster as we can fit more heap_elem on one cache line (currently, we can fit 5 as heap_elem is 12 bytes), even with more - * comparisons. - */ - struct heap_elem { - t_heap* elem_ptr; - float cost; - }; - - /** - * @return The number of elements in the heap. - */ - size_t size() const; - - /** - * @brief Sift node up until it satisfies minheap property. - * - * @details - * O(lgn) sifting up to maintain heap property after insertion (should sift - * own when building heap) - * - * @param leaf The heap leaf where node currently resides. - * @param node The node to be sifted up. - */ - void sift_up(size_t leaf, heap_elem const& node); - - /** - * @brief Expands heap by 2 times if it is full. - */ - void expand_heap_if_full(); - - /** - * @brief If the size of the heap is greater than the prune limit, prune the heap. - * - * @return Whether the heap was pruned. - */ - bool check_prune_limit(); - - /** - * @brief Prune the heap. - */ - void prune_heap(); - - /** - * @brief Make a heap rooted at index hole by **sifting down** in O(lgn) time - * - * @param hole - */ - virtual void sift_down(size_t hole) = 0; - - /** - * @param i Heap child node. - * - * @return Heap parent node. - */ - virtual size_t parent(size_t i) const = 0; - - HeapStorage storage_; - - /** - * @details - * heap_ is indexed from [1..heap_size]; the 0th element is unused. For BinaryHeap, this simplifies - * arithmetic in left() and parent() functions. Using a heap beginning at index 0 would simplify - * first_child() and parent() functions in FourAryHeap, but this does not improve runtime. - * - * @todo - * If an 8-ary heap is implemented, experiment with starting at index 0 - */ - std::vector heap_; - - size_t heap_size_; /* Number of slots in the heap array */ - size_t heap_tail_; /* Index of first unused slot in the heap array */ - - size_t max_index_; - size_t prune_limit_; -}; - -#endif // VTR_K_ARY_HEAP_H diff --git a/vpr/src/route/netlist_routers.h b/vpr/src/route/netlist_routers.h index 448e4f7f76c..d5f5354a392 100644 --- a/vpr/src/route/netlist_routers.h +++ b/vpr/src/route/netlist_routers.h @@ -16,9 +16,6 @@ * of this interface. */ #include "NetPinTimingInvalidator.h" -#include "binary_heap.h" -#include "four_ary_heap.h" -#include "bucket.h" #include "clustered_netlist_utils.h" #include "connection_based_routing_fwd.h" #include "connection_router.h" @@ -182,20 +179,6 @@ inline std::unique_ptr make_netlist_router( routing_predictor, choking_spots, is_flat); - } else if (router_opts.router_heap == e_heap_type::BUCKET_HEAP_APPROXIMATION) { - return make_netlist_router_with_heap( - net_list, - router_lookahead, - router_opts, - connections_inf, - net_delay, - netlist_pin_lookup, - timing_info, - pin_timing_invalidator, - budgeting_inf, - routing_predictor, - choking_spots, - is_flat); } else { VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Unknown heap type %d", router_opts.router_heap); } diff --git a/vpr/src/route/route_common.cpp b/vpr/src/route/route_common.cpp index fe9db221ae9..7fd9720e450 100644 --- a/vpr/src/route/route_common.cpp +++ b/vpr/src/route/route_common.cpp @@ -291,6 +291,9 @@ void reset_path_costs(const std::vector& visited_rr_nodes) { route_ctx.rr_node_route_inf[node].path_cost = std::numeric_limits::infinity(); route_ctx.rr_node_route_inf[node].backward_path_cost = std::numeric_limits::infinity(); route_ctx.rr_node_route_inf[node].prev_edge = RREdgeId::INVALID(); + // Note: R_upstream of each node is intentionally not reset here. + // For the reasons and details, please refer to the `Update R_upstream` + // in `evaluate_timing_driven_node_costs` in `connection_router.cpp`. } } @@ -781,7 +784,7 @@ void reserve_locally_used_opins(HeapInterface* heap, float pres_fac, float acc_f int num_local_opin, iconn, num_edges; int iclass, ipin; float cost; - t_heap* heap_head_ptr; + HeapNode heap_head_node; t_physical_tile_type_ptr type; auto& cluster_ctx = g_vpr_ctx.clustering(); @@ -839,22 +842,21 @@ void reserve_locally_used_opins(HeapInterface* heap, float pres_fac, float acc_f //Add the OPIN to the heap according to it's congestion cost cost = get_rr_cong_cost(to_node, pres_fac); - add_node_to_heap(heap, route_ctx.rr_node_route_inf, - to_node, cost, RREdgeId::INVALID(), - 0., 0.); + if (cost < route_ctx.rr_node_route_inf[to_node].path_cost) { + heap->add_to_heap({cost, to_node}); + } } for (ipin = 0; ipin < num_local_opin; ipin++) { //Pop the nodes off the heap. We get them from the heap so we //reserve those pins with lowest congestion cost first. - heap_head_ptr = heap->get_heap_head(); - RRNodeId inode(heap_head_ptr->index); + VTR_ASSERT(heap->try_pop(heap_head_node)); + const RRNodeId& inode = heap_head_node.node; VTR_ASSERT(rr_graph.node_type(inode) == OPIN); adjust_one_rr_occ_and_acc_cost(inode, 1, acc_fac); route_ctx.clb_opins_used_locally[blk_id][iclass][ipin] = inode; - heap->free(heap_head_ptr); } heap->empty_heap(); diff --git a/vpr/src/route/route_common.h b/vpr/src/route/route_common.h index a6f18f3af38..1d6bfb58082 100644 --- a/vpr/src/route/route_common.h +++ b/vpr/src/route/route_common.h @@ -144,98 +144,3 @@ float get_cost_from_lookahead(const RouterLookahead& router_lookahead, float R_upstream, const t_conn_cost_params cost_params, bool is_flat); - -/* Creates a new t_heap object to be placed on the heap, if the new cost * - * given is lower than the current path_cost to this channel segment. The * - * index of its predecessor is stored to make traceback easy. The index of * - * the edge used to get from its predecessor to it is also stored to make * - * timing analysis, etc. * - * * - * Returns t_heap suitable for adding to heap or nullptr if node is more * - * expensive than previously explored path. */ -template -t_heap* prepare_to_add_node_to_heap( - T* heap, - const RouteInf& rr_node_route_inf, - RRNodeId inode, - float total_cost, - RREdgeId prev_edge, - float backward_path_cost, - float R_upstream) { - if (total_cost >= rr_node_route_inf[inode].path_cost) - return nullptr; - - t_heap* hptr = heap->alloc(); - - hptr->index = inode; - hptr->cost = total_cost; - hptr->set_prev_edge(prev_edge); - hptr->backward_path_cost = backward_path_cost; - hptr->R_upstream = R_upstream; - return hptr; -} - -/* Puts an rr_node on the heap if it is the cheapest path. */ -template -void add_node_to_heap( - T* heap, - const RouteInf& rr_node_route_inf, - RRNodeId inode, - float total_cost, - RREdgeId prev_edge, - float backward_path_cost, - float R_upstream) { - t_heap* hptr = prepare_to_add_node_to_heap( - heap, - rr_node_route_inf, inode, total_cost, - prev_edge, backward_path_cost, R_upstream); - if (hptr) { - heap->add_to_heap(hptr); - } -} - -/* Puts an rr_node on the heap with the same condition as add_node_to_heap, - * but do not fix heap property yet as that is more efficiently done from - * bottom up with build_heap */ -template -void push_back_node( - T* heap, - const RouteInf& rr_node_route_inf, - RRNodeId inode, - float total_cost, - RREdgeId prev_edge, - float backward_path_cost, - float R_upstream) { - t_heap* hptr = prepare_to_add_node_to_heap( - heap, - rr_node_route_inf, inode, total_cost, prev_edge, - backward_path_cost, R_upstream); - if (hptr) { - heap->push_back(hptr); - } -} - -/* Puts an rr_node on the heap with the same condition as node_to_heap, - * but do not fix heap property yet as that is more efficiently done from - * bottom up with build_heap. Certain information is also added */ -template -void push_back_node_with_info( - T* heap, - RRNodeId inode, - float total_cost, - float backward_path_cost, - float R_upstream, - float backward_path_delay, - PathManager* rcv_path_manager) { - t_heap* hptr = heap->alloc(); - rcv_path_manager->alloc_path_struct(hptr->path_data); - - hptr->index = inode; - hptr->cost = total_cost; - hptr->backward_path_cost = backward_path_cost; - hptr->R_upstream = R_upstream; - - hptr->path_data->backward_delay = backward_path_delay; - - heap->push_back(hptr); -} diff --git a/vpr/src/route/route_net.tpp b/vpr/src/route/route_net.tpp index 7004dbb4d5f..0e8c4c268a5 100644 --- a/vpr/src/route/route_net.tpp +++ b/vpr/src/route/route_net.tpp @@ -313,7 +313,7 @@ inline NetResultFlags pre_route_to_clock_root(ConnectionRouter& router, router.clear_modified_rr_node_info(); bool found_path, retry_with_full_bb; - t_heap cheapest; + RTExploredNode cheapest; ConnectionParameters conn_params(net_id, -1, false, @@ -428,7 +428,7 @@ inline NetResultFlags route_sink(ConnectionRouter& router, router.clear_modified_rr_node_info(); bool found_path; - t_heap cheapest; + RTExploredNode cheapest; bool net_is_global = net_list.net_is_global(net_id); bool high_fanout = is_high_fanout(net_list.net_sinks(net_id).size(), router_opts.high_fanout_threshold); @@ -487,8 +487,8 @@ inline NetResultFlags route_sink(ConnectionRouter& router, update_screen(ScreenUpdatePriority::MAJOR, msg.c_str(), ROUTING, nullptr); } - if (budgeting_inf.if_set() && cheapest.path_data != nullptr && cost_params.delay_budget) { - if (cheapest.path_data->backward_delay < cost_params.delay_budget->min_delay) { + if (budgeting_inf.if_set() && cheapest.rcv_path_backward_delay != std::numeric_limits::infinity() && cost_params.delay_budget) { + if (cheapest.rcv_path_backward_delay < cost_params.delay_budget->min_delay) { budgeting_inf.set_should_reroute(net_id, true); } } diff --git a/vpr/src/route/route_path_manager.h b/vpr/src/route/route_path_manager.h index c3f69980b67..f1673772193 100644 --- a/vpr/src/route/route_path_manager.h +++ b/vpr/src/route/route_path_manager.h @@ -6,19 +6,19 @@ #include #ifndef _PATH_MANAGER_H -# define _PATH_MANAGER_H +#define _PATH_MANAGER_H -/* Extra path data needed by RCV, seperated from t_heap struct for performance reasons +/* Extra path data needed by RCV, separated from RTExploredNode struct for performance reasons * Can be accessed by a pointer, won't be initialized unless by RCV * Use PathManager class to handle this structure's allocation and deallocation * * path_rr: The entire partial path up until the route tree with the first node being the SOURCE, - * or a part of the route tree that already exists for this net - * + * or a part of the route tree that already exists for this net + * * edge: A list of edges from each node in the partial path to reach the next node - * + * * backward_delay: The delay of the partial path plus the path from route tree to source - * + * * backward_cong: The congestion estimate of the partial path plus the path from route tree to source */ struct t_heap_path { std::vector path_rr; @@ -33,24 +33,24 @@ struct RoutingContext; /* A class to manage the extra data required for RCV * It manages a set containing all the nodes that currently exist in the route tree * This class also manages the extra memory allocation required for the t_heap_path structure - * + * * When RCV is enabled, the router will not always be looking for minimal cost routing * This means nodes that already exist in the current path, or current route tree could be expanded twice. * This would result in electrically illegal loops (example below) - * + * * OPIN--|----| |-----------Sink 1 * | |--------X----| <--- The branch intersects with a previous routing * | | * |-------------| Sink 2 - * + * * To stop this, we keep track of the route tree (route_tree_nodes_), and each node keeps track of it's current partial routing up to the route tree * Before expanding a node, we check to see if it exists in either the route tree, or the current partial path to eliminate these scenarios - * - * - * The t_heap_path structure was created to isolate the RCV specific data from the t_heap struct - * Having these in t_heap creates significant performance issues when RCV is disabled - * A t_heap_path pointer is instead stored in t_heap, which is selectively allocated only when RCV is enabled - * + * + * + * The t_heap_path structure was created to isolate the RCV specific data from the RTExploredNode struct + * Having these in RTExploredNode creates significant performance issues when RCV is disabled + * A t_heap_path pointer is instead stored in RTExploredNode, which is selectively allocated only when RCV is enabled + * * If the _is_enabled flag is true, alloc_path_struct allocates t_heap_path structures, otherwise will be a NOOP */ class PathManager { public: @@ -92,7 +92,6 @@ class PathManager { // Put all currently allocated structures into the free_nodes list // This currently does NOT invalidate them - // Ideally used before a t_heap empty_heap() call void empty_heap(); // Clear the route tree nodes set, before moving onto the next net diff --git a/vpr/src/route/route_tree.cpp b/vpr/src/route/route_tree.cpp index daf21bd1eb8..799fa185fbd 100644 --- a/vpr/src/route/route_tree.cpp +++ b/vpr/src/route/route_tree.cpp @@ -478,15 +478,15 @@ void RouteTree::print(void) const { /** Add the most recently finished wire segment to the routing tree, and * update the Tdel, etc. numbers for the rest of the routing tree. hptr - * is the heap pointer of the SINK that was reached, and target_net_pin_index + * is the pointer of the SINK that was reached/explored, and target_net_pin_index * is the net pin index corresponding to the SINK that was reached. Usually target_net_pin_index * is a non-negative integer indicating the netlist connection being routed, but it can be OPEN (-1) - * to indicate this is a routing path to a virtual sink which we use when routing to the source of - * dedicated clock networks. + * to indicate this is a routing path to a virtual sink which we use when routing to the source of + * dedicated clock networks. * This routine returns a tuple: RouteTreeNode of the branch it adds to the route tree and * RouteTreeNode of the SINK it adds to the routing. */ std::tuple, vtr::optional> -RouteTree::update_from_heap(t_heap* hptr, int target_net_pin_index, SpatialRouteTreeLookup* spatial_rt_lookup, bool is_flat) { +RouteTree::update_from_heap(RTExploredNode* hptr, int target_net_pin_index, SpatialRouteTreeLookup* spatial_rt_lookup, bool is_flat) { /* Lock the route tree for writing. At least on Linux this shouldn't have an impact on single-threaded code */ std::unique_lock write_lock(_write_mutex); @@ -515,7 +515,7 @@ RouteTree::update_from_heap(t_heap* hptr, int target_net_pin_index, SpatialRoute * to the SINK indicated by hptr. Returns the first (most upstream) new rt_node, * and the rt_node of the new SINK. Traverses up from SINK */ std::tuple, vtr::optional> -RouteTree::add_subtree_from_heap(t_heap* hptr, int target_net_pin_index, bool is_flat) { +RouteTree::add_subtree_from_heap(RTExploredNode* hptr, int target_net_pin_index, bool is_flat) { auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; auto& route_ctx = g_vpr_ctx.routing(); @@ -534,7 +534,7 @@ RouteTree::add_subtree_from_heap(t_heap* hptr, int target_net_pin_index, bool is * Here we create two vectors: * new_branch_inodes: [sink, nodeN-1, nodeN-2, ... node 1] of length N * and new_branch_iswitches: [N-1->sink, N-2->N-1, ... 2->1, 1->found_node] of length N */ - RREdgeId edge = hptr->prev_edge(); + RREdgeId edge = hptr->prev_edge; RRNodeId new_inode = rr_graph.edge_src_node(edge); RRSwitchId new_iswitch = RRSwitchId(rr_graph.rr_nodes().edge_switch(edge)); diff --git a/vpr/src/route/route_tree.h b/vpr/src/route/route_tree.h index 4991d57f301..37e89db16ae 100644 --- a/vpr/src/route/route_tree.h +++ b/vpr/src/route/route_tree.h @@ -323,6 +323,44 @@ class RouteTreeNode { /** fwd definition for compatibility class in old_traceback.h */ class TracebackCompat; +/** + * @brief Each RTExploredNode element stores the node states for the connection router and represents a partial route. + * + * @note Only `index`, `prev_edge`, and `rcv_path_backward_delay` fields are used as the return value outside the connection router. + */ +class RTExploredNode { + public: + /* Used inside the connection router */ + + ///@brief The cost used to sort heap. For the timing-driven router this is the backward_path_cost + /// plus the expected cost to the target. + float total_cost = std::numeric_limits::infinity(); + ///@brief The "known" cost of the path up to and including this node. + float backward_path_cost = std::numeric_limits::infinity(); + ///@brief Stores the upstream resistance to ground from this node in the path search (connection + /// routing), including the resistance of the node itself (device_ctx.rr_nodes[index].R). + float R_upstream = std::numeric_limits::infinity(); + ///@brief Structure to handle extra RCV structures. Managed by PathManager class. + t_heap_path* path_data = nullptr; + + /* Used outside the connection router as the return values (`index` and `prev_edge` are also used inside the router). */ + + ///@brief The RR node index associated with the costs/R_upstream values. Outside the + /// connection router, this field is mainly used in `RouteTree::update_from_heap` and + /// `RouteTree::add_subtree_from_heap`. Inside the connection router, this is used as + /// part of the node info passed as a parameter of some member functions. + RRNodeId index = RRNodeId::INVALID(); + ///@brief The edge from the previous node used to reach the current. Same usage as the + /// `index` field described above. + RREdgeId prev_edge = RREdgeId::INVALID(); + ///@brief The delay of the partial path plus the path from route tree to source. + /// Needed by RCV. Set to infinity if RCV is disabled. This field is used as part + /// of the return value of the route routine, derived from the `path_data` pointer + /// (but not using `path_data` for returning to avoid issues with dynamic memory + /// management). + float rcv_path_backward_delay = std::numeric_limits::infinity(); +}; + /** * @brief Top level route tree used in timing analysis and keeping routing state. * @@ -357,7 +395,7 @@ class RouteTree { * RouteTreeNode of the SINK it adds to the routing. * Locking operation: only one thread can update_from_heap() a RouteTree at a time. */ std::tuple, vtr::optional> - update_from_heap(t_heap* hptr, int target_net_pin_index, SpatialRouteTreeLookup* spatial_rt_lookup, bool is_flat); + update_from_heap(RTExploredNode* hptr, int target_net_pin_index, SpatialRouteTreeLookup* spatial_rt_lookup, bool is_flat); /** Reload timing values (R_upstream, C_downstream, Tdel). * Can take a RouteTreeNode& to do an incremental update. @@ -491,7 +529,7 @@ class RouteTree { private: std::tuple, vtr::optional> - add_subtree_from_heap(t_heap* hptr, int target_net_pin_index, bool is_flat); + add_subtree_from_heap(RTExploredNode* hptr, int target_net_pin_index, bool is_flat); void add_non_configurable_nodes(RouteTreeNode* rt_node, bool reached_by_non_configurable_edge, diff --git a/vpr/src/route/route_tree_fwd.h b/vpr/src/route/route_tree_fwd.h index 61b61ae739d..6f48247ef30 100644 --- a/vpr/src/route/route_tree_fwd.h +++ b/vpr/src/route/route_tree_fwd.h @@ -4,3 +4,4 @@ class RouteTree; class RouteTreeNode; +class RTExploredNode; diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp index e37744ab70a..ae25d5cdf78 100644 --- a/vpr/src/route/router_delay_profiling.cpp +++ b/vpr/src/route/router_delay_profiling.cpp @@ -97,7 +97,7 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RouterStats router_stats; bool found_path; - t_heap cheapest; + RTExploredNode cheapest; ConnectionParameters conn_params(ParentNetId::INVALID(), -1, false, @@ -186,7 +186,7 @@ vtr::vector calculate_all_path_delays_from_rr_node(RRNodeId src is_flat); RouterStats router_stats; ConnectionParameters conn_params(ParentNetId::INVALID(), OPEN, false, std::unordered_map()); - vtr::vector shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree.root(), + vtr::vector shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree.root(), cost_params, bounding_box, router_stats, diff --git a/vpr/src/route/router_delay_profiling.h b/vpr/src/route/router_delay_profiling.h index bda721e1a24..1d5ae1b21c1 100644 --- a/vpr/src/route/router_delay_profiling.h +++ b/vpr/src/route/router_delay_profiling.h @@ -2,8 +2,6 @@ #define ROUTER_DELAY_PROFILING_H_ #include "vpr_types.h" -#include "binary_heap.h" -#include "four_ary_heap.h" #include "connection_router.h" #include diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp index c2ac5329a26..568b2b175f7 100644 --- a/vpr/test/test_connection_router.cpp +++ b/vpr/test/test_connection_router.cpp @@ -67,7 +67,7 @@ static float do_one_route(RRNodeId source_node, // Find the cheapest route if possible. bool found_path; - t_heap cheapest; + RTExploredNode cheapest; ConnectionParameters conn_params(ParentNetId::INVALID(), -1, false, diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_calc_method/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_calc_method/config/golden_results.txt index 66168627a8f..fcf92ec7e8f 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_calc_method/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_calc_method/config/golden_results.txt @@ -1,5 +1,5 @@ -arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time -stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_--place_delta_delay_matrix_calculation_method_astar 34.84 vpr 975.36 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success v8.0.0-11333-g6a44da44e release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-213-generic x86_64 2024-09-18T20:37:10 betzgrp-wintermute.eecg.utoronto.ca /home/singera8/vtr-verilog-to-routing/vtr_flow/tasks 998768 10 10 168 178 1 68 30 11 8 88 io auto 952.5 MiB 0.50 358 812 97 660 55 975.4 MiB 0.07 0.00 6.44563 -69.2664 -6.44563 6.44563 3.31 0.000633306 0.000584828 0.014981 0.013961 26 784 31 0 0 125464. 1425.72 1.77 0.217747 0.184211 11500 28430 -1 625 17 282 1013 95514 35394 6.59221 6.59221 -74.0805 -6.59221 0 0 163463. 1857.53 0.03 0.07 0.09 -1 -1 0.03 0.0275927 0.0245705 -stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_override_--place_delta_delay_matrix_calculation_method_astar 34.42 vpr 975.53 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success v8.0.0-11333-g6a44da44e release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-213-generic x86_64 2024-09-18T20:37:10 betzgrp-wintermute.eecg.utoronto.ca /home/singera8/vtr-verilog-to-routing/vtr_flow/tasks 998944 10 10 168 178 1 68 30 11 8 88 io auto 952.6 MiB 0.50 365 812 101 651 60 975.5 MiB 0.10 0.00 6.37156 -69.5088 -6.37156 6.37156 3.32 0.000634379 0.000586337 0.015972 0.0149437 24 851 26 0 0 114778. 1304.29 1.37 0.179349 0.152804 11416 27150 -1 691 14 354 1388 135595 52969 6.82221 6.82221 -75.6812 -6.82221 0 0 153433. 1743.56 0.03 0.07 0.09 -1 -1 0.03 0.024931 0.0223273 -stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_--place_delta_delay_matrix_calculation_method_dijkstra 35.84 vpr 975.38 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success v8.0.0-11333-g6a44da44e release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-213-generic x86_64 2024-09-18T20:37:10 betzgrp-wintermute.eecg.utoronto.ca /home/singera8/vtr-verilog-to-routing/vtr_flow/tasks 998788 10 10 168 178 1 68 30 11 8 88 io auto 952.4 MiB 0.50 367 812 86 668 58 975.4 MiB 0.15 0.00 6.39336 -69.4912 -6.39336 6.39336 4.34 0.000639177 0.000587378 0.017224 0.0162017 22 875 22 0 0 110609. 1256.92 1.66 0.199683 0.169442 11258 24748 -1 730 18 335 1182 109582 46429 6.92426 6.92426 -76.9247 -6.92426 0 0 134428. 1527.59 0.02 0.07 0.09 -1 -1 0.02 0.0283942 0.0252052 -stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_override_--place_delta_delay_matrix_calculation_method_dijkstra 35.35 vpr 975.52 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success v8.0.0-11333-g6a44da44e release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-213-generic x86_64 2024-09-18T20:37:10 betzgrp-wintermute.eecg.utoronto.ca /home/singera8/vtr-verilog-to-routing/vtr_flow/tasks 998932 10 10 168 178 1 68 30 11 8 88 io auto 952.8 MiB 0.50 368 812 78 675 59 975.5 MiB 0.07 0.00 6.26392 -68.4373 -6.26392 6.26392 4.33 0.000637702 0.000588521 0.0149562 0.0139792 28 776 45 0 0 134428. 1527.59 1.48 0.227998 0.19302 11590 29630 -1 595 13 254 987 91515 32222 6.61176 6.61176 -72.652 -6.61176 0 0 173354. 1969.93 0.03 0.07 0.10 -1 -1 0.03 0.0241301 0.021664 + arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time ap_mem ap_time ap_full_legalizer_mem ap_full_legalizer_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time + stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_--place_delta_delay_matrix_calculation_method_astar 27.50 vpr 977.58 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success 0f69adb Release IPO VTR_ASSERT_LEVEL=3 GNU 11.4.0 on Linux-6.5.0-1025-azure x86_64 2024-10-15T16:01:56 fv-az837-567 /home/runner/work/vtr-verilog-to-routing/vtr-verilog-to-routing 1001044 10 10 168 178 1 68 30 11 8 88 io auto 956.2 MiB 0.45 370 858 95 697 66 977.6 MiB 0.04 0.00 6.45248 -69.1493 -6.45248 6.45248 2.68 0.000346945 0.000301901 0.0109124 0.00985616 -1 -1 -1 -1 32 693 33 0 0 153433. 1743.56 1.19 0.127615 0.111696 11830 34246 -1 570 10 235 725 56242 26416 6.94346 6.94346 -73.9579 -6.94346 0 0 205860. 2339.32 0.06 0.04 0.08 -1 -1 0.06 0.0194505 0.0184001 + stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_override_--place_delta_delay_matrix_calculation_method_astar 27.82 vpr 977.35 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success 0f69adb Release IPO VTR_ASSERT_LEVEL=3 GNU 11.4.0 on Linux-6.5.0-1025-azure x86_64 2024-10-15T16:01:56 fv-az837-567 /home/runner/work/vtr-verilog-to-routing/vtr-verilog-to-routing 1000804 10 10 168 178 1 68 30 11 8 88 io auto 954.9 MiB 0.45 369 812 82 656 74 977.3 MiB 0.04 0.00 6.45248 -69.2479 -6.45248 6.45248 2.74 0.00035978 0.000313724 0.0101986 0.00925468 -1 -1 -1 -1 32 691 29 0 0 153433. 1743.56 1.24 0.130899 0.114171 11830 34246 -1 553 12 224 697 51846 24062 6.94346 6.94346 -73.4811 -6.94346 0 0 205860. 2339.32 0.06 0.04 0.08 -1 -1 0.06 0.0206713 0.0194697 + stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_--place_delta_delay_matrix_calculation_method_dijkstra 28.08 vpr 977.66 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success 0f69adb Release IPO VTR_ASSERT_LEVEL=3 GNU 11.4.0 on Linux-6.5.0-1025-azure x86_64 2024-10-15T16:01:56 fv-az837-567 /home/runner/work/vtr-verilog-to-routing/vtr-verilog-to-routing 1001124 10 10 168 178 1 68 30 11 8 88 io auto 955.1 MiB 0.47 370 812 89 663 60 977.7 MiB 0.04 0.00 6.52191 -68.7563 -6.52191 6.52191 3.40 0.000347877 0.0002958 0.010332 0.00933957 -1 -1 -1 -1 22 809 21 0 0 110609. 1256.92 0.45 0.066663 0.0592234 11258 24748 -1 663 14 329 1173 67735 35710 7.04515 7.04515 -76.4932 -7.04515 0 0 134428. 1527.59 0.04 0.05 0.07 -1 -1 0.04 0.0237505 0.0223282 + stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_override_--place_delta_delay_matrix_calculation_method_dijkstra 28.29 vpr 977.61 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success 0f69adb Release IPO VTR_ASSERT_LEVEL=3 GNU 11.4.0 on Linux-6.5.0-1025-azure x86_64 2024-10-15T16:01:56 fv-az837-567 /home/runner/work/vtr-verilog-to-routing/vtr-verilog-to-routing 1001072 10 10 168 178 1 68 30 11 8 88 io auto 955.2 MiB 0.45 368 812 95 656 61 977.6 MiB 0.04 0.00 6.34478 -68.8031 -6.34478 6.34478 3.48 0.000358527 0.000311549 0.0101593 0.00922939 -1 -1 -1 -1 28 753 22 0 0 134428. 1527.59 0.44 0.0663655 0.0590372 11590 29630 -1 624 15 260 959 55378 26467 6.64742 6.64742 -72.827 -6.64742 0 0 173354. 1969.93 0.05 0.04 0.07 -1 -1 0.05 0.0225106 0.0210004 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_model/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_model/config/golden_results.txt index cc882260f8f..10c4b944169 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_model/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_delay_model/config/golden_results.txt @@ -1,3 +1,3 @@ -arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time -stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta 35.93 vpr 975.56 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success v8.0.0-11333-g1d3eb07f5 release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-213-generic x86_64 2024-09-18T23:07:52 betzgrp-wintermute.eecg.utoronto.ca /home/singera8/vtr-verilog-to-routing/vtr_flow/tasks 998976 10 10 168 178 1 68 30 11 8 88 io auto 952.8 MiB 0.54 420 582 82 470 30 975.6 MiB 0.07 0.00 6.38568 -70.463 -6.38568 6.38568 3.45 0.000645075 0.000592785 0.0119866 0.0112148 20 909 46 0 0 100248. 1139.18 0.84 0.12912 0.111352 11180 23751 -1 803 20 495 1987 182273 69910 6.92851 6.92851 -75.9518 -6.92851 0 0 125464. 1425.72 0.02 0.10 0.09 -1 -1 0.02 0.0328754 0.0291737 -stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_override 37.04 vpr 975.73 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success v8.0.0-11333-g1d3eb07f5 release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-213-generic x86_64 2024-09-18T23:07:52 betzgrp-wintermute.eecg.utoronto.ca /home/singera8/vtr-verilog-to-routing/vtr_flow/tasks 999148 10 10 168 178 1 68 30 11 8 88 io auto 952.6 MiB 0.60 395 582 95 453 34 975.7 MiB 0.07 0.00 6.37094 -69.85 -6.37094 6.37094 3.47 0.000638173 0.000588606 0.012516 0.0117144 30 698 21 0 0 144567. 1642.81 1.56 0.195052 0.165386 11730 32605 -1 613 13 256 907 102553 34444 6.74537 6.74537 -72.8995 -6.74537 0 0 194014. 2204.70 0.03 0.07 0.11 -1 -1 0.03 0.0256888 0.0231304 + arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time ap_mem ap_time ap_full_legalizer_mem ap_full_legalizer_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time + stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta 28.29 vpr 977.73 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success 0f69adb Release IPO VTR_ASSERT_LEVEL=3 GNU 11.4.0 on Linux-6.5.0-1025-azure x86_64 2024-10-15T16:01:56 fv-az837-567 /home/runner/work/vtr-verilog-to-routing/vtr-verilog-to-routing 1001196 10 10 168 178 1 68 30 11 8 88 io auto 955.4 MiB 0.43 393 628 105 491 32 977.7 MiB 0.03 0.00 6.51193 -69.1178 -6.51193 6.51193 2.64 0.000368496 0.000316279 0.00897708 0.00821508 -1 -1 -1 -1 20 893 28 0 0 100248. 1139.18 1.58 0.129641 0.112291 11180 23751 -1 831 19 496 1987 121384 60113 6.91414 6.91414 -78.1319 -6.91414 0 0 125464. 1425.72 0.04 0.06 0.07 -1 -1 0.04 0.0265283 0.0245474 + stratixiv_arch.timing.xml styr.blif common_--place_delay_model_delta_override 28.12 vpr 977.50 MiB -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 10 -1 -1 success 0f69adb Release IPO VTR_ASSERT_LEVEL=3 GNU 11.4.0 on Linux-6.5.0-1025-azure x86_64 2024-10-15T16:01:56 fv-az837-567 /home/runner/work/vtr-verilog-to-routing/vtr-verilog-to-routing 1000956 10 10 168 178 1 68 30 11 8 88 io auto 955.9 MiB 0.54 380 628 91 496 41 977.5 MiB 0.05 0.00 6.52338 -69.1003 -6.52338 6.52338 2.70 0.000355671 0.000305949 0.00939391 0.00863885 -1 -1 -1 -1 30 673 12 0 0 144567. 1642.81 1.15 0.113164 0.0991248 11730 32605 -1 585 9 216 698 45031 21119 6.8993 6.8993 -73.7008 -6.8993 0 0 194014. 2204.70 0.08 0.05 0.08 -1 -1 0.08 0.0197747 0.0187602