From 2dc1a34437be3f55259798f06242a1b23918883c Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Thu, 23 Jul 2020 13:19:38 +0200
Subject: [PATCH 01/18] Updates to lookahead map.

  - New sampling method for lookahead map.
  - New spiral hole filling algorithm
  - Use a lightweight version of PQ_Entry (PQ_Entry_Lite)
  - Use maps in run_dijkstra()
  - Move context of router_lookahead_map_utils inside of util
    namespace.
  - Parallelization of lookahead generation

lookahead: add new extended map lookahead

This lookahead generates a map with a more complex and complete way of
sampling and expanding the nodes to reach all destinations.

The dijkstra expansion is parallelized to overcome the higher number of
samples needed.

It also adds a penalty cost factor based on distance between the origin
and destination nodes.

extended_lookahead: cleaned code and reduced code duplication

Signed-off-by: Dusty DeWeese <dustin.deweese@gmail.com>
Signed-off-by: Keith Rothman <537074+litghost@users.noreply.github.com>
Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 libs/libvtrcapnproto/CMakeLists.txt           |   1 +
 .../extended_map_lookahead.capnp              |  25 +
 vpr/src/base/SetupVPR.cpp                     |   1 +
 vpr/src/base/ShowSetup.cpp                    |   6 +
 vpr/src/base/echo_files.cpp                   |   1 +
 vpr/src/base/echo_files.h                     |   1 +
 vpr/src/base/read_options.cpp                 |  14 +-
 vpr/src/base/vpr_context.h                    |   3 +-
 vpr/src/base/vpr_types.h                      |   7 +-
 vpr/src/route/route_timing.cpp                |   1 +
 vpr/src/route/route_timing.h                  |   2 +
 vpr/src/route/router_lookahead.cpp            |   3 +
 vpr/src/route/router_lookahead_cost_map.cpp   | 408 ++++++++++++
 vpr/src/route/router_lookahead_cost_map.h     |  30 +
 .../route/router_lookahead_extended_map.cpp   | 566 ++++++++++++++++
 vpr/src/route/router_lookahead_extended_map.h |  46 ++
 vpr/src/route/router_lookahead_map.cpp        | 275 +-------
 vpr/src/route/router_lookahead_map.h          |  13 +-
 vpr/src/route/router_lookahead_map_utils.cpp  | 619 ++++++++++++++++++
 vpr/src/route/router_lookahead_map_utils.h    | 268 ++++++++
 vpr/src/route/router_lookahead_sampling.cpp   | 229 +++++++
 vpr/src/route/router_lookahead_sampling.h     |  35 +
 22 files changed, 2282 insertions(+), 272 deletions(-)
 create mode 100644 libs/libvtrcapnproto/extended_map_lookahead.capnp
 create mode 100644 vpr/src/route/router_lookahead_cost_map.cpp
 create mode 100644 vpr/src/route/router_lookahead_cost_map.h
 create mode 100644 vpr/src/route/router_lookahead_extended_map.cpp
 create mode 100644 vpr/src/route/router_lookahead_extended_map.h
 create mode 100644 vpr/src/route/router_lookahead_map_utils.cpp
 create mode 100644 vpr/src/route/router_lookahead_map_utils.h
 create mode 100644 vpr/src/route/router_lookahead_sampling.cpp
 create mode 100644 vpr/src/route/router_lookahead_sampling.h

diff --git a/libs/libvtrcapnproto/CMakeLists.txt b/libs/libvtrcapnproto/CMakeLists.txt
index 91a77b08014..8067ab14c65 100644
--- a/libs/libvtrcapnproto/CMakeLists.txt
+++ b/libs/libvtrcapnproto/CMakeLists.txt
@@ -22,6 +22,7 @@ set(CAPNP_DEFS
     matrix.capnp
     gen/rr_graph_uxsdcxx.capnp
     map_lookahead.capnp
+    extended_map_lookahead.capnp
     )
 capnp_generate_cpp(CAPNP_SRCS CAPNP_HDRS
     ${CAPNP_DEFS}
diff --git a/libs/libvtrcapnproto/extended_map_lookahead.capnp b/libs/libvtrcapnproto/extended_map_lookahead.capnp
new file mode 100644
index 00000000000..874b39822dd
--- /dev/null
+++ b/libs/libvtrcapnproto/extended_map_lookahead.capnp
@@ -0,0 +1,25 @@
+@0x876ec83c2fea5a18;
+
+using Matrix = import "matrix.capnp";
+
+struct VprCostEntry {
+    delay @0 :Float32;
+    congestion @1 :Float32;
+    fill @2 :Bool;
+}
+
+struct VprVector2D {
+    x @0 :Int64;
+    y @1 :Int64;
+}
+
+struct VprFloatEntry {
+    value @0 :Float32;
+}
+
+struct VprCostMap {
+    costMap @0 :Matrix.Matrix((Matrix.Matrix(VprCostEntry)));
+    offset @1 :Matrix.Matrix(VprVector2D);
+    depField @2 :List(Int64);
+    penalty @3 :Matrix.Matrix(VprFloatEntry);
+}
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index 8bbdaee4989..c5d94465a83 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -160,6 +160,7 @@ void SetupVPR(const t_options* Options,
     }
 
     Segments = Arch->Segments;
+    device_ctx.segment_inf = Arch->Segments;
 
     SetupSwitches(*Arch, RoutingArch, Arch->Switches, Arch->num_switches);
     SetupRoutingArch(*Arch, RoutingArch);
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
index 6fbc3ad28d1..fb065b6d071 100644
--- a/vpr/src/base/ShowSetup.cpp
+++ b/vpr/src/base/ShowSetup.cpp
@@ -307,6 +307,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
                 case e_router_lookahead::MAP:
                     VTR_LOG("MAP\n");
                     break;
+                case e_router_lookahead::EXTENDED_MAP:
+                    VTR_LOG("EXTENDED_MAP\n");
+                    break;
                 case e_router_lookahead::NO_OP:
                     VTR_LOG("NO_OP\n");
                     break;
@@ -444,6 +447,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
                 case e_router_lookahead::MAP:
                     VTR_LOG("MAP\n");
                     break;
+                case e_router_lookahead::EXTENDED_MAP:
+                    VTR_LOG("EXTENDED_MAP\n");
+                    break;
                 case e_router_lookahead::NO_OP:
                     VTR_LOG("NO_OP\n");
                     break;
diff --git a/vpr/src/base/echo_files.cpp b/vpr/src/base/echo_files.cpp
index 6f8de45933b..602004266f6 100644
--- a/vpr/src/base/echo_files.cpp
+++ b/vpr/src/base/echo_files.cpp
@@ -114,6 +114,7 @@ void alloc_and_load_echo_file_info() {
     setEchoFileName(E_ECHO_CHAN_DETAILS, "chan_details.txt");
     setEchoFileName(E_ECHO_SBLOCK_PATTERN, "sblock_pattern.txt");
     setEchoFileName(E_ECHO_ENDPOINT_TIMING, "endpoint_timing.echo.json");
+    setEchoFileName(E_ECHO_LOOKAHEAD_MAP, "lookahead_map.echo");
 }
 
 void free_echo_file_info() {
diff --git a/vpr/src/base/echo_files.h b/vpr/src/base/echo_files.h
index 70df3a3d962..d273c575d50 100644
--- a/vpr/src/base/echo_files.h
+++ b/vpr/src/base/echo_files.h
@@ -46,6 +46,7 @@ enum e_echo_files {
     E_ECHO_CHAN_DETAILS,
     E_ECHO_SBLOCK_PATTERN,
     E_ECHO_ENDPOINT_TIMING,
+    E_ECHO_LOOKAHEAD_MAP,
 
     //Timing Graphs
     E_ECHO_PRE_PACKING_TIMING_GRAPH,
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 24177616e6b..2e25983f493 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -773,6 +773,8 @@ struct ParseRouterLookahead {
             conv_value.set_value(e_router_lookahead::CLASSIC);
         else if (str == "map")
             conv_value.set_value(e_router_lookahead::MAP);
+        else if (str == "extended_map")
+            conv_value.set_value(e_router_lookahead::EXTENDED_MAP);
         else {
             std::stringstream msg;
             msg << "Invalid conversion from '"
@@ -788,15 +790,17 @@ struct ParseRouterLookahead {
         ConvertedValue<std::string> conv_value;
         if (val == e_router_lookahead::CLASSIC)
             conv_value.set_value("classic");
-        else {
-            VTR_ASSERT(val == e_router_lookahead::MAP);
+        else if (val == e_router_lookahead::MAP) {
             conv_value.set_value("map");
+        } else {
+            VTR_ASSERT(val == e_router_lookahead::EXTENDED_MAP);
+            conv_value.set_value("extended_map");
         }
         return conv_value;
     }
 
     std::vector<std::string> default_choices() {
-        return {"classic", "map"};
+        return {"classic", "map", "extended_map"};
     }
 };
 
@@ -2093,7 +2097,9 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
             "Controls what lookahead the router uses to calculate cost of completing a connection.\n"
             " * classic: The classic VPR lookahead (may perform better on un-buffered routing\n"
             "            architectures)\n"
-            " * map: A more advanced lookahead which accounts for diverse wire type\n")
+            " * map: An advanced lookahead which accounts for diverse wire type\n"
+            " * extended_map: A more advanced and extended lookahead which accounts for a more\n"
+            "                 exhaustive node sampling method")
         .default_value("map")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h
index b7048480a27..d353e1109ea 100644
--- a/vpr/src/base/vpr_context.h
+++ b/vpr/src/base/vpr_context.h
@@ -161,7 +161,8 @@ struct DeviceContext : public Context {
     ///@brief The indicies of rr nodes of a given type at a specific x,y grid location
     t_rr_node_indices rr_node_indices; // [0..NUM_RR_TYPES-1][0..grid.width()-1][0..grid.width()-1][0..size-1]
 
-    std::vector<t_rr_switch_inf> rr_switch_inf; // autogenerated in build_rr_graph based on switch fan-in. [0..(num_rr_switches-1)]
+    std::vector<t_rr_switch_inf> rr_switch_inf; /* autogenerated in build_rr_graph based on switch fan-in. [0..(num_rr_switches-1)] */
+    std::vector<t_segment_inf> segment_inf;
 
     ///@brief Wire segment types in RR graph
     std::vector<t_segment_inf> rr_segments;
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index c8734009f7b..00a215ee681 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -110,9 +110,10 @@ constexpr const char* EMPTY_BLOCK_NAME = "EMPTY";
 #endif
 
 enum class e_router_lookahead {
-    CLASSIC, ///<VPR's classic lookahead (assumes uniform wire types)
-    MAP,     ///<Lookahead considering different wire types (see Oleg Petelin's MASc Thesis)
-    NO_OP    ///<A no-operation lookahead which always returns zero
+    CLASSIC,      ///<VPR's classic lookahead (assumes uniform wire types)
+    MAP,          ///<Lookahead considering different wire types (see Oleg Petelin's MASc Thesis)
+    EXTENDED_MAP, ///<Lookahead with a more extensive node sampling method
+    NO_OP         ///<A no-operation lookahead which always returns zero
 };
 
 enum class e_route_bb_update {
diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp
index 1c2ed63e401..00e0d16ca45 100644
--- a/vpr/src/route/route_timing.cpp
+++ b/vpr/src/route/route_timing.cpp
@@ -40,6 +40,7 @@
 #include "connection_router.h"
 
 #include "router_lookahead_map.h"
+#include "router_lookahead_extended_map.h"
 
 #include "tatum/TimingReporter.hpp"
 #include "overuse_report.h"
diff --git a/vpr/src/route/route_timing.h b/vpr/src/route/route_timing.h
index d8827e38307..90756a02e19 100644
--- a/vpr/src/route/route_timing.h
+++ b/vpr/src/route/route_timing.h
@@ -14,6 +14,8 @@
 #include "heap_type.h"
 #include "routing_predictor.h"
 
+extern bool f_router_debug;
+
 int get_max_pins_per_net();
 
 bool try_timing_driven_route(const t_router_opts& router_opts,
diff --git a/vpr/src/route/router_lookahead.cpp b/vpr/src/route/router_lookahead.cpp
index 839c92aaa70..e38f48ee727 100644
--- a/vpr/src/route/router_lookahead.cpp
+++ b/vpr/src/route/router_lookahead.cpp
@@ -1,6 +1,7 @@
 #include "router_lookahead.h"
 
 #include "router_lookahead_map.h"
+#include "router_lookahead_extended_map.h"
 #include "vpr_error.h"
 #include "globals.h"
 #include "route_timing.h"
@@ -13,6 +14,8 @@ static std::unique_ptr<RouterLookahead> make_router_lookahead_object(e_router_lo
         return std::make_unique<ClassicLookahead>();
     } else if (router_lookahead_type == e_router_lookahead::MAP) {
         return std::make_unique<MapLookahead>();
+    } else if (router_lookahead_type == e_router_lookahead::EXTENDED_MAP) {
+        return std::make_unique<ExtendedMapLookahead>();
     } else if (router_lookahead_type == e_router_lookahead::NO_OP) {
         return std::make_unique<NoOpLookahead>();
     }
diff --git a/vpr/src/route/router_lookahead_cost_map.cpp b/vpr/src/route/router_lookahead_cost_map.cpp
new file mode 100644
index 00000000000..16ae8dab526
--- /dev/null
+++ b/vpr/src/route/router_lookahead_cost_map.cpp
@@ -0,0 +1,408 @@
+#include "router_lookahead_cost_map.h"
+
+#include "router_lookahead_map_utils.h"
+#include "globals.h"
+#include "echo_files.h"
+
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "extended_map_lookahead.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif
+
+// Distance penalties filling are calculated based on available samples, but can be adjusted with this factor.
+static constexpr float PENALTY_FACTOR = 1.f;
+static constexpr float PENALTY_MIN = 1e-12f;
+
+// also known as the L1 norm
+static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b) {
+    return abs(b.x() - a.x()) + abs(b.y() - a.y());
+}
+
+template<class T>
+static constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
+    return std::min(std::max(v, lo), hi);
+}
+
+template<typename T>
+static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Point<T>& p) {
+    if (r.empty()) {
+        return vtr::Point<T>(0, 0);
+    } else {
+        return vtr::Point<T>(clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
+                             clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
+    }
+}
+
+// build the segment map
+void CostMap::build_segment_map() {
+    const auto& device_ctx = g_vpr_ctx.device();
+    segment_map_.resize(device_ctx.rr_nodes.size());
+    for (size_t i = 0; i < segment_map_.size(); ++i) {
+        auto& from_node = device_ctx.rr_nodes[i];
+
+        int from_cost_index = from_node.cost_index();
+        int from_seg_index = device_ctx.rr_indexed_data[from_cost_index].seg_index;
+
+        segment_map_[i] = from_seg_index;
+    }
+}
+
+// resize internal data structures
+void CostMap::set_counts(size_t seg_count) {
+    cost_map_.clear();
+    offset_.clear();
+    penalty_.clear();
+    cost_map_.resize({1, seg_count});
+    offset_.resize({1, seg_count});
+    penalty_.resize({1, seg_count});
+    seg_count_ = seg_count;
+}
+
+// cached node -> segment map
+int CostMap::node_to_segment(int from_node_ind) const {
+    return segment_map_[from_node_ind];
+}
+
+static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) {
+    penalty = std::max(penalty, PENALTY_MIN);
+    return util::Cost_Entry(entry.delay + distance * penalty * PENALTY_FACTOR,
+                            entry.congestion, entry.fill);
+}
+
+// get a cost entry for a segment type, chan type, and offset
+util::Cost_Entry CostMap::find_cost(int from_seg_index, int delta_x, int delta_y) const {
+    VTR_ASSERT(from_seg_index >= 0 && from_seg_index < (ssize_t)offset_.size());
+    const auto& cost_map = cost_map_[0][from_seg_index];
+    if (cost_map.dim_size(0) == 0 || cost_map.dim_size(1) == 0) {
+        return util::Cost_Entry();
+    }
+
+    vtr::Point<int> coord(delta_x - offset_[0][from_seg_index].first,
+                          delta_y - offset_[0][from_seg_index].second);
+    vtr::Rect<int> bounds(0, 0, cost_map.dim_size(0), cost_map.dim_size(1));
+    auto closest = closest_point_in_rect(bounds, coord);
+    auto cost = cost_map_[0][from_seg_index][closest.x()][closest.y()];
+    float penalty = penalty_[0][from_seg_index];
+    auto distance = manhattan_distance(closest, coord);
+    return penalize(cost, distance, penalty);
+}
+
+// set the cost map for a segment type and chan type, filling holes
+void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::RoutingCosts& base_costs) {
+    // calculate the bounding boxes
+    vtr::Matrix<vtr::Rect<int>> bounds({1, seg_count_});
+    for (const auto& entry : delay_costs) {
+        bounds[0][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
+    }
+    for (const auto& entry : base_costs) {
+        bounds[0][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
+    }
+
+    // store bounds
+    for (size_t seg = 0; seg < seg_count_; seg++) {
+        const auto& seg_bounds = bounds[0][seg];
+        if (seg_bounds.empty()) {
+            // Didn't find any sample routes, so routing isn't possible between these segment/chan types.
+            offset_[0][seg] = std::make_pair(0, 0);
+            cost_map_[0][seg] = vtr::NdMatrix<util::Cost_Entry, 2>(
+                {size_t(0), size_t(0)});
+            continue;
+        } else {
+            offset_[0][seg] = std::make_pair(seg_bounds.xmin(), seg_bounds.ymin());
+            cost_map_[0][seg] = vtr::NdMatrix<util::Cost_Entry, 2>(
+                {size_t(seg_bounds.width()), size_t(seg_bounds.height())});
+        }
+    }
+
+    // store entries into the matrices
+    for (const auto& entry : delay_costs) {
+        int seg = entry.first.seg_index;
+        const auto& seg_bounds = bounds[0][seg];
+        int x = entry.first.delta.x() - seg_bounds.xmin();
+        int y = entry.first.delta.y() - seg_bounds.ymin();
+        cost_map_[0][seg][x][y].delay = entry.second;
+    }
+    for (const auto& entry : base_costs) {
+        int seg = entry.first.seg_index;
+        const auto& seg_bounds = bounds[0][seg];
+        int x = entry.first.delta.x() - seg_bounds.xmin();
+        int y = entry.first.delta.y() - seg_bounds.ymin();
+        cost_map_[0][seg][x][y].congestion = entry.second;
+    }
+
+    // fill the holes
+    for (size_t seg = 0; seg < seg_count_; seg++) {
+        penalty_[0][seg] = std::numeric_limits<float>::infinity();
+        const auto& seg_bounds = bounds[0][seg];
+        if (seg_bounds.empty()) {
+            continue;
+        }
+        auto& matrix = cost_map_[0][seg];
+
+        // calculate delay penalty
+        float min_delay = std::numeric_limits<float>::infinity(), max_delay = 0.f;
+        vtr::Point<int> min_location(0, 0), max_location(0, 0);
+        for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+            for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+                util::Cost_Entry& cost_entry = matrix[ix][iy];
+                if (cost_entry.valid()) {
+                    if (cost_entry.delay < min_delay) {
+                        min_delay = cost_entry.delay;
+                        min_location = vtr::Point<int>(ix, iy);
+                    }
+                    if (cost_entry.delay > max_delay) {
+                        max_delay = cost_entry.delay;
+                        max_location = vtr::Point<int>(ix, iy);
+                    }
+                }
+            }
+        }
+        float delay_penalty = (max_delay - min_delay) / static_cast<float>(std::max(1, manhattan_distance(max_location, min_location)));
+        penalty_[0][seg] = delay_penalty;
+
+        // find missing cost entries and fill them in by copying a nearby cost entry
+        std::vector<std::tuple<unsigned, unsigned, util::Cost_Entry>> missing;
+        bool couldnt_fill = false;
+        auto shifted_bounds = vtr::Rect<int>(0, 0, seg_bounds.width(), seg_bounds.height());
+        int max_fill = 0;
+        for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+            for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+                util::Cost_Entry& cost_entry = matrix[ix][iy];
+                if (!cost_entry.valid()) {
+                    // maximum search radius
+                    util::Cost_Entry filler;
+                    int distance;
+                    std::tie(filler, distance) = get_nearby_cost_entry(matrix, ix, iy, shifted_bounds);
+                    if (filler.valid()) {
+                        missing.push_back(std::make_tuple(ix, iy, penalize(filler, distance, delay_penalty)));
+                        max_fill = std::max(max_fill, distance);
+                    } else {
+                        couldnt_fill = true;
+                    }
+                }
+            }
+            if (couldnt_fill) {
+                // give up trying to fill an empty matrix
+                break;
+            }
+        }
+
+        if (!couldnt_fill) {
+            VTR_LOG("At %d: max_fill = %d, delay_penalty = %e\n", seg, max_fill, delay_penalty);
+        }
+
+        // write back the missing entries
+        for (auto& xy_entry : missing) {
+            matrix[std::get<0>(xy_entry)][std::get<1>(xy_entry)] = std::get<2>(xy_entry);
+        }
+
+        if (couldnt_fill) {
+            VTR_LOG_WARN("Couldn't fill holes in the cost matrix for %ld, %d x %d bounding box\n",
+                         seg, seg_bounds.width(), seg_bounds.height());
+            for (unsigned y = 0; y < matrix.dim_size(1); y++) {
+                for (unsigned x = 0; x < matrix.dim_size(0); x++) {
+                    VTR_ASSERT(!matrix[x][y].valid());
+                }
+            }
+        }
+    }
+}
+
+// prints an ASCII diagram of each cost map for a segment type (debug)
+// o => above average
+// . => at or below average
+// * => invalid (missing)
+void CostMap::print(int iseg) const {
+    auto& matrix = cost_map_[0][iseg];
+    if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) {
+        VTR_LOG("cost EMPTY");
+    }
+    double sum = 0.0;
+    for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+        for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+            const auto& entry = matrix[ix][iy];
+            if (entry.valid()) {
+                sum += entry.delay;
+            }
+        }
+    }
+    double avg = sum / ((double)matrix.dim_size(0) * (double)matrix.dim_size(1));
+    for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+        for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+            const auto& entry = matrix[ix][iy];
+            if (!entry.valid()) {
+                VTR_LOG("*");
+            } else if (entry.delay * 4 > avg * 5) {
+                VTR_LOG("O");
+            } else if (entry.delay > avg) {
+                VTR_LOG("o");
+            } else if (entry.delay * 4 > avg * 3) {
+                VTR_LOG(".");
+            } else {
+                VTR_LOG(" ");
+            }
+        }
+        VTR_LOG("\n");
+    }
+}
+
+// list segment type and chan type pairs that have empty cost maps (debug)
+std::vector<std::pair<int, int>> CostMap::list_empty() const {
+    std::vector<std::pair<int, int>> results;
+    for (int iseg = 0; iseg < (int)cost_map_.dim_size(0); iseg++) {
+        auto& matrix = cost_map_[0][iseg];
+        if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) results.push_back(std::make_pair(0, iseg));
+    }
+
+    return results;
+}
+
+static void assign_min_entry(util::Cost_Entry* dst, const util::Cost_Entry& src) {
+    if (src.delay < dst->delay) {
+        dst->delay = src.delay;
+    }
+    if (src.congestion < dst->congestion) {
+        dst->congestion = src.congestion;
+    }
+}
+
+// find the minimum cost entry from the nearest manhattan distance neighbor
+std::pair<util::Cost_Entry, int> CostMap::get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix,
+                                                                int cx,
+                                                                int cy,
+                                                                const vtr::Rect<int>& bounds) {
+    // spiral around (cx, cy) looking for a nearby entry
+    bool in_bounds = bounds.contains(vtr::Point<int>(cx, cy));
+    if (!in_bounds) {
+        return std::make_pair(util::Cost_Entry(), 0);
+    }
+    int n = 0;
+    util::Cost_Entry fill(matrix[cx][cy]);
+    fill.fill = true;
+
+    while (in_bounds && !fill.valid()) {
+        n++;
+        in_bounds = false;
+        util::Cost_Entry min_entry;
+        for (int ox = -n; ox <= n; ox++) {
+            int x = cx + ox;
+            int oy = n - abs(ox);
+            int yp = cy + oy;
+            int yn = cy - oy;
+            if (bounds.contains(vtr::Point<int>(x, yp))) {
+                assign_min_entry(&min_entry, matrix[x][yp]);
+                in_bounds = true;
+            }
+            if (bounds.contains(vtr::Point<int>(x, yn))) {
+                assign_min_entry(&min_entry, matrix[x][yn]);
+                in_bounds = true;
+            }
+        }
+        if (!std::isfinite(fill.delay)) {
+            fill.delay = min_entry.delay;
+        }
+        if (!std::isfinite(fill.congestion)) {
+            fill.congestion = min_entry.congestion;
+        }
+    }
+    return std::make_pair(fill, n);
+}
+
+static void ToCostEntry(util::Cost_Entry* out, const VprCostEntry::Reader& in) {
+    out->delay = in.getDelay();
+    out->congestion = in.getCongestion();
+    out->fill = in.getFill();
+}
+
+static void FromCostEntry(VprCostEntry::Builder* out, const util::Cost_Entry& in) {
+    out->setDelay(in.delay);
+    out->setCongestion(in.congestion);
+    out->setFill(in.fill);
+}
+
+static void ToVprVector2D(std::pair<int, int>* out, const VprVector2D::Reader& in) {
+    *out = std::make_pair(in.getX(), in.getY());
+}
+
+static void FromVprVector2D(VprVector2D::Builder* out, const std::pair<int, int>& in) {
+    out->setX(in.first);
+    out->setY(in.second);
+}
+
+static void ToMatrixCostEntry(vtr::NdMatrix<util::Cost_Entry, 2>* out,
+                              const Matrix<VprCostEntry>::Reader& in) {
+    ToNdMatrix<2, VprCostEntry, util::Cost_Entry>(out, in, ToCostEntry);
+}
+
+static void FromMatrixCostEntry(
+    Matrix<VprCostEntry>::Builder* out,
+    const vtr::NdMatrix<util::Cost_Entry, 2>& in) {
+    FromNdMatrix<2, VprCostEntry, util::Cost_Entry>(
+        out, in, FromCostEntry);
+}
+
+static void ToFloat(float* out, const VprFloatEntry::Reader& in) {
+    // Getting a scalar field is always "get<field name>()".
+    *out = in.getValue();
+}
+
+static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
+    // Setting a scalar field is always "set<field name>(value)".
+    out->setValue(in);
+}
+
+void CostMap::read(const std::string& file) {
+    build_segment_map();
+    MmapFile f(file);
+
+    ::capnp::ReaderOptions opts = default_large_capnp_opts();
+    ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
+
+    auto cost_map = reader.getRoot<VprCostMap>();
+    {
+        const auto& offset = cost_map.getOffset();
+        ToNdMatrix<2, VprVector2D, std::pair<int, int>>(
+            &offset_, offset, ToVprVector2D);
+    }
+
+    {
+        const auto& cost_maps = cost_map.getCostMap();
+        ToNdMatrix<2, Matrix<VprCostEntry>, vtr::NdMatrix<util::Cost_Entry, 2>>(
+            &cost_map_, cost_maps, ToMatrixCostEntry);
+    }
+
+    {
+        const auto& penalty = cost_map.getPenalty();
+        ToNdMatrix<2, VprFloatEntry, float>(
+            &penalty_, penalty, ToFloat);
+    }
+}
+
+void CostMap::write(const std::string& file) const {
+    ::capnp::MallocMessageBuilder builder;
+
+    auto cost_map = builder.initRoot<VprCostMap>();
+
+    {
+        auto offset = cost_map.initOffset();
+        FromNdMatrix<2, VprVector2D, std::pair<int, int>>(
+            &offset, offset_, FromVprVector2D);
+    }
+
+    {
+        auto cost_maps = cost_map.initCostMap();
+        FromNdMatrix<2, Matrix<VprCostEntry>, vtr::NdMatrix<util::Cost_Entry, 2>>(
+            &cost_maps, cost_map_, FromMatrixCostEntry);
+    }
+
+    {
+        auto penalty = cost_map.initPenalty();
+        FromNdMatrix<2, VprFloatEntry, float>(
+            &penalty, penalty_, FromFloat);
+    }
+
+    writeMessageToFile(file, &builder);
+}
diff --git a/vpr/src/route/router_lookahead_cost_map.h b/vpr/src/route/router_lookahead_cost_map.h
new file mode 100644
index 00000000000..b2620335498
--- /dev/null
+++ b/vpr/src/route/router_lookahead_cost_map.h
@@ -0,0 +1,30 @@
+#ifndef ROUTER_LOOKAHEAD_COST_MAP_H_
+#define ROUTER_LOOKAHEAD_COST_MAP_H_
+
+#include <vector>
+#include "router_lookahead_map_utils.h"
+#include "vtr_geometry.h"
+
+// Dense cost maps per source segment and destination chan types
+class CostMap {
+  public:
+    void set_counts(size_t seg_count);
+    void build_segment_map();
+    int node_to_segment(int from_node_ind) const;
+    util::Cost_Entry find_cost(int from_seg_index, int delta_x, int delta_y) const;
+    void set_cost_map(const util::RoutingCosts& delay_costs, const util::RoutingCosts& base_costs);
+    std::pair<util::Cost_Entry, int> get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int cx, int cy, const vtr::Rect<int>& bounds);
+    void read(const std::string& file);
+    void write(const std::string& file) const;
+    void print(int iseg) const;
+    std::vector<std::pair<int, int>> list_empty() const;
+
+  private:
+    vtr::Matrix<vtr::Matrix<util::Cost_Entry>> cost_map_;
+    vtr::Matrix<std::pair<int, int>> offset_;
+    vtr::Matrix<float> penalty_;
+    std::vector<int> segment_map_;
+    size_t seg_count_;
+};
+
+#endif
diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
new file mode 100644
index 00000000000..d4c769b4655
--- /dev/null
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -0,0 +1,566 @@
+#include "router_lookahead_extended_map.h"
+
+#include <vector>
+#include <queue>
+
+#include "rr_node.h"
+#include "router_lookahead_map_utils.h"
+#include "router_lookahead_sampling.h"
+#include "globals.h"
+#include "vtr_math.h"
+#include "vtr_time.h"
+#include "vtr_geometry.h"
+#include "echo_files.h"
+#include "rr_graph.h"
+
+#include "route_timing.h"
+#include "route_common.h"
+
+#ifdef VTR_ENABLE_CAPNPROTO
+#    include "capnp/serialize.h"
+#    include "extended_map_lookahead.capnp.h"
+#    include "ndmatrix_serdes.h"
+#    include "mmap_file.h"
+#    include "serdes_utils.h"
+#endif
+
+#if defined(VPR_USE_TBB)
+#    include <tbb/parallel_for_each.h>
+#    include <tbb/mutex.h>
+#endif
+
+/* we're profiling routing cost over many tracks for each wire type, so we'll
+ * have many cost entries at each |dx|,|dy| offset. There are many ways to
+ * "boil down" the many costs at each offset to a single entry for a given
+ * (wire type, chan_type) combination we can take the smallest cost, the
+ * average, median, etc. This define selects the method we use.
+ *
+ * NOTE: Currently, only SMALLEST is supported.
+ *
+ * See e_representative_entry_method */
+#define REPRESENTATIVE_ENTRY_METHOD util::SMALLEST
+
+#define CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS
+
+// Don't continue storing a path after hitting a lower-or-same cost entry.
+static constexpr bool BREAK_ON_MISS = false;
+
+// Distance penalties filling are calculated based on available samples, but can be adjusted with this factor.
+static constexpr float PENALTY_FACTOR = 1.f;
+static constexpr float PENALTY_MIN = 1e-12f;
+
+static constexpr int MIN_PATH_COUNT = 1000;
+
+template<typename Entry>
+static std::pair<float, int> run_dijkstra(int start_node_ind,
+                                          std::vector<bool>* node_expanded,
+                                          std::vector<util::Search_Path>* paths,
+                                          util::RoutingCosts* routing_costs);
+
+template<class T>
+constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
+    return std::min(std::max(v, lo), hi);
+}
+
+template<typename T>
+static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Point<T>& p) {
+    if (r.empty()) {
+        return vtr::Point<T>(0, 0);
+    } else {
+        return vtr::Point<T>(clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
+                             clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
+    }
+}
+
+std::pair<float, float> ExtendedMapLookahead::get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    e_rr_type from_type = rr_graph.node_type(from_node);
+    if (from_type == SOURCE || from_type == OPIN) {
+        //When estimating costs from a SOURCE/OPIN we look-up to find which wire types (and the
+        //cost to reach them) in f_src_opin_delays. Once we know what wire types are
+        //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final
+        //delay to reach the sink.
+
+        t_physical_tile_type_ptr tile_type = device_ctx.grid[rr_graph.node_xlow(from_node)][rr_graph.node_ylow(from_node)].type;
+        auto tile_index = std::distance(&device_ctx.physical_tile_types[0], tile_type);
+
+        auto from_ptc = rr_graph.node_ptc_num(from_node);
+
+        if (this->src_opin_delays[tile_index][from_ptc].empty()) {
+            //During lookahead profiling we were unable to find any wires which connected
+            //to this PTC.
+            //
+            //This can sometimes occur at very low channel widths (e.g. during min W search on
+            //small designs) where W discretization combined with fraction Fc may cause some
+            //pins/sources to be left disconnected.
+            //
+            //Such RR graphs are of course unroutable, but that should be determined by the
+            //router. So just return an arbitrary value here rather than error.
+
+            return std::pair<float, float>(0.f, 0.f);
+        } else {
+            //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
+            //and then add the estimates from those wire types for the distance of interest.
+            //If there are multiple options we use the minimum value.
+
+            float delay = 0;
+            float congestion = 0;
+            float expected_cost = std::numeric_limits<float>::infinity();
+
+            for (const auto& kv : this->src_opin_delays[tile_index][from_ptc]) {
+                const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
+
+                util::Cost_Entry cost_entry;
+                if (reachable_wire_inf.wire_rr_type == SINK) {
+                    //Some pins maybe reachable via a direct (OPIN -> IPIN) connection.
+                    //In the lookahead, we treat such connections as 'special' wire types
+                    //with no delay/congestion cost
+                    cost_entry.delay = 0;
+                    cost_entry.congestion = 0;
+                } else {
+                    //For an actual accessible wire, we query the wire look-up to get it's
+                    //delay and congestion cost estimates
+                    cost_entry = cost_map_.find_cost(reachable_wire_inf.wire_seg_index, delta_x, delta_y);
+                }
+
+                float this_cost = (criticality_fac) * (reachable_wire_inf.delay + cost_entry.delay)
+                                  + (1. - criticality_fac) * (reachable_wire_inf.congestion + cost_entry.congestion);
+
+                if (this_cost < expected_cost) {
+                    delay = reachable_wire_inf.delay;
+                    congestion = reachable_wire_inf.congestion;
+                }
+            }
+
+            return std::pair<float, float>(delay, congestion);
+        }
+
+        VTR_ASSERT_SAFE_MSG(false,
+                            vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
+                                            rr_node_arch_name(size_t(from_node)).c_str(),
+                                            describe_rr_node(size_t(from_node)).c_str())
+                                .c_str());
+    }
+
+    return std::pair<float, float>(0.f, 0.f);
+}
+
+float ExtendedMapLookahead::get_chan_ipin_delays(RRNodeId to_node) const {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    auto to_tile_type = device_ctx.grid[rr_graph.node_xlow(to_node)][rr_graph.node_ylow(to_node)].type;
+    auto to_tile_index = to_tile_type->index;
+
+    auto to_ptc = rr_graph.node_ptc_num(to_node);
+
+    float site_pin_delay = 0.f;
+    if (this->chan_ipins_delays[to_tile_index].size() != 0) {
+        auto reachable_wire_inf = this->chan_ipins_delays[to_tile_index][to_ptc];
+
+        site_pin_delay = reachable_wire_inf.delay;
+    }
+
+    return site_pin_delay;
+}
+
+// derive a cost from the map between two nodes
+float ExtendedMapLookahead::get_map_cost(int from_node_ind,
+                                         int to_node_ind,
+                                         float criticality_fac) const {
+    if (from_node_ind == to_node_ind) {
+        return 0.f;
+    }
+
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    RRNodeId to_node(to_node_ind);
+    RRNodeId from_node(from_node_ind);
+
+    int from_x = rr_graph.node_xlow(from_node);
+    int from_y = rr_graph.node_ylow(from_node);
+
+    int to_x = rr_graph.node_xlow(to_node);
+    int to_y = rr_graph.node_ylow(to_node);
+
+    int dx, dy;
+    dx = to_x - from_x;
+    dy = to_y - from_y;
+
+    float extra_delay, extra_congestion;
+    std::tie(extra_delay, extra_congestion) = this->get_src_opin_delays(from_node, dx, dy, criticality_fac);
+
+    int from_seg_index = cost_map_.node_to_segment(from_node_ind);
+    util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, dx, dy);
+
+    if (!cost_entry.valid()) {
+        // there is no route
+        VTR_LOGV_DEBUG(f_router_debug,
+                       "Not connected %d (%s, %d) -> %d (%s)\n",
+                       from_node_ind, device_ctx.rr_nodes[from_node_ind].type_string(), from_seg_index,
+                       to_node_ind, device_ctx.rr_nodes[to_node_ind].type_string());
+        return std::numeric_limits<float>::infinity();
+    }
+
+    float expected_delay = cost_entry.delay + extra_delay;
+    float expected_congestion = cost_entry.congestion + extra_congestion;
+
+    float site_pin_delay = this->get_chan_ipin_delays(to_node);
+    expected_delay += site_pin_delay;
+
+    float expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion;
+
+    VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node %d to %d\n", from_node_ind, to_node_ind);
+    const std::string& segment_name = device_ctx.segment_inf[from_seg_index].name;
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead returned %s (%d) with distance (%zd, %zd)\n",
+                   segment_name.c_str(), from_seg_index,
+                   dx, dy);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead delay: %g\n", expected_delay);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead congestion: %g\n", expected_congestion);
+    VTR_LOGV_DEBUG(f_router_debug, "Criticality: %g\n", criticality_fac);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead cost: %g\n", expected_cost);
+    VTR_LOGV_DEBUG(f_router_debug, "Site pin delay: %g\n", site_pin_delay);
+
+    if (!std::isfinite(expected_cost)) {
+        VTR_LOG_ERROR("infinite cost for segment %d at (%d, %d)\n", from_seg_index, (int)dx, (int)dy);
+        VTR_ASSERT(0);
+    }
+
+    if (expected_cost < 0.f) {
+        VTR_LOG_ERROR("negative cost for segment %d at (%d, %d)\n", from_seg_index, (int)dx, (int)dy);
+        VTR_ASSERT(0);
+    }
+
+    return expected_cost;
+}
+
+// add a best cost routing path from start_node_ind to node_ind to routing costs
+template<typename Entry>
+bool ExtendedMapLookahead::add_paths(int start_node_ind,
+                                     Entry current,
+                                     const std::vector<util::Search_Path>& paths,
+                                     util::RoutingCosts* routing_costs) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    int node_ind = current.rr_node_ind;
+    RRNodeId node(node_ind);
+
+    bool new_sample_found = false;
+
+    float site_pin_delay = this->get_chan_ipin_delays(node);
+
+    // reconstruct the path
+    std::vector<int> path;
+    for (int i = paths[node_ind].parent; i != start_node_ind; i = paths[i].parent) {
+        VTR_ASSERT(i != -1);
+        path.push_back(i);
+    }
+    path.push_back(start_node_ind);
+
+    current.adjust_Tsw(-site_pin_delay);
+
+    // add each node along the path subtracting the incremental costs from the current costs
+    Entry start_to_here(start_node_ind, UNDEFINED, nullptr);
+    int parent = start_node_ind;
+    for (auto it = path.rbegin(); it != path.rend(); it++) {
+        RRNodeId this_node(*it);
+        auto& here = device_ctx.rr_nodes[*it];
+        int seg_index = device_ctx.rr_indexed_data[here.cost_index()].seg_index;
+
+        int from_x = rr_graph.node_xlow(this_node);
+        int from_y = rr_graph.node_ylow(this_node);
+
+        int to_x = rr_graph.node_xlow(node);
+        int to_y = rr_graph.node_ylow(node);
+
+        int delta_x, delta_y;
+        delta_x = to_x - from_x;
+        delta_y = to_y - from_y;
+
+        vtr::Point<int> delta(delta_x, delta_y);
+
+        util::RoutingCostKey key = {
+            seg_index,
+            delta};
+
+        if (*it != start_node_ind) {
+            auto& parent_node = device_ctx.rr_nodes[parent];
+            start_to_here = Entry(*it, parent_node.edge_switch(paths[*it].edge), &start_to_here);
+            parent = *it;
+        }
+
+        float cost = current.cost() - start_to_here.cost();
+        if (cost < 0.f && cost > -10e-15 /* 10 femtosecond */) {
+            cost = 0.f;
+        }
+
+        VTR_ASSERT(cost >= 0.f);
+
+        // NOTE: implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+        auto result = routing_costs->insert(std::make_pair(key, cost));
+        if (!result.second) {
+            if (cost < result.first->second) {
+                result.first->second = cost;
+                new_sample_found = true;
+            } else if (BREAK_ON_MISS) {
+                break;
+            }
+        } else {
+            new_sample_found = true;
+        }
+    }
+    return new_sample_found;
+}
+
+/* Runs Dijkstra's algorithm from specified node until all nodes have been
+ * visited. Each time a pin is visited, the delay/congestion information
+ * to that pin is stored to an entry in the routing_cost_map.
+ *
+ * Returns the maximum (last) minimum cost path stored, and
+ * the number of paths from start_node_ind stored. */
+template<typename Entry>
+std::pair<float, int> ExtendedMapLookahead::run_dijkstra(int start_node_ind,
+                                                         std::vector<bool>* node_expanded,
+                                                         std::vector<util::Search_Path>* paths,
+                                                         util::RoutingCosts* routing_costs) {
+    auto& device_ctx = g_vpr_ctx.device();
+    int path_count = 0;
+
+    /* a list of boolean flags (one for each rr node) to figure out if a
+     * certain node has already been expanded */
+    std::fill(node_expanded->begin(), node_expanded->end(), false);
+    /* For each node keep a list of the cost with which that node has been
+     * visited (used to determine whether to push a candidate node onto the
+     * expansion queue.
+     * Also store the parent node so we can reconstruct a specific path. */
+    std::fill(paths->begin(), paths->end(), util::Search_Path{std::numeric_limits<float>::infinity(), -1, -1});
+    /* a priority queue for expansion */
+    std::priority_queue<Entry, std::vector<Entry>, std::greater<Entry>> pq;
+
+    /* first entry has no upstream delay or congestion */
+    Entry first_entry(start_node_ind, UNDEFINED, nullptr);
+    float max_cost = first_entry.cost();
+
+    pq.push(first_entry);
+
+    /* now do routing */
+    while (!pq.empty()) {
+        auto current = pq.top();
+        pq.pop();
+
+        int node_ind = current.rr_node_ind;
+
+        /* check that we haven't already expanded from this node */
+        if ((*node_expanded)[node_ind]) {
+            continue;
+        }
+
+        /* if this node is an ipin record its congestion/delay in the routing_cost_map */
+        if (device_ctx.rr_nodes[node_ind].type() == IPIN) {
+            // the last cost should be the highest
+            max_cost = current.cost();
+
+            path_count++;
+            this->add_paths<Entry>(start_node_ind, current, *paths, routing_costs);
+        } else {
+            util::expand_dijkstra_neighbours(device_ctx.rr_nodes,
+                                             current, paths, node_expanded, &pq);
+            (*node_expanded)[node_ind] = true;
+        }
+    }
+    return std::make_pair(max_cost, path_count);
+}
+
+// compute the cost maps for lookahead
+void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf) {
+    this->src_opin_delays = util::compute_router_src_opin_lookahead();
+    this->chan_ipins_delays = util::compute_router_chan_ipin_lookahead();
+
+    vtr::ScopedStartFinishTimer timer("Computing connection box lookahead map");
+
+    // Initialize rr_node_route_inf if not already
+    alloc_and_load_rr_node_route_structs();
+
+    size_t num_segments = segment_inf.size();
+    std::vector<SampleRegion> sample_regions = find_sample_regions(num_segments);
+
+    /* free previous delay map and allocate new one */
+    auto& device_ctx = g_vpr_ctx.device();
+    cost_map_.set_counts(segment_inf.size());
+    cost_map_.build_segment_map();
+
+    VTR_ASSERT(REPRESENTATIVE_ENTRY_METHOD == util::SMALLEST);
+    util::RoutingCosts all_delay_costs;
+    util::RoutingCosts all_base_costs;
+
+    /* run Dijkstra's algorithm for each segment type & channel type combination */
+#if defined(VPR_USE_TBB)
+    tbb::mutex all_costs_mutex;
+    tbb::parallel_for_each(sample_regions, [&](const SampleRegion& region) {
+#else
+    for (const auto& region : sample_regions) {
+#endif
+        // holds the cost entries for a run
+        util::RoutingCosts delay_costs;
+        util::RoutingCosts base_costs;
+        int total_path_count = 0;
+        std::vector<bool> node_expanded(device_ctx.rr_nodes.size());
+        std::vector<util::Search_Path> paths(device_ctx.rr_nodes.size());
+
+        for (auto& point : region.points) {
+            // statistics
+            vtr::Timer run_timer;
+            float max_delay_cost = 0.f;
+            float max_base_cost = 0.f;
+            int path_count = 0;
+            for (auto node_ind : point.nodes) {
+                {
+                    auto result = run_dijkstra<util::PQ_Entry_Delay>(node_ind, &node_expanded, &paths, &delay_costs);
+                    max_delay_cost = std::max(max_delay_cost, result.first);
+                    path_count += result.second;
+                }
+                {
+                    auto result = run_dijkstra<util::PQ_Entry_Base_Cost>(node_ind, &node_expanded, &paths, &base_costs);
+                    max_base_cost = std::max(max_base_cost, result.first);
+                    path_count += result.second;
+                }
+            }
+
+            if (path_count > 0) {
+                VTR_LOG("Expanded %d paths of segment type %s(%d) starting at (%d, %d) from %d segments, max_cost %e %e (%g paths/sec)\n",
+                        path_count, segment_inf[region.segment_type].name.c_str(), region.segment_type,
+                        point.location.x(), point.location.y(),
+                        (int)point.nodes.size(),
+                        max_delay_cost, max_base_cost,
+                        path_count / run_timer.elapsed_sec());
+            }
+
+            /*
+             * if (path_count == 0) {
+             * for (auto node_ind : point.nodes) {
+             * VTR_LOG("Expanded node %s\n", describe_rr_node(node_ind).c_str());
+             * }
+             * }
+             */
+
+            total_path_count += path_count;
+            if (total_path_count > MIN_PATH_COUNT) {
+                break;
+            }
+        }
+
+#if defined(VPR_USE_TBB)
+        all_costs_mutex.lock();
+#endif
+
+        if (total_path_count == 0) {
+            VTR_LOG_WARN("No paths found for sample region %s(%d, %d)\n",
+                         segment_inf[region.segment_type].name.c_str(), region.grid_location.x(), region.grid_location.y());
+        }
+
+        // combine the cost map from this run with the final cost maps for each segment
+        for (const auto& cost : delay_costs) {
+            const auto& val = cost.second;
+            auto result = all_delay_costs.insert(std::make_pair(cost.first, val));
+            if (!result.second) {
+                // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+                result.first->second = std::min(result.first->second, val);
+            }
+        }
+        for (const auto& cost : base_costs) {
+            const auto& val = cost.second;
+            auto result = all_base_costs.insert(std::make_pair(cost.first, val));
+            if (!result.second) {
+                // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+                result.first->second = std::min(result.first->second, val);
+            }
+        }
+
+#if defined(VPR_USE_TBB)
+        all_costs_mutex.unlock();
+    });
+#else
+    }
+#endif
+
+    VTR_LOG("Combining results\n");
+    /* boil down the cost list in routing_cost_map at each coordinate to a
+     * representative cost entry and store it in the lookahead cost map */
+    cost_map_.set_cost_map(all_delay_costs, all_base_costs);
+
+// diagnostics
+#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_ENTRIES)
+    for (auto& cost : all_costs) {
+        const auto& key = cost.first;
+        const auto& val = cost.second;
+        VTR_LOG("%d -> %d (%d, %d): %g, %g\n",
+                val.from_node, val.to_node,
+                key.delta.x(), key.delta.y(),
+                val.cost_entry.delay, val.cost_entry.congestion);
+    }
+#endif
+
+#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS)
+    for (int iseg = 0; iseg < (ssize_t)num_segments; iseg++) {
+        VTR_LOG("cost map for %s(%d)\n",
+                segment_inf[iseg].name.c_str(), iseg);
+        cost_map_.print(iseg);
+    }
+#endif
+
+#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_EMPTY_MAPS)
+    for (std::pair<int, int> p : cost_map_.list_empty()) {
+        int ichan, iseg;
+        std::tie(ichan, iseg) = p;
+        VTR_LOG("cost map for %s(%d), chan %d EMPTY\n",
+                segment_inf[iseg].name.c_str(), iseg, box_id);
+    }
+#endif
+}
+
+// get an expected minimum cost for routing from the current node to the target node
+float ExtendedMapLookahead::get_expected_cost(
+    int current_node,
+    int target_node,
+    const t_conn_cost_params& params,
+    float /*R_upstream*/) const {
+    auto& device_ctx = g_vpr_ctx.device();
+
+    t_rr_type rr_type = device_ctx.rr_nodes[current_node].type();
+
+    if (rr_type == CHANX || rr_type == CHANY) {
+        return get_map_cost(
+            current_node, target_node, params.criticality);
+    } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
+        return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost);
+    } else { /* Change this if you want to investigate route-throughs */
+        return (0.);
+    }
+}
+
+#ifndef VTR_ENABLE_CAPNPROTO
+
+void ExtendedMapLookahead::read(const std::string& file) {
+    VPR_THROW(VPR_ERROR_ROUTE, "MapLookahead::read not implemented");
+}
+void ExtendedMapLookahead::write(const std::string& file) const {
+    VPR_THROW(VPR_ERROR_ROUTE, "MapLookahead::write not implemented");
+}
+
+#else
+
+void ExtendedMapLookahead::read(const std::string& file) {
+    cost_map_.read(file);
+
+    this->src_opin_delays = util::compute_router_src_opin_lookahead();
+    this->chan_ipins_delays = util::compute_router_chan_ipin_lookahead();
+}
+void ExtendedMapLookahead::write(const std::string& file) const {
+    cost_map_.write(file);
+}
+
+#endif
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
new file mode 100644
index 00000000000..dd0e7fb2519
--- /dev/null
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -0,0 +1,46 @@
+#ifndef EXTENDED_MAP_LOOKAHEAD_H_
+#define EXTENDED_MAP_LOOKAHEAD_H_
+
+#include <vector>
+#include "physical_types.h"
+#include "router_lookahead.h"
+#include "router_lookahead_map_utils.h"
+#include "router_lookahead_cost_map.h"
+#include "vtr_geometry.h"
+
+// Implementation of RouterLookahead based on source segment and destination connection box types
+class ExtendedMapLookahead : public RouterLookahead {
+  private:
+    //Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
+    util::t_src_opin_delays src_opin_delays;
+
+    //Look-up table from CHANX/CHANY to SINK/IPIN of various types
+    util::t_chan_ipins_delays chan_ipins_delays;
+
+    std::pair<float, float> get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
+    float get_chan_ipin_delays(RRNodeId to_node) const;
+
+    template<typename Entry>
+    bool add_paths(int start_node_ind,
+                   Entry current,
+                   const std::vector<util::Search_Path>& paths,
+                   util::RoutingCosts* routing_costs);
+
+    template<typename Entry>
+    std::pair<float, int> run_dijkstra(int start_node_ind,
+                                       std::vector<bool>* node_expanded,
+                                       std::vector<util::Search_Path>* paths,
+                                       util::RoutingCosts* routing_costs);
+
+  public:
+    float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
+    float get_map_cost(int from_node_ind, int to_node_ind, float criticality_fac) const;
+    void compute(const std::vector<t_segment_inf>& segment_inf) override;
+
+    void read(const std::string& file) override;
+    void write(const std::string& file) const override;
+
+    CostMap cost_map_;
+};
+
+#endif
diff --git a/vpr/src/route/router_lookahead_map.cpp b/vpr/src/route/router_lookahead_map.cpp
index 7d2217d84be..2f1d955a20e 100644
--- a/vpr/src/route/router_lookahead_map.cpp
+++ b/vpr/src/route/router_lookahead_map.cpp
@@ -14,7 +14,7 @@
  * in Oleg Petelin's MASc thesis (2016) for more discussion.
  *
  * To handle estimates starting from SOURCE/OPIN's the lookahead also creates a small side look-up table of the wire types
- * which are reachable from each physical tile type's SOURCEs/OPINs (f_src_opin_reachable_wires). This is used for
+ * which are reachable from each physical tile type's SOURCEs/OPINs (f_src_opin_delays). This is used for
  * SRC/OPIN -> CHANX/CHANY estimates.
  *
  * In the case of SRC/OPIN -> SINK estimates the resuls from the two look-ups are added together (and the minimum taken
@@ -35,6 +35,7 @@
 #include "vtr_time.h"
 #include "vtr_geometry.h"
 #include "router_lookahead_map.h"
+#include "router_lookahead_map_utils.h"
 #include "rr_graph2.h"
 #include "rr_graph.h"
 #include "route_common.h"
@@ -182,26 +183,10 @@ class PQ_Entry {
     }
 };
 
-struct t_reachable_wire_inf {
-    e_rr_type wire_rr_type;
-    int wire_seg_index;
-
-    //Costs to reach the wire type from the current node
-    float congestion;
-    float delay;
-};
-
 /* used during Dijkstra expansion to store delay/congestion info lists for each relative coordinate for a given segment and channel type.
  * the list at each coordinate is later boiled down to a single representative cost entry to be stored in the final cost map */
 typedef vtr::Matrix<Expansion_Cost_Entry> t_routing_cost_map; //[0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1]
 
-typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_src_opin_reachable_wires; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index]
-                                                                                                  // ^                                           ^             ^
-                                                                                                  // |                                           |             |
-                                                                                                  // physical block type index                   |             Reachable wire info
-                                                                                                  //                                             |
-                                                                                                  //                                             SOURCE/OPIN ptc
-
 struct t_dijkstra_data {
     /* a list of boolean flags (one for each rr node) to figure out if a certain node has already been expanded */
     vtr::vector<RRNodeId, bool> node_expanded;
@@ -214,20 +199,12 @@ struct t_dijkstra_data {
 
 /******** File-Scope Variables ********/
 
-constexpr int DIRECT_CONNECT_SPECIAL_SEG_TYPE = -1;
-
 //Look-up table from CHANX/CHANY (to SINKs) for various distances
 t_wire_cost_map f_wire_cost_map;
 
-//Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
-t_src_opin_reachable_wires f_src_opin_reachable_wires;
-
 /******** File-Scope Functions ********/
 Cost_Entry get_wire_cost_entry(e_rr_type rr_type, int seg_index, int delta_x, int delta_y);
 static void compute_router_wire_lookahead(const std::vector<t_segment_inf>& segment_inf);
-static void compute_router_src_opin_lookahead();
-static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> start);
-void dijkstra_flood_to_wires(int itile, RRNodeId inode, t_src_opin_reachable_wires& src_opin_reachable_wires);
 
 /* returns index of a node from which to start routing */
 static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset);
@@ -257,9 +234,6 @@ float MapLookahead::get_expected_cost(int current_node, int target_node, const t
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    RRNodeId current(current_node);
-    RRNodeId target(target_node);
-
     t_rr_type rr_type = rr_graph.node_type(current);
 
     if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
@@ -275,6 +249,9 @@ float MapLookahead::get_expected_cost(int current_node, int target_node, const t
     }
 }
 
+/******** Function Definitions ********/
+/* queries the lookahead_map (should have been computed prior to routing) to get the expected cost
+ * from the specified source to the specified target */
 std::pair<float, float> MapLookahead::get_expected_delay_and_cong(int inode, int target_node, const t_conn_cost_params& params, float /*R_upstream*/) const {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
@@ -293,7 +270,7 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(int inode, int
     e_rr_type from_type = rr_graph.node_type(from_node);
     if (from_type == SOURCE || from_type == OPIN) {
         //When estimating costs from a SOURCE/OPIN we look-up to find which wire types (and the
-        //cost to reach them) in f_src_opin_reachable_wires. Once we know what wire types are
+        //cost to reach them) in src_opin_delays. Once we know what wire types are
         //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final
         //delay to reach the sink.
 
@@ -302,7 +279,7 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(int inode, int
 
         auto from_ptc = rr_graph.node_ptc_num(from_node);
 
-        if (f_src_opin_reachable_wires[tile_index][from_ptc].empty()) {
+        if (this->src_opin_delays[tile_index][from_ptc].empty()) {
             //During lookahead profiling we were unable to find any wires which connected
             //to this PTC.
             //
@@ -326,8 +303,8 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(int inode, int
             //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
             //and then add the estimates from those wire types for the distance of interest.
             //If there are multiple options we use the minimum value.
-            for (const auto& kv : f_src_opin_reachable_wires[tile_index][from_ptc]) {
-                const t_reachable_wire_inf& reachable_wire_inf = kv.second;
+            for (const auto& kv : this->src_opin_delays[tile_index][from_ptc]) {
+                const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
 
                 Cost_Entry wire_cost_entry;
                 if (reachable_wire_inf.wire_rr_type == SINK) {
@@ -389,7 +366,15 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(int inode, int
 }
 
 void MapLookahead::compute(const std::vector<t_segment_inf>& segment_inf) {
-    compute_router_lookahead(segment_inf);
+    vtr::ScopedStartFinishTimer timer("Computing router lookahead map");
+
+    //First compute the delay map when starting from the various wire types
+    //(CHANX/CHANY)in the routing architecture
+    compute_router_wire_lookahead(segment_inf);
+
+    //Next, compute which wire types are accessible (and the cost to reach them)
+    //from the different physical tile type's SOURCEs & OPINs
+    this->src_opin_delays = util::compute_router_src_opin_lookahead();
 }
 
 void MapLookahead::read(const std::string& file) {
@@ -397,7 +382,7 @@ void MapLookahead::read(const std::string& file) {
 
     //Next, compute which wire types are accessible (and the cost to reach them)
     //from the different physical tile type's SOURCEs & OPINs
-    compute_router_src_opin_lookahead();
+    this->src_opin_delays = util::compute_router_src_opin_lookahead();
 }
 
 void MapLookahead::write(const std::string& file) const {
@@ -420,20 +405,6 @@ Cost_Entry get_wire_cost_entry(e_rr_type rr_type, int seg_index, int delta_x, in
     return f_wire_cost_map[chan_index][seg_index][delta_x][delta_y];
 }
 
-/* Computes the lookahead map to be used by the router. If a map was computed prior to this, a new one will not be computed again.
- * The rr graph must have been built before calling this function. */
-void compute_router_lookahead(const std::vector<t_segment_inf>& segment_inf) {
-    vtr::ScopedStartFinishTimer timer("Computing router lookahead map");
-
-    //First compute the delay map when starting from the various wire types
-    //(CHANX/CHANY)in the routing architecture
-    compute_router_wire_lookahead(segment_inf);
-
-    //Next, compute which wire types are accessible (and the cost to reach them)
-    //from the different physical tile type's SOURCEs & OPINs
-    compute_router_src_opin_lookahead();
-}
-
 static void compute_router_wire_lookahead(const std::vector<t_segment_inf>& segment_inf) {
     vtr::ScopedStartFinishTimer timer("Computing wire lookahead");
 
@@ -571,214 +542,6 @@ static void compute_router_wire_lookahead(const std::vector<t_segment_inf>& segm
     if (false) print_wire_cost_map(segment_inf);
 }
 
-static void compute_router_src_opin_lookahead() {
-    vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead");
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
-
-    f_src_opin_reachable_wires.clear();
-
-    f_src_opin_reachable_wires.resize(device_ctx.physical_tile_types.size());
-
-    std::vector<int> rr_nodes_at_loc;
-
-    //We assume that the routing connectivity of each instance of a physical tile is the same,
-    //and so only measure one instance of each type
-    for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) {
-        for (e_rr_type rr_type : {SOURCE, OPIN}) {
-            vtr::Point<int> sample_loc(-1, -1);
-
-            size_t num_sampled_locs = 0;
-            bool ptcs_with_no_reachable_wires = true;
-            while (ptcs_with_no_reachable_wires) { //Haven't found wire connected to ptc
-                ptcs_with_no_reachable_wires = false;
-
-                sample_loc = pick_sample_tile(&device_ctx.physical_tile_types[itile], sample_loc);
-
-                if (sample_loc.x() == -1 && sample_loc.y() == -1) {
-                    //No untried instances of the current tile type left
-                    VTR_LOG_WARN("Found no %ssample locations for %s in %s\n",
-                                 (num_sampled_locs == 0) ? "" : "more ",
-                                 rr_node_typename[rr_type],
-                                 device_ctx.physical_tile_types[itile].name);
-                    break;
-                }
-
-                //VTR_LOG("Sampling %s at (%d,%d)\n", device_ctx.physical_tile_types[itile].name, sample_loc.x(), sample_loc.y());
-
-                rr_nodes_at_loc.clear();
-
-                get_rr_node_indices(device_ctx.rr_node_indices, sample_loc.x(), sample_loc.y(), rr_type, &rr_nodes_at_loc);
-                for (int inode : rr_nodes_at_loc) {
-                    if (inode < 0) continue;
-
-                    RRNodeId node_id(inode);
-
-                    int ptc = rr_graph.node_ptc_num(node_id);
-
-                    if (ptc >= int(f_src_opin_reachable_wires[itile].size())) {
-                        f_src_opin_reachable_wires[itile].resize(ptc + 1); //Inefficient but functional...
-                    }
-
-                    //Find the wire types which are reachable from inode and record them and
-                    //the cost to reach them
-                    dijkstra_flood_to_wires(itile, node_id, f_src_opin_reachable_wires);
-
-                    if (f_src_opin_reachable_wires[itile][ptc].empty()) {
-                        VTR_LOG_WARN("Found no reachable wires from %s (%s) at (%d,%d)\n",
-                                     rr_node_typename[rr_type],
-                                     rr_node_arch_name(inode).c_str(),
-                                     sample_loc.x(),
-                                     sample_loc.y());
-
-                        ptcs_with_no_reachable_wires = true;
-                    }
-                }
-
-                ++num_sampled_locs;
-            }
-            if (ptcs_with_no_reachable_wires) {
-                VPR_ERROR(VPR_ERROR_ROUTE, "Some SOURCE/OPINs have no reachable wires\n");
-            }
-        }
-    }
-}
-
-static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> prev) {
-    //Very simple for now, just pick the fist matching tile found
-    vtr::Point<int> loc(OPEN, OPEN);
-
-    //VTR_LOG("Prev: %d,%d\n", prev.x(), prev.y());
-
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-
-    int y_init = prev.y() + 1; //Start searching next element above prev
-
-    for (int x = prev.x(); x < int(grid.width()); ++x) {
-        if (x < 0) continue;
-
-        //VTR_LOG("  x: %d\n", x);
-
-        for (int y = y_init; y < int(grid.height()); ++y) {
-            if (y < 0) continue;
-
-            //VTR_LOG("   y: %d\n", y);
-            if (grid[x][y].type == tile_type) {
-                loc.set_x(x);
-                loc.set_y(y);
-                break;
-            }
-        }
-
-        if (loc.x() != OPEN && loc.y() != OPEN) {
-            break;
-        } else {
-            y_init = 0; //Prepare to search next column
-        }
-    }
-    //VTR_LOG("Next: %d,%d\n", loc.x(), loc.y());
-
-    return loc;
-}
-
-void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_reachable_wires& src_opin_reachable_wires) {
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
-
-    struct t_pq_entry {
-        float delay;
-        float congestion;
-        RRNodeId node;
-
-        bool operator<(const t_pq_entry& rhs) const {
-            return this->delay < rhs.delay;
-        }
-    };
-
-    std::priority_queue<t_pq_entry> pq;
-
-    t_pq_entry root;
-    root.congestion = 0.;
-    root.delay = 0.;
-    root.node = node;
-
-    int ptc = rr_graph.node_ptc_num(node);
-
-    /*
-     * Perform Djikstra from the SOURCE/OPIN of interest, stopping at the the first
-     * reachable wires (i.e until we hit the inter-block routing network), or a SINK
-     * (via a direct-connect).
-     *
-     * Note that typical RR graphs are structured :
-     *
-     *      SOURCE ---> OPIN --> CHANX/CHANY
-     *              |
-     *              --> OPIN --> CHANX/CHANY
-     *              |
-     *             ...
-     *
-     *   possibly with direct-connects of the form:
-     *
-     *      SOURCE --> OPIN --> IPIN --> SINK
-     *
-     * and there is a small number of fixed hops from SOURCE/OPIN to CHANX/CHANY or
-     * to a SINK (via a direct-connect), so this runs very fast (i.e. O(1))
-     */
-    pq.push(root);
-    while (!pq.empty()) {
-        t_pq_entry curr = pq.top();
-        pq.pop();
-
-        e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
-        if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) {
-            //We stop expansion at any CHANX/CHANY/SINK
-            int seg_index;
-            if (curr_rr_type != SINK) {
-                //It's a wire, figure out its type
-                int cost_index = rr_graph.node_cost_index(curr.node);
-                seg_index = device_ctx.rr_indexed_data[cost_index].seg_index;
-            } else {
-                //This is a direct-connect path between an IPIN and OPIN,
-                //which terminated at a SINK.
-                //
-                //We treat this as a 'special' wire type
-                seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE;
-            }
-
-            //Keep costs of the best path to reach each wire type
-            if (!src_opin_reachable_wires[itile][ptc].count(seg_index)
-                || curr.delay < src_opin_reachable_wires[itile][ptc][seg_index].delay) {
-                src_opin_reachable_wires[itile][ptc][seg_index].wire_rr_type = curr_rr_type;
-                src_opin_reachable_wires[itile][ptc][seg_index].wire_seg_index = seg_index;
-                src_opin_reachable_wires[itile][ptc][seg_index].delay = curr.delay;
-                src_opin_reachable_wires[itile][ptc][seg_index].congestion = curr.congestion;
-            }
-
-        } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) {
-            //We allow expansion through SOURCE/OPIN/IPIN types
-            int cost_index = rr_graph.node_cost_index(curr.node);
-            float incr_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost
-
-            for (RREdgeId edge : rr_graph.edge_range(curr.node)) {
-                int iswitch = rr_graph.edge_switch(edge);
-                float incr_delay = device_ctx.rr_switch_inf[iswitch].Tdel;
-
-                RRNodeId next_node = rr_graph.edge_sink_node(edge);
-
-                t_pq_entry next;
-                next.congestion = curr.congestion + incr_cong; //Of current node
-                next.delay = curr.delay + incr_delay;          //To reach next node
-                next.node = next_node;
-
-                pq.push(next);
-            }
-        } else {
-            VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type");
-        }
-    }
-}
-
 /* returns index of a node from which to start routing */
 static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset) {
     auto& device_ctx = g_vpr_ctx.device();
diff --git a/vpr/src/route/router_lookahead_map.h b/vpr/src/route/router_lookahead_map.h
index 19d43f332fc..dfb5d1c768d 100644
--- a/vpr/src/route/router_lookahead_map.h
+++ b/vpr/src/route/router_lookahead_map.h
@@ -4,8 +4,13 @@
 #include <limits>
 #include "vtr_ndmatrix.h"
 #include "router_lookahead.h"
+#include "router_lookahead_map_utils.h"
 
 class MapLookahead : public RouterLookahead {
+  private:
+    //Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
+    util::t_src_opin_delays src_opin_delays;
+
   protected:
     float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
     std::pair<float, float> get_expected_delay_and_cong(int inode, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
@@ -40,11 +45,3 @@ typedef vtr::NdMatrix<Cost_Entry, 4> t_wire_cost_map; //[0..1][[0..num_seg_types
 
 void read_router_lookahead(const std::string& file);
 void write_router_lookahead(const std::string& file);
-
-/* Computes the lookahead map to be used by the router. If a map was computed prior to this, a new one will not be computed again.
- * The rr graph must have been built before calling this function. */
-void compute_router_lookahead(const std::vector<t_segment_inf>& segment_inf);
-
-/* queries the lookahead_map (should have been computed prior to routing) to get the expected cost
- * from the specified source to the specified target */
-float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac);
diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
new file mode 100644
index 00000000000..e01ebba26bd
--- /dev/null
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -0,0 +1,619 @@
+#include "router_lookahead_map_utils.h"
+
+#include "globals.h"
+#include "vpr_context.h"
+#include "vtr_math.h"
+#include "vtr_time.h"
+#include "route_common.h"
+#include "route_timing.h"
+
+static void dijkstra_flood_to_wires(int itile, RRNodeId inode, util::t_src_opin_delays& src_opin_delays);
+static void dijkstra_flood_to_ipins(RRNodeId node, util::t_chan_ipins_delays& chan_ipins_delays);
+
+static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> start);
+
+// Constants needed to reduce the bounding box when expanding CHAN wires to reach the IPINs
+#define X_OFFSET 2
+#define Y_OFFSET 2
+
+// Maximum dijkstra expansions when exploring the CHAN --> IPIN connections
+#define MAX_EXPANSION_LEVEL 1
+
+#define DIRECT_CONNECT_SPECIAL_SEG_TYPE -1;
+
+namespace util {
+
+PQ_Entry::PQ_Entry(
+    int set_rr_node_ind,
+    int switch_ind,
+    float parent_delay,
+    float parent_R_upstream,
+    float parent_congestion_upstream,
+    bool starting_node,
+    float Tsw_adjust) {
+    this->rr_node_ind = set_rr_node_ind;
+
+    auto& device_ctx = g_vpr_ctx.device();
+    this->delay = parent_delay;
+    this->congestion_upstream = parent_congestion_upstream;
+    this->R_upstream = parent_R_upstream;
+    if (!starting_node) {
+        float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel;
+        Tsw += Tsw_adjust;
+        VTR_ASSERT(Tsw >= 0.f);
+        float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
+        float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
+        float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R();
+
+        float T_linear = 0.f;
+        if (device_ctx.rr_switch_inf[switch_ind].buffered()) {
+            T_linear = Tsw + Rsw * Cnode + 0.5 * Rnode * Cnode;
+        } else { /* Pass transistor */
+            T_linear = Tsw + 0.5 * Rsw * Cnode;
+        }
+
+        float base_cost = 0.f;
+        if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
+            base_cost = get_single_rr_cong_base_cost(set_rr_node_ind);
+        }
+
+        VTR_ASSERT(T_linear >= 0.);
+        VTR_ASSERT(base_cost >= 0.);
+        this->delay += T_linear;
+
+        this->congestion_upstream += base_cost;
+    }
+
+    /* set the cost of this node */
+    this->cost = this->delay;
+}
+
+util::PQ_Entry_Delay::PQ_Entry_Delay(
+    int set_rr_node_ind,
+    int switch_ind,
+    const util::PQ_Entry_Delay* parent) {
+    this->rr_node_ind = set_rr_node_ind;
+
+    if (parent != nullptr) {
+        auto& device_ctx = g_vpr_ctx.device();
+        float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel;
+        float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
+        float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
+        float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R();
+
+        float T_linear = 0.f;
+        if (device_ctx.rr_switch_inf[switch_ind].buffered()) {
+            T_linear = Tsw + Rsw * Cnode + 0.5 * Rnode * Cnode;
+        } else { /* Pass transistor */
+            T_linear = Tsw + 0.5 * Rsw * Cnode;
+        }
+
+        VTR_ASSERT(T_linear >= 0.);
+        this->delay_cost = parent->delay_cost + T_linear;
+    } else {
+        this->delay_cost = 0.f;
+    }
+}
+
+util::PQ_Entry_Base_Cost::PQ_Entry_Base_Cost(
+    int set_rr_node_ind,
+    int switch_ind,
+    const util::PQ_Entry_Base_Cost* parent) {
+    this->rr_node_ind = set_rr_node_ind;
+
+    if (parent != nullptr) {
+        auto& device_ctx = g_vpr_ctx.device();
+        if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
+            this->base_cost = parent->base_cost + get_single_rr_cong_base_cost(set_rr_node_ind);
+        } else {
+            this->base_cost = parent->base_cost;
+        }
+    } else {
+        this->base_cost = 0.f;
+    }
+}
+
+/* returns cost entry with the smallest delay */
+util::Cost_Entry util::Expansion_Cost_Entry::get_smallest_entry() const {
+    util::Cost_Entry smallest_entry;
+
+    for (auto entry : this->cost_vector) {
+        if (!smallest_entry.valid() || entry.delay < smallest_entry.delay) {
+            smallest_entry = entry;
+        }
+    }
+
+    return smallest_entry;
+}
+
+/* returns a cost entry that represents the average of all the recorded entries */
+util::Cost_Entry util::Expansion_Cost_Entry::get_average_entry() const {
+    float avg_delay = 0;
+    float avg_congestion = 0;
+
+    for (auto cost_entry : this->cost_vector) {
+        avg_delay += cost_entry.delay;
+        avg_congestion += cost_entry.congestion;
+    }
+
+    avg_delay /= (float)this->cost_vector.size();
+    avg_congestion /= (float)this->cost_vector.size();
+
+    return util::Cost_Entry(avg_delay, avg_congestion);
+}
+
+/* returns a cost entry that represents the geomean of all the recorded entries */
+util::Cost_Entry util::Expansion_Cost_Entry::get_geomean_entry() const {
+    float geomean_delay = 0;
+    float geomean_cong = 0;
+    for (auto cost_entry : this->cost_vector) {
+        geomean_delay += log(cost_entry.delay);
+        geomean_cong += log(cost_entry.congestion);
+    }
+
+    geomean_delay = exp(geomean_delay / (float)this->cost_vector.size());
+    geomean_cong = exp(geomean_cong / (float)this->cost_vector.size());
+
+    return util::Cost_Entry(geomean_delay, geomean_cong);
+}
+
+/* returns a cost entry that represents the medial of all recorded entries */
+util::Cost_Entry util::Expansion_Cost_Entry::get_median_entry() const {
+    /* find median by binning the delays of all entries and then chosing the bin
+     * with the largest number of entries */
+
+    int num_bins = 10;
+
+    /* find entries with smallest and largest delays */
+    util::Cost_Entry min_del_entry;
+    util::Cost_Entry max_del_entry;
+    for (auto entry : this->cost_vector) {
+        if (!min_del_entry.valid() || entry.delay < min_del_entry.delay) {
+            min_del_entry = entry;
+        }
+        if (!max_del_entry.valid() || entry.delay > max_del_entry.delay) {
+            max_del_entry = entry;
+        }
+    }
+
+    /* get the bin size */
+    float delay_diff = max_del_entry.delay - min_del_entry.delay;
+    float bin_size = delay_diff / (float)num_bins;
+
+    /* sort the cost entries into bins */
+    std::vector<std::vector<util::Cost_Entry>> entry_bins(num_bins, std::vector<util::Cost_Entry>());
+    for (auto entry : this->cost_vector) {
+        float bin_num = floor((entry.delay - min_del_entry.delay) / bin_size);
+
+        VTR_ASSERT(vtr::nint(bin_num) >= 0 && vtr::nint(bin_num) <= num_bins);
+        if (vtr::nint(bin_num) == num_bins) {
+            /* largest entry will otherwise have an out-of-bounds bin number */
+            bin_num -= 1;
+        }
+        entry_bins[vtr::nint(bin_num)].push_back(entry);
+    }
+
+    /* find the bin with the largest number of elements */
+    int largest_bin = 0;
+    int largest_size = 0;
+    for (int ibin = 0; ibin < num_bins; ibin++) {
+        if (entry_bins[ibin].size() > (unsigned)largest_size) {
+            largest_bin = ibin;
+            largest_size = (unsigned)entry_bins[ibin].size();
+        }
+    }
+
+    /* get the representative delay of the largest bin */
+    util::Cost_Entry representative_entry = entry_bins[largest_bin][0];
+
+    return representative_entry;
+}
+
+template<typename Entry>
+void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                const Entry& parent_entry,
+                                std::vector<util::Search_Path>* paths,
+                                std::vector<bool>* node_expanded,
+                                std::priority_queue<Entry,
+                                                    std::vector<Entry>,
+                                                    std::greater<Entry>>* pq) {
+    int parent_ind = size_t(parent_entry.rr_node_ind);
+
+    auto& parent_node = rr_nodes[parent_ind];
+
+    for (int iedge = 0; iedge < parent_node.num_edges(); iedge++) {
+        int child_node_ind = parent_node.edge_sink_node(iedge);
+        int switch_ind = parent_node.edge_switch(iedge);
+
+        /* skip this child if it has already been expanded from */
+        if ((*node_expanded)[child_node_ind]) {
+            continue;
+        }
+
+        Entry child_entry(child_node_ind, switch_ind, &parent_entry);
+        VTR_ASSERT(child_entry.cost() >= 0);
+
+        /* Create (if it doesn't exist) or update (if the new cost is lower)
+         * to specified node */
+        Search_Path path_entry = {child_entry.cost(), parent_ind, iedge};
+        auto& path = (*paths)[child_node_ind];
+        if (path_entry.cost < path.cost) {
+            pq->push(child_entry);
+            path = path_entry;
+        }
+    }
+}
+
+template void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                         const PQ_Entry_Delay& parent_entry,
+                                         std::vector<Search_Path>* paths,
+                                         std::vector<bool>* node_expanded,
+                                         std::priority_queue<PQ_Entry_Delay,
+                                                             std::vector<PQ_Entry_Delay>,
+                                                             std::greater<PQ_Entry_Delay>>* pq);
+template void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                         const PQ_Entry_Base_Cost& parent_entry,
+                                         std::vector<Search_Path>* paths,
+                                         std::vector<bool>* node_expanded,
+                                         std::priority_queue<PQ_Entry_Base_Cost,
+                                                             std::vector<PQ_Entry_Base_Cost>,
+                                                             std::greater<PQ_Entry_Base_Cost>>* pq);
+
+t_src_opin_delays compute_router_src_opin_lookahead() {
+    vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead");
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    t_src_opin_delays src_opin_delays;
+
+    src_opin_delays.resize(device_ctx.physical_tile_types.size());
+
+    std::vector<int> rr_nodes_at_loc;
+
+    //We assume that the routing connectivity of each instance of a physical tile is the same,
+    //and so only measure one instance of each type
+    for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) {
+        for (e_rr_type rr_type : {SOURCE, OPIN}) {
+            vtr::Point<int> sample_loc(-1, -1);
+
+            size_t num_sampled_locs = 0;
+            bool ptcs_with_no_delays = true;
+            while (ptcs_with_no_delays) { //Haven't found wire connected to ptc
+                ptcs_with_no_delays = false;
+
+                sample_loc = pick_sample_tile(&device_ctx.physical_tile_types[itile], sample_loc);
+
+                if (sample_loc.x() == -1 && sample_loc.y() == -1) {
+                    //No untried instances of the current tile type left
+                    VTR_LOG_WARN("Found no %ssample locations for %s in %s\n",
+                                 (num_sampled_locs == 0) ? "" : "more ",
+                                 rr_node_typename[rr_type],
+                                 device_ctx.physical_tile_types[itile].name);
+                    break;
+                }
+
+                //VTR_LOG("Sampling %s at (%d,%d)\n", device_ctx.physical_tile_types[itile].name, sample_loc.x(), sample_loc.y());
+
+                rr_nodes_at_loc.clear();
+
+                get_rr_node_indices(device_ctx.rr_node_indices, sample_loc.x(), sample_loc.y(), rr_type, &rr_nodes_at_loc);
+                for (int inode : rr_nodes_at_loc) {
+                    if (inode < 0) continue;
+
+                    RRNodeId node_id(inode);
+
+                    int ptc = rr_graph.node_ptc_num(node_id);
+
+                    if (ptc >= int(src_opin_delays[itile].size())) {
+                        src_opin_delays[itile].resize(ptc + 1); //Inefficient but functional...
+                    }
+
+                    //Find the wire types which are reachable from inode and record them and
+                    //the cost to reach them
+                    dijkstra_flood_to_wires(itile, node_id, src_opin_delays);
+
+                    if (src_opin_delays[itile][ptc].empty()) {
+                        VTR_LOGV_DEBUG(f_router_debug, "Found no reachable wires from %s (%s) at (%d,%d)\n",
+                                       rr_node_typename[rr_type],
+                                       rr_node_arch_name(inode).c_str(),
+                                       sample_loc.x(),
+                                       sample_loc.y());
+
+                        ptcs_with_no_delays = true;
+                    }
+                }
+
+                ++num_sampled_locs;
+            }
+            if (ptcs_with_no_delays) {
+                VPR_ERROR(VPR_ERROR_ROUTE, "Some SOURCE/OPINs have no reachable wires\n");
+            }
+        }
+    }
+
+    return src_opin_delays;
+}
+
+t_chan_ipins_delays compute_router_chan_ipin_lookahead() {
+    vtr::ScopedStartFinishTimer timer("Computing chan/ipin lookahead");
+    auto& device_ctx = g_vpr_ctx.device();
+
+    t_chan_ipins_delays chan_ipins_delays;
+
+    chan_ipins_delays.resize(device_ctx.physical_tile_types.size());
+
+    std::vector<int> rr_nodes_at_loc;
+
+    //We assume that the routing connectivity of each instance of a physical tile is the same,
+    //and so only measure one instance of each type
+    for (auto tile_type : device_ctx.physical_tile_types) {
+        vtr::Point<int> sample_loc(-1, -1);
+
+        sample_loc = pick_sample_tile(&tile_type, sample_loc);
+
+        if (sample_loc.x() == -1 && sample_loc.y() == -1) {
+            //No untried instances of the current tile type left
+            VTR_LOG_WARN("Found no sample locations for %s\n",
+                         tile_type.name);
+            continue;
+        }
+
+        int min_x = std::max(0, sample_loc.x() - X_OFFSET);
+        int min_y = std::max(0, sample_loc.y() - Y_OFFSET);
+        int max_x = std::min(int(device_ctx.grid.width()), sample_loc.x() + X_OFFSET);
+        int max_y = std::min(int(device_ctx.grid.height()), sample_loc.y() + Y_OFFSET);
+
+        for (int ix = min_x; ix < max_x; ix++) {
+            for (int iy = min_y; iy < max_y; iy++) {
+                for (auto rr_type : {CHANX, CHANY}) {
+                    rr_nodes_at_loc.clear();
+
+                    get_rr_node_indices(device_ctx.rr_node_indices, ix, iy, rr_type, &rr_nodes_at_loc);
+                    for (int inode : rr_nodes_at_loc) {
+                        if (inode < 0) continue;
+
+                        RRNodeId node_id(inode);
+
+                        //Find the IPINs which are reachable from the wires within the bounding box
+                        //around the selected tile location
+                        dijkstra_flood_to_ipins(node_id, chan_ipins_delays);
+                    }
+                }
+            }
+        }
+    }
+
+    return chan_ipins_delays;
+}
+
+} // namespace util
+
+static void dijkstra_flood_to_wires(int itile, RRNodeId node, util::t_src_opin_delays& src_opin_delays) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    struct t_pq_entry {
+        float delay;
+        float congestion;
+        RRNodeId node;
+
+        bool operator<(const t_pq_entry& rhs) const {
+            return this->delay < rhs.delay;
+        }
+    };
+
+    std::priority_queue<t_pq_entry> pq;
+
+    t_pq_entry root;
+    root.congestion = 0.;
+    root.delay = 0.;
+    root.node = node;
+
+    int ptc = rr_graph.node_ptc_num(node);
+
+    /*
+     * Perform Djikstra from the SOURCE/OPIN of interest, stopping at the the first
+     * reachable wires (i.e until we hit the inter-block routing network), or a SINK
+     * (via a direct-connect).
+     *
+     * Note that typical RR graphs are structured :
+     *
+     *      SOURCE ---> OPIN --> CHANX/CHANY
+     *              |
+     *              --> OPIN --> CHANX/CHANY
+     *              |
+     *             ...
+     *
+     *   possibly with direct-connects of the form:
+     *
+     *      SOURCE --> OPIN --> IPIN --> SINK
+     *
+     * and there is a small number of fixed hops from SOURCE/OPIN to CHANX/CHANY or
+     * to a SINK (via a direct-connect), so this runs very fast (i.e. O(1))
+     */
+    pq.push(root);
+    while (!pq.empty()) {
+        t_pq_entry curr = pq.top();
+        pq.pop();
+
+        e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
+        if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) {
+            //We stop expansion at any CHANX/CHANY/SINK
+            int seg_index;
+            if (curr_rr_type != SINK) {
+                //It's a wire, figure out its type
+                int cost_index = rr_graph.node_cost_index(curr.node);
+                seg_index = device_ctx.rr_indexed_data[cost_index].seg_index;
+            } else {
+                //This is a direct-connect path between an IPIN and OPIN,
+                //which terminated at a SINK.
+                //
+                //We treat this as a 'special' wire type
+                seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE;
+            }
+
+            //Keep costs of the best path to reach each wire type
+            if (!src_opin_delays[itile][ptc].count(seg_index)
+                || curr.delay < src_opin_delays[itile][ptc][seg_index].delay) {
+                src_opin_delays[itile][ptc][seg_index].wire_rr_type = curr_rr_type;
+                src_opin_delays[itile][ptc][seg_index].wire_seg_index = seg_index;
+                src_opin_delays[itile][ptc][seg_index].delay = curr.delay;
+                src_opin_delays[itile][ptc][seg_index].congestion = curr.congestion;
+            }
+
+        } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) {
+            //We allow expansion through SOURCE/OPIN/IPIN types
+            int cost_index = rr_graph.node_cost_index(curr.node);
+            float incr_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost
+
+            for (RREdgeId edge : rr_graph.edge_range(curr.node)) {
+                int iswitch = rr_graph.edge_switch(edge);
+                float incr_delay = device_ctx.rr_switch_inf[iswitch].Tdel;
+
+                RRNodeId next_node = rr_graph.edge_sink_node(edge);
+
+                t_pq_entry next;
+                next.congestion = curr.congestion + incr_cong; //Of current node
+                next.delay = curr.delay + incr_delay;          //To reach next node
+                next.node = next_node;
+
+                pq.push(next);
+            }
+        } else {
+            VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type");
+        }
+    }
+}
+
+static void dijkstra_flood_to_ipins(RRNodeId node, util::t_chan_ipins_delays& chan_ipins_delays) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+
+    struct t_pq_entry {
+        float delay;
+        float congestion;
+        RRNodeId node;
+        int level;
+
+        bool operator<(const t_pq_entry& rhs) const {
+            return this->delay < rhs.delay;
+        }
+    };
+
+    std::priority_queue<t_pq_entry> pq;
+
+    t_pq_entry root;
+    root.congestion = 0.;
+    root.delay = 0.;
+    root.node = node;
+    root.level = 0;
+
+    /*
+     * Perform Djikstra from the CHAN of interest, stopping at the the first
+     * reachable IPIN
+     *
+     * Note that typical RR graphs are structured :
+     *
+     *      CHANX/CHANY --> CHANX/CHANY --> ... --> CHANX/CHANY --> IPIN --> SINK
+     *                  |
+     *                  --> CHANX/CHANY --> ... --> CHANX/CHANY --> IPIN --> SINK
+     *                  |
+     *                  ...
+     *
+     * and there is a variable number of hops from a given CHANX/CHANY  to IPIN.
+     * To avoid impacting on run-time, a fixed number of hops is performed. This
+     * should be enough to find the delay from the last CAHN to IPIN connection.
+     */
+    pq.push(root);
+
+    float site_pin_delay = std::numeric_limits<float>::infinity();
+
+    while (!pq.empty()) {
+        t_pq_entry curr = pq.top();
+        pq.pop();
+
+        e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
+        if (curr_rr_type == IPIN) {
+            int node_x = rr_graph.node_xlow(curr.node);
+            int node_y = rr_graph.node_ylow(curr.node);
+
+            auto tile_type = device_ctx.grid[node_x][node_y].type;
+            int itile = tile_type->index;
+
+            int ptc = rr_graph.node_ptc_num(curr.node);
+
+            if (ptc >= int(chan_ipins_delays[itile].size())) {
+                chan_ipins_delays[itile].resize(ptc + 1); //Inefficient but functional...
+            }
+
+            site_pin_delay = std::min(curr.delay, site_pin_delay);
+            //Keep costs of the best path to reach each wire type
+            chan_ipins_delays[itile][ptc].wire_rr_type = curr_rr_type;
+            chan_ipins_delays[itile][ptc].delay = site_pin_delay;
+            chan_ipins_delays[itile][ptc].congestion = curr.congestion;
+        } else if (curr_rr_type == CHANX || curr_rr_type == CHANY) {
+            if (curr.level >= MAX_EXPANSION_LEVEL) {
+                continue;
+            }
+
+            //We allow expansion through SOURCE/OPIN/IPIN types
+            int cost_index = rr_graph.node_cost_index(curr.node);
+            float new_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost
+
+            for (RREdgeId edge : rr_graph.edge_range(curr.node)) {
+                int iswitch = rr_graph.edge_switch(edge);
+                float new_delay = device_ctx.rr_switch_inf[iswitch].Tdel;
+
+                RRNodeId next_node = rr_graph.edge_sink_node(edge);
+
+                t_pq_entry next;
+                next.congestion = new_cong; //Of current node
+                next.delay = new_delay;     //To reach next node
+                next.node = next_node;
+                next.level = curr.level + 1;
+
+                pq.push(next);
+            }
+        } else {
+            VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type");
+        }
+    }
+}
+
+static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> prev) {
+    //Very simple for now, just pick the fist matching tile found
+    vtr::Point<int> loc(OPEN, OPEN);
+
+    //VTR_LOG("Prev: %d,%d\n", prev.x(), prev.y());
+
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& grid = device_ctx.grid;
+
+    int y_init = prev.y() + 1; //Start searching next element above prev
+
+    for (int x = prev.x(); x < int(grid.width()); ++x) {
+        if (x < 0) continue;
+
+        //VTR_LOG("  x: %d\n", x);
+
+        for (int y = y_init; y < int(grid.height()); ++y) {
+            if (y < 0) continue;
+
+            //VTR_LOG("   y: %d\n", y);
+            if (grid[x][y].type == tile_type) {
+                loc.set_x(x);
+                loc.set_y(y);
+                break;
+            }
+        }
+
+        if (loc.x() != OPEN && loc.y() != OPEN) {
+            break;
+        } else {
+            y_init = 0; //Prepare to search next column
+        }
+    }
+    //VTR_LOG("Next: %d,%d\n", loc.x(), loc.y());
+
+    return loc;
+}
diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
new file mode 100644
index 00000000000..56b2d564d96
--- /dev/null
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -0,0 +1,268 @@
+#ifndef ROUTER_LOOKAHEAD_MAP_UTILS_H_
+#define ROUTER_LOOKAHEAD_MAP_UTILS_H_
+/*
+ * The router lookahead provides an estimate of the cost from an intermediate node to the target node
+ * during directed (A*-like) routing.
+ *
+ * The VPR 7.0 lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and
+ * congestion by assuming that a minimum number of wires, of the same type as the current node being expanded, can be used
+ * to complete the route. While this method is efficient, it can run into trouble with architectures that use
+ * multiple interconnected wire types.
+ *
+ * The lookahead in this file pre-computes delay/congestion costs up and to the right of a starting tile. This generates
+ * delay/congestion tables for {CHANX, CHANY} channel types, over all wire types defined in the architecture file.
+ * See Section 3.2.4 in Oleg Petelin's MASc thesis (2016) for more discussion.
+ *
+ */
+
+#include <cmath>
+#include <limits>
+#include <vector>
+#include <queue>
+#include <unordered_map>
+#include "vpr_types.h"
+#include "vtr_geometry.h"
+#include "rr_node.h"
+
+namespace util {
+
+/* when a list of delay/congestion entries at a coordinate in Cost_Entry is boiled down to a single
+ * representative entry, this enum is passed-in to specify how that representative entry should be
+ * calculated */
+enum e_representative_entry_method {
+    FIRST = 0, //the first cost that was recorded
+    SMALLEST,  //the smallest-delay cost recorded
+    AVERAGE,
+    GEOMEAN,
+    MEDIAN
+};
+
+/* f_cost_map is an array of these cost entries that specifies delay/congestion estimates
+ * to travel relative x/y distances */
+class Cost_Entry {
+  public:
+    float delay;
+    float congestion;
+    bool fill;
+
+    Cost_Entry() {
+        delay = std::numeric_limits<float>::infinity();
+        congestion = std::numeric_limits<float>::infinity();
+        fill = false;
+    }
+    Cost_Entry(float set_delay, float set_congestion)
+        : delay(set_delay)
+        , congestion(set_congestion)
+        , fill(false) {}
+    Cost_Entry(float set_delay, float set_congestion, bool set_fill)
+        : delay(set_delay)
+        , congestion(set_congestion)
+        , fill(set_fill) {}
+    bool valid() const {
+        return std::isfinite(delay) && std::isfinite(congestion);
+    }
+};
+
+/* a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion.
+ * since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored
+ * in the final lookahead cost map */
+class Expansion_Cost_Entry {
+  private:
+    std::vector<Cost_Entry> cost_vector;
+
+    Cost_Entry get_smallest_entry() const;
+    Cost_Entry get_average_entry() const;
+    Cost_Entry get_geomean_entry() const;
+    Cost_Entry get_median_entry() const;
+
+  public:
+    void add_cost_entry(e_representative_entry_method method,
+                        float add_delay,
+                        float add_congestion) {
+        Cost_Entry cost_entry(add_delay, add_congestion);
+        if (method == SMALLEST) {
+            /* taking the smallest-delay entry anyway, so no need to push back multple entries */
+            if (this->cost_vector.empty()) {
+                this->cost_vector.push_back(cost_entry);
+            } else {
+                if (add_delay < this->cost_vector[0].delay) {
+                    this->cost_vector[0] = cost_entry;
+                }
+            }
+        } else {
+            this->cost_vector.push_back(cost_entry);
+        }
+    }
+    void clear_cost_entries() {
+        this->cost_vector.clear();
+    }
+
+    Cost_Entry get_representative_cost_entry(e_representative_entry_method method) const {
+        Cost_Entry entry;
+
+        if (!cost_vector.empty()) {
+            switch (method) {
+                case FIRST:
+                    entry = cost_vector[0];
+                    break;
+                case SMALLEST:
+                    entry = this->get_smallest_entry();
+                    break;
+                case AVERAGE:
+                    entry = this->get_average_entry();
+                    break;
+                case GEOMEAN:
+                    entry = this->get_geomean_entry();
+                    break;
+                case MEDIAN:
+                    entry = this->get_median_entry();
+                    break;
+                default:
+                    break;
+            }
+        }
+        return entry;
+    }
+};
+
+// Keys in the RoutingCosts map
+struct RoutingCostKey {
+    // segment type index
+    int seg_index;
+
+    // offset of the destination connection box from the starting segment
+    vtr::Point<int> delta;
+
+    RoutingCostKey()
+        : seg_index(-1)
+        , delta(0, 0) {}
+    RoutingCostKey(int seg_index_arg, vtr::Point<int> delta_arg)
+        : seg_index(seg_index_arg)
+        , delta(delta_arg) {}
+
+    bool operator==(const RoutingCostKey& other) const {
+        return seg_index == other.seg_index && delta == other.delta;
+    }
+};
+
+// hash implementation for RoutingCostKey
+struct HashRoutingCostKey {
+    std::size_t operator()(RoutingCostKey const& key) const noexcept {
+        std::size_t hash = std::hash<int>{}(key.seg_index);
+        vtr::hash_combine(hash, key.delta.x());
+        vtr::hash_combine(hash, key.delta.y());
+        return hash;
+    }
+};
+
+// Map used to store intermediate routing costs
+typedef std::unordered_map<RoutingCostKey, float, HashRoutingCostKey> RoutingCosts;
+
+/* a class that represents an entry in the Dijkstra expansion priority queue */
+class PQ_Entry {
+  public:
+    int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
+    float cost;      //the cost of the path to get to this node
+
+    /* store backward delay, R and congestion info */
+    float delay;
+    float R_upstream;
+    float congestion_upstream;
+
+    PQ_Entry(int set_rr_node_ind, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node, float Tsw_adjust);
+
+    bool operator<(const PQ_Entry& obj) const {
+        /* inserted into max priority queue so want queue entries with a lower cost to be greater */
+        return (this->cost > obj.cost);
+    }
+};
+
+// A version of PQ_Entry that only calculates and stores the delay.
+class PQ_Entry_Delay {
+  public:
+    int rr_node_ind;  //index in device_ctx.rr_nodes that this entry represents
+    float delay_cost; //the cost of the path to get to this node
+
+    PQ_Entry_Delay(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Delay* parent);
+
+    float cost() const {
+        return delay_cost;
+    }
+
+    void adjust_Tsw(float amount) {
+        delay_cost += amount;
+    }
+
+    bool operator>(const PQ_Entry_Delay& obj) const {
+        return (this->delay_cost > obj.delay_cost);
+    }
+};
+
+// A version of PQ_Entry that only calculates and stores the base cost.
+class PQ_Entry_Base_Cost {
+  public:
+    int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
+    float base_cost;
+
+    PQ_Entry_Base_Cost(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Base_Cost* parent);
+
+    float cost() const {
+        return base_cost;
+    }
+
+    void adjust_Tsw(float /* amount */) {
+        // do nothing
+    }
+
+    bool operator>(const PQ_Entry_Base_Cost& obj) const {
+        return (this->base_cost > obj.base_cost);
+    }
+};
+
+struct Search_Path {
+    float cost;
+    int parent;
+    int edge;
+};
+
+/* iterates over the children of the specified node and selectively pushes them onto the priority queue */
+template<typename Entry>
+void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                const Entry& parent_entry,
+                                std::vector<Search_Path>* paths,
+                                std::vector<bool>* node_expanded,
+                                std::priority_queue<Entry,
+                                                    std::vector<Entry>,
+                                                    std::greater<Entry>>* pq);
+
+struct t_reachable_wire_inf {
+    e_rr_type wire_rr_type;
+    int wire_seg_index;
+
+    //Costs to reach the wire type from the current node
+    float congestion;
+    float delay;
+};
+
+//[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index]
+// ^                                           ^             ^
+// |                                           |             |
+// physical block type index                   |             Reachable wire info
+//                                             |
+//                                             SOURCE/OPIN ptc
+typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_src_opin_delays;
+
+//[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1]
+// ^                                           ^
+// |                                           |
+// physical block type index                   |
+//                                             |
+//                                             SINK/IPIN ptc
+typedef std::vector<std::vector<t_reachable_wire_inf>> t_chan_ipins_delays;
+
+t_src_opin_delays compute_router_src_opin_lookahead();
+t_chan_ipins_delays compute_router_chan_ipin_lookahead();
+
+} // namespace util
+
+#endif
diff --git a/vpr/src/route/router_lookahead_sampling.cpp b/vpr/src/route/router_lookahead_sampling.cpp
new file mode 100644
index 00000000000..568d5134602
--- /dev/null
+++ b/vpr/src/route/router_lookahead_sampling.cpp
@@ -0,0 +1,229 @@
+#include "router_lookahead_sampling.h"
+
+#include <vector>
+
+#include "globals.h"
+#include "vtr_math.h"
+#include "vtr_geometry.h"
+#include "vtr_time.h"
+
+// Sample based an NxN grid of starting segments, where N = SAMPLE_GRID_SIZE
+static constexpr int SAMPLE_GRID_SIZE = 2;
+
+// quantiles (like percentiles but 0-1) of segment count to use as a selection criteria
+// choose locations with higher, but not extreme, counts of each segment type
+static constexpr double kSamplingCountLowerQuantile = 0.5;
+static constexpr double kSamplingCountUpperQuantile = 0.7;
+
+// also known as the L1 norm
+static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b) {
+    return abs(b.x() - a.x()) + abs(b.y() - a.y());
+}
+
+// the smallest bounding box containing a node
+static vtr::Rect<int> bounding_box_for_node(int node_ind) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+    int x = rr_graph.node_xlow(RRNodeId(node_ind));
+    int y = rr_graph.node_ylow(RRNodeId(node_ind));
+
+    return vtr::Rect<int>(vtr::Point<int>(x, y));
+}
+
+static vtr::Rect<int> sample_window(const vtr::Rect<int>& bounding_box, int sx, int sy, int n) {
+    return vtr::Rect<int>(sample(bounding_box, sx, sy, n),
+                          sample(bounding_box, sx + 1, sy + 1, n));
+}
+
+static std::vector<SamplePoint> choose_points(const vtr::Matrix<int>& counts,
+                                              const vtr::Rect<int>& window,
+                                              int min_count,
+                                              int max_count) {
+    VTR_ASSERT(min_count <= max_count);
+    std::vector<SamplePoint> points;
+    for (int y = window.ymin(); y < window.ymax(); y++) {
+        for (int x = window.xmin(); x < window.xmax(); x++) {
+            if (counts[x][y] >= min_count && counts[x][y] <= max_count) {
+                points.push_back(SamplePoint{/* .location = */ vtr::Point<int>(x, y),
+                                             /* .nodes = */ {}});
+            }
+        }
+    }
+
+    vtr::Point<int> center = sample(window, 1, 1, 2);
+
+    // sort by distance from center
+    std::sort(points.begin(), points.end(),
+              [&](const SamplePoint& a, const SamplePoint& b) {
+                  return manhattan_distance(a.location, center) < manhattan_distance(b.location, center);
+              });
+
+    return points;
+}
+
+// histogram is a map from segment count to number of locations having that count
+static int quantile(const std::map<int, int>& histogram, float ratio) {
+    if (histogram.empty()) {
+        return 0;
+    }
+    int sum = 0;
+    for (const auto& entry : histogram) {
+        sum += entry.second;
+    }
+    int limit = std::ceil(sum * ratio);
+    for (const auto& entry : histogram) {
+        limit -= entry.second;
+        if (limit <= 0) {
+            return entry.first;
+        }
+    }
+    return 0;
+}
+
+// select a good number of segments to find
+static std::map<int, int> count_histogram(const vtr::Rect<int>& box, const vtr::Matrix<int>& counts) {
+    std::map<int, int> histogram;
+    for (int y = box.ymin(); y < box.ymax(); y++) {
+        for (int x = box.xmin(); x < box.xmax(); x++) {
+            int count = counts[x][y];
+            if (count > 0) {
+                ++histogram[count];
+            }
+        }
+    }
+    return histogram;
+}
+
+// Used to calculate each region's `order.'
+// A space-filling curve will order the regions so that
+// nearby points stay close in order. A Hilbert curve might
+// be better, but a Morton (Z)-order curve is easy to compute,
+// because it's just interleaving binary bits, so this
+// function interleaves with 0's so that the X and Y
+// dimensions can then be OR'ed together.
+static uint64_t interleave(uint32_t x) {
+    uint64_t i = x;
+    i = (i ^ (i << 16)) & 0x0000ffff0000ffff;
+    i = (i ^ (i << 8)) & 0x00ff00ff00ff00ff;
+    i = (i ^ (i << 4)) & 0x0f0f0f0f0f0f0f0f;
+    i = (i ^ (i << 2)) & 0x3333333333333333;
+    i = (i ^ (i << 1)) & 0x5555555555555555;
+    return i;
+}
+
+// for each segment type, find the nearest nodes to an equally spaced grid of points
+// within the bounding box for that segment type
+std::vector<SampleRegion> find_sample_regions(int num_segments) {
+    vtr::ScopedStartFinishTimer timer("finding sample regions");
+    std::vector<SampleRegion> sample_regions;
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_nodes = device_ctx.rr_nodes;
+    std::vector<vtr::Matrix<int>> segment_counts(num_segments);
+
+    // compute bounding boxes for each segment type
+    std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>());
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+
+        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(i));
+    }
+
+    // initialize counts
+    for (int seg = 0; seg < num_segments; seg++) {
+        const auto& box = bounding_box_for_segment[seg];
+        segment_counts[seg] = vtr::Matrix<int>({size_t(box.width()), size_t(box.height())}, 0);
+    }
+
+    // count sample points
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        int x = rr_nodes.node_xlow(RRNodeId(i));
+        int y = rr_nodes.node_ylow(RRNodeId(i));
+
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+        segment_counts[seg_index][x][y] += 1;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+    }
+
+    // select sample points
+    for (int i = 0; i < num_segments; i++) {
+        const auto& counts = segment_counts[i];
+        const auto& bounding_box = bounding_box_for_segment[i];
+        if (bounding_box.empty()) continue;
+        for (int y = 0; y < SAMPLE_GRID_SIZE; y++) {
+            for (int x = 0; x < SAMPLE_GRID_SIZE; x++) {
+                vtr::Rect<int> window = sample_window(bounding_box, x, y, SAMPLE_GRID_SIZE);
+                if (window.empty()) continue;
+
+                auto histogram = count_histogram(window, segment_counts[i]);
+                SampleRegion region = {
+                    /* .segment_type = */ i,
+                    /* .grid_location = */ vtr::Point<int>(x, y),
+                    /* .points = */ choose_points(counts, window, quantile(histogram, kSamplingCountLowerQuantile), quantile(histogram, kSamplingCountUpperQuantile)),
+                    /* .order = */ 0};
+                if (!region.points.empty()) {
+                    /* In order to improve caching, the list of sample points are
+                     * sorted to keep points that are nearby on the Euclidean plane also
+                     * nearby in the vector of sample points.
+                     *
+                     * This means subsequent expansions on the same thread are likely
+                     * to cover a similar set of nodes, so they are more likely to be
+                     * cached. This improves performance by about 7%, which isn't a lot,
+                     * but not a bad improvement for a few lines of code. */
+                    vtr::Point<int> location = region.points[0].location;
+
+                    // interleave bits of X and Y for a Z-curve ordering.
+                    region.order = interleave(location.x()) | (interleave(location.y()) << 1);
+
+                    sample_regions.push_back(region);
+                }
+            }
+        }
+    }
+
+    // sort regions
+    std::sort(sample_regions.begin(), sample_regions.end(),
+              [](const SampleRegion& a, const SampleRegion& b) {
+                  return a.order < b.order;
+              });
+
+    // build an index of sample points on segment type and location
+    std::map<std::tuple<int, int, int>, SamplePoint*> sample_point_index;
+    for (auto& region : sample_regions) {
+        for (auto& point : region.points) {
+            sample_point_index[std::make_tuple(region.segment_type, point.location.x(), point.location.y())] = &point;
+        }
+    }
+
+    // collect the node indices for each segment type at the selected sample points
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+
+        int x = rr_nodes.node_xlow(RRNodeId(i));
+        int y = rr_nodes.node_ylow(RRNodeId(i));
+
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+
+        auto point = sample_point_index.find(std::make_tuple(seg_index, x, y));
+        if (point != sample_point_index.end()) {
+            point->second->nodes.push_back(i);
+        }
+    }
+
+    return sample_regions;
+}
diff --git a/vpr/src/route/router_lookahead_sampling.h b/vpr/src/route/router_lookahead_sampling.h
new file mode 100644
index 00000000000..6668792e9c6
--- /dev/null
+++ b/vpr/src/route/router_lookahead_sampling.h
@@ -0,0 +1,35 @@
+#ifndef ROUTER_LOOKAHEAD_SAMPLING_H
+#define ROUTER_LOOKAHEAD_SAMPLING_H
+
+#include <vector>
+#include "vtr_geometry.h"
+#include "globals.h"
+
+// a sample point for a segment type, contains all segments at the VPR location
+struct SamplePoint {
+    // canonical location
+    vtr::Point<int> location;
+
+    // nodes to expand
+    std::vector<ssize_t> nodes;
+};
+
+struct SampleRegion {
+    // all nodes in `points' have this segment type
+    int segment_type;
+
+    // location on the sample grid
+    vtr::Point<int> grid_location;
+
+    // locations to try
+    // The computation will keep expanding each of the points
+    // until a number of paths (segment -> connection box) are found.
+    std::vector<SamplePoint> points;
+
+    // used to sort the regions to improve caching
+    uint64_t order;
+};
+
+std::vector<SampleRegion> find_sample_regions(int num_segments);
+
+#endif

From 99d3b94c16feb5823e7534339fadeaa96bf88566 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Wed, 19 Aug 2020 12:56:45 +0200
Subject: [PATCH 02/18] tests: added extended_map to router_lookahead strong
 test

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 .../strong_router_lookahead/config/config.txt              | 1 +
 .../strong_router_lookahead/config/golden_results.txt      | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt
index a3852ac7ed5..faeaffeb119 100644
--- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt
+++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt
@@ -27,3 +27,4 @@ pass_requirements_file=pass_requirements_fixed_chan_width.txt
 script_params_common = -starting_stage vpr --route_chan_width 60
 script_params_list_add = --router_lookahead classic
 script_params_list_add = --router_lookahead map
+script_params_list_add = --router_lookahead extended_map
diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt
index 1e62b8fda48..3a9851af2ab 100644
--- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt
+++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt
@@ -1,3 +1,4 @@
-arch                  	circuit  	script_params                    	vtr_flow_elapsed_time	error	odin_synth_time	max_odin_mem	abc_depth	abc_synth_time	abc_cec_time	abc_sec_time	max_abc_mem	ace_time	max_ace_mem	num_clb	num_io	num_memories	num_mult	vpr_status	vpr_revision          	vpr_build_info                	vpr_compiler                               	vpr_compiled       	hostname                           	rundir                                                                                                                                                                  	max_vpr_mem	num_primary_inputs	num_primary_outputs	num_pre_packed_nets	num_pre_packed_blocks	num_netlist_clocks	num_post_packed_nets	num_post_packed_blocks	device_width	device_height	device_grid_tiles	device_limiting_resources	device_name	pack_time	placed_wirelength_est	place_time	place_quench_time	placed_CPD_est	placed_setup_TNS_est	placed_setup_WNS_est	placed_geomean_nonvirtual_intradomain_critical_path_delay_est	place_quench_timing_analysis_time	place_quench_sta_time	place_total_timing_analysis_time	place_total_sta_time	routed_wirelength	total_nets_routed	total_connections_routed	total_heap_pushes	total_heap_pops	logic_block_area_total	logic_block_area_used	routing_area_total	routing_area_per_tile	crit_path_route_success_iteration	critical_path_delay	geomean_nonvirtual_intradomain_critical_path_delay	setup_TNS	setup_WNS	hold_TNS	hold_WNS	crit_path_route_time	crit_path_total_timing_analysis_time	crit_path_total_sta_time
-k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_classic	1.39                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-1877-g77d3b9ae4	release IPO VTR_ASSERT_LEVEL=2	GNU 7.5.0 on Linux-4.15.0-60-generic x86_64	2020-06-02T18:06:14	betzgrp-wintermute.eecg.utoronto.ca	/home/kmurray/trees/vtr/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run014/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_classic	38504      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.29     	4714                 	0.52      	0.00             	3.70871       	-159.069            	-3.70871            	nan                                                          	0.000962558                      	0.000710169          	0.147775                        	0.113669            	6802             	4803             	18126                   	1410663          	241971         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	31                               	4.15097            	nan                                               	-183.347 	-4.15097 	0       	0       	0.29                	0.246723                            	0.200214                
-k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_map    	1.28                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-1877-g77d3b9ae4	release IPO VTR_ASSERT_LEVEL=2	GNU 7.5.0 on Linux-4.15.0-60-generic x86_64	2020-06-02T18:06:14	betzgrp-wintermute.eecg.utoronto.ca	/home/kmurray/trees/vtr/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run014/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_map    	37744      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.27     	4774                 	0.45      	0.00             	3.68749       	-162.239            	-3.68749            	nan                                                          	0.000979309                      	0.000718807          	0.123995                        	0.094656            	6885             	4550             	17546                   	783482           	128874         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.28104            	nan                                               	-189.84  	-4.28104 	0       	0       	0.21                	0.206307                            	0.167057                
+arch                  	circuit  	script_params                         	vtr_flow_elapsed_time	error	odin_synth_time	max_odin_mem	abc_depth	abc_synth_time	abc_cec_time	abc_sec_time	max_abc_mem	ace_time	max_ace_mem	num_clb	num_io	num_memories	num_mult	vpr_status	vpr_revision                    	vpr_build_info                          	vpr_compiler                                	vpr_compiled       	hostname	rundir                                                                                                                                                                   	max_vpr_mem	num_primary_inputs	num_primary_outputs	num_pre_packed_nets	num_pre_packed_blocks	num_netlist_clocks	num_post_packed_nets	num_post_packed_blocks	device_width	device_height	device_grid_tiles	device_limiting_resources	device_name	pack_time	placed_wirelength_est	place_time	place_quench_time	placed_CPD_est	placed_setup_TNS_est	placed_setup_WNS_est	placed_geomean_nonvirtual_intradomain_critical_path_delay_est	place_quench_timing_analysis_time	place_quench_sta_time	place_total_timing_analysis_time	place_total_sta_time	routed_wirelength	total_nets_routed	total_connections_routed	total_heap_pushes	total_heap_pops	logic_block_area_total	logic_block_area_used	routing_area_total	routing_area_per_tile	crit_path_route_success_iteration	critical_path_delay	geomean_nonvirtual_intradomain_critical_path_delay	setup_TNS	setup_WNS	hold_TNS	hold_WNS	crit_path_route_time	crit_path_total_timing_analysis_time	crit_path_total_sta_time
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_classic     	1.36                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-rc2-2438-ge5e1da843-dirty	release VTR_ASSERT_LEVEL=2 debug_logging	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-08-19T12:52:35	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run023/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_classic     	76636      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.31     	4714                 	0.51      	0.00             	3.70871       	-159.069            	-3.70871            	nan                                                          	0.00081656                       	0.000697153          	0.113819                        	0.0986893           	6802             	4803             	18126                   	1410663          	241971         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	31                               	4.15097            	nan                                               	-183.347 	-4.15097 	0       	0       	0.26                	0.16699                             	0.143514                
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_map         	1.40                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-rc2-2438-ge5e1da843-dirty	release VTR_ASSERT_LEVEL=2 debug_logging	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-08-19T12:52:35	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run023/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_map         	74280      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.31     	4774                 	0.50      	0.00             	3.68749       	-162.239            	-3.68749            	nan                                                          	0.00107059                       	0.000930556          	0.0963918                       	0.0798913           	6885             	4550             	17546                   	783482           	128874         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.28104            	nan                                               	-189.84  	-4.28104 	0       	0       	0.19                	0.142017                            	0.116339                
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_extended_map	1.57                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-rc2-2438-ge5e1da843-dirty	release VTR_ASSERT_LEVEL=2 debug_logging	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-08-19T12:52:35	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run023/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_extended_map	78432      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.32     	4756                 	0.48      	0.00             	3.96235       	-167.937            	-3.96235            	nan                                                          	0.000742545                      	0.000635633          	0.104159                        	0.091287            	6963             	3821             	13362                   	1508440          	271241         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.68054            	nan                                               	-193.587 	-4.68054 	0       	0       	0.25                	0.149959                            	0.129748                

From 53f2dd16ed29f6901806f1a5600bd910bc3d63ce Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Thu, 3 Sep 2020 17:41:36 +0200
Subject: [PATCH 03/18] context: removed not needed segment_inf member from the
 device ctx

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/base/SetupVPR.cpp                       | 1 -
 vpr/src/base/vpr_context.h                      | 4 ++--
 vpr/src/route/router_lookahead_extended_map.cpp | 6 +-----
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index c5d94465a83..8bbdaee4989 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -160,7 +160,6 @@ void SetupVPR(const t_options* Options,
     }
 
     Segments = Arch->Segments;
-    device_ctx.segment_inf = Arch->Segments;
 
     SetupSwitches(*Arch, RoutingArch, Arch->Switches, Arch->num_switches);
     SetupRoutingArch(*Arch, RoutingArch);
diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h
index d353e1109ea..752c96cc331 100644
--- a/vpr/src/base/vpr_context.h
+++ b/vpr/src/base/vpr_context.h
@@ -161,8 +161,8 @@ struct DeviceContext : public Context {
     ///@brief The indicies of rr nodes of a given type at a specific x,y grid location
     t_rr_node_indices rr_node_indices; // [0..NUM_RR_TYPES-1][0..grid.width()-1][0..grid.width()-1][0..size-1]
 
-    std::vector<t_rr_switch_inf> rr_switch_inf; /* autogenerated in build_rr_graph based on switch fan-in. [0..(num_rr_switches-1)] */
-    std::vector<t_segment_inf> segment_inf;
+    ///@brief Autogenerated in build_rr_graph based on switch fan-in. [0..(num_rr_switches-1)]
+    std::vector<t_rr_switch_inf> rr_switch_inf;
 
     ///@brief Wire segment types in RR graph
     std::vector<t_segment_inf> rr_segments;
diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index d4c769b4655..b8726f7a3b9 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -45,10 +45,6 @@
 // Don't continue storing a path after hitting a lower-or-same cost entry.
 static constexpr bool BREAK_ON_MISS = false;
 
-// Distance penalties filling are calculated based on available samples, but can be adjusted with this factor.
-static constexpr float PENALTY_FACTOR = 1.f;
-static constexpr float PENALTY_MIN = 1e-12f;
-
 static constexpr int MIN_PATH_COUNT = 1000;
 
 template<typename Entry>
@@ -214,7 +210,7 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
     float expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion;
 
     VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node %d to %d\n", from_node_ind, to_node_ind);
-    const std::string& segment_name = device_ctx.segment_inf[from_seg_index].name;
+    const std::string& segment_name = device_ctx.rr_segments[from_seg_index].name;
     VTR_LOGV_DEBUG(f_router_debug, "Lookahead returned %s (%d) with distance (%zd, %zd)\n",
                    segment_name.c_str(), from_seg_index,
                    dx, dy);

From c1d21d33c56d508758e71ffadcae063ca004c7aa Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Thu, 3 Sep 2020 17:42:12 +0200
Subject: [PATCH 04/18] lookahead cost map: refactor and commented

- The cost map filling routine has been simplified and the relevant code
extracted in two different sub-routines (penalty calculation, cost
filling)
- Commented used Doxygen compatible style some of the important methods
and class members

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_cost_map.cpp | 210 +++++++++++++-------
 vpr/src/route/router_lookahead_cost_map.h   | 110 +++++++++-
 2 files changed, 244 insertions(+), 76 deletions(-)

diff --git a/vpr/src/route/router_lookahead_cost_map.cpp b/vpr/src/route/router_lookahead_cost_map.cpp
index 16ae8dab526..1f036569bc8 100644
--- a/vpr/src/route/router_lookahead_cost_map.cpp
+++ b/vpr/src/route/router_lookahead_cost_map.cpp
@@ -12,8 +12,17 @@
 #    include "serdes_utils.h"
 #endif
 
-// Distance penalties filling are calculated based on available samples, but can be adjusted with this factor.
+// Lookahead penalties constants
+// Penalities are added for deltas that are outside of a specific segment bounding box region.
+// These penalties are calculated based on the distance of the requested delta to a valid closest point of the bounding
+// box.
+
+///@brief Factor to adjust the penalty calculation for deltas outside the segment bounding box:
+//      factor < 1.0: penalty has less impact on the final returned delay
+//      factor > 1.0: penalty has more impact on the final returned delay
 static constexpr float PENALTY_FACTOR = 1.f;
+
+///@brief Minimum penalty cost that is added when penalizing a delta outside the segment bounding box.
 static constexpr float PENALTY_MIN = 1e-12f;
 
 // also known as the L1 norm
@@ -21,6 +30,7 @@ static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b
     return abs(b.x() - a.x()) + abs(b.y() - a.y());
 }
 
+// clamps v to be between low (lo) and high (hi), inclusive.
 template<class T>
 static constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
     return std::min(std::max(v, lo), hi);
@@ -72,27 +82,135 @@ static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, fl
                             entry.congestion, entry.fill);
 }
 
-// get a cost entry for a segment type, chan type, and offset
+/**
+ * @brief Gets a cost entry for a segment type, and delta_x, delta_y coordinates.
+ * @note Cost entries are pre-computed during the lookahead map generation.
+ *
+ *   @param from_seg_index Index of the segment corresponding to the current expansion node
+ *   @param delta_x x coordinate corresponding to the distance between the source and destination nodes
+ *   @param delta_y y coordinate corresponding to the distance between the source and destination nodes
+ *   @return The requested cost entry in the lookahead map.
+ *
+ * */
 util::Cost_Entry CostMap::find_cost(int from_seg_index, int delta_x, int delta_y) const {
     VTR_ASSERT(from_seg_index >= 0 && from_seg_index < (ssize_t)offset_.size());
     const auto& cost_map = cost_map_[0][from_seg_index];
+    // Check whether the cost map corresponding to the input segment is empty.
+    // This can be due to an absence of samples during the lookahead generation.
+    // This check is required to avoid unexpected behavior when querying an empty map.
     if (cost_map.dim_size(0) == 0 || cost_map.dim_size(1) == 0) {
         return util::Cost_Entry();
     }
 
+    // Delta coordinate with the offset adjusted to fit the segment bounding box
     vtr::Point<int> coord(delta_x - offset_[0][from_seg_index].first,
                           delta_y - offset_[0][from_seg_index].second);
     vtr::Rect<int> bounds(0, 0, cost_map.dim_size(0), cost_map.dim_size(1));
+
+    // Get the closest point in the bounding box:
+    //      - If the coordinate is within the bounding box, the closest point will coincide with the original coordinates.
+    //      - If the coordinate is outside the bounding box, the chosen point will be the one within the bounding box that is
+    //        closest to the original coordinates.
     auto closest = closest_point_in_rect(bounds, coord);
+
+    // Get the cost entry from the cost map at the deltas values
     auto cost = cost_map_[0][from_seg_index][closest.x()][closest.y()];
+
+    // Get the base penalty corresponding to the current segment.
     float penalty = penalty_[0][from_seg_index];
+
+    // Get the distance between the closest point in the bounding box and the original coordinates.
+    // Note that if the original coordinates are within the bounding box, the distance will be equal to zero.
     auto distance = manhattan_distance(closest, coord);
+
+    // Return the penalized cost (no penalty is added if the coordinates are within the bounding box).
     return penalize(cost, distance, penalty);
 }
 
+// finds the penalty delay corresponding to a segment
+float CostMap::get_penalty(vtr::NdMatrix<util::Cost_Entry, 2>& matrix) const {
+    float min_delay = std::numeric_limits<float>::infinity(), max_delay = 0.f;
+    vtr::Point<int> min_location(0, 0), max_location(0, 0);
+    for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+        for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+            util::Cost_Entry& cost_entry = matrix[ix][iy];
+            if (cost_entry.valid()) {
+                if (cost_entry.delay < min_delay) {
+                    min_delay = cost_entry.delay;
+                    min_location = vtr::Point<int>(ix, iy);
+                }
+                if (cost_entry.delay > max_delay) {
+                    max_delay = cost_entry.delay;
+                    max_location = vtr::Point<int>(ix, iy);
+                }
+            }
+        }
+    }
+
+    float delay_penalty = (max_delay - min_delay) / static_cast<float>(std::max(1, manhattan_distance(max_location, min_location)));
+
+    return delay_penalty;
+}
+
+// fills the holes in the cost map matrix
+void CostMap::fill_holes(vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int seg_index, int bounding_box_width, int bounding_box_height, float delay_penalty) {
+    // find missing cost entries and fill them in by copying a nearby cost entry
+    std::vector<std::tuple<unsigned, unsigned, util::Cost_Entry>> missing;
+    bool couldnt_fill = false;
+    auto shifted_bounds = vtr::Rect<int>(0, 0, bounding_box_width, bounding_box_height);
+    int max_fill = 0;
+    for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+        for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+            util::Cost_Entry& cost_entry = matrix[ix][iy];
+            if (!cost_entry.valid()) {
+                // maximum search radius
+                util::Cost_Entry filler;
+                int distance;
+                std::tie(filler, distance) = get_nearby_cost_entry(matrix, ix, iy, shifted_bounds);
+                if (filler.valid()) {
+                    missing.push_back(std::make_tuple(ix, iy, penalize(filler, distance, delay_penalty)));
+                    max_fill = std::max(max_fill, distance);
+                } else {
+                    couldnt_fill = true;
+                }
+            }
+        }
+        if (couldnt_fill) {
+            // give up trying to fill an empty matrix
+            break;
+        }
+    }
+
+    if (!couldnt_fill) {
+        VTR_LOG("At %d: max_fill = %d, delay_penalty = %e\n", seg_index, max_fill, delay_penalty);
+    }
+
+    // write back the missing entries
+    for (auto& xy_entry : missing) {
+        matrix[std::get<0>(xy_entry)][std::get<1>(xy_entry)] = std::get<2>(xy_entry);
+    }
+
+    if (couldnt_fill) {
+        VTR_LOG_WARN("Couldn't fill holes in the cost matrix for %ld, %d x %d bounding box\n",
+                     seg_index, bounding_box_width, bounding_box_height);
+        for (unsigned y = 0; y < matrix.dim_size(1); y++) {
+            for (unsigned x = 0; x < matrix.dim_size(0); x++) {
+                VTR_ASSERT(!matrix[x][y].valid());
+            }
+        }
+    }
+}
+
 // set the cost map for a segment type and chan type, filling holes
 void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::RoutingCosts& base_costs) {
-    // calculate the bounding boxes
+    // Calculate the bounding boxes
+    // Bounding boxes are used to reduce the cost map size. They are generated based on the minimum
+    // and maximum coordinates of the x/y delta delays obtained for each segment.
+    //
+    // There is one bounding box for each segment type.
+    //
+    // In case the lookahead is queried to return a cost that is outside of the bounding box, the closest
+    // cost within the bounding box is returned, with the addition of a calculated penalty cost.
     vtr::Matrix<vtr::Rect<int>> bounds({1, seg_count_});
     for (const auto& entry : delay_costs) {
         bounds[0][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
@@ -101,7 +219,9 @@ void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::Ro
         bounds[0][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
     }
 
-    // store bounds
+    // Adds the above generated bounds to the cost map.
+    // Also the offset_ data is stored, to allow the application of the adjustment factor to the
+    // delta x/y coordinates.
     for (size_t seg = 0; seg < seg_count_; seg++) {
         const auto& seg_bounds = bounds[0][seg];
         if (seg_bounds.empty()) {
@@ -117,7 +237,7 @@ void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::Ro
         }
     }
 
-    // store entries into the matrices
+    // Fill the cost map entries with the delay and congestion costs obtained in the dijkstra expansion step.
     for (const auto& entry : delay_costs) {
         int seg = entry.first.seg_index;
         const auto& seg_bounds = bounds[0][seg];
@@ -133,7 +253,12 @@ void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::Ro
         cost_map_[0][seg][x][y].congestion = entry.second;
     }
 
-    // fill the holes
+    // Adjust the cost map in two steps.
+    //      1. penalty calculation: value used to add a penalty cost to the delta x/y that fall
+    //                              outside of the bounding box for a specific segment
+    //      2. holes filling: some entries might miss delay/congestion data. These holes are being
+    //                        filled in a spiral fashion, starting from the hole, to find the nearby
+    //                        valid cost.
     for (size_t seg = 0; seg < seg_count_; seg++) {
         penalty_[0][seg] = std::numeric_limits<float>::infinity();
         const auto& seg_bounds = bounds[0][seg];
@@ -142,72 +267,12 @@ void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::Ro
         }
         auto& matrix = cost_map_[0][seg];
 
-        // calculate delay penalty
-        float min_delay = std::numeric_limits<float>::infinity(), max_delay = 0.f;
-        vtr::Point<int> min_location(0, 0), max_location(0, 0);
-        for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
-            for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
-                util::Cost_Entry& cost_entry = matrix[ix][iy];
-                if (cost_entry.valid()) {
-                    if (cost_entry.delay < min_delay) {
-                        min_delay = cost_entry.delay;
-                        min_location = vtr::Point<int>(ix, iy);
-                    }
-                    if (cost_entry.delay > max_delay) {
-                        max_delay = cost_entry.delay;
-                        max_location = vtr::Point<int>(ix, iy);
-                    }
-                }
-            }
-        }
-        float delay_penalty = (max_delay - min_delay) / static_cast<float>(std::max(1, manhattan_distance(max_location, min_location)));
+        // Penalty factor calculation for the current segment
+        float delay_penalty = this->get_penalty(matrix);
         penalty_[0][seg] = delay_penalty;
 
-        // find missing cost entries and fill them in by copying a nearby cost entry
-        std::vector<std::tuple<unsigned, unsigned, util::Cost_Entry>> missing;
-        bool couldnt_fill = false;
-        auto shifted_bounds = vtr::Rect<int>(0, 0, seg_bounds.width(), seg_bounds.height());
-        int max_fill = 0;
-        for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
-            for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
-                util::Cost_Entry& cost_entry = matrix[ix][iy];
-                if (!cost_entry.valid()) {
-                    // maximum search radius
-                    util::Cost_Entry filler;
-                    int distance;
-                    std::tie(filler, distance) = get_nearby_cost_entry(matrix, ix, iy, shifted_bounds);
-                    if (filler.valid()) {
-                        missing.push_back(std::make_tuple(ix, iy, penalize(filler, distance, delay_penalty)));
-                        max_fill = std::max(max_fill, distance);
-                    } else {
-                        couldnt_fill = true;
-                    }
-                }
-            }
-            if (couldnt_fill) {
-                // give up trying to fill an empty matrix
-                break;
-            }
-        }
-
-        if (!couldnt_fill) {
-            VTR_LOG("At %d: max_fill = %d, delay_penalty = %e\n", seg, max_fill, delay_penalty);
-        }
-
-        // write back the missing entries
-        for (auto& xy_entry : missing) {
-            matrix[std::get<0>(xy_entry)][std::get<1>(xy_entry)] = std::get<2>(xy_entry);
-        }
-
-        if (couldnt_fill) {
-            VTR_LOG_WARN("Couldn't fill holes in the cost matrix for %ld, %d x %d bounding box\n",
-                         seg, seg_bounds.width(), seg_bounds.height());
-            for (unsigned y = 0; y < matrix.dim_size(1); y++) {
-                for (unsigned x = 0; x < matrix.dim_size(0); x++) {
-                    VTR_ASSERT(!matrix[x][y].valid());
-                }
-            }
-        }
+        // Holes filling
+        this->fill_holes(matrix, seg, seg_bounds.width(), seg_bounds.height(), delay_penalty);
     }
 }
 
@@ -311,6 +376,11 @@ std::pair<util::Cost_Entry, int> CostMap::get_nearby_cost_entry(const vtr::NdMat
     return std::make_pair(fill, n);
 }
 
+/*
+ * The following static functions are used to have a fast read and write access to
+ * the cost map data structures, exploiting the capnp serialization.
+ */
+
 static void ToCostEntry(util::Cost_Entry* out, const VprCostEntry::Reader& in) {
     out->delay = in.getDelay();
     out->congestion = in.getCongestion();
diff --git a/vpr/src/route/router_lookahead_cost_map.h b/vpr/src/route/router_lookahead_cost_map.h
index b2620335498..90990c3d62b 100644
--- a/vpr/src/route/router_lookahead_cost_map.h
+++ b/vpr/src/route/router_lookahead_cost_map.h
@@ -5,26 +5,124 @@
 #include "router_lookahead_map_utils.h"
 #include "vtr_geometry.h"
 
-// Dense cost maps per source segment and destination chan types
+/**
+ * @brief Dense cost maps per source segment and destination nodes
+ *
+ * This class is intended to collect all methods and members that act on the lookahead cost map.
+ * All methods that modify or access the cost maps should belong to this class.
+ *
+ * The main cost map, containing the cost entries to reach a specific destination in the device is indexed as follows:
+ *
+ * cost_map_[0][segment_index][delta_x][delta_y]
+ *
+ * Originally, the first dimension was used to differentiate the lookahead between CHANX and CHANY nodes. It turns out
+ * that this distinction leads to worse results when using the extended map lookahead, therefore, this dimension of the cost map
+ * has been collapsed, therefore this first dimention is virtually unused, but is kept to have the extended map lookahead as close
+ * as possible to the original extended map.
+ */
 class CostMap {
   public:
+    /**
+     * @briefSets the total number of segments
+     *
+     * @param seg_count total segment count.
+     */
     void set_counts(size_t seg_count);
+
+    /**
+     * @brief Builds a node to segments map for a fast lookup
+     */
     void build_segment_map();
+
+    /**
+     * @brief Queries the segment map to get the segment index given the corresponding node index
+     *
+     * @param from_node_ind index of the node to search in the segment map
+     * @return The index of the segment corresponding to the input node id
+     */
     int node_to_segment(int from_node_ind) const;
+
+    /**
+     * @brief Queries the cost map to get a lookahead delay cost given the segment index and the x/y deltas to the destination node
+     *
+     * @param from_seg_index segment index to query the cost map corresponding to the current segment
+     * @param delta_x x delta value that is the distance between the current node to the target node
+     * @param delta_y y delta value that is the distance between the current node to the target node
+     * @return The cost entry corresponding to the current segment and the delta x,y.
+     */
     util::Cost_Entry find_cost(int from_seg_index, int delta_x, int delta_y) const;
+
+    /**
+     * @brief Sets the cost map given the delay and congestion routing costs
+     *
+     * @param delay_costs delay costs calculated with the dijkstra expansions
+     * @param base_costs congestion costs calculated with the dijkstra expansions
+     */
     void set_cost_map(const util::RoutingCosts& delay_costs, const util::RoutingCosts& base_costs);
+
+    /**
+     * @brief Gets the cost of an entry that is close to a hole in the cost map
+     *
+     * @param matrix cost map corresponding to a specific segment index
+     * @param cx x location of the cost map entry that needs to be filled
+     * @param cy y location of the cost map entry that needs to be filled
+     * @param bounds bounds of the cost map corresponding to the current segment
+     * @return A pair containing the nearby cost entry and the distance from the current location to the narby cost entry found.
+     */
     std::pair<util::Cost_Entry, int> get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int cx, int cy, const vtr::Rect<int>& bounds);
+
+    /**
+     * @brief Reads the lookahead file
+     *
+     * @param file input lookahead file
+     */
     void read(const std::string& file);
+
+    /**
+     * @brief Writes the lookahead file
+     *
+     * @param file output lookahead file
+     */
     void write(const std::string& file) const;
+
     void print(int iseg) const;
     std::vector<std::pair<int, int>> list_empty() const;
 
   private:
-    vtr::Matrix<vtr::Matrix<util::Cost_Entry>> cost_map_;
-    vtr::Matrix<std::pair<int, int>> offset_;
-    vtr::Matrix<float> penalty_;
-    std::vector<int> segment_map_;
-    size_t seg_count_;
+    vtr::Matrix<vtr::Matrix<util::Cost_Entry>> cost_map_; ///<Cost map containing all the costs computed during the lookahead generation.
+                                                          ///<It is indexed as follows: cost_map_[0][segment_index][delta_x][delta_y]
+                                                          ///<The first index is always 0 and it is kept to allow future specializations of
+                                                          ///<the cost map based on other possible indices.
+
+    vtr::Matrix<std::pair<int, int>> offset_; ///<Offset to specify the bounds of a specific segment map.
+                                              ///<It is used to adjust delta values to the segment bounding box.
+                                              ///<The offset map is addressed as follows: offset_[0][segment_index]
+                                              ///<The values of the matrix are pairs of corresponding to X and Y offsets.
+
+    vtr::Matrix<float> penalty_; ///<Penalty value corresponding to each segment type and used to penalize
+                                 ///<delta locations that fall outside of a segment's bounding box.
+                                 ///<The penalty map is addressed as follows penalty_[0][segment_index]
+
+    std::vector<int> segment_map_; ///<Segment map to lookup to the segment index given the node index
+
+    size_t seg_count_; ///<Total segment count in the architecture
+
+    /**
+     * @brief Get penalty delay for a segment type
+     * @param matrix cost map for a specific segment type
+     * @return Penalty delay value
+     */
+    float get_penalty(vtr::NdMatrix<util::Cost_Entry, 2>& matrix) const;
+
+    /**
+     * @brief Fills the holes in the cost map matrix
+     * @param matrix cost map for a specific segment type
+     * @param seg_index index of the current segment
+     * @param bounding_box_width width of the segment type cost map bounding box
+     * @param bounding_box_height height of the segment type cost map bounding box
+     * @param delay_penalty penalty corresponding to the current segment cost map
+     */
+    void fill_holes(vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int seg_index, int bounding_box_width, int bounding_box_height, float delay_penalty);
 };
 
 #endif

From 894ec9f7e8c43c59fc72759e827255f4e1d0b1a9 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Thu, 3 Sep 2020 19:17:25 +0200
Subject: [PATCH 05/18] extended_lookahead: fix SRC/OPIN delay

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 .../route/router_lookahead_extended_map.cpp   | 161 ++++++++----------
 vpr/src/route/router_lookahead_extended_map.h |   2 +-
 2 files changed, 70 insertions(+), 93 deletions(-)

diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index b8726f7a3b9..1d6c623a7ce 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -45,6 +45,7 @@
 // Don't continue storing a path after hitting a lower-or-same cost entry.
 static constexpr bool BREAK_ON_MISS = false;
 
+///@brief Threshold to set a limit the paths exploration during the dijkstra expansion for a particualr sample region.
 static constexpr int MIN_PATH_COUNT = 1000;
 
 template<typename Entry>
@@ -53,94 +54,76 @@ static std::pair<float, int> run_dijkstra(int start_node_ind,
                                           std::vector<util::Search_Path>* paths,
                                           util::RoutingCosts* routing_costs);
 
-template<class T>
-constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
-    return std::min(std::max(v, lo), hi);
-}
-
-template<typename T>
-static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Point<T>& p) {
-    if (r.empty()) {
-        return vtr::Point<T>(0, 0);
-    } else {
-        return vtr::Point<T>(clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
-                             clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
-    }
-}
-
-std::pair<float, float> ExtendedMapLookahead::get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const {
+float ExtendedMapLookahead::get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    e_rr_type from_type = rr_graph.node_type(from_node);
-    if (from_type == SOURCE || from_type == OPIN) {
-        //When estimating costs from a SOURCE/OPIN we look-up to find which wire types (and the
-        //cost to reach them) in f_src_opin_delays. Once we know what wire types are
-        //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final
-        //delay to reach the sink.
-
-        t_physical_tile_type_ptr tile_type = device_ctx.grid[rr_graph.node_xlow(from_node)][rr_graph.node_ylow(from_node)].type;
-        auto tile_index = std::distance(&device_ctx.physical_tile_types[0], tile_type);
-
-        auto from_ptc = rr_graph.node_ptc_num(from_node);
-
-        if (this->src_opin_delays[tile_index][from_ptc].empty()) {
-            //During lookahead profiling we were unable to find any wires which connected
-            //to this PTC.
-            //
-            //This can sometimes occur at very low channel widths (e.g. during min W search on
-            //small designs) where W discretization combined with fraction Fc may cause some
-            //pins/sources to be left disconnected.
-            //
-            //Such RR graphs are of course unroutable, but that should be determined by the
-            //router. So just return an arbitrary value here rather than error.
-
-            return std::pair<float, float>(0.f, 0.f);
-        } else {
-            //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
-            //and then add the estimates from those wire types for the distance of interest.
-            //If there are multiple options we use the minimum value.
-
-            float delay = 0;
-            float congestion = 0;
-            float expected_cost = std::numeric_limits<float>::infinity();
-
-            for (const auto& kv : this->src_opin_delays[tile_index][from_ptc]) {
-                const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
-
-                util::Cost_Entry cost_entry;
-                if (reachable_wire_inf.wire_rr_type == SINK) {
-                    //Some pins maybe reachable via a direct (OPIN -> IPIN) connection.
-                    //In the lookahead, we treat such connections as 'special' wire types
-                    //with no delay/congestion cost
-                    cost_entry.delay = 0;
-                    cost_entry.congestion = 0;
-                } else {
-                    //For an actual accessible wire, we query the wire look-up to get it's
-                    //delay and congestion cost estimates
-                    cost_entry = cost_map_.find_cost(reachable_wire_inf.wire_seg_index, delta_x, delta_y);
-                }
-
-                float this_cost = (criticality_fac) * (reachable_wire_inf.delay + cost_entry.delay)
-                                  + (1. - criticality_fac) * (reachable_wire_inf.congestion + cost_entry.congestion);
-
-                if (this_cost < expected_cost) {
-                    delay = reachable_wire_inf.delay;
-                    congestion = reachable_wire_inf.congestion;
-                }
+    //When estimating costs from a SOURCE/OPIN we look-up to find which wire types (and the
+    //cost to reach them) in f_src_opin_delays. Once we know what wire types are
+    //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final
+    //delay to reach the sink.
+
+    t_physical_tile_type_ptr tile_type = device_ctx.grid[rr_graph.node_xlow(from_node)][rr_graph.node_ylow(from_node)].type;
+    auto tile_index = std::distance(&device_ctx.physical_tile_types[0], tile_type);
+
+    auto from_ptc = rr_graph.node_ptc_num(from_node);
+
+    if (this->src_opin_delays[tile_index][from_ptc].empty()) {
+        //During lookahead profiling we were unable to find any wires which connected
+        //to this PTC.
+        //
+        //This can sometimes occur at very low channel widths (e.g. during min W search on
+        //small designs) where W discretization combined with fraction Fc may cause some
+        //pins/sources to be left disconnected.
+        //
+        //Such RR graphs are of course unroutable, but that should be determined by the
+        //router. So just return an arbitrary value here rather than error.
+
+        //We choose to return the largest (non-infinite) value possible, but scaled
+        //down by a large factor to maintain some dynaimc range in case this value ends
+        //up being processed (e.g. by the timing analyzer).
+        //
+        //The cost estimate should still be *extremely* large compared to a typical delay, and
+        //so should ensure that the router de-prioritizes exploring this path, but does not
+        //forbid the router from trying.
+        return std::numeric_limits<float>::max() / 1e12;
+    } else {
+        //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
+        //and then add the estimates from those wire types for the distance of interest.
+        //If there are multiple options we use the minimum value.
+        float expected_cost = std::numeric_limits<float>::infinity();
+
+        for (const auto& kv : this->src_opin_delays[tile_index][from_ptc]) {
+            const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
+
+            util::Cost_Entry cost_entry;
+            if (reachable_wire_inf.wire_rr_type == SINK) {
+                //Some pins maybe reachable via a direct (OPIN -> IPIN) connection.
+                //In the lookahead, we treat such connections as 'special' wire types
+                //with no delay/congestion cost
+                cost_entry.delay = 0;
+                cost_entry.congestion = 0;
+            } else {
+                //For an actual accessible wire, we query the wire look-up to get it's
+                //delay and congestion cost estimates
+                cost_entry = cost_map_.find_cost(reachable_wire_inf.wire_seg_index, delta_x, delta_y);
             }
 
-            return std::pair<float, float>(delay, congestion);
+            float this_cost = (criticality_fac) * (reachable_wire_inf.congestion + cost_entry.congestion)
+                              + (1. - criticality_fac) * (reachable_wire_inf.delay + cost_entry.delay);
+
+            expected_cost = std::min(this_cost, expected_cost);
         }
 
-        VTR_ASSERT_SAFE_MSG(false,
-                            vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
-                                            rr_node_arch_name(size_t(from_node)).c_str(),
-                                            describe_rr_node(size_t(from_node)).c_str())
-                                .c_str());
+        VTR_ASSERT(std::isfinite(expected_cost));
+        return expected_cost;
     }
 
-    return std::pair<float, float>(0.f, 0.f);
+    VTR_ASSERT_SAFE_MSG(false,
+                        vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
+                                        rr_node_arch_name(size_t(from_node)).c_str(),
+                                        describe_rr_node(size_t(from_node)).c_str())
+                            .c_str());
 }
 
 float ExtendedMapLookahead::get_chan_ipin_delays(RRNodeId to_node) const {
@@ -186,8 +169,10 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
     dx = to_x - from_x;
     dy = to_y - from_y;
 
-    float extra_delay, extra_congestion;
-    std::tie(extra_delay, extra_congestion) = this->get_src_opin_delays(from_node, dx, dy, criticality_fac);
+    e_rr_type from_type = rr_graph.node_type(from_node);
+    if (from_type == SOURCE || from_type == OPIN) {
+        return this->get_src_opin_cost(from_node, dx, dy, criticality_fac);
+    }
 
     int from_seg_index = cost_map_.node_to_segment(from_node_ind);
     util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, dx, dy);
@@ -201,8 +186,8 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
         return std::numeric_limits<float>::infinity();
     }
 
-    float expected_delay = cost_entry.delay + extra_delay;
-    float expected_congestion = cost_entry.congestion + extra_congestion;
+    float expected_delay = cost_entry.delay;
+    float expected_congestion = cost_entry.congestion;
 
     float site_pin_delay = this->get_chan_ipin_delays(to_node);
     expected_delay += site_pin_delay;
@@ -435,14 +420,6 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
                         path_count / run_timer.elapsed_sec());
             }
 
-            /*
-             * if (path_count == 0) {
-             * for (auto node_ind : point.nodes) {
-             * VTR_LOG("Expanded node %s\n", describe_rr_node(node_ind).c_str());
-             * }
-             * }
-             */
-
             total_path_count += path_count;
             if (total_path_count > MIN_PATH_COUNT) {
                 break;
@@ -528,7 +505,7 @@ float ExtendedMapLookahead::get_expected_cost(
 
     t_rr_type rr_type = device_ctx.rr_nodes[current_node].type();
 
-    if (rr_type == CHANX || rr_type == CHANY) {
+    if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
         return get_map_cost(
             current_node, target_node, params.criticality);
     } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
index dd0e7fb2519..413d5b7ecaf 100644
--- a/vpr/src/route/router_lookahead_extended_map.h
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -17,7 +17,7 @@ class ExtendedMapLookahead : public RouterLookahead {
     //Look-up table from CHANX/CHANY to SINK/IPIN of various types
     util::t_chan_ipins_delays chan_ipins_delays;
 
-    std::pair<float, float> get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
+    float get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
     float get_chan_ipin_delays(RRNodeId to_node) const;
 
     template<typename Entry>

From 2472366de629b9f746048a55232f6f58c5ae136f Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 13:41:33 +0200
Subject: [PATCH 06/18] extended_map: added in-code comments

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 .../route/router_lookahead_extended_map.cpp   | 57 +++++++++++++++++--
 vpr/src/route/router_lookahead_extended_map.h | 40 +++++++++++--
 2 files changed, 85 insertions(+), 12 deletions(-)

diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index 1d6c623a7ce..7ece5359198 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -45,7 +45,7 @@
 // Don't continue storing a path after hitting a lower-or-same cost entry.
 static constexpr bool BREAK_ON_MISS = false;
 
-///@brief Threshold to set a limit the paths exploration during the dijkstra expansion for a particualr sample region.
+///@brief Threshold indicating the minimum number of paths to sample for each point in a sample region
 static constexpr int MIN_PATH_COUNT = 1000;
 
 template<typename Entry>
@@ -64,7 +64,7 @@ float ExtendedMapLookahead::get_src_opin_cost(RRNodeId from_node, int delta_x, i
     //delay to reach the sink.
 
     t_physical_tile_type_ptr tile_type = device_ctx.grid[rr_graph.node_xlow(from_node)][rr_graph.node_ylow(from_node)].type;
-    auto tile_index = std::distance(&device_ctx.physical_tile_types[0], tile_type);
+    auto tile_index = tile_type->index;
 
     auto from_ptc = rr_graph.node_ptc_num(from_node);
 
@@ -146,6 +146,13 @@ float ExtendedMapLookahead::get_chan_ipin_delays(RRNodeId to_node) const {
 }
 
 // derive a cost from the map between two nodes
+// The criticality factor can range between 0 and 1 and weights the delay versus the congestion costs:
+//      - value == 0    : congestion is the only cost factor that gets considered
+//      - 0 < value < 1 : different weight on both delay and congestion
+//      - value == 1    : delay is the only cost factor that gets considered
+//
+//  The from_node_ind can be of one of the following types: CHANX, CHANY, SOURCE, OPIN
+//  The to_node_ind is always a SINK
 float ExtendedMapLookahead::get_map_cost(int from_node_ind,
                                          int to_node_ind,
                                          float criticality_fac) const {
@@ -218,7 +225,12 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
     return expected_cost;
 }
 
-// add a best cost routing path from start_node_ind to node_ind to routing costs
+// Adds a best cost routing path from start_node_ind to node_ind to routing costs
+//
+// This routine performs a backtrace from a destination node to the starting node of each path found during the dijkstra expansion.
+//
+// The Routing Costs table is updated for every partial path found between the destination and start nodes, only if the cost to reach
+// the intermediate node is lower than the one already stored in the Routing Costs table.
 template<typename Entry>
 bool ExtendedMapLookahead::add_paths(int start_node_ind,
                                      Entry current,
@@ -242,6 +254,11 @@ bool ExtendedMapLookahead::add_paths(int start_node_ind,
     }
     path.push_back(start_node_ind);
 
+    // The site_pin_delay gets subtracted from the current path as it might prevent the addition
+    // of the path to the final routing costs.
+    //
+    // The site pin delay (or CHAN -> IPIN delay) is a constant adjustment that is added when querying
+    // the lookahead.
     current.adjust_Tsw(-site_pin_delay);
 
     // add each node along the path subtracting the incremental costs from the current costs
@@ -282,12 +299,21 @@ bool ExtendedMapLookahead::add_paths(int start_node_ind,
         VTR_ASSERT(cost >= 0.f);
 
         // NOTE: implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+        // This updates the Cost Entry relative to the current key.
+        // The unique key is composed by:
+        //      - segment_id
+        //      - delta_x/y
         auto result = routing_costs->insert(std::make_pair(key, cost));
         if (!result.second) {
             if (cost < result.first->second) {
                 result.first->second = cost;
                 new_sample_found = true;
             } else if (BREAK_ON_MISS) {
+                // This is an experimental feature to reduce CPU time.
+                // Storing all partial paths generates a lot of extra data and map lookups and, if BREAK_ON_MISS is active,
+                // the search for a minimum sample is stopped whenever a better partial path is not found.
+                //
+                // This can potentially prevent this routine to find the minimal sample though.
                 break;
             }
         } else {
@@ -379,10 +405,10 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
     util::RoutingCosts all_base_costs;
 
     /* run Dijkstra's algorithm for each segment type & channel type combination */
-#if defined(VPR_USE_TBB)
+#if defined(VPR_USE_TBB) // Run parallely
     tbb::mutex all_costs_mutex;
     tbb::parallel_for_each(sample_regions, [&](const SampleRegion& region) {
-#else
+#else // Run serially
     for (const auto& region : sample_regions) {
 #endif
         // holds the cost entries for a run
@@ -392,6 +418,11 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
         std::vector<bool> node_expanded(device_ctx.rr_nodes.size());
         std::vector<util::Search_Path> paths(device_ctx.rr_nodes.size());
 
+        // Each point in a sample region contains a set of nodes. Each node becomes a starting node
+        // for the dijkstra expansions, and different paths are explored to reach different locations.
+        //
+        // If the nodes get exhausted or the minimum number of paths is reached (set by MIN_PATH_COUNT)
+        // the routine exits and the costs added to the collection of all the costs found so far.
         for (auto& point : region.points) {
             // statistics
             vtr::Timer run_timer;
@@ -399,6 +430,13 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
             float max_base_cost = 0.f;
             int path_count = 0;
             for (auto node_ind : point.nodes) {
+                // For each starting node, two different expansions are performed, one to find minimum delay
+                // and the second to find minimum base costs.
+                //
+                // NOTE: Doing two separate dijkstra expansions, each finding a minimum value leads to have an optimistic lookahead,
+                //       given that delay and base costs may not be simultaneously achievemnts.
+                //       Experiments have shown that the having two separate expansions lead to better results for Series 7 devices, but
+                //       this might not be true for Stratix ones.
                 {
                     auto result = run_dijkstra<util::PQ_Entry_Delay>(node_ind, &node_expanded, &paths, &delay_costs);
                     max_delay_cost = std::max(max_delay_cost, result.first);
@@ -427,6 +465,13 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
         }
 
 #if defined(VPR_USE_TBB)
+        // The mutex locks the common data structures that each worker uses to store the routing
+        // expansion data.
+        //
+        // This because delay_costs and base_costs are in the worker's scope, so they can be run parallely and
+        // no data is shared among workers.
+        // Instead, all_delay_costs and all_base_costs is a shared object between all the workers, hence the
+        // need of a mutex when modifying their entries.
         all_costs_mutex.lock();
 #endif
 
@@ -505,7 +550,7 @@ float ExtendedMapLookahead::get_expected_cost(
 
     t_rr_type rr_type = device_ctx.rr_nodes[current_node].type();
 
-    if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
+    if (rr_type == CHANX || rr_type == CHANY) {
         return get_map_cost(
             current_node, target_node, params.criticality);
     } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
index 413d5b7ecaf..0af50581adf 100644
--- a/vpr/src/route/router_lookahead_extended_map.h
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -11,13 +11,27 @@
 // Implementation of RouterLookahead based on source segment and destination connection box types
 class ExtendedMapLookahead : public RouterLookahead {
   private:
-    //Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
+    ///<Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
     util::t_src_opin_delays src_opin_delays;
 
-    //Look-up table from CHANX/CHANY to SINK/IPIN of various types
+    ///<Look-up table from CHANX/CHANY to SINK/IPIN of various types
     util::t_chan_ipins_delays chan_ipins_delays;
 
+    /**
+     * @brief Returns the cost to get to a input delta_x/y location when starting from a SOURCE or OPIN node
+     * @param from_node starting node
+     * @param delta_x x delta value that is the distance between the current node to the target node 
+     * @param delta_y y delta value that is the distance between the current node to the target node 
+     * @param criticality_fac criticality of the current connection
+     * @return expected cost to get to the destination
+     */
     float get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
+
+    /**
+     * @brief Returns the CHAN -> IPIN delay that gets added to the final expected delay
+     * @param to_node target node for which we query the IPIN delay
+     * @return IPIN delay relative to the input destination node
+     */
     float get_chan_ipin_delays(RRNodeId to_node) const;
 
     template<typename Entry>
@@ -32,15 +46,29 @@ class ExtendedMapLookahead : public RouterLookahead {
                                        std::vector<util::Search_Path>* paths,
                                        util::RoutingCosts* routing_costs);
 
+    float get_map_cost(int from_node_ind, int to_node_ind, float criticality_fac) const;
+
+    CostMap cost_map_; ///<Cost map containing all data to extract the entry cost when querying the lookahead.
   public:
+    /**
+     * @brief Returns the expected cost to get to a destination node
+     */
     float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
-    float get_map_cost(int from_node_ind, int to_node_ind, float criticality_fac) const;
+   
+    /**
+     * @brief Computes the extended lookahead map 
+     */
     void compute(const std::vector<t_segment_inf>& segment_inf) override;
-
+    
+    /**
+     * @brief Reads the extended lookahead map 
+     */
     void read(const std::string& file) override;
+    
+    /**
+     * @brief Writes the extended lookahead map 
+     */
     void write(const std::string& file) const override;
-
-    CostMap cost_map_;
 };
 
 #endif

From c2546274615bb86c5e85205e90701391b8feadcb Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 14:56:27 +0200
Subject: [PATCH 07/18] lookahead_map_utils: added comments

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_map_utils.cpp | 38 +++++++++++++++++-
 vpr/src/route/router_lookahead_map_utils.h   | 42 +++++++++++++++-----
 2 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
index e01ebba26bd..dc35afef09d 100644
--- a/vpr/src/route/router_lookahead_map_utils.cpp
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -1,5 +1,17 @@
 #include "router_lookahead_map_utils.h"
 
+/*
+ * This file contains utility functions that can be shared among different
+ * lookahead computation strategies.
+ *
+ * In general, this utility library contains:
+ *
+ * - Different dijkstra expansion alogrithms used to perform specific tasks, such as computing the SROURCE/OPIN --> CHAN lookup tables
+ * - Cost Entries definitions used when generating and querying the lookahead
+ *
+ * To access the utility functions, the util namespace needs to be used.
+ */
+
 #include "globals.h"
 #include "vpr_context.h"
 #include "vtr_math.h"
@@ -12,11 +24,30 @@ static void dijkstra_flood_to_ipins(RRNodeId node, util::t_chan_ipins_delays& ch
 
 static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> start);
 
-// Constants needed to reduce the bounding box when expanding CHAN wires to reach the IPINs
+// Constants needed to reduce the bounding box when expanding CHAN wires to reach the IPINs.
+// These are used when finding all the delays to get to the IPINs of all the different tile types
+// of the device.
+//
+// The node expansions to get to the center of the bounding box (where the IPINs are located) start from
+// the edge of the bounding box defined by these constants.
+//
+// E.g.: IPIN locations is found at (3,5). A bounding box around (3, 5) is generated with the following
+//       corners.
+//          x_min: 1 (x - X_OFFSET)
+//          x_max: 5 (x + X_OFFSET)
+//          y_min: 3 (y - Y_OFFSET)
+//          y_max: 7 (y + Y_OFFSET)
 #define X_OFFSET 2
 #define Y_OFFSET 2
 
 // Maximum dijkstra expansions when exploring the CHAN --> IPIN connections
+// This sets a limit on the dijkstra expansions to reach an IPIN connection. Given that we
+// want to have the data on the last delay to get to an IPIN, we do not want to have an unbounded
+// number of hops to get to the IPIN, as this would result in high run-times.
+//
+// E.g.: if the constant value is set to 2, the following expansions are perfomed:
+//          - CHANX --> CHANX --> exploration interrupted: Maximum expansion level reached
+//          - CHANX --> IPIN --> exploration interrupted: IPIN found, no need to expand further
 #define MAX_EXPANSION_LEVEL 1
 
 #define DIRECT_CONNECT_SPECIAL_SEG_TYPE -1;
@@ -448,7 +479,10 @@ static void dijkstra_flood_to_wires(int itile, RRNodeId node, util::t_src_opin_d
                 //This is a direct-connect path between an IPIN and OPIN,
                 //which terminated at a SINK.
                 //
-                //We treat this as a 'special' wire type
+                //We treat this as a 'special' wire type.
+                //When the src_opin_delays data structure is queried and a SINK rr_type
+                //is found, the lookahead is not accessed to get the cost entries
+                //as this is a special case of direct connections between OPIN and IPIN.
                 seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE;
             }
 
diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
index 56b2d564d96..5de54d96a76 100644
--- a/vpr/src/route/router_lookahead_map_utils.h
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -4,7 +4,7 @@
  * The router lookahead provides an estimate of the cost from an intermediate node to the target node
  * during directed (A*-like) routing.
  *
- * The VPR 7.0 lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and
+ * The VPR 7.0 classic lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and
  * congestion by assuming that a minimum number of wires, of the same type as the current node being expanded, can be used
  * to complete the route. While this method is efficient, it can run into trouble with architectures that use
  * multiple interconnected wire types.
@@ -37,13 +37,15 @@ enum e_representative_entry_method {
     MEDIAN
 };
 
-/* f_cost_map is an array of these cost entries that specifies delay/congestion estimates
- * to travel relative x/y distances */
+/**
+ * @brief f_cost_map is an array of these cost entries that specifies delay/congestion estimates to travel relative x/y distances
+ */
 class Cost_Entry {
   public:
-    float delay;
-    float congestion;
-    bool fill;
+    float delay; ///<Delay value of the cost entry
+    float congestion; ///<Base cost value of the cost entry
+    bool fill; ///<Boolean specifying whether this Entry was created as a result of the cost map
+               ///<holes filling procedure
 
     Cost_Entry() {
         delay = std::numeric_limits<float>::infinity();
@@ -63,12 +65,21 @@ class Cost_Entry {
     }
 };
 
-/* a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion.
- * since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored
- * in the final lookahead cost map */
+/** 
+ * @brief a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion.
+ * 
+ * Since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored
+ * in the final lookahead cost map
+ *
+ * This class is used for the lookahead map implementation only
+ */
 class Expansion_Cost_Entry {
   private:
-    std::vector<Cost_Entry> cost_vector;
+    std::vector<Cost_Entry> cost_vector; ///<This vector is used to store different cost entries that are treated
+                                         ///<differently based on the below representative cost entry method computations
+                                         ///<Currently, only the smallest entry is stored in the vector, but this prepares
+                                         ///<the ground for other possible representative cost entry calculation methods.
+                                         ///<The vector is filled for a specific (chan_index, segment_index, delta_x, delta_y) cost entry.
 
     Cost_Entry get_smallest_entry() const;
     Cost_Entry get_average_entry() const;
@@ -250,6 +261,12 @@ struct t_reachable_wire_inf {
 // physical block type index                   |             Reachable wire info
 //                                             |
 //                                             SOURCE/OPIN ptc
+//
+// This data structure stores a set of delay and congestion values for each wire that can be reached from a given
+// SOURCE/OPIN of a given tile type.
+//
+// When querying this data structure, the minimum cost is computed for each delay/congestion pair, and returned
+// as the lookahead expected cost.
 typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_src_opin_delays;
 
 //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1]
@@ -257,7 +274,10 @@ typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_src_opin
 // |                                           |
 // physical block type index                   |
 //                                             |
-//                                             SINK/IPIN ptc
+//                                             SINK ptc
+//
+// This data structure stores the minimum delay to reach a specific SINK from the last connection between the wire (CHANX/CHANY)
+// and the tile's IPIN. If there are many connections to the same IPIN, the one with the minimum delay is selected.
 typedef std::vector<std::vector<t_reachable_wire_inf>> t_chan_ipins_delays;
 
 t_src_opin_delays compute_router_src_opin_lookahead();

From b3e6f83ac28bc8e9733ddb0333818762141a6b9a Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 16:55:03 +0200
Subject: [PATCH 08/18] lookahead: wider usage of RRNodeId where possible

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 .../route/router_lookahead_extended_map.cpp   | 72 +++++++++----------
 vpr/src/route/router_lookahead_extended_map.h | 22 +++---
 vpr/src/route/router_lookahead_map_utils.cpp  | 32 ++++-----
 vpr/src/route/router_lookahead_map_utils.h    | 20 +++---
 vpr/src/route/router_lookahead_sampling.cpp   | 42 ++++++-----
 vpr/src/route/router_lookahead_sampling.h     |  2 +-
 6 files changed, 98 insertions(+), 92 deletions(-)

diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index 7ece5359198..5c1b7232253 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -49,7 +49,7 @@ static constexpr bool BREAK_ON_MISS = false;
 static constexpr int MIN_PATH_COUNT = 1000;
 
 template<typename Entry>
-static std::pair<float, int> run_dijkstra(int start_node_ind,
+static std::pair<float, int> run_dijkstra(RRNodeId start_node,
                                           std::vector<bool>* node_expanded,
                                           std::vector<util::Search_Path>* paths,
                                           util::RoutingCosts* routing_costs);
@@ -151,21 +151,18 @@ float ExtendedMapLookahead::get_chan_ipin_delays(RRNodeId to_node) const {
 //      - 0 < value < 1 : different weight on both delay and congestion
 //      - value == 1    : delay is the only cost factor that gets considered
 //
-//  The from_node_ind can be of one of the following types: CHANX, CHANY, SOURCE, OPIN
-//  The to_node_ind is always a SINK
-float ExtendedMapLookahead::get_map_cost(int from_node_ind,
-                                         int to_node_ind,
+//  The from_node can be of one of the following types: CHANX, CHANY, SOURCE, OPIN
+//  The to_node is always a SINK
+float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
+                                         RRNodeId to_node,
                                          float criticality_fac) const {
-    if (from_node_ind == to_node_ind) {
+    if (from_node == to_node) {
         return 0.f;
     }
 
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    RRNodeId to_node(to_node_ind);
-    RRNodeId from_node(from_node_ind);
-
     int from_x = rr_graph.node_xlow(from_node);
     int from_y = rr_graph.node_ylow(from_node);
 
@@ -181,15 +178,15 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
         return this->get_src_opin_cost(from_node, dx, dy, criticality_fac);
     }
 
-    int from_seg_index = cost_map_.node_to_segment(from_node_ind);
+    int from_seg_index = cost_map_.node_to_segment(size_t(from_node));
     util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, dx, dy);
 
     if (!cost_entry.valid()) {
         // there is no route
         VTR_LOGV_DEBUG(f_router_debug,
                        "Not connected %d (%s, %d) -> %d (%s)\n",
-                       from_node_ind, device_ctx.rr_nodes[from_node_ind].type_string(), from_seg_index,
-                       to_node_ind, device_ctx.rr_nodes[to_node_ind].type_string());
+                       size_t(from_node), device_ctx.rr_nodes[size_t(from_node)].type_string(), from_seg_index,
+                       size_t(to_node), device_ctx.rr_nodes[size_t(to_node)].type_string());
         return std::numeric_limits<float>::infinity();
     }
 
@@ -201,7 +198,7 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
 
     float expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion;
 
-    VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node %d to %d\n", from_node_ind, to_node_ind);
+    VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node %d to %d\n", size_t(from_node), size_t(to_node));
     const std::string& segment_name = device_ctx.rr_segments[from_seg_index].name;
     VTR_LOGV_DEBUG(f_router_debug, "Lookahead returned %s (%d) with distance (%zd, %zd)\n",
                    segment_name.c_str(), from_seg_index,
@@ -232,15 +229,14 @@ float ExtendedMapLookahead::get_map_cost(int from_node_ind,
 // The Routing Costs table is updated for every partial path found between the destination and start nodes, only if the cost to reach
 // the intermediate node is lower than the one already stored in the Routing Costs table.
 template<typename Entry>
-bool ExtendedMapLookahead::add_paths(int start_node_ind,
+bool ExtendedMapLookahead::add_paths(RRNodeId start_node,
                                      Entry current,
                                      const std::vector<util::Search_Path>& paths,
                                      util::RoutingCosts* routing_costs) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    int node_ind = current.rr_node_ind;
-    RRNodeId node(node_ind);
+    RRNodeId node = current.rr_node;
 
     bool new_sample_found = false;
 
@@ -248,11 +244,11 @@ bool ExtendedMapLookahead::add_paths(int start_node_ind,
 
     // reconstruct the path
     std::vector<int> path;
-    for (int i = paths[node_ind].parent; i != start_node_ind; i = paths[i].parent) {
-        VTR_ASSERT(i != -1);
+    for (size_t i = paths[size_t(node)].parent; i != size_t(start_node); i = paths[i].parent) {
+        VTR_ASSERT(i != std::numeric_limits<size_t>::max());
         path.push_back(i);
     }
-    path.push_back(start_node_ind);
+    path.push_back(size_t(start_node));
 
     // The site_pin_delay gets subtracted from the current path as it might prevent the addition
     // of the path to the final routing costs.
@@ -262,8 +258,8 @@ bool ExtendedMapLookahead::add_paths(int start_node_ind,
     current.adjust_Tsw(-site_pin_delay);
 
     // add each node along the path subtracting the incremental costs from the current costs
-    Entry start_to_here(start_node_ind, UNDEFINED, nullptr);
-    int parent = start_node_ind;
+    Entry start_to_here(start_node, UNDEFINED, nullptr);
+    auto parent = start_node;
     for (auto it = path.rbegin(); it != path.rend(); it++) {
         RRNodeId this_node(*it);
         auto& here = device_ctx.rr_nodes[*it];
@@ -285,10 +281,10 @@ bool ExtendedMapLookahead::add_paths(int start_node_ind,
             seg_index,
             delta};
 
-        if (*it != start_node_ind) {
-            auto& parent_node = device_ctx.rr_nodes[parent];
-            start_to_here = Entry(*it, parent_node.edge_switch(paths[*it].edge), &start_to_here);
-            parent = *it;
+        if (size_t(this_node) != size_t(start_node)) {
+            auto& parent_node = device_ctx.rr_nodes[size_t(parent)];
+            start_to_here = Entry(this_node, parent_node.edge_switch(paths[*it].edge), &start_to_here);
+            parent = this_node;
         }
 
         float cost = current.cost() - start_to_here.cost();
@@ -328,9 +324,9 @@ bool ExtendedMapLookahead::add_paths(int start_node_ind,
  * to that pin is stored to an entry in the routing_cost_map.
  *
  * Returns the maximum (last) minimum cost path stored, and
- * the number of paths from start_node_ind stored. */
+ * the number of paths from start_node stored. */
 template<typename Entry>
-std::pair<float, int> ExtendedMapLookahead::run_dijkstra(int start_node_ind,
+std::pair<float, int> ExtendedMapLookahead::run_dijkstra(RRNodeId start_node,
                                                          std::vector<bool>* node_expanded,
                                                          std::vector<util::Search_Path>* paths,
                                                          util::RoutingCosts* routing_costs) {
@@ -344,12 +340,12 @@ std::pair<float, int> ExtendedMapLookahead::run_dijkstra(int start_node_ind,
      * visited (used to determine whether to push a candidate node onto the
      * expansion queue.
      * Also store the parent node so we can reconstruct a specific path. */
-    std::fill(paths->begin(), paths->end(), util::Search_Path{std::numeric_limits<float>::infinity(), -1, -1});
+    std::fill(paths->begin(), paths->end(), util::Search_Path{std::numeric_limits<float>::infinity(), std::numeric_limits<size_t>::max(), -1});
     /* a priority queue for expansion */
     std::priority_queue<Entry, std::vector<Entry>, std::greater<Entry>> pq;
 
     /* first entry has no upstream delay or congestion */
-    Entry first_entry(start_node_ind, UNDEFINED, nullptr);
+    Entry first_entry(start_node, UNDEFINED, nullptr);
     float max_cost = first_entry.cost();
 
     pq.push(first_entry);
@@ -359,24 +355,24 @@ std::pair<float, int> ExtendedMapLookahead::run_dijkstra(int start_node_ind,
         auto current = pq.top();
         pq.pop();
 
-        int node_ind = current.rr_node_ind;
+        RRNodeId node = current.rr_node;
 
         /* check that we haven't already expanded from this node */
-        if ((*node_expanded)[node_ind]) {
+        if ((*node_expanded)[size_t(node)]) {
             continue;
         }
 
         /* if this node is an ipin record its congestion/delay in the routing_cost_map */
-        if (device_ctx.rr_nodes[node_ind].type() == IPIN) {
+        if (device_ctx.rr_nodes[size_t(node)].type() == IPIN) {
             // the last cost should be the highest
             max_cost = current.cost();
 
             path_count++;
-            this->add_paths<Entry>(start_node_ind, current, *paths, routing_costs);
+            this->add_paths<Entry>(start_node, current, *paths, routing_costs);
         } else {
             util::expand_dijkstra_neighbours(device_ctx.rr_nodes,
                                              current, paths, node_expanded, &pq);
-            (*node_expanded)[node_ind] = true;
+            (*node_expanded)[size_t(node)] = true;
         }
     }
     return std::make_pair(max_cost, path_count);
@@ -429,7 +425,7 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
             float max_delay_cost = 0.f;
             float max_base_cost = 0.f;
             int path_count = 0;
-            for (auto node_ind : point.nodes) {
+            for (auto node : point.nodes) {
                 // For each starting node, two different expansions are performed, one to find minimum delay
                 // and the second to find minimum base costs.
                 //
@@ -438,12 +434,12 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
                 //       Experiments have shown that the having two separate expansions lead to better results for Series 7 devices, but
                 //       this might not be true for Stratix ones.
                 {
-                    auto result = run_dijkstra<util::PQ_Entry_Delay>(node_ind, &node_expanded, &paths, &delay_costs);
+                    auto result = run_dijkstra<util::PQ_Entry_Delay>(node, &node_expanded, &paths, &delay_costs);
                     max_delay_cost = std::max(max_delay_cost, result.first);
                     path_count += result.second;
                 }
                 {
-                    auto result = run_dijkstra<util::PQ_Entry_Base_Cost>(node_ind, &node_expanded, &paths, &base_costs);
+                    auto result = run_dijkstra<util::PQ_Entry_Base_Cost>(node, &node_expanded, &paths, &base_costs);
                     max_base_cost = std::max(max_base_cost, result.first);
                     path_count += result.second;
                 }
@@ -552,7 +548,7 @@ float ExtendedMapLookahead::get_expected_cost(
 
     if (rr_type == CHANX || rr_type == CHANY) {
         return get_map_cost(
-            current_node, target_node, params.criticality);
+            RRNodeId(current_node), RRNodeId(target_node), params.criticality);
     } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
         return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost);
     } else { /* Change this if you want to investigate route-throughs */
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
index 0af50581adf..c6123c80c13 100644
--- a/vpr/src/route/router_lookahead_extended_map.h
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -20,8 +20,8 @@ class ExtendedMapLookahead : public RouterLookahead {
     /**
      * @brief Returns the cost to get to a input delta_x/y location when starting from a SOURCE or OPIN node
      * @param from_node starting node
-     * @param delta_x x delta value that is the distance between the current node to the target node 
-     * @param delta_y y delta value that is the distance between the current node to the target node 
+     * @param delta_x x delta value that is the distance between the current node to the target node
+     * @param delta_y y delta value that is the distance between the current node to the target node
      * @param criticality_fac criticality of the current connection
      * @return expected cost to get to the destination
      */
@@ -35,18 +35,18 @@ class ExtendedMapLookahead : public RouterLookahead {
     float get_chan_ipin_delays(RRNodeId to_node) const;
 
     template<typename Entry>
-    bool add_paths(int start_node_ind,
+    bool add_paths(RRNodeId start_node,
                    Entry current,
                    const std::vector<util::Search_Path>& paths,
                    util::RoutingCosts* routing_costs);
 
     template<typename Entry>
-    std::pair<float, int> run_dijkstra(int start_node_ind,
+    std::pair<float, int> run_dijkstra(RRNodeId start_node,
                                        std::vector<bool>* node_expanded,
                                        std::vector<util::Search_Path>* paths,
                                        util::RoutingCosts* routing_costs);
 
-    float get_map_cost(int from_node_ind, int to_node_ind, float criticality_fac) const;
+    float get_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac) const;
 
     CostMap cost_map_; ///<Cost map containing all data to extract the entry cost when querying the lookahead.
   public:
@@ -54,19 +54,19 @@ class ExtendedMapLookahead : public RouterLookahead {
      * @brief Returns the expected cost to get to a destination node
      */
     float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
-   
+
     /**
-     * @brief Computes the extended lookahead map 
+     * @brief Computes the extended lookahead map
      */
     void compute(const std::vector<t_segment_inf>& segment_inf) override;
-    
+
     /**
-     * @brief Reads the extended lookahead map 
+     * @brief Reads the extended lookahead map
      */
     void read(const std::string& file) override;
-    
+
     /**
-     * @brief Writes the extended lookahead map 
+     * @brief Writes the extended lookahead map
      */
     void write(const std::string& file) const override;
 };
diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
index dc35afef09d..ec4fa210b95 100644
--- a/vpr/src/route/router_lookahead_map_utils.cpp
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -55,14 +55,14 @@ static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr:
 namespace util {
 
 PQ_Entry::PQ_Entry(
-    int set_rr_node_ind,
+    RRNodeId set_rr_node,
     int switch_ind,
     float parent_delay,
     float parent_R_upstream,
     float parent_congestion_upstream,
     bool starting_node,
     float Tsw_adjust) {
-    this->rr_node_ind = set_rr_node_ind;
+    this->rr_node = set_rr_node;
 
     auto& device_ctx = g_vpr_ctx.device();
     this->delay = parent_delay;
@@ -73,8 +73,8 @@ PQ_Entry::PQ_Entry(
         Tsw += Tsw_adjust;
         VTR_ASSERT(Tsw >= 0.f);
         float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
-        float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
-        float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R();
+        float Cnode = device_ctx.rr_nodes[size_t(set_rr_node)].C();
+        float Rnode = device_ctx.rr_nodes[size_t(set_rr_node)].R();
 
         float T_linear = 0.f;
         if (device_ctx.rr_switch_inf[switch_ind].buffered()) {
@@ -85,7 +85,7 @@ PQ_Entry::PQ_Entry(
 
         float base_cost = 0.f;
         if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
-            base_cost = get_single_rr_cong_base_cost(set_rr_node_ind);
+            base_cost = get_single_rr_cong_base_cost(size_t(set_rr_node));
         }
 
         VTR_ASSERT(T_linear >= 0.);
@@ -100,17 +100,17 @@ PQ_Entry::PQ_Entry(
 }
 
 util::PQ_Entry_Delay::PQ_Entry_Delay(
-    int set_rr_node_ind,
+    RRNodeId set_rr_node,
     int switch_ind,
     const util::PQ_Entry_Delay* parent) {
-    this->rr_node_ind = set_rr_node_ind;
+    this->rr_node = set_rr_node;
 
     if (parent != nullptr) {
         auto& device_ctx = g_vpr_ctx.device();
         float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel;
         float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
-        float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
-        float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R();
+        float Cnode = device_ctx.rr_nodes[size_t(set_rr_node)].C();
+        float Rnode = device_ctx.rr_nodes[size_t(set_rr_node)].R();
 
         float T_linear = 0.f;
         if (device_ctx.rr_switch_inf[switch_ind].buffered()) {
@@ -127,15 +127,15 @@ util::PQ_Entry_Delay::PQ_Entry_Delay(
 }
 
 util::PQ_Entry_Base_Cost::PQ_Entry_Base_Cost(
-    int set_rr_node_ind,
+    RRNodeId set_rr_node,
     int switch_ind,
     const util::PQ_Entry_Base_Cost* parent) {
-    this->rr_node_ind = set_rr_node_ind;
+    this->rr_node = set_rr_node;
 
     if (parent != nullptr) {
         auto& device_ctx = g_vpr_ctx.device();
         if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
-            this->base_cost = parent->base_cost + get_single_rr_cong_base_cost(set_rr_node_ind);
+            this->base_cost = parent->base_cost + get_single_rr_cong_base_cost(size_t(set_rr_node));
         } else {
             this->base_cost = parent->base_cost;
         }
@@ -248,9 +248,9 @@ void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
                                 std::priority_queue<Entry,
                                                     std::vector<Entry>,
                                                     std::greater<Entry>>* pq) {
-    int parent_ind = size_t(parent_entry.rr_node_ind);
+    RRNodeId parent = parent_entry.rr_node;
 
-    auto& parent_node = rr_nodes[parent_ind];
+    auto& parent_node = rr_nodes[size_t(parent)];
 
     for (int iedge = 0; iedge < parent_node.num_edges(); iedge++) {
         int child_node_ind = parent_node.edge_sink_node(iedge);
@@ -261,12 +261,12 @@ void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
             continue;
         }
 
-        Entry child_entry(child_node_ind, switch_ind, &parent_entry);
+        Entry child_entry(RRNodeId(child_node_ind), switch_ind, &parent_entry);
         VTR_ASSERT(child_entry.cost() >= 0);
 
         /* Create (if it doesn't exist) or update (if the new cost is lower)
          * to specified node */
-        Search_Path path_entry = {child_entry.cost(), parent_ind, iedge};
+        Search_Path path_entry = {child_entry.cost(), size_t(parent), iedge};
         auto& path = (*paths)[child_node_ind];
         if (path_entry.cost < path.cost) {
             pq->push(child_entry);
diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
index 5de54d96a76..f39660843dd 100644
--- a/vpr/src/route/router_lookahead_map_utils.h
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -65,9 +65,9 @@ class Cost_Entry {
     }
 };
 
-/** 
+/**
  * @brief a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion.
- * 
+ *
  * Since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored
  * in the final lookahead cost map
  *
@@ -172,15 +172,15 @@ typedef std::unordered_map<RoutingCostKey, float, HashRoutingCostKey> RoutingCos
 /* a class that represents an entry in the Dijkstra expansion priority queue */
 class PQ_Entry {
   public:
-    int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
-    float cost;      //the cost of the path to get to this node
+    RRNodeId rr_node; //index in device_ctx.rr_nodes that this entry represents
+    float cost;       //the cost of the path to get to this node
 
     /* store backward delay, R and congestion info */
     float delay;
     float R_upstream;
     float congestion_upstream;
 
-    PQ_Entry(int set_rr_node_ind, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node, float Tsw_adjust);
+    PQ_Entry(RRNodeId set_rr_node, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node, float Tsw_adjust);
 
     bool operator<(const PQ_Entry& obj) const {
         /* inserted into max priority queue so want queue entries with a lower cost to be greater */
@@ -191,10 +191,10 @@ class PQ_Entry {
 // A version of PQ_Entry that only calculates and stores the delay.
 class PQ_Entry_Delay {
   public:
-    int rr_node_ind;  //index in device_ctx.rr_nodes that this entry represents
+    RRNodeId rr_node;  //index in device_ctx.rr_nodes that this entry represents
     float delay_cost; //the cost of the path to get to this node
 
-    PQ_Entry_Delay(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Delay* parent);
+    PQ_Entry_Delay(RRNodeId set_rr_node, int /*switch_ind*/, const PQ_Entry_Delay* parent);
 
     float cost() const {
         return delay_cost;
@@ -212,10 +212,10 @@ class PQ_Entry_Delay {
 // A version of PQ_Entry that only calculates and stores the base cost.
 class PQ_Entry_Base_Cost {
   public:
-    int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
+    RRNodeId rr_node; //index in device_ctx.rr_nodes that this entry represents
     float base_cost;
 
-    PQ_Entry_Base_Cost(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Base_Cost* parent);
+    PQ_Entry_Base_Cost(RRNodeId set_rr_node, int /*switch_ind*/, const PQ_Entry_Base_Cost* parent);
 
     float cost() const {
         return base_cost;
@@ -232,7 +232,7 @@ class PQ_Entry_Base_Cost {
 
 struct Search_Path {
     float cost;
-    int parent;
+    size_t parent;
     int edge;
 };
 
diff --git a/vpr/src/route/router_lookahead_sampling.cpp b/vpr/src/route/router_lookahead_sampling.cpp
index 568d5134602..53dec653273 100644
--- a/vpr/src/route/router_lookahead_sampling.cpp
+++ b/vpr/src/route/router_lookahead_sampling.cpp
@@ -21,20 +21,26 @@ static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b
 }
 
 // the smallest bounding box containing a node
-static vtr::Rect<int> bounding_box_for_node(int node_ind) {
+// This builds a rectangle from (x, y) to (x+1, y+1)
+static vtr::Rect<int> bounding_box_for_node(RRNodeId node) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
-    int x = rr_graph.node_xlow(RRNodeId(node_ind));
-    int y = rr_graph.node_ylow(RRNodeId(node_ind));
+    int x = rr_graph.node_xlow(node);
+    int y = rr_graph.node_ylow(node);
 
     return vtr::Rect<int>(vtr::Point<int>(x, y));
 }
 
+// Returns a sub-rectangle from bounding_box split into an n x n grid,
+// with sx and sy in [0, n-1] selecting the column and row, respectively.
 static vtr::Rect<int> sample_window(const vtr::Rect<int>& bounding_box, int sx, int sy, int n) {
     return vtr::Rect<int>(sample(bounding_box, sx, sy, n),
                           sample(bounding_box, sx + 1, sy + 1, n));
 }
 
+// Chooses all points within the sample window within the given count range,
+// sorted outward from the center by Manhattan distance, the result of which
+// is stored in the points member of a SampleRegion.
 static std::vector<SamplePoint> choose_points(const vtr::Matrix<int>& counts,
                                               const vtr::Rect<int>& window,
                                               int min_count,
@@ -80,7 +86,10 @@ static int quantile(const std::map<int, int>& histogram, float ratio) {
     return 0;
 }
 
-// select a good number of segments to find
+// Computes a histogram of counts within the box, where counts are nodes of a given segment type in this case.
+//
+// This is used to avoid choosing starting points where there extremely low or extremely high counts,
+// which lead to bad sampling or high runtime for little gain.
 static std::map<int, int> count_histogram(const vtr::Rect<int>& box, const vtr::Matrix<int>& counts) {
     std::map<int, int> histogram;
     for (int y = box.ymin(); y < box.ymax(); y++) {
@@ -101,6 +110,9 @@ static std::map<int, int> count_histogram(const vtr::Rect<int>& box, const vtr::
 // because it's just interleaving binary bits, so this
 // function interleaves with 0's so that the X and Y
 // dimensions can then be OR'ed together.
+//
+// This is used to order sample points to increase cache locality.
+// It achieves a ~5-10% run-time improvement.
 static uint64_t interleave(uint32_t x) {
     uint64_t i = x;
     i = (i ^ (i << 16)) & 0x0000ffff0000ffff;
@@ -122,8 +134,7 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
 
     // compute bounding boxes for each segment type
     std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>());
-    for (size_t i = 0; i < rr_nodes.size(); i++) {
-        auto& node = rr_nodes[i];
+    for (auto& node : rr_nodes) {
         if (node.type() != CHANX && node.type() != CHANY) continue;
         if (node.capacity() == 0 || node.num_edges() == 0) continue;
         int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
@@ -131,7 +142,7 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
         VTR_ASSERT(seg_index != OPEN);
         VTR_ASSERT(seg_index < num_segments);
 
-        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(i));
+        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(node.id()));
     }
 
     // initialize counts
@@ -141,12 +152,12 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
     }
 
     // count sample points
-    for (size_t i = 0; i < rr_nodes.size(); i++) {
-        auto& node = rr_nodes[i];
+    for (auto& node : rr_nodes) {
         if (node.type() != CHANX && node.type() != CHANY) continue;
         if (node.capacity() == 0 || node.num_edges() == 0) continue;
-        int x = rr_nodes.node_xlow(RRNodeId(i));
-        int y = rr_nodes.node_ylow(RRNodeId(i));
+
+        int x = rr_nodes.node_xlow(node.id());
+        int y = rr_nodes.node_ylow(node.id());
 
         int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
         segment_counts[seg_index][x][y] += 1;
@@ -206,13 +217,12 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
     }
 
     // collect the node indices for each segment type at the selected sample points
-    for (size_t i = 0; i < rr_nodes.size(); i++) {
-        auto& node = rr_nodes[i];
+    for (auto& node : rr_nodes) {
         if (node.type() != CHANX && node.type() != CHANY) continue;
         if (node.capacity() == 0 || node.num_edges() == 0) continue;
 
-        int x = rr_nodes.node_xlow(RRNodeId(i));
-        int y = rr_nodes.node_ylow(RRNodeId(i));
+        int x = rr_nodes.node_xlow(node.id());
+        int y = rr_nodes.node_ylow(node.id());
 
         int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
 
@@ -221,7 +231,7 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
 
         auto point = sample_point_index.find(std::make_tuple(seg_index, x, y));
         if (point != sample_point_index.end()) {
-            point->second->nodes.push_back(i);
+            point->second->nodes.push_back(node.id());
         }
     }
 
diff --git a/vpr/src/route/router_lookahead_sampling.h b/vpr/src/route/router_lookahead_sampling.h
index 6668792e9c6..ab08ae9404d 100644
--- a/vpr/src/route/router_lookahead_sampling.h
+++ b/vpr/src/route/router_lookahead_sampling.h
@@ -11,7 +11,7 @@ struct SamplePoint {
     vtr::Point<int> location;
 
     // nodes to expand
-    std::vector<ssize_t> nodes;
+    std::vector<RRNodeId> nodes;
 };
 
 struct SampleRegion {

From 98e56f8e7dcd061d5e55d4b716de60fd7b2faec2 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 16:55:37 +0200
Subject: [PATCH 09/18] run make format

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_map_utils.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
index f39660843dd..5a3fe3b3bd5 100644
--- a/vpr/src/route/router_lookahead_map_utils.h
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -42,10 +42,10 @@ enum e_representative_entry_method {
  */
 class Cost_Entry {
   public:
-    float delay; ///<Delay value of the cost entry
+    float delay;      ///<Delay value of the cost entry
     float congestion; ///<Base cost value of the cost entry
-    bool fill; ///<Boolean specifying whether this Entry was created as a result of the cost map
-               ///<holes filling procedure
+    bool fill;        ///<Boolean specifying whether this Entry was created as a result of the cost map
+                      ///<holes filling procedure
 
     Cost_Entry() {
         delay = std::numeric_limits<float>::infinity();
@@ -191,7 +191,7 @@ class PQ_Entry {
 // A version of PQ_Entry that only calculates and stores the delay.
 class PQ_Entry_Delay {
   public:
-    RRNodeId rr_node;  //index in device_ctx.rr_nodes that this entry represents
+    RRNodeId rr_node; //index in device_ctx.rr_nodes that this entry represents
     float delay_cost; //the cost of the path to get to this node
 
     PQ_Entry_Delay(RRNodeId set_rr_node, int /*switch_ind*/, const PQ_Entry_Delay* parent);

From d9d1ed8ac47bba0bcd16d0f902608b03dab43806 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 17:42:20 +0200
Subject: [PATCH 10/18] lookahead_cost: avoid building the segment map

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_cost_map.cpp   | 22 +++++--------------
 vpr/src/route/router_lookahead_cost_map.h     |  9 +-------
 .../route/router_lookahead_extended_map.cpp   |  1 -
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/vpr/src/route/router_lookahead_cost_map.cpp b/vpr/src/route/router_lookahead_cost_map.cpp
index 1f036569bc8..555e3c9da0b 100644
--- a/vpr/src/route/router_lookahead_cost_map.cpp
+++ b/vpr/src/route/router_lookahead_cost_map.cpp
@@ -46,20 +46,6 @@ static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Poi
     }
 }
 
-// build the segment map
-void CostMap::build_segment_map() {
-    const auto& device_ctx = g_vpr_ctx.device();
-    segment_map_.resize(device_ctx.rr_nodes.size());
-    for (size_t i = 0; i < segment_map_.size(); ++i) {
-        auto& from_node = device_ctx.rr_nodes[i];
-
-        int from_cost_index = from_node.cost_index();
-        int from_seg_index = device_ctx.rr_indexed_data[from_cost_index].seg_index;
-
-        segment_map_[i] = from_seg_index;
-    }
-}
-
 // resize internal data structures
 void CostMap::set_counts(size_t seg_count) {
     cost_map_.clear();
@@ -73,7 +59,12 @@ void CostMap::set_counts(size_t seg_count) {
 
 // cached node -> segment map
 int CostMap::node_to_segment(int from_node_ind) const {
-    return segment_map_[from_node_ind];
+    const auto& device_ctx = g_vpr_ctx.device();
+
+    auto& from_node = device_ctx.rr_nodes[from_node_ind];
+
+    int from_cost_index = from_node.cost_index();
+    return device_ctx.rr_indexed_data[from_cost_index].seg_index;
 }
 
 static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) {
@@ -425,7 +416,6 @@ static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
 }
 
 void CostMap::read(const std::string& file) {
-    build_segment_map();
     MmapFile f(file);
 
     ::capnp::ReaderOptions opts = default_large_capnp_opts();
diff --git a/vpr/src/route/router_lookahead_cost_map.h b/vpr/src/route/router_lookahead_cost_map.h
index 90990c3d62b..6c314f05ee8 100644
--- a/vpr/src/route/router_lookahead_cost_map.h
+++ b/vpr/src/route/router_lookahead_cost_map.h
@@ -30,12 +30,7 @@ class CostMap {
     void set_counts(size_t seg_count);
 
     /**
-     * @brief Builds a node to segments map for a fast lookup
-     */
-    void build_segment_map();
-
-    /**
-     * @brief Queries the segment map to get the segment index given the corresponding node index
+     * @brief Gets the segment index relative to the input node index
      *
      * @param from_node_ind index of the node to search in the segment map
      * @return The index of the segment corresponding to the input node id
@@ -103,8 +98,6 @@ class CostMap {
                                  ///<delta locations that fall outside of a segment's bounding box.
                                  ///<The penalty map is addressed as follows penalty_[0][segment_index]
 
-    std::vector<int> segment_map_; ///<Segment map to lookup to the segment index given the node index
-
     size_t seg_count_; ///<Total segment count in the architecture
 
     /**
diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index 5c1b7232253..52b410330b2 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -394,7 +394,6 @@ void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf
     /* free previous delay map and allocate new one */
     auto& device_ctx = g_vpr_ctx.device();
     cost_map_.set_counts(segment_inf.size());
-    cost_map_.build_segment_map();
 
     VTR_ASSERT(REPRESENTATIVE_ENTRY_METHOD == util::SMALLEST);
     util::RoutingCosts all_delay_costs;

From aefe2f125c53f69b1286974dc82b846a829a0bd2 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 17:54:31 +0200
Subject: [PATCH 11/18] address review comments

- Added better description on the user options
- Added test with random_shuffle nodes reordering

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/base/read_options.cpp                              | 7 ++++++-
 .../strong_router_lookahead/config/config.txt              | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 2e25983f493..7c9e23f29ee 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -2099,7 +2099,12 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
             "            architectures)\n"
             " * map: An advanced lookahead which accounts for diverse wire type\n"
             " * extended_map: A more advanced and extended lookahead which accounts for a more\n"
-            "                 exhaustive node sampling method")
+            "                 exhaustive node sampling method\n"
+            "\n"
+            " The extended map differs from the map lookahead in the lookahead computation.\n"
+            " It is better suited for architectures that have specialized routing for specific\n"
+            " kinds of connections, but note that the time and memory necessary to compute the\n"
+            " extended lookahead map are greater than the basic lookahead map.\n")
         .default_value("map")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt
index faeaffeb119..91b148a9d7c 100644
--- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt
+++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/config.txt
@@ -28,3 +28,4 @@ script_params_common = -starting_stage vpr --route_chan_width 60
 script_params_list_add = --router_lookahead classic
 script_params_list_add = --router_lookahead map
 script_params_list_add = --router_lookahead extended_map
+script_params_list_add = --router_lookahead extended_map --reorder_rr_graph_nodes_algorithm random_shuffle

From 39a5524a65ad761cae6f35147ea89a0bd5c250df Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 4 Sep 2020 18:23:06 +0200
Subject: [PATCH 12/18] lookahead_sampling: simplified find_sample_regions code

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_sampling.cpp | 106 ++++++++++++--------
 1 file changed, 64 insertions(+), 42 deletions(-)

diff --git a/vpr/src/route/router_lookahead_sampling.cpp b/vpr/src/route/router_lookahead_sampling.cpp
index 53dec653273..f8ae27a4a10 100644
--- a/vpr/src/route/router_lookahead_sampling.cpp
+++ b/vpr/src/route/router_lookahead_sampling.cpp
@@ -123,49 +123,36 @@ static uint64_t interleave(uint32_t x) {
     return i;
 }
 
-// for each segment type, find the nearest nodes to an equally spaced grid of points
-// within the bounding box for that segment type
-std::vector<SampleRegion> find_sample_regions(int num_segments) {
-    vtr::ScopedStartFinishTimer timer("finding sample regions");
-    std::vector<SampleRegion> sample_regions;
+// Used to get a valid segment index given an input rr_node.
+// If the node is not interesting an invalid value is returned.
+static std::tuple<int, int, int> get_node_info(const t_rr_node& node, int num_segments) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_nodes = device_ctx.rr_nodes;
-    std::vector<vtr::Matrix<int>> segment_counts(num_segments);
 
-    // compute bounding boxes for each segment type
-    std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>());
-    for (auto& node : rr_nodes) {
-        if (node.type() != CHANX && node.type() != CHANY) continue;
-        if (node.capacity() == 0 || node.num_edges() == 0) continue;
-        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
-
-        VTR_ASSERT(seg_index != OPEN);
-        VTR_ASSERT(seg_index < num_segments);
-
-        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(node.id()));
+    if (node.type() != CHANX && node.type() != CHANY) {
+        return std::tuple<int, int, int>(OPEN, OPEN, OPEN);
     }
 
-    // initialize counts
-    for (int seg = 0; seg < num_segments; seg++) {
-        const auto& box = bounding_box_for_segment[seg];
-        segment_counts[seg] = vtr::Matrix<int>({size_t(box.width()), size_t(box.height())}, 0);
+    if (node.capacity() == 0 || node.num_edges() == 0) {
+        return std::tuple<int, int, int>(OPEN, OPEN, OPEN);
     }
 
-    // count sample points
-    for (auto& node : rr_nodes) {
-        if (node.type() != CHANX && node.type() != CHANY) continue;
-        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+    int x = rr_nodes.node_xlow(node.id());
+    int y = rr_nodes.node_ylow(node.id());
 
-        int x = rr_nodes.node_xlow(node.id());
-        int y = rr_nodes.node_ylow(node.id());
+    int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
 
-        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
-        segment_counts[seg_index][x][y] += 1;
+    VTR_ASSERT(seg_index != OPEN);
+    VTR_ASSERT(seg_index < num_segments);
 
-        VTR_ASSERT(seg_index != OPEN);
-        VTR_ASSERT(seg_index < num_segments);
-    }
+    return std::tuple<int, int, int>(seg_index, x, y);
+}
 
+// Fills the sample_regions vector
+static void compute_sample_regions(std::vector<SampleRegion>& sample_regions,
+                                   std::vector<vtr::Matrix<int>>& segment_counts,
+                                   std::vector<vtr::Rect<int>>& bounding_box_for_segment,
+                                   int num_segments) {
     // select sample points
     for (int i = 0; i < num_segments; i++) {
         const auto& counts = segment_counts[i];
@@ -201,6 +188,47 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
             }
         }
     }
+}
+
+// for each segment type, find the nearest nodes to an equally spaced grid of points
+// within the bounding box for that segment type
+std::vector<SampleRegion> find_sample_regions(int num_segments) {
+    vtr::ScopedStartFinishTimer timer("finding sample regions");
+    std::vector<SampleRegion> sample_regions;
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_nodes = device_ctx.rr_nodes;
+    std::vector<vtr::Matrix<int>> segment_counts(num_segments);
+
+    // compute bounding boxes for each segment type
+    std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>());
+    for (auto& node : rr_nodes) {
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+
+        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(node.id()));
+    }
+
+    // initialize counts
+    for (int seg = 0; seg < num_segments; seg++) {
+        const auto& box = bounding_box_for_segment[seg];
+        segment_counts[seg] = vtr::Matrix<int>({size_t(box.width()), size_t(box.height())}, 0);
+    }
+
+    // count sample points
+    for (const auto& node : rr_nodes) {
+        int seg_index, x, y;
+        std::tie(seg_index, x, y) = get_node_info(node, num_segments);
+
+        if (seg_index == OPEN) continue;
+
+        segment_counts[seg_index][x][y] += 1;
+    }
+
+    compute_sample_regions(sample_regions, segment_counts, bounding_box_for_segment, num_segments);
 
     // sort regions
     std::sort(sample_regions.begin(), sample_regions.end(),
@@ -217,17 +245,11 @@ std::vector<SampleRegion> find_sample_regions(int num_segments) {
     }
 
     // collect the node indices for each segment type at the selected sample points
-    for (auto& node : rr_nodes) {
-        if (node.type() != CHANX && node.type() != CHANY) continue;
-        if (node.capacity() == 0 || node.num_edges() == 0) continue;
-
-        int x = rr_nodes.node_xlow(node.id());
-        int y = rr_nodes.node_ylow(node.id());
+    for (const auto& node : rr_nodes) {
+        int seg_index, x, y;
+        std::tie(seg_index, x, y) = get_node_info(node, num_segments);
 
-        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
-
-        VTR_ASSERT(seg_index != OPEN);
-        VTR_ASSERT(seg_index < num_segments);
+        if (seg_index == OPEN) continue;
 
         auto point = sample_point_index.find(std::make_tuple(seg_index, x, y));
         if (point != sample_point_index.end()) {

From 392996a46d5c58a186e481f0db2e4320ca176372 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Sun, 6 Sep 2020 19:51:03 +0200
Subject: [PATCH 13/18] reg_test: added missing golden results

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 .../strong_router_lookahead/config/golden_results.txt    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt
index 3a9851af2ab..518fd3fcac8 100644
--- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt
+++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/config/golden_results.txt
@@ -1,4 +1,5 @@
-arch                  	circuit  	script_params                         	vtr_flow_elapsed_time	error	odin_synth_time	max_odin_mem	abc_depth	abc_synth_time	abc_cec_time	abc_sec_time	max_abc_mem	ace_time	max_ace_mem	num_clb	num_io	num_memories	num_mult	vpr_status	vpr_revision                    	vpr_build_info                          	vpr_compiler                                	vpr_compiled       	hostname	rundir                                                                                                                                                                   	max_vpr_mem	num_primary_inputs	num_primary_outputs	num_pre_packed_nets	num_pre_packed_blocks	num_netlist_clocks	num_post_packed_nets	num_post_packed_blocks	device_width	device_height	device_grid_tiles	device_limiting_resources	device_name	pack_time	placed_wirelength_est	place_time	place_quench_time	placed_CPD_est	placed_setup_TNS_est	placed_setup_WNS_est	placed_geomean_nonvirtual_intradomain_critical_path_delay_est	place_quench_timing_analysis_time	place_quench_sta_time	place_total_timing_analysis_time	place_total_sta_time	routed_wirelength	total_nets_routed	total_connections_routed	total_heap_pushes	total_heap_pops	logic_block_area_total	logic_block_area_used	routing_area_total	routing_area_per_tile	crit_path_route_success_iteration	critical_path_delay	geomean_nonvirtual_intradomain_critical_path_delay	setup_TNS	setup_WNS	hold_TNS	hold_WNS	crit_path_route_time	crit_path_total_timing_analysis_time	crit_path_total_sta_time
-k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_classic     	1.36                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-rc2-2438-ge5e1da843-dirty	release VTR_ASSERT_LEVEL=2 debug_logging	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-08-19T12:52:35	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run023/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_classic     	76636      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.31     	4714                 	0.51      	0.00             	3.70871       	-159.069            	-3.70871            	nan                                                          	0.00081656                       	0.000697153          	0.113819                        	0.0986893           	6802             	4803             	18126                   	1410663          	241971         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	31                               	4.15097            	nan                                               	-183.347 	-4.15097 	0       	0       	0.26                	0.16699                             	0.143514                
-k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_map         	1.40                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-rc2-2438-ge5e1da843-dirty	release VTR_ASSERT_LEVEL=2 debug_logging	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-08-19T12:52:35	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run023/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_map         	74280      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.31     	4774                 	0.50      	0.00             	3.68749       	-162.239            	-3.68749            	nan                                                          	0.00107059                       	0.000930556          	0.0963918                       	0.0798913           	6885             	4550             	17546                   	783482           	128874         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.28104            	nan                                               	-189.84  	-4.28104 	0       	0       	0.19                	0.142017                            	0.116339                
-k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_extended_map	1.57                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-rc2-2438-ge5e1da843-dirty	release VTR_ASSERT_LEVEL=2 debug_logging	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-08-19T12:52:35	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run023/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_extended_map	78432      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.32     	4756                 	0.48      	0.00             	3.96235       	-167.937            	-3.96235            	nan                                                          	0.000742545                      	0.000635633          	0.104159                        	0.091287            	6963             	3821             	13362                   	1508440          	271241         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.68054            	nan                                               	-193.587 	-4.68054 	0       	0       	0.25                	0.149959                            	0.129748                
+arch                  	circuit  	script_params                                                                           	vtr_flow_elapsed_time	error	odin_synth_time	max_odin_mem	abc_depth	abc_synth_time	abc_cec_time	abc_sec_time	max_abc_mem	ace_time	max_ace_mem	num_clb	num_io	num_memories	num_mult	vpr_status	vpr_revision          	vpr_build_info                	vpr_compiler                                	vpr_compiled       	hostname	rundir                                                                                                                                                                                                                     	max_vpr_mem	num_primary_inputs	num_primary_outputs	num_pre_packed_nets	num_pre_packed_blocks	num_netlist_clocks	num_post_packed_nets	num_post_packed_blocks	device_width	device_height	device_grid_tiles	device_limiting_resources	device_name	pack_time	placed_wirelength_est	place_time	place_quench_time	placed_CPD_est	placed_setup_TNS_est	placed_setup_WNS_est	placed_geomean_nonvirtual_intradomain_critical_path_delay_est	place_quench_timing_analysis_time	place_quench_sta_time	place_total_timing_analysis_time	place_total_sta_time	routed_wirelength	total_nets_routed	total_connections_routed	total_heap_pushes	total_heap_pops	logic_block_area_total	logic_block_area_used	routing_area_total	routing_area_per_tile	crit_path_route_success_iteration	critical_path_delay	geomean_nonvirtual_intradomain_critical_path_delay	setup_TNS	setup_WNS	hold_TNS	hold_WNS	crit_path_route_time	crit_path_total_timing_analysis_time	crit_path_total_sta_time
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_classic                                                       	1.37                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-2584-g5f6578a34	release IPO VTR_ASSERT_LEVEL=2	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-09-06T19:44:18	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run054/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_classic                                                       	40544      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.31     	4714                 	0.57      	0.00             	3.70871       	-159.069            	-3.70871            	nan                                                          	0.000595846                      	0.000483397          	0.093679                        	0.0767406           	6802             	4803             	18126                   	1410663          	241971         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	31                               	4.15097            	nan                                               	-183.347 	-4.15097 	0       	0       	0.25                	0.133699                            	0.10838                 
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_map                                                           	1.34                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-2584-g5f6578a34	release IPO VTR_ASSERT_LEVEL=2	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-09-06T19:44:18	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run054/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_map                                                           	40116      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.30     	4774                 	0.49      	0.00             	3.68749       	-162.239            	-3.68749            	nan                                                          	0.000750225                      	0.000628302          	0.0994842                       	0.0818529           	6885             	4550             	17546                   	783482           	128874         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.28104            	nan                                               	-189.84  	-4.28104 	0       	0       	0.18                	0.142113                            	0.116707                
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_extended_map                                                  	1.65                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-2584-g5f6578a34	release IPO VTR_ASSERT_LEVEL=2	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-09-06T19:44:18	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run054/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_extended_map                                                  	42564      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.33     	4756                 	0.52      	0.01             	3.96235       	-167.937            	-3.96235            	nan                                                          	0.000771353                      	0.000639451          	0.0941602                       	0.079945            	6963             	3821             	13362                   	1508440          	271241         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.68054            	nan                                               	-193.587 	-4.68054 	0       	0       	0.22                	0.133333                            	0.110696                
+k6_N10_mem32K_40nm.xml	ex5p.blif	common_--router_lookahead_extended_map_--reorder_rr_graph_nodes_algorithm_random_shuffle	1.49                 	     	-1             	-1          	-1       	-1            	-1          	-1          	-1         	-1      	-1         	80     	8     	0           	0       	success   	v8.0.0-2584-g5f6578a34	release IPO VTR_ASSERT_LEVEL=2	GNU 9.2.1 on Linux-4.15.0-101-generic x86_64	2020-09-06T19:44:18	acomodi 	/data/vtr-symbiflow/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_router_lookahead/run054/k6_N10_mem32K_40nm.xml/ex5p.blif/common_--router_lookahead_extended_map_--reorder_rr_graph_nodes_algorithm_random_shuffle	43940      	8                 	63                 	748                	811                  	0                 	474                 	151                   	13          	13           	169              	clb                      	auto       	0.28     	4756                 	0.45      	0.00             	3.96235       	-167.937            	-3.96235            	nan                                                          	0.000597938                      	0.000489964          	0.0821085                       	0.0685745           	6963             	3821             	13362                   	1508440          	271241         	6.63067e+06           	4.31152e+06          	558096.           	3302.35              	25                               	4.68054            	nan                                               	-193.587 	-4.68054 	0       	0       	0.22                	0.118205                            	0.0971101               

From 070bb74fb7e6bed8da07b99b1e89ae8ffbad3aae Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Wed, 9 Sep 2020 10:58:36 +0200
Subject: [PATCH 14/18] extended_lookahead: re-enable SRC/OPIN lookahead cost

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_extended_map.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index 52b410330b2..88cd12fca72 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -545,7 +545,7 @@ float ExtendedMapLookahead::get_expected_cost(
 
     t_rr_type rr_type = device_ctx.rr_nodes[current_node].type();
 
-    if (rr_type == CHANX || rr_type == CHANY) {
+    if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
         return get_map_cost(
             RRNodeId(current_node), RRNodeId(target_node), params.criticality);
     } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */

From 98cc4d821b186caff13ab08b80eda15a9b8d8e6e Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Wed, 9 Sep 2020 13:04:20 +0200
Subject: [PATCH 15/18] lookahead: addressed review comments

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 libs/libvtrutil/src/vtr_geometry.h            |  6 +++
 vpr/src/route/router_lookahead_cost_map.cpp   | 46 +++++++++++++++----
 vpr/src/route/router_lookahead_cost_map.h     |  2 +
 .../route/router_lookahead_extended_map.cpp   | 24 +++++++++-
 vpr/src/route/router_lookahead_extended_map.h | 10 +++-
 5 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/libs/libvtrutil/src/vtr_geometry.h b/libs/libvtrutil/src/vtr_geometry.h
index d9d82fd45a2..a055b256350 100644
--- a/libs/libvtrutil/src/vtr_geometry.h
+++ b/libs/libvtrutil/src/vtr_geometry.h
@@ -149,6 +149,12 @@ Rect<T> bounding_box(const Rect<T>& lhs, const Rect<T>& rhs);
 template<typename T, typename std::enable_if<std::is_integral<T>::value>::type...>
 Point<T> sample(const vtr::Rect<T>& r, T x, T y, T d);
 
+// clamps v to be between low (lo) and high (hi), inclusive.
+template<class T>
+static constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
+    return std::min(std::max(v, lo), hi);
+}
+
 //A 2D line
 template<class T>
 class Line {
diff --git a/vpr/src/route/router_lookahead_cost_map.cpp b/vpr/src/route/router_lookahead_cost_map.cpp
index 555e3c9da0b..b15f605f21c 100644
--- a/vpr/src/route/router_lookahead_cost_map.cpp
+++ b/vpr/src/route/router_lookahead_cost_map.cpp
@@ -3,6 +3,7 @@
 #include "router_lookahead_map_utils.h"
 #include "globals.h"
 #include "echo_files.h"
+#include "vtr_geometry.h"
 
 #ifdef VTR_ENABLE_CAPNPROTO
 #    include "capnp/serialize.h"
@@ -30,19 +31,13 @@ static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b
     return abs(b.x() - a.x()) + abs(b.y() - a.y());
 }
 
-// clamps v to be between low (lo) and high (hi), inclusive.
-template<class T>
-static constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
-    return std::min(std::max(v, lo), hi);
-}
-
 template<typename T>
 static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Point<T>& p) {
     if (r.empty()) {
         return vtr::Point<T>(0, 0);
     } else {
-        return vtr::Point<T>(clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
-                             clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
+        return vtr::Point<T>(vtr::clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
+                             vtr::clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
     }
 }
 
@@ -57,7 +52,11 @@ void CostMap::set_counts(size_t seg_count) {
     seg_count_ = seg_count;
 }
 
-// cached node -> segment map
+/**
+ * @brief Gets the segment index corresponding to a node index
+ * @param from_node_ind node index
+ * @return segment index corresponding to the input node
+ */
 int CostMap::node_to_segment(int from_node_ind) const {
     const auto& device_ctx = g_vpr_ctx.device();
 
@@ -67,6 +66,26 @@ int CostMap::node_to_segment(int from_node_ind) const {
     return device_ctx.rr_indexed_data[from_cost_index].seg_index;
 }
 
+/**
+ * @brief Penalizes a specific cost entry based on its distance to the cost map bounds.
+ * @param entry cost entry to penalize
+ * @param distance distance to the cost map bounds (can be zero in case the entry is within the bounds)
+ * @param penalty penalty factor relative to the current segment type
+ *
+ * If a specific (delta_x, delta_y) coordinates pair falls out of a segment bounding box, the returned cost is a result of the following equation:
+ *
+ * delay + distance * penalty * PENALTY_FACTOR
+ *
+ * delay    : delay of the closest point in the bounding box for the specific wire segment
+ * distance : this can assume two values:
+ *              - 0     : in case the deltas fall within the bounding box (in this case no penalty is added)
+ *              - non-0 : in case the point is out of the bounding box, the value is the manhattan distance between the point and the closest point within the bounding box.
+ * penalty  : value determined when building the lookahead and relative to a specific segment type (meaning there is one penalty value for each segment type).
+ *            The base delay_penalty value is a calculated as follows:
+ *              (max_delay - min_delay) / max(1, manhattan_distance(max_location, min_location))
+ *
+ * PENALTY_FACTOR: impact of the penalty on the total delay cost.
+ */
 static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) {
     penalty = std::max(penalty, PENALTY_MIN);
     return util::Cost_Entry(entry.delay + distance * penalty * PENALTY_FACTOR,
@@ -202,6 +221,15 @@ void CostMap::set_cost_map(const util::RoutingCosts& delay_costs, const util::Ro
     //
     // In case the lookahead is queried to return a cost that is outside of the bounding box, the closest
     // cost within the bounding box is returned, with the addition of a calculated penalty cost.
+    //
+    // The bounding boxes do have two dimensions and are specified as matrices.
+    // The first dimension though is unused and set to be constant as it is required to keep consistency
+    // with the lookahead map. The extended lookahead map and the normal one index the various costs as follows:
+    //      - Lookahead map         : [0..chan_index][seg_index][dx][dy]
+    //      - Extended lookahead map: [0][seg_index][dx][dy]
+    //
+    // The extended lookahead map does not require the first channel index distinction, therefore the first dimension
+    // remains unused.
     vtr::Matrix<vtr::Rect<int>> bounds({1, seg_count_});
     for (const auto& entry : delay_costs) {
         bounds[0][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
diff --git a/vpr/src/route/router_lookahead_cost_map.h b/vpr/src/route/router_lookahead_cost_map.h
index 6c314f05ee8..1329a6007c8 100644
--- a/vpr/src/route/router_lookahead_cost_map.h
+++ b/vpr/src/route/router_lookahead_cost_map.h
@@ -63,6 +63,8 @@ class CostMap {
      * @param cy y location of the cost map entry that needs to be filled
      * @param bounds bounds of the cost map corresponding to the current segment
      * @return A pair containing the nearby cost entry and the distance from the current location to the narby cost entry found.
+     *
+     * The coordinates identifying the cost map location to fill (cx, cy) need to fall within the bounding box provided as input (bounds). If the (cx, cy) point falls out of the bounds, a default cost entry is returned instead.
      */
     std::pair<util::Cost_Entry, int> get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int cx, int cy, const vtr::Rect<int>& bounds);
 
diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index 88cd12fca72..3eea5617e34 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -45,7 +45,17 @@
 // Don't continue storing a path after hitting a lower-or-same cost entry.
 static constexpr bool BREAK_ON_MISS = false;
 
-///@brief Threshold indicating the minimum number of paths to sample for each point in a sample region
+/**
+ * @brief Threshold indicating the minimum number of paths to sample for each point in a sample region
+ *
+ * Each sample region contains a set of points containing the starting nodes from which the Dijkstra expansions
+ * are performed.
+ * Each node produces a number N of paths that are being explored exhaustively until no more expansions are possible,
+ * and the number of paths for each Dijkstra run is added to the total number of paths found when computing a sample region
+ * In case there are still points in a region that were not picked to perform the Dijkstra expansion, and the
+ * MIN_PATH_COUNT threshold has been reached, the sample region is intended to be complete and the computation of a next
+ * sample region begins.
+ */
 static constexpr int MIN_PATH_COUNT = 1000;
 
 template<typename Entry>
@@ -193,6 +203,16 @@ float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
     float expected_delay = cost_entry.delay;
     float expected_congestion = cost_entry.congestion;
 
+    // The CHAN -> IPIN delay was removed from the cost map calculation, as it might tamper the addition
+    // of a smaller cost to this node location. This might happen as not all the CHAN -> IPIN connection
+    // have a delay, therefore, a different cost than the correct one might have been added to the cost
+    // map location of the input node.
+    //
+    // The CHAN -> IPIN delay gets re-added to the final calculation as it effectively is a valid delay
+    // to reach the destination.
+    //
+    // TODO: Capture this delay as a funtion of both the current wire type and the ipin to have a more
+    //       realistic excpected cost returned.
     float site_pin_delay = this->get_chan_ipin_delays(to_node);
     expected_delay += site_pin_delay;
 
@@ -549,6 +569,8 @@ float ExtendedMapLookahead::get_expected_cost(
         return get_map_cost(
             RRNodeId(current_node), RRNodeId(target_node), params.criticality);
     } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
+        // This is to return only the cost between the IPIN and SINK. No need to
+        // query the cost map, as the routing of this connection is almost done.
         return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost);
     } else { /* Change this if you want to investigate route-throughs */
         return (0.);
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
index c6123c80c13..ad23f035619 100644
--- a/vpr/src/route/router_lookahead_extended_map.h
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -22,7 +22,7 @@ class ExtendedMapLookahead : public RouterLookahead {
      * @param from_node starting node
      * @param delta_x x delta value that is the distance between the current node to the target node
      * @param delta_y y delta value that is the distance between the current node to the target node
-     * @param criticality_fac criticality of the current connection
+     * @param criticality_fac criticality of the current connection between 0 (all congestion) and 1 (all timing)
      * @return expected cost to get to the destination
      */
     float get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
@@ -31,6 +31,14 @@ class ExtendedMapLookahead : public RouterLookahead {
      * @brief Returns the CHAN -> IPIN delay that gets added to the final expected delay
      * @param to_node target node for which we query the IPIN delay
      * @return IPIN delay relative to the input destination node
+     *
+     * The CHAN -> IPIN delay was removed from the cost map calculation, as it might tamper the addition
+     * of a smaller cost to this node location. This might happen as not all the CHAN -> IPIN connection
+     * have a delay, therefore, a different cost than the correct one might have been added to the cost
+     * map location of the input node.
+     * 
+     * The CHAN -> IPIN delay gets re-added to the final calculation as it effectively is a valid delay
+     * to reach the destination.
      */
     float get_chan_ipin_delays(RRNodeId to_node) const;
 

From f4b7155e378b72551c09d864ff3f578f695745ce Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Thu, 10 Sep 2020 17:38:51 +0200
Subject: [PATCH 16/18] lookahead: added comment to the special segment type
 constant index

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/router_lookahead_map_utils.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
index ec4fa210b95..6346ba00006 100644
--- a/vpr/src/route/router_lookahead_map_utils.cpp
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -50,6 +50,11 @@ static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr:
 //          - CHANX --> IPIN --> exploration interrupted: IPIN found, no need to expand further
 #define MAX_EXPANSION_LEVEL 1
 
+// The special segment type index used to identify a direct connection between an OPIN to IPIN that
+// does not go through the CHANX/CHANY nodes.
+//
+// This is used when building the SROURCE/OPIN --> CHAN lookup table that contains additional delay and
+// congestion data to reach the CHANX/CHANY nodes which is not present in the lookahead cost map.
 #define DIRECT_CONNECT_SPECIAL_SEG_TYPE -1;
 
 namespace util {

From fb214f2978b613b842a810e7bd70a1dc074005da Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Thu, 10 Sep 2020 19:18:21 +0200
Subject: [PATCH 17/18] route_timing: remove un-necessary headers

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 vpr/src/route/route_timing.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp
index 00e0d16ca45..a3910e06fe4 100644
--- a/vpr/src/route/route_timing.cpp
+++ b/vpr/src/route/route_timing.cpp
@@ -39,9 +39,6 @@
 #include "bucket.h"
 #include "connection_router.h"
 
-#include "router_lookahead_map.h"
-#include "router_lookahead_extended_map.h"
-
 #include "tatum/TimingReporter.hpp"
 #include "overuse_report.h"
 

From 31115000688be15d6c8840053cd6bc4bb2d301f5 Mon Sep 17 00:00:00 2001
From: Alessandro Comodi <acomodi@antmicro.com>
Date: Fri, 11 Sep 2020 12:14:18 +0200
Subject: [PATCH 18/18] lookahead: fixed rebase conflicts

Signed-off-by: Alessandro Comodi <acomodi@antmicro.com>
---
 .../route/router_lookahead_extended_map.cpp   | 63 +++++++++++--------
 vpr/src/route/router_lookahead_extended_map.h |  9 ++-
 vpr/src/route/router_lookahead_map.cpp        |  2 +-
 3 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index 3eea5617e34..6431dfbf375 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -64,7 +64,7 @@ static std::pair<float, int> run_dijkstra(RRNodeId start_node,
                                           std::vector<util::Search_Path>* paths,
                                           util::RoutingCosts* routing_costs);
 
-float ExtendedMapLookahead::get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const {
+std::pair<float, float> ExtendedMapLookahead::get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, const t_conn_cost_params& params) const {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
@@ -96,12 +96,14 @@ float ExtendedMapLookahead::get_src_opin_cost(RRNodeId from_node, int delta_x, i
         //The cost estimate should still be *extremely* large compared to a typical delay, and
         //so should ensure that the router de-prioritizes exploring this path, but does not
         //forbid the router from trying.
-        return std::numeric_limits<float>::max() / 1e12;
+        float max = std::numeric_limits<float>::max() / 1e12;
+        return std::make_pair(max, max);
     } else {
         //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
         //and then add the estimates from those wire types for the distance of interest.
         //If there are multiple options we use the minimum value.
-        float expected_cost = std::numeric_limits<float>::infinity();
+        float expected_delay_cost = std::numeric_limits<float>::infinity();
+        float expected_cong_cost = std::numeric_limits<float>::infinity();
 
         for (const auto& kv : this->src_opin_delays[tile_index][from_ptc]) {
             const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
@@ -119,14 +121,15 @@ float ExtendedMapLookahead::get_src_opin_cost(RRNodeId from_node, int delta_x, i
                 cost_entry = cost_map_.find_cost(reachable_wire_inf.wire_seg_index, delta_x, delta_y);
             }
 
-            float this_cost = (criticality_fac) * (reachable_wire_inf.congestion + cost_entry.congestion)
-                              + (1. - criticality_fac) * (reachable_wire_inf.delay + cost_entry.delay);
+            float this_delay_cost = (1. - params.criticality) * (reachable_wire_inf.delay + cost_entry.delay);
+            float this_cong_cost = (params.criticality) * (reachable_wire_inf.congestion + cost_entry.congestion);
 
-            expected_cost = std::min(this_cost, expected_cost);
+            expected_delay_cost = std::min(expected_delay_cost, this_delay_cost);
+            expected_cong_cost = std::min(expected_cong_cost, this_cong_cost);
         }
 
-        VTR_ASSERT(std::isfinite(expected_cost));
-        return expected_cost;
+        VTR_ASSERT(std::isfinite(expected_delay_cost) && std::isfinite(expected_cong_cost));
+        return std::make_pair(expected_delay_cost, expected_cong_cost);
     }
 
     VTR_ASSERT_SAFE_MSG(false,
@@ -163,13 +166,14 @@ float ExtendedMapLookahead::get_chan_ipin_delays(RRNodeId to_node) const {
 //
 //  The from_node can be of one of the following types: CHANX, CHANY, SOURCE, OPIN
 //  The to_node is always a SINK
-float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
-                                         RRNodeId to_node,
-                                         float criticality_fac) const {
-    if (from_node == to_node) {
-        return 0.f;
+std::pair<float, float> ExtendedMapLookahead::get_expected_delay_and_cong(int inode, int target_node, const t_conn_cost_params& params, float /*R_upstream*/) const {
+    if (inode == target_node) {
+        return std::make_pair(0., 0.);
     }
 
+    RRNodeId from_node(inode);
+    RRNodeId to_node(target_node);
+
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
@@ -185,7 +189,9 @@ float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
 
     e_rr_type from_type = rr_graph.node_type(from_node);
     if (from_type == SOURCE || from_type == OPIN) {
-        return this->get_src_opin_cost(from_node, dx, dy, criticality_fac);
+        return this->get_src_opin_cost(from_node, dx, dy, params);
+    } else if (from_type == IPIN) {
+        return std::make_pair(0., 0.);
     }
 
     int from_seg_index = cost_map_.node_to_segment(size_t(from_node));
@@ -197,7 +203,8 @@ float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
                        "Not connected %d (%s, %d) -> %d (%s)\n",
                        size_t(from_node), device_ctx.rr_nodes[size_t(from_node)].type_string(), from_seg_index,
                        size_t(to_node), device_ctx.rr_nodes[size_t(to_node)].type_string());
-        return std::numeric_limits<float>::infinity();
+        float infinity = std::numeric_limits<float>::infinity();
+        return std::make_pair(infinity, infinity);
     }
 
     float expected_delay = cost_entry.delay;
@@ -216,16 +223,19 @@ float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
     float site_pin_delay = this->get_chan_ipin_delays(to_node);
     expected_delay += site_pin_delay;
 
-    float expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion;
+    float expected_delay_cost = params.criticality * expected_delay;
+    float expected_cong_cost = (1.0 - params.criticality) * expected_congestion;
+
+    float expected_cost = expected_delay_cost + expected_cong_cost;
 
     VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node %d to %d\n", size_t(from_node), size_t(to_node));
     const std::string& segment_name = device_ctx.rr_segments[from_seg_index].name;
     VTR_LOGV_DEBUG(f_router_debug, "Lookahead returned %s (%d) with distance (%zd, %zd)\n",
                    segment_name.c_str(), from_seg_index,
                    dx, dy);
-    VTR_LOGV_DEBUG(f_router_debug, "Lookahead delay: %g\n", expected_delay);
-    VTR_LOGV_DEBUG(f_router_debug, "Lookahead congestion: %g\n", expected_congestion);
-    VTR_LOGV_DEBUG(f_router_debug, "Criticality: %g\n", criticality_fac);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead delay: %g\n", expected_delay_cost);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead congestion: %g\n", expected_cong_cost);
+    VTR_LOGV_DEBUG(f_router_debug, "Criticality: %g\n", params.criticality);
     VTR_LOGV_DEBUG(f_router_debug, "Lookahead cost: %g\n", expected_cost);
     VTR_LOGV_DEBUG(f_router_debug, "Site pin delay: %g\n", site_pin_delay);
 
@@ -239,7 +249,7 @@ float ExtendedMapLookahead::get_map_cost(RRNodeId from_node,
         VTR_ASSERT(0);
     }
 
-    return expected_cost;
+    return std::make_pair(expected_delay_cost, expected_cong_cost);
 }
 
 // Adds a best cost routing path from start_node_ind to node_ind to routing costs
@@ -560,20 +570,23 @@ float ExtendedMapLookahead::get_expected_cost(
     int current_node,
     int target_node,
     const t_conn_cost_params& params,
-    float /*R_upstream*/) const {
+    float R_upstream) const {
     auto& device_ctx = g_vpr_ctx.device();
 
     t_rr_type rr_type = device_ctx.rr_nodes[current_node].type();
 
     if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
-        return get_map_cost(
-            RRNodeId(current_node), RRNodeId(target_node), params.criticality);
+        float delay_cost, cong_cost;
+
+        // Get the total cost using the combined delay and congestion costs
+        std::tie(delay_cost, cong_cost) = get_expected_delay_and_cong(current_node, target_node, params, R_upstream);
+        return delay_cost + cong_cost;
     } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
         // This is to return only the cost between the IPIN and SINK. No need to
         // query the cost map, as the routing of this connection is almost done.
-        return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost);
+        return device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost;
     } else { /* Change this if you want to investigate route-throughs */
-        return (0.);
+        return 0.;
     }
 }
 
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
index ad23f035619..5e57cd340ed 100644
--- a/vpr/src/route/router_lookahead_extended_map.h
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -25,7 +25,7 @@ class ExtendedMapLookahead : public RouterLookahead {
      * @param criticality_fac criticality of the current connection between 0 (all congestion) and 1 (all timing)
      * @return expected cost to get to the destination
      */
-    float get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
+    std::pair<float, float> get_src_opin_cost(RRNodeId from_node, int delta_x, int delta_y, const t_conn_cost_params& params) const;
 
     /**
      * @brief Returns the CHAN -> IPIN delay that gets added to the final expected delay
@@ -54,8 +54,6 @@ class ExtendedMapLookahead : public RouterLookahead {
                                        std::vector<util::Search_Path>* paths,
                                        util::RoutingCosts* routing_costs);
 
-    float get_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac) const;
-
     CostMap cost_map_; ///<Cost map containing all data to extract the entry cost when querying the lookahead.
   public:
     /**
@@ -63,6 +61,11 @@ class ExtendedMapLookahead : public RouterLookahead {
      */
     float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
 
+    /**
+     * @brief Returns a pair of expected delay and congestion
+     */
+    std::pair<float, float> get_expected_delay_and_cong(int inode, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
+
     /**
      * @brief Computes the extended lookahead map
      */
diff --git a/vpr/src/route/router_lookahead_map.cpp b/vpr/src/route/router_lookahead_map.cpp
index 2f1d955a20e..aafae1d14d5 100644
--- a/vpr/src/route/router_lookahead_map.cpp
+++ b/vpr/src/route/router_lookahead_map.cpp
@@ -234,7 +234,7 @@ float MapLookahead::get_expected_cost(int current_node, int target_node, const t
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    t_rr_type rr_type = rr_graph.node_type(current);
+    t_rr_type rr_type = rr_graph.node_type(RRNodeId(current_node));
 
     if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
         float delay_cost, cong_cost;