diff --git a/libs/libvtrcapnproto/CMakeLists.txt b/libs/libvtrcapnproto/CMakeLists.txt
index 91a77b08014..d0d50dd50e9 100644
--- a/libs/libvtrcapnproto/CMakeLists.txt
+++ b/libs/libvtrcapnproto/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 # Each schema used should appear here.
 set(CAPNP_DEFS
     place_delay_model.capnp
+    connection_map.capnp
     matrix.capnp
     gen/rr_graph_uxsdcxx.capnp
     map_lookahead.capnp
diff --git a/libs/libvtrcapnproto/connection_map.capnp b/libs/libvtrcapnproto/connection_map.capnp
new file mode 100644
index 00000000000..874b39822dd
--- /dev/null
+++ b/libs/libvtrcapnproto/connection_map.capnp
@@ -0,0 +1,25 @@
+@0x876ec83c2fea5a18;
+
+using Matrix = import "matrix.capnp";
+
+struct VprCostEntry {
+    delay @0 :Float32;
+    congestion @1 :Float32;
+    fill @2 :Bool;
+}
+
+struct VprVector2D {
+    x @0 :Int64;
+    y @1 :Int64;
+}
+
+struct VprFloatEntry {
+    value @0 :Float32;
+}
+
+struct VprCostMap {
+    costMap @0 :Matrix.Matrix((Matrix.Matrix(VprCostEntry)));
+    offset @1 :Matrix.Matrix(VprVector2D);
+    depField @2 :List(Int64);
+    penalty @3 :Matrix.Matrix(VprFloatEntry);
+}
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index 170bb9f85bd..d2efc837f80 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -156,6 +156,7 @@ void SetupVPR(const t_options* Options,
     }
 
     Segments = Arch->Segments;
+    device_ctx.segment_inf = Arch->Segments;
 
     SetupSwitches(*Arch, RoutingArch, Arch->Switches, Arch->num_switches);
     SetupRoutingArch(*Arch, RoutingArch);
diff --git a/vpr/src/base/echo_files.cpp b/vpr/src/base/echo_files.cpp
index 6f8de45933b..cd4e2566f9b 100644
--- a/vpr/src/base/echo_files.cpp
+++ b/vpr/src/base/echo_files.cpp
@@ -114,6 +114,8 @@ void alloc_and_load_echo_file_info() {
     setEchoFileName(E_ECHO_CHAN_DETAILS, "chan_details.txt");
     setEchoFileName(E_ECHO_SBLOCK_PATTERN, "sblock_pattern.txt");
     setEchoFileName(E_ECHO_ENDPOINT_TIMING, "endpoint_timing.echo.json");
+
+    setEchoFileName(E_ECHO_LOOKAHEAD_MAP, "lookahead_map.echo");
 }
 
 void free_echo_file_info() {
diff --git a/vpr/src/base/echo_files.h b/vpr/src/base/echo_files.h
index 70df3a3d962..d273c575d50 100644
--- a/vpr/src/base/echo_files.h
+++ b/vpr/src/base/echo_files.h
@@ -46,6 +46,7 @@ enum e_echo_files {
     E_ECHO_CHAN_DETAILS,
     E_ECHO_SBLOCK_PATTERN,
     E_ECHO_ENDPOINT_TIMING,
+    E_ECHO_LOOKAHEAD_MAP,
 
     //Timing Graphs
     E_ECHO_PRE_PACKING_TIMING_GRAPH,
diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h
index 302a93b6b8e..ff43a23345f 100644
--- a/vpr/src/base/vpr_context.h
+++ b/vpr/src/base/vpr_context.h
@@ -137,6 +137,7 @@ struct DeviceContext : public Context {
     t_rr_node_indices rr_node_indices; //[0..NUM_RR_TYPES-1][0..grid.width()-1][0..grid.width()-1][0..size-1]
 
     std::vector<t_rr_switch_inf> rr_switch_inf; /* autogenerated in build_rr_graph based on switch fan-in. [0..(num_rr_switches-1)] */
+    std::vector<t_segment_inf> segment_inf;
 
     //Wire segment types in RR graph
     std::vector<t_segment_inf> rr_segments;
diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h
index b2d0ff16bdb..68716fcce64 100644
--- a/vpr/src/place/timing_place.h
+++ b/vpr/src/place/timing_place.h
@@ -190,10 +190,11 @@ class PlacerTimingCosts {
         num_levels_ = ilevel + 1;
 
         size_t num_leaves = num_nodes_in_level(ilevel);
-        size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1);
+        //size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1);
 
         VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections");
-        VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)");
+        // XXX: Temporarily disabling this assertion as num_level_befor_leaves
+        //VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)");
 
         //We don't need to store all possible leaves if we have fewer connections
         //(i.e. bottom-right of tree is empty)
diff --git a/vpr/src/route/route_profiling.cpp b/vpr/src/route/route_profiling.cpp
index c5e5737caca..b393711eba9 100644
--- a/vpr/src/route/route_profiling.cpp
+++ b/vpr/src/route/route_profiling.cpp
@@ -2,6 +2,7 @@
 #include "globals.h"
 #include "vpr_types.h"
 #include "route_profiling.h"
+#include "rr_graph.h"
 
 namespace profiling {
 
@@ -29,6 +30,10 @@ void time_on_fanout_analysis() {}
 
 void profiling_initialization(unsigned /*max_net_fanout*/) {}
 
+void conn_start() {}
+void conn_finish(int /*src_rr*/, int /*sink_rr*/, float /*criticality*/) {}
+void net_finish() {}
+
 #else
 
 constexpr unsigned int fanout_per_bin = 1;
@@ -181,6 +186,12 @@ void congestion_analysis() {
 #    endif
 }
 
+static clock_t conn_start_time;
+static float worst_conn_time = 0.f;
+static int worst_src_rr;
+static int worst_sink_rr;
+static float worst_crit;
+
 void profiling_initialization(unsigned max_fanout) {
     // add 1 so that indexing on the max fanout would still be valid
     time_on_fanout.resize((max_fanout / fanout_per_bin) + 1, 0);
@@ -195,8 +206,37 @@ void profiling_initialization(unsigned max_fanout) {
     part_tree_preserved = 0;
     connections_forced_to_reroute = 0;
     connections_rerouted_due_to_forcing = 0;
+    worst_conn_time = 0.f;
     return;
 }
+
+void conn_start() {
+    conn_start_time = clock();
+}
+void conn_finish(int src_rr, int sink_rr, float criticality) {
+    float route_time = static_cast<float>(clock() - conn_start_time) / CLOCKS_PER_SEC;
+    if (route_time > worst_conn_time) {
+        worst_src_rr = src_rr;
+        worst_sink_rr = sink_rr;
+        worst_conn_time = route_time;
+        worst_crit = criticality;
+    }
+
+    VTR_LOG("%s to %s (crit: %f) took %f\n",
+            describe_rr_node(src_rr).c_str(),
+            describe_rr_node(sink_rr).c_str(),
+            criticality,
+            route_time);
+}
+void net_finish() {
+    if (worst_conn_time > 0.f) {
+        VTR_LOG("Worst conn was %s to %s (crit: %f) took %f\n",
+                describe_rr_node(worst_src_rr).c_str(),
+                describe_rr_node(worst_sink_rr).c_str(),
+                worst_crit,
+                worst_conn_time);
+    }
+}
 #endif
 
 } // end namespace profiling
diff --git a/vpr/src/route/route_profiling.h b/vpr/src/route/route_profiling.h
index 52a73655214..3cc1ec377a2 100644
--- a/vpr/src/route/route_profiling.h
+++ b/vpr/src/route/route_profiling.h
@@ -30,6 +30,10 @@ void congestion_analysis();
 void time_on_criticality_analysis();
 void time_on_fanout_analysis();
 
+void conn_start();
+void conn_finish(int src_rr, int sink_rr, float criticality);
+void net_finish();
+
 void profiling_initialization(unsigned max_net_fanout);
 
 } // end namespace profiling
diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp
index e7eef6461c6..f58672606bf 100644
--- a/vpr/src/route/route_timing.cpp
+++ b/vpr/src/route/route_timing.cpp
@@ -1096,6 +1096,8 @@ bool timing_driven_route_net(ConnectionRouter& router,
             conn_delay_budget.short_path_criticality = budgeting_inf.get_crit_short_path(net_id, target_pin);
         }
 
+        profiling::conn_start();
+
         // build a branch in the route tree to the target
         if (!timing_driven_route_sink(router,
                                       net_id,
@@ -1109,10 +1111,15 @@ bool timing_driven_route_net(ConnectionRouter& router,
                                       router_stats))
             return false;
 
+        profiling::conn_finish(route_ctx.net_rr_terminals[net_id][0],
+                               sink_rr,
+                               pin_criticality[target_pin]);
+
         ++router_stats.connections_routed;
     } // finished all sinks
 
     ++router_stats.nets_routed;
+    profiling::net_finish();
 
     /* For later timing analysis. */
 
diff --git a/vpr/src/route/route_timing.h b/vpr/src/route/route_timing.h
index 5bc4653035a..3974f3c6405 100644
--- a/vpr/src/route/route_timing.h
+++ b/vpr/src/route/route_timing.h
@@ -13,6 +13,8 @@
 #include "connection_router_interface.h"
 #include "heap_type.h"
 
+extern bool f_router_debug;
+
 int get_max_pins_per_net();
 
 bool try_timing_driven_route(const t_router_opts& router_opts,
diff --git a/vpr/src/route/router_lookahead_map.cpp b/vpr/src/route/router_lookahead_map.cpp
index 0b9772e2e50..6aad0be6eb5 100644
--- a/vpr/src/route/router_lookahead_map.cpp
+++ b/vpr/src/route/router_lookahead_map.cpp
@@ -1,187 +1,95 @@
-/*
- * The router lookahead provides an estimate of the cost from an intermediate node to the target node
- * during directed (A*-like) routing.
- *
- * The VPR 7.0 lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and
- * congestion by assuming that a minimum number of wires, of the same type as the current node being expanded, can be used
- * to complete the route. While this method is efficient, it can run into trouble with architectures that use
- * multiple interconnected wire types.
- *
- * The lookahead in this file performs undirected Dijkstra searches to evaluate many paths through the routing network,
- * starting from all the different wire types in the routing architecture. This ensures the lookahead captures the 
- * effect of inter-wire connectivity. This information is then reduced into a delta_x delta_y based lookup table for 
- * reach source wire type (f_cost_map). This is used for estimates from CHANX/CHANY -> SINK nodes. See Section 3.2.4 
- * in Oleg Petelin's MASc thesis (2016) for more discussion.
- *
- * To handle estimates starting from SOURCE/OPIN's the lookahead also creates a small side look-up table of the wire types
- * which are reachable from each physical tile type's SOURCEs/OPINs (f_src_opin_reachable_wires). This is used for
- * SRC/OPIN -> CHANX/CHANY estimates.
- *
- * In the case of SRC/OPIN -> SINK estimates the resuls from the two look-ups are added together (and the minimum taken
- * if there are multiple possiblities).
- */
+#include "router_lookahead_map.h"
 
-#include <cmath>
 #include <vector>
 #include <queue>
-#include <ctime>
-#include "vpr_types.h"
-#include "vpr_error.h"
-#include "vpr_utils.h"
+
+#include "rr_node.h"
+#include "router_lookahead_map_utils.h"
+#include "router_lookahead_sampling.h"
 #include "globals.h"
 #include "vtr_math.h"
-#include "vtr_log.h"
-#include "vtr_assert.h"
 #include "vtr_time.h"
 #include "vtr_geometry.h"
-#include "router_lookahead_map.h"
-#include "rr_graph2.h"
+#include "echo_files.h"
 #include "rr_graph.h"
-#include "route_common.h"
+
 #include "route_timing.h"
+#include "route_common.h"
 
 #ifdef VTR_ENABLE_CAPNPROTO
 #    include "capnp/serialize.h"
-#    include "map_lookahead.capnp.h"
+#    include "connection_map.capnp.h"
 #    include "ndmatrix_serdes.h"
 #    include "mmap_file.h"
 #    include "serdes_utils.h"
-#endif /* VTR_ENABLE_CAPNPROTO */
+#endif
 
-/* we will profile delay/congestion using this many tracks for each wire type */
-#define MAX_TRACK_OFFSET 16
+#if defined(VPR_USE_TBB)
+#    include <tbb/parallel_for_each.h>
+#    include <tbb/mutex.h>
+#endif
 
-/* we're profiling routing cost over many tracks for each wire type, so we'll have many cost entries at each |dx|,|dy| offset.
- * there are many ways to "boil down" the many costs at each offset to a single entry for a given (wire type, chan_type) combination --
- * we can take the smallest cost, the average, median, etc. This define selects the method we use.
+/* we're profiling routing cost over many tracks for each wire type, so we'll
+ * have many cost entries at each |dx|,|dy| offset. There are many ways to
+ * "boil down" the many costs at each offset to a single entry for a given
+ * (wire type, chan_type) combination we can take the smallest cost, the
+ * average, median, etc. This define selects the method we use.
+ *
+ * NOTE: Currently, only SMALLEST is supported.
+ *
  * See e_representative_entry_method */
-#define REPRESENTATIVE_ENTRY_METHOD SMALLEST
-
-/* when a list of delay/congestion entries at a coordinate in Cost_Entry is boiled down to a single
- * representative entry, this enum is passed-in to specify how that representative entry should be
- * calculated */
-enum e_representative_entry_method {
-    FIRST = 0, //the first cost that was recorded
-    SMALLEST,  //the smallest-delay cost recorded
-    AVERAGE,
-    GEOMEAN,
-    MEDIAN
-};
+#define REPRESENTATIVE_ENTRY_METHOD util::SMALLEST
 
-/* a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion.
- * since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored
- * in the final lookahead cost map */
-class Expansion_Cost_Entry {
-  public:
-    std::vector<Cost_Entry> cost_vector;
-
-  private:
-    Cost_Entry get_smallest_entry();
-    Cost_Entry get_average_entry();
-    Cost_Entry get_geomean_entry();
-    Cost_Entry get_median_entry();
-
-  public:
-    void add_cost_entry(float add_delay, float add_congestion) {
-        Cost_Entry cost_entry(add_delay, add_congestion);
-        if (REPRESENTATIVE_ENTRY_METHOD == SMALLEST) {
-            /* taking the smallest-delay entry anyway, so no need to push back multple entries */
-            if (this->cost_vector.empty()) {
-                this->cost_vector.push_back(cost_entry);
-            } else {
-                if (add_delay < this->cost_vector[0].delay) {
-                    this->cost_vector[0] = cost_entry;
-                }
-            }
-        } else {
-            this->cost_vector.push_back(cost_entry);
-        }
-    }
-    void clear_cost_entries() {
-        this->cost_vector.clear();
-    }
+#define CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS
 
-    Cost_Entry get_representative_cost_entry(e_representative_entry_method method) {
-        float nan = std::numeric_limits<float>::quiet_NaN();
-        Cost_Entry entry(nan, nan);
+#define X_OFFSET 2
+#define Y_OFFSET 2
 
-        if (!cost_vector.empty()) {
-            switch (method) {
-                case FIRST:
-                    entry = cost_vector[0];
-                    break;
-                case SMALLEST:
-                    entry = this->get_smallest_entry();
-                    break;
-                case AVERAGE:
-                    entry = this->get_average_entry();
-                    break;
-                case GEOMEAN:
-                    entry = this->get_geomean_entry();
-                    break;
-                case MEDIAN:
-                    entry = this->get_median_entry();
-                    break;
-                default:
-                    break;
-            }
-        }
-        return entry;
-    }
-};
+#define MAX_EXPANSION_LEVEL 1
 
-/* a class that represents an entry in the Dijkstra expansion priority queue */
-class PQ_Entry {
-  public:
-    RRNodeId rr_node; //index in device_ctx.rr_nodes that this entry represents
-    float cost;       //the cost of the path to get to this node
+// Don't continue storing a path after hitting a lower-or-same cost entry.
+static constexpr bool BREAK_ON_MISS = false;
 
-    /* store backward delay, R and congestion info */
-    float delay;
-    float R_upstream;
-    float congestion_upstream;
-
-    PQ_Entry(RRNodeId set_rr_node, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node) {
-        this->rr_node = set_rr_node;
-
-        auto& device_ctx = g_vpr_ctx.device();
-        this->delay = parent_delay;
-        this->congestion_upstream = parent_congestion_upstream;
-        this->R_upstream = parent_R_upstream;
-        if (!starting_node) {
-            int cost_index = device_ctx.rr_nodes.node_cost_index(RRNodeId(set_rr_node));
-            //this->delay += g_rr_nodes[set_rr_node].C() * (g_rr_switch_inf[switch_ind].R + 0.5*g_rr_nodes[set_rr_node].R()) +
-            //              g_rr_switch_inf[switch_ind].Tdel;
-
-            //FIXME going to use the delay data that the VPR7 lookahead uses. For some reason the delay calculation above calculates
-            //    a value that's just a little smaller compared to what VPR7 lookahead gets. While the above calculation should be more accurate,
-            //    I have found that it produces the same CPD results but with worse runtime.
-            //
-            //    TODO: figure out whether anything's wrong with the calculation above and use that instead. If not, add the other
-            //          terms like T_quadratic and R_upstream to the calculation below (they are == 0 or UNDEFINED for buffered archs I think)
+// Distance penalties filling are calculated based on available samples, but can be adjusted with this factor.
+static constexpr float PENALTY_FACTOR = 1.f;
+static constexpr float PENALTY_MIN = 1e-12f;
 
-            //NOTE: We neglect the T_quadratic and C_load terms and Switch R, so this lookahead is likely
-            //      less accurate on unbuffered (e.g. pass-gate) architectures
+static constexpr int MIN_PATH_COUNT = 1000;
 
-            this->delay += device_ctx.rr_indexed_data[cost_index].T_linear;
+template<typename Entry>
+static std::pair<float, int> run_dijkstra(int start_node_ind,
+                                          std::vector<bool>* node_expanded,
+                                          std::vector<util::Search_Path>* paths,
+                                          RoutingCosts* routing_costs);
 
-            this->congestion_upstream += device_ctx.rr_indexed_data[cost_index].base_cost;
-        }
+// also known as the L1 norm
+static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b) {
+    return abs(b.x() - a.x()) + abs(b.y() - a.y());
+}
 
-        if (this->delay < 0) {
-            VTR_LOG("NEGATIVE DELAY!\n");
-        }
+template<class T>
+constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
+    return std::min(std::max(v, lo), hi);
+}
 
-        /* set the cost of this node */
-        this->cost = this->delay;
+template<typename T>
+static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Point<T>& p) {
+    if (r.empty()) {
+        return vtr::Point<T>(0, 0);
+    } else {
+        return vtr::Point<T>(clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
+                             clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
     }
+}
 
-    bool operator<(const PQ_Entry& obj) const {
-        /* inserted into max priority queue so want queue entries with a lower cost to be greater */
-        return (this->cost > obj.cost);
-    }
-};
+// deltas calculation
+static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y);
+static void adjust_rr_position(const RRNodeId rr, int& x, int& y);
+static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y);
+static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y);
+static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y);
 
+// additional lookups for IPIN/OPIN missing delays
 struct t_reachable_wire_inf {
     e_rr_type wire_rr_type;
     int wire_seg_index;
@@ -191,120 +99,335 @@ struct t_reachable_wire_inf {
     float delay;
 };
 
-/* used during Dijkstra expansion to store delay/congestion info lists for each relative coordinate for a given segment and channel type.
- * the list at each coordinate is later boiled down to a single representative cost entry to be stored in the final cost map */
-typedef vtr::Matrix<Expansion_Cost_Entry> t_routing_cost_map; //[0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1]
-
-typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_src_opin_reachable_wires; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index]
-                                                                                                  // ^                                           ^             ^
-                                                                                                  // |                                           |             |
-                                                                                                  // physical block type index                   |             Reachable wire info
-                                                                                                  //                                             |
-                                                                                                  //                                             SOURCE/OPIN ptc
-
-struct t_dijkstra_data {
-    /* a list of boolean flags (one for each rr node) to figure out if a certain node has already been expanded */
-    vtr::vector<RRNodeId, bool> node_expanded;
-    /* for each node keep a list of the cost with which that node has been visited (used to determine whether to push
-     * a candidate node onto the expansion queue */
-    vtr::vector<RRNodeId, float> node_visited_costs;
-    /* a priority queue for expansion */
-    std::priority_queue<PQ_Entry> pq;
-};
+typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_src_opin_delays; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index]
+                                                                                         // ^                                           ^             ^
+                                                                                         // |                                           |             |
+                                                                                         // physical block type index                   |             Reachable wire info
+                                                                                         //                                             |
+                                                                                         //                                             SOURCE/OPIN ptc
 
-/******** File-Scope Variables ********/
+typedef std::vector<std::vector<std::map<int, t_reachable_wire_inf>>> t_chan_ipins_delays; //[0..device_ctx.physical_tile_types.size()-1][0..max_ptc-1][wire_seg_index]
+                                                                                           // ^                                           ^             ^
+                                                                                           // |                                           |             |
+                                                                                           // physical block type index                   |             Wire to IPIN segment info
+                                                                                           //                                             |
+                                                                                           //                                             SINK/IPIN ptc
 
-constexpr int DIRECT_CONNECT_SPECIAL_SEG_TYPE = -1;
+//Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
+t_src_opin_delays f_src_opin_delays;
 
-//Look-up table from CHANX/CHANY (to SINKs) for various distances
-t_wire_cost_map f_wire_cost_map;
+//Look-up table from CHANX/CHANY to SINK/IPIN of various types
+t_chan_ipins_delays f_chan_ipins_delays;
 
-//Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
-t_src_opin_reachable_wires f_src_opin_reachable_wires;
+constexpr int DIRECT_CONNECT_SPECIAL_SEG_TYPE = -1;
 
-/******** File-Scope Functions ********/
-Cost_Entry get_wire_cost_entry(e_rr_type rr_type, int seg_index, int delta_x, int delta_y);
-static void compute_router_wire_lookahead(const std::vector<t_segment_inf>& segment_inf);
 static void compute_router_src_opin_lookahead();
-static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> start);
-void dijkstra_flood_to_wires(int itile, RRNodeId inode, t_src_opin_reachable_wires& src_opin_reachable_wires);
-
-/* returns index of a node from which to start routing */
-static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset);
-/* runs Dijkstra's algorithm from specified node until all nodes have been visited. Each time a pin is visited, the delay/congestion information
- * to that pin is stored is added to an entry in the routing_cost_map */
-static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routing_cost_map& routing_cost_map, t_dijkstra_data* data);
-/* iterates over the children of the specified node and selectively pushes them onto the priority queue */
-static void expand_dijkstra_neighbours(PQ_Entry parent_entry, vtr::vector<RRNodeId, float>& node_visited_costs, vtr::vector<RRNodeId, bool>& node_expanded, std::priority_queue<PQ_Entry>& pq);
-/* sets the lookahead cost map entries based on representative cost entries from routing_cost_map */
-static void set_lookahead_map_costs(int segment_index, e_rr_type chan_type, t_routing_cost_map& routing_cost_map);
-/* fills in missing lookahead map entries by copying the cost of the closest valid entry */
-static void fill_in_missing_lookahead_entries(int segment_index, e_rr_type chan_type);
-/* returns a cost entry in the f_wire_cost_map that is near the specified coordinates (and preferably towards (0,0)) */
-static Cost_Entry get_nearby_cost_entry(int x, int y, int segment_index, int chan_index);
-/* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */
-static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y);
-static void adjust_rr_position(const RRNodeId rr, int& x, int& y);
-static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y);
-static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y);
-static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y);
+static void compute_router_chan_ipin_lookahead();
+static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> prev);
+static void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_delays& src_opin_delays);
+static void dijkstra_flood_to_ipins(RRNodeId node, t_chan_ipins_delays& chan_ipins_delays);
+
+// build the segment map
+void CostMap::build_segment_map() {
+    const auto& device_ctx = g_vpr_ctx.device();
+    segment_map_.resize(device_ctx.rr_nodes.size());
+    for (size_t i = 0; i < segment_map_.size(); ++i) {
+        auto& from_node = device_ctx.rr_nodes[i];
+
+        int from_cost_index = from_node.cost_index();
+        int from_seg_index = device_ctx.rr_indexed_data[from_cost_index].seg_index;
 
-static void print_wire_cost_map(const std::vector<t_segment_inf>& segment_inf);
-static void print_router_cost_map(const t_routing_cost_map& router_cost_map);
+        segment_map_[i] = from_seg_index;
+    }
+}
 
-/******** Interface class member function definitions ********/
-float MapLookahead::get_expected_cost(int current_node, int target_node, const t_conn_cost_params& params, float /*R_upstream*/) const {
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
+// resize internal data structures
+void CostMap::set_counts(size_t seg_count) {
+    cost_map_.clear();
+    offset_.clear();
+    penalty_.clear();
+    cost_map_.resize({2, seg_count});
+    offset_.resize({2, seg_count});
+    penalty_.resize({2, seg_count});
+    seg_count_ = seg_count;
+}
 
-    RRNodeId current(current_node);
-    RRNodeId target(target_node);
+// cached node -> segment map
+int CostMap::node_to_segment(int from_node_ind) const {
+    return segment_map_[from_node_ind];
+}
 
-    t_rr_type rr_type = rr_graph.node_type(current);
+static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) {
+    penalty = std::max(penalty, PENALTY_MIN);
+    return util::Cost_Entry(entry.delay + distance * penalty * PENALTY_FACTOR,
+                            entry.congestion, entry.fill);
+}
 
-    if (rr_type == CHANX || rr_type == CHANY || rr_type == SOURCE || rr_type == OPIN) {
-        return get_lookahead_map_cost(current, target, params.criticality);
-    } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
-        return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost);
-    } else { /* Change this if you want to investigate route-throughs */
-        return (0.);
+// get a cost entry for a segment type, connection box type, and offset
+util::Cost_Entry CostMap::find_cost(int from_seg_index, e_rr_type rr_type, int delta_x, int delta_y) const {
+    int chan_index = 0;
+    if (rr_type == CHANY) {
+        chan_index = 1;
+    }
+
+    VTR_ASSERT(from_seg_index >= 0 && from_seg_index < (ssize_t)offset_.size());
+    const auto& cost_map = cost_map_[chan_index][from_seg_index];
+    if (cost_map.dim_size(0) == 0 || cost_map.dim_size(1) == 0) {
+        return util::Cost_Entry();
     }
+
+    vtr::Point<int> coord(delta_x - offset_[chan_index][from_seg_index].first,
+                          delta_y - offset_[chan_index][from_seg_index].second);
+    vtr::Rect<int> bounds(0, 0, cost_map.dim_size(0), cost_map.dim_size(1));
+    auto closest = closest_point_in_rect(bounds, coord);
+    auto cost = cost_map_[chan_index][from_seg_index][closest.x()][closest.y()];
+    float penalty = penalty_[chan_index][from_seg_index];
+    auto distance = manhattan_distance(closest, coord);
+    return penalize(cost, distance, penalty);
 }
 
-void MapLookahead::compute(const std::vector<t_segment_inf>& segment_inf) {
-    compute_router_lookahead(segment_inf);
+// set the cost map for a segment type and connection box type, filling holes
+void CostMap::set_cost_map(const RoutingCosts& delay_costs, const RoutingCosts& base_costs) {
+    // calculate the bounding boxes
+    vtr::Matrix<vtr::Rect<int>> bounds({2, seg_count_});
+    for (const auto& entry : delay_costs) {
+        bounds[entry.first.chan_index][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
+    }
+    for (const auto& entry : base_costs) {
+        bounds[entry.first.chan_index][entry.first.seg_index].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
+    }
+
+    // store bounds
+    for (size_t chan = 0; chan < 2; chan++) {
+        for (size_t seg = 0; seg < seg_count_; seg++) {
+            const auto& chan_seg_bounds = bounds[chan][seg];
+            if (chan_seg_bounds.empty()) {
+                // Didn't find any sample routes, so routing isn't possible between these segment/connection box types.
+                offset_[chan][seg] = std::make_pair(0, 0);
+                cost_map_[chan][seg] = vtr::NdMatrix<util::Cost_Entry, 2>(
+                    {size_t(0), size_t(0)});
+                continue;
+            } else {
+                offset_[chan][seg] = std::make_pair(chan_seg_bounds.xmin(), chan_seg_bounds.ymin());
+                cost_map_[chan][seg] = vtr::NdMatrix<util::Cost_Entry, 2>(
+                    {size_t(chan_seg_bounds.width()), size_t(chan_seg_bounds.height())});
+            }
+        }
+    }
+
+    // store entries into the matrices
+    for (const auto& entry : delay_costs) {
+        int seg = entry.first.seg_index;
+        int chan = entry.first.chan_index;
+        const auto& chan_seg_bounds = bounds[chan][seg];
+        int x = entry.first.delta.x() - chan_seg_bounds.xmin();
+        int y = entry.first.delta.y() - chan_seg_bounds.ymin();
+        cost_map_[chan][seg][x][y].delay = entry.second;
+    }
+    for (const auto& entry : base_costs) {
+        int seg = entry.first.seg_index;
+        int chan = entry.first.chan_index;
+        const auto& chan_seg_bounds = bounds[chan][seg];
+        int x = entry.first.delta.x() - chan_seg_bounds.xmin();
+        int y = entry.first.delta.y() - chan_seg_bounds.ymin();
+        cost_map_[chan][seg][x][y].congestion = entry.second;
+    }
+
+    // fill the holes
+    for (size_t chan = 0; chan < 2; chan++) {
+        for (size_t seg = 0; seg < seg_count_; seg++) {
+            penalty_[chan][seg] = std::numeric_limits<float>::infinity();
+            const auto& chan_seg_bounds = bounds[chan][seg];
+            if (chan_seg_bounds.empty()) {
+                continue;
+            }
+            auto& matrix = cost_map_[chan][seg];
+
+            // calculate delay penalty
+            float min_delay = std::numeric_limits<float>::infinity(), max_delay = 0.f;
+            vtr::Point<int> min_location(0, 0), max_location(0, 0);
+            for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+                for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+                    util::Cost_Entry& cost_entry = matrix[ix][iy];
+                    if (cost_entry.valid()) {
+                        if (cost_entry.delay < min_delay) {
+                            min_delay = cost_entry.delay;
+                            min_location = vtr::Point<int>(ix, iy);
+                        }
+                        if (cost_entry.delay > max_delay) {
+                            max_delay = cost_entry.delay;
+                            max_location = vtr::Point<int>(ix, iy);
+                        }
+                    }
+                }
+            }
+            float delay_penalty = (max_delay - min_delay) / static_cast<float>(std::max(1, manhattan_distance(max_location, min_location)));
+            penalty_[chan][seg] = delay_penalty;
+
+            // find missing cost entries and fill them in by copying a nearby cost entry
+            std::vector<std::tuple<unsigned, unsigned, util::Cost_Entry>> missing;
+            bool couldnt_fill = false;
+            auto shifted_bounds = vtr::Rect<int>(0, 0, chan_seg_bounds.width(), chan_seg_bounds.height());
+            int max_fill = 0;
+            for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+                for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+                    util::Cost_Entry& cost_entry = matrix[ix][iy];
+                    if (!cost_entry.valid()) {
+                        // maximum search radius
+                        util::Cost_Entry filler;
+                        int distance;
+                        std::tie(filler, distance) = get_nearby_cost_entry(matrix, ix, iy, shifted_bounds);
+                        if (filler.valid()) {
+                            missing.push_back(std::make_tuple(ix, iy, penalize(filler, distance, delay_penalty)));
+                            max_fill = std::max(max_fill, distance);
+                        } else {
+                            couldnt_fill = true;
+                        }
+                    }
+                }
+                if (couldnt_fill) {
+                    // give up trying to fill an empty matrix
+                    break;
+                }
+            }
+
+            if (!couldnt_fill) {
+                VTR_LOG("At %d: max_fill = %d, delay_penalty = %e\n", seg, max_fill, delay_penalty);
+            }
+
+            // write back the missing entries
+            for (auto& xy_entry : missing) {
+                matrix[std::get<0>(xy_entry)][std::get<1>(xy_entry)] = std::get<2>(xy_entry);
+            }
+
+            if (couldnt_fill) {
+                VTR_LOG_WARN("Couldn't fill holes in the cost matrix for %d -> %ld, %d x %d bounding box\n",
+                             chan, seg, chan_seg_bounds.width(), chan_seg_bounds.height());
+                for (unsigned y = 0; y < matrix.dim_size(1); y++) {
+                    for (unsigned x = 0; x < matrix.dim_size(0); x++) {
+                        VTR_ASSERT(!matrix[x][y].valid());
+                    }
+                }
+            }
+        }
+    }
 }
 
-void MapLookahead::read(const std::string& file) {
-    read_router_lookahead(file);
+// prints an ASCII diagram of each cost map for a segment type (debug)
+// o => above average
+// . => at or below average
+// * => invalid (missing)
+void CostMap::print(int iseg) const {
+    for (size_t chan_index = 0;
+         chan_index < 2;
+         chan_index++) {
+        auto& matrix = cost_map_[chan_index][iseg];
+        if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) {
+            VTR_LOG("cost EMPTY for chan_index = %lu\n", chan_index);
+            continue;
+        }
+        VTR_LOG("cost for chan_index = %lu (%zu, %zu)\n", chan_index, matrix.dim_size(0), matrix.dim_size(1));
+        double sum = 0.0;
+        for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+            for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+                const auto& entry = matrix[ix][iy];
+                if (entry.valid()) {
+                    sum += entry.delay;
+                }
+            }
+        }
+        double avg = sum / ((double)matrix.dim_size(0) * (double)matrix.dim_size(1));
+        for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
+            for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
+                const auto& entry = matrix[ix][iy];
+                if (!entry.valid()) {
+                    VTR_LOG("*");
+                } else if (entry.delay * 4 > avg * 5) {
+                    VTR_LOG("O");
+                } else if (entry.delay > avg) {
+                    VTR_LOG("o");
+                } else if (entry.delay * 4 > avg * 3) {
+                    VTR_LOG(".");
+                } else {
+                    VTR_LOG(" ");
+                }
+            }
+            VTR_LOG("\n");
+        }
+    }
+}
 
-    //Next, compute which wire types are accessible (and the cost to reach them)
-    //from the different physical tile type's SOURCEs & OPINs
-    compute_router_src_opin_lookahead();
+// list segment type and connection box type pairs that have empty cost maps (debug)
+std::vector<std::pair<int, int>> CostMap::list_empty() const {
+    std::vector<std::pair<int, int>> results;
+    for (int chan_index = 0; chan_index < (int)cost_map_.dim_size(1); chan_index++) {
+        for (int iseg = 0; iseg < (int)cost_map_.dim_size(0); iseg++) {
+            auto& matrix = cost_map_[chan_index][iseg];
+            if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) results.push_back(std::make_pair(chan_index, iseg));
+        }
+    }
+    return results;
 }
 
-void MapLookahead::write(const std::string& file) const {
-    write_router_lookahead(file);
+static void assign_min_entry(util::Cost_Entry* dst, const util::Cost_Entry& src) {
+    if (src.delay < dst->delay) {
+        dst->delay = src.delay;
+    }
+    if (src.congestion < dst->congestion) {
+        dst->congestion = src.congestion;
+    }
+}
+
+// find the minimum cost entry from the nearest manhattan distance neighbor
+std::pair<util::Cost_Entry, int> CostMap::get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix,
+                                                                int cx,
+                                                                int cy,
+                                                                const vtr::Rect<int>& bounds) {
+    // spiral around (cx, cy) looking for a nearby entry
+    bool in_bounds = bounds.contains(vtr::Point<int>(cx, cy));
+    if (!in_bounds) {
+        return std::make_pair(util::Cost_Entry(), 0);
+    }
+    int n = 0;
+    util::Cost_Entry fill(matrix[cx][cy]);
+    fill.fill = true;
+
+    while (in_bounds && !fill.valid()) {
+        n++;
+        in_bounds = false;
+        util::Cost_Entry min_entry;
+        for (int ox = -n; ox <= n; ox++) {
+            int x = cx + ox;
+            int oy = n - abs(ox);
+            int yp = cy + oy;
+            int yn = cy - oy;
+            if (bounds.contains(vtr::Point<int>(x, yp))) {
+                assign_min_entry(&min_entry, matrix[x][yp]);
+                in_bounds = true;
+            }
+            if (bounds.contains(vtr::Point<int>(x, yn))) {
+                assign_min_entry(&min_entry, matrix[x][yn]);
+                in_bounds = true;
+            }
+        }
+        if (!std::isfinite(fill.delay)) {
+            fill.delay = min_entry.delay;
+        }
+        if (!std::isfinite(fill.congestion)) {
+            fill.congestion = min_entry.congestion;
+        }
+    }
+    return std::make_pair(fill, n);
 }
 
-/******** Function Definitions ********/
-/* queries the lookahead_map (should have been computed prior to routing) to get the expected cost
- * from the specified source to the specified target */
-float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac) {
+std::pair<float, float> MapLookahead::get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    int delta_x, delta_y;
-    get_xy_deltas(from_node, to_node, &delta_x, &delta_y);
-    delta_x = abs(delta_x);
-    delta_y = abs(delta_y);
-
-    float expected_cost = std::numeric_limits<float>::infinity();
-
     e_rr_type from_type = rr_graph.node_type(from_node);
     if (from_type == SOURCE || from_type == OPIN) {
         //When estimating costs from a SOURCE/OPIN we look-up to find which wire types (and the
-        //cost to reach them) in f_src_opin_reachable_wires. Once we know what wire types are
+        //cost to reach them) in f_src_opin_delays. Once we know what wire types are
         //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final
         //delay to reach the sink.
 
@@ -313,7 +436,7 @@ float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float critica
 
         auto from_ptc = rr_graph.node_ptc_num(from_node);
 
-        if (f_src_opin_reachable_wires[tile_index][from_ptc].empty()) {
+        if (f_src_opin_delays[tile_index][from_ptc].empty()) {
             //During lookahead profiling we were unable to find any wires which connected
             //to this PTC.
             //
@@ -324,1062 +447,1129 @@ float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float critica
             //Such RR graphs are of course unroutable, but that should be determined by the
             //router. So just return an arbitrary value here rather than error.
 
-            //We choose to return the largest (non-infinite) value possible, but scaled
-            //down by a large factor to maintain some dynaimc range in case this value ends
-            //up being processed (e.g. by the timing analyzer).
-            //
-            //The cost estimate should still be *extremely* large compared to a typical delay, and
-            //so should ensure that the router de-prioritizes exploring this path, but does not
-            //forbid the router from trying.
-            expected_cost = std::numeric_limits<float>::max() / 1e12;
+            return std::pair<float, float>(0.f, 0.f);
         } else {
             //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
             //and then add the estimates from those wire types for the distance of interest.
             //If there are multiple options we use the minimum value.
-            for (const auto& kv : f_src_opin_reachable_wires[tile_index][from_ptc]) {
+
+            float delay = 0;
+            float congestion = 0;
+            float expected_cost = std::numeric_limits<float>::infinity();
+
+            for (const auto& kv : f_src_opin_delays[tile_index][from_ptc]) {
                 const t_reachable_wire_inf& reachable_wire_inf = kv.second;
 
-                Cost_Entry wire_cost_entry;
+                util::Cost_Entry cost_entry;
                 if (reachable_wire_inf.wire_rr_type == SINK) {
                     //Some pins maybe reachable via a direct (OPIN -> IPIN) connection.
                     //In the lookahead, we treat such connections as 'special' wire types
                     //with no delay/congestion cost
-                    wire_cost_entry.delay = 0;
-                    wire_cost_entry.congestion = 0;
+                    cost_entry.delay = 0;
+                    cost_entry.congestion = 0;
                 } else {
                     //For an actual accessible wire, we query the wire look-up to get it's
                     //delay and congestion cost estimates
-                    wire_cost_entry = get_wire_cost_entry(reachable_wire_inf.wire_rr_type, reachable_wire_inf.wire_seg_index, delta_x, delta_y);
+                    cost_entry = cost_map_.find_cost(reachable_wire_inf.wire_seg_index, reachable_wire_inf.wire_rr_type, delta_x, delta_y);
                 }
 
-                float this_cost = (criticality_fac) * (reachable_wire_inf.congestion + wire_cost_entry.congestion)
-                                  + (1. - criticality_fac) * (reachable_wire_inf.delay + wire_cost_entry.delay);
+                float this_cost = (criticality_fac) * (reachable_wire_inf.delay + cost_entry.delay)
+                                  + (1. - criticality_fac) * (reachable_wire_inf.congestion + cost_entry.congestion);
 
-                expected_cost = std::min(expected_cost, this_cost);
+                if (this_cost < expected_cost) {
+                    delay = reachable_wire_inf.delay;
+                    congestion = reachable_wire_inf.congestion;
+                }
             }
-        }
-
-        VTR_ASSERT_SAFE_MSG(std::isfinite(expected_cost),
-                            vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
-                                            rr_node_arch_name(size_t(from_node)).c_str(),
-                                            describe_rr_node(size_t(from_node)).c_str())
-                                .c_str());
-
-    } else {
-        VTR_ASSERT_SAFE(from_type == CHANX || from_type == CHANY);
-        //When estimating costs from a wire, we directly look-up the result in the wire lookahead (f_wire_cost_map)
-
-        int from_cost_index = rr_graph.node_cost_index(from_node);
-        int from_seg_index = device_ctx.rr_indexed_data[from_cost_index].seg_index;
-
-        VTR_ASSERT(from_seg_index >= 0);
 
-        /* now get the expected cost from our lookahead map */
-        Cost_Entry cost_entry = get_wire_cost_entry(from_type, from_seg_index, delta_x, delta_y);
-
-        float expected_delay = cost_entry.delay;
-        float expected_congestion = cost_entry.congestion;
-
-        expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion;
+            return std::pair<float, float>(delay, congestion);
+        }
 
-        VTR_ASSERT_SAFE_MSG(std::isfinite(expected_cost),
+        VTR_ASSERT_SAFE_MSG(false,
                             vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
                                             rr_node_arch_name(size_t(from_node)).c_str(),
                                             describe_rr_node(size_t(from_node)).c_str())
                                 .c_str());
     }
 
-    return expected_cost;
+    return std::pair<float, float>(0.f, 0.f);
 }
 
-Cost_Entry get_wire_cost_entry(e_rr_type rr_type, int seg_index, int delta_x, int delta_y) {
-    VTR_ASSERT_SAFE(rr_type == CHANX || rr_type == CHANY);
-
-    int chan_index = 0;
-    if (rr_type == CHANY) {
-        chan_index = 1;
+// derive a cost from the map between two nodes
+float MapLookahead::get_map_cost(int from_node_ind,
+                                 int to_node_ind,
+                                 float criticality_fac) const {
+    if (from_node_ind == to_node_ind) {
+        return 0.f;
     }
 
-    VTR_ASSERT_SAFE(delta_x < (int)f_wire_cost_map.dim_size(2));
-    VTR_ASSERT_SAFE(delta_y < (int)f_wire_cost_map.dim_size(3));
-
-    return f_wire_cost_map[chan_index][seg_index][delta_x][delta_y];
-}
-
-/* Computes the lookahead map to be used by the router. If a map was computed prior to this, a new one will not be computed again.
- * The rr graph must have been built before calling this function. */
-void compute_router_lookahead(const std::vector<t_segment_inf>& segment_inf) {
-    vtr::ScopedStartFinishTimer timer("Computing router lookahead map");
-
-    //First compute the delay map when starting from the various wire types
-    //(CHANX/CHANY)in the routing architecture
-    compute_router_wire_lookahead(segment_inf);
-
-    //Next, compute which wire types are accessible (and the cost to reach them)
-    //from the different physical tile type's SOURCEs & OPINs
-    compute_router_src_opin_lookahead();
-}
-
-static void compute_router_wire_lookahead(const std::vector<t_segment_inf>& segment_inf) {
-    vtr::ScopedStartFinishTimer timer("Computing wire lookahead");
-
     auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
     auto& rr_graph = device_ctx.rr_nodes;
 
-    //Re-allocate
-    f_wire_cost_map = t_wire_cost_map({2, segment_inf.size(), device_ctx.grid.width(), device_ctx.grid.height()});
+    auto from_node_type = device_ctx.rr_nodes[to_node_ind].type();
 
-    int longest_length = 0;
-    for (const auto& seg_inf : segment_inf) {
-        longest_length = std::max(longest_length, seg_inf.length);
-    }
+    RRNodeId to_node(to_node_ind);
+    RRNodeId from_node(from_node_ind);
 
-    //Start sampling at the lower left non-corner
-    int ref_x = 1;
-    int ref_y = 1;
-
-    //Sample from locations near the reference location (to capture maximum distance paths)
-    //Also sample from locations at least the longest wire length away from the edge (to avoid
-    //edge effects for shorter distances)
-    std::vector<int> ref_increments = {0, 1,
-                                       longest_length, longest_length + 1};
-
-    //Uniquify the increments (avoid sampling the same locations repeatedly if they happen to
-    //overlap)
-    std::sort(ref_increments.begin(), ref_increments.end());
-    ref_increments.erase(std::unique(ref_increments.begin(), ref_increments.end()), ref_increments.end());
-
-    //Upper right non-corner
-    int target_x = device_ctx.grid.width() - 2;
-    int target_y = device_ctx.grid.height() - 2;
-
-    //Profile each wire segment type
-    for (int iseg = 0; iseg < int(segment_inf.size()); iseg++) {
-        //First try to pick good representative sample locations for each type
-        std::map<t_rr_type, std::vector<RRNodeId>> sample_nodes;
-        for (e_rr_type chan_type : {CHANX, CHANY}) {
-            for (int ref_inc : ref_increments) {
-                int sample_x = ref_x + ref_inc;
-                int sample_y = ref_y + ref_inc;
-
-                if (sample_x >= int(grid.width())) continue;
-                if (sample_y >= int(grid.height())) continue;
-
-                for (int track_offset = 0; track_offset < MAX_TRACK_OFFSET; track_offset += 2) {
-                    /* get the rr node index from which to start routing */
-                    RRNodeId start_node = get_start_node(sample_x, sample_y,
-                                                         target_x, target_y, //non-corner upper right
-                                                         chan_type, iseg, track_offset);
-
-                    if (!start_node) {
-                        continue;
-                    }
+    int dx, dy;
+    get_xy_deltas(from_node, to_node, &dx, &dy);
 
-                    sample_nodes[chan_type].push_back(RRNodeId(start_node));
-                }
-            }
-        }
+    float extra_delay, extra_congestion;
+    std::tie(extra_delay, extra_congestion) = this->get_src_opin_delays(from_node, dx, dy, criticality_fac);
 
-        //If we failed to find any representative sample locations, search exhaustively
-        //
-        //This is to ensure we sample 'unusual' wire types which may not exist in all channels
-        //(e.g. clock routing)
-        for (e_rr_type chan_type : {CHANX, CHANY}) {
-            if (!sample_nodes[chan_type].empty()) continue;
+    int from_seg_index = cost_map_.node_to_segment(from_node_ind);
+    util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, from_node_type, dx, dy);
 
-            //Try an exhaustive search to find a suitable sample point
-            for (int inode = 0; inode < int(device_ctx.rr_nodes.size()); ++inode) {
-                auto rr_node = RRNodeId(inode);
-                auto rr_type = rr_graph.node_type(rr_node);
-                if (rr_type != chan_type) continue;
+    if (!cost_entry.valid()) {
+        // there is no route
+        VTR_LOGV_DEBUG(f_router_debug,
+                       "Not connected %d (%s, %d) -> %d (%s)\n",
+                       from_node_ind, device_ctx.rr_nodes[from_node_ind].type_string(), from_seg_index,
+                       to_node_ind, device_ctx.rr_nodes[to_node_ind].type_string());
+        return std::numeric_limits<float>::infinity();
+    }
 
-                int cost_index = rr_graph.node_cost_index(rr_node);
-                VTR_ASSERT(cost_index != OPEN);
+    auto to_tile_type = device_ctx.grid[rr_graph.node_xlow(to_node)][rr_graph.node_ylow(to_node)].type;
+    auto to_tile_index = to_tile_type->index;
 
-                int seg_index = device_ctx.rr_indexed_data[cost_index].seg_index;
+    auto to_ptc = rr_graph.node_ptc_num(to_node);
 
-                if (seg_index == iseg) {
-                    sample_nodes[chan_type].push_back(RRNodeId(inode));
-                }
+    float site_pin_delay = std::numeric_limits<float>::infinity();
+    if (f_chan_ipins_delays[to_tile_index].size() != 0) {
+        for (const auto& kv : f_chan_ipins_delays[to_tile_index][to_ptc]) {
+            const t_reachable_wire_inf& reachable_wire_inf = kv.second;
 
-                if (sample_nodes[chan_type].size() >= ref_increments.size()) {
-                    break;
-                }
-            }
+            float this_delay = reachable_wire_inf.delay;
+            site_pin_delay = std::min(this_delay, site_pin_delay);
         }
+    }
 
-        //Finally, now that we have a list of sample locations, run a Djikstra flood from
-        //each sample location to profile the routing network from this type
-
-        t_dijkstra_data dijkstra_data;
-        t_routing_cost_map routing_cost_map({device_ctx.grid.width(), device_ctx.grid.height()});
-
-        for (e_rr_type chan_type : {CHANX, CHANY}) {
-            if (sample_nodes[chan_type].empty()) {
-                VTR_LOG_WARN("Unable to find any sample location for segment %s type '%s' (length %d)\n",
-                             rr_node_typename[chan_type],
-                             segment_inf[iseg].name.c_str(),
-                             segment_inf[iseg].length);
-            } else {
-                //reset cost for this segment
-                routing_cost_map.fill(Expansion_Cost_Entry());
-
-                for (RRNodeId sample_node : sample_nodes[chan_type]) {
-                    int sample_x = rr_graph.node_xlow(sample_node);
-                    int sample_y = rr_graph.node_ylow(sample_node);
-
-                    if (rr_graph.node_direction(sample_node) == DEC_DIRECTION) {
-                        sample_x = rr_graph.node_xhigh(sample_node);
-                        sample_y = rr_graph.node_yhigh(sample_node);
-                    }
-
-                    run_dijkstra(sample_node,
-                                 sample_x,
-                                 sample_y,
-                                 routing_cost_map,
-                                 &dijkstra_data);
-                }
-
-                if (false) print_router_cost_map(routing_cost_map);
-
-                /* boil down the cost list in routing_cost_map at each coordinate to a representative cost entry and store it in the lookahead
-                 * cost map */
-                set_lookahead_map_costs(iseg, chan_type, routing_cost_map);
-
-                /* fill in missing entries in the lookahead cost map by copying the closest cost entries (cost map was computed based on
-                 * a reference coordinate > (0,0) so some entries that represent a cross-chip distance have not been computed) */
-                fill_in_missing_lookahead_entries(iseg, chan_type);
-            }
-        }
+    float expected_delay = cost_entry.delay + extra_delay;
+    float expected_congestion = cost_entry.congestion + extra_congestion;
+
+    expected_delay += site_pin_delay;
+
+    float expected_cost = criticality_fac * expected_delay + (1.0 - criticality_fac) * expected_congestion;
+
+    VTR_LOGV_DEBUG(f_router_debug, "Requested lookahead from node \n\t%s to \n\t%s\n",
+                   describe_rr_node(from_node_ind).c_str(),
+                   describe_rr_node(to_node_ind).c_str());
+    const std::string& segment_name = device_ctx.segment_inf[from_seg_index].name;
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead returned %s (%d) with distance (%zd, %zd)\n",
+                   segment_name.c_str(), from_seg_index,
+                   dx, dy);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead delay: %g\n", expected_delay);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead congestion: %g\n", expected_congestion);
+    VTR_LOGV_DEBUG(f_router_debug, "Criticality: %g\n", criticality_fac);
+    VTR_LOGV_DEBUG(f_router_debug, "Lookahead cost: %g\n", expected_cost);
+    VTR_LOGV_DEBUG(f_router_debug, "Site pin delay: %g\n", site_pin_delay);
+
+    /* XXX: temporarily disable this for further debugging as it appears that some costs are set to infinity
+    if (!std::isfinite(expected_cost) {
+        VTR_LOG_ERROR("infinite cost for segment %d at (%d, %d)\n", from_seg_index, (int)dx, (int)dy);
+        VTR_ASSERT(0);
+    } */
+
+    if (expected_cost < 0.f) {
+        VTR_LOG_ERROR("negative cost for segment %d at (%d, %d)\n", from_seg_index, (int)dx, (int)dy);
+        VTR_ASSERT(0);
     }
 
-    if (false) print_wire_cost_map(segment_inf);
+    return expected_cost;
 }
 
-static void compute_router_src_opin_lookahead() {
-    vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead");
+// add a best cost routing path from start_node_ind to node_ind to routing costs
+template<typename Entry>
+static bool add_paths(int start_node_ind,
+                      Entry current,
+                      const std::vector<util::Search_Path>& paths,
+                      RoutingCosts* routing_costs) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    f_src_opin_reachable_wires.clear();
+    int node_ind = current.rr_node_ind;
+    RRNodeId node(node_ind);
 
-    f_src_opin_reachable_wires.resize(device_ctx.physical_tile_types.size());
+    bool new_sample_found = false;
 
-    std::vector<int> rr_nodes_at_loc;
+    auto to_tile_type = device_ctx.grid[rr_graph.node_xlow(node)][rr_graph.node_ylow(node)].type;
+    auto to_tile_index = to_tile_type->index;
 
-    //We assume that the routing connectivity of each instance of a physical tile is the same,
-    //and so only measure one instance of each type
-    for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) {
-        for (e_rr_type rr_type : {SOURCE, OPIN}) {
-            vtr::Point<int> sample_loc(-1, -1);
+    auto to_ptc = rr_graph.node_ptc_num(node);
 
-            size_t num_sampled_locs = 0;
-            bool ptcs_with_no_reachable_wires = true;
-            while (ptcs_with_no_reachable_wires) { //Haven't found wire connected to ptc
-                ptcs_with_no_reachable_wires = false;
+    float site_pin_delay = std::numeric_limits<float>::infinity();
+    if (f_chan_ipins_delays[to_tile_index].size() != 0) {
+        for (const auto& kv : f_chan_ipins_delays[to_tile_index][to_ptc]) {
+            const t_reachable_wire_inf& reachable_wire_inf = kv.second;
 
-                sample_loc = pick_sample_tile(&device_ctx.physical_tile_types[itile], sample_loc);
+            float this_delay = reachable_wire_inf.delay;
+            site_pin_delay = std::min(this_delay, site_pin_delay);
+        }
+    }
 
-                if (sample_loc.x() == -1 && sample_loc.y() == -1) {
-                    //No untried instances of the current tile type left
-                    VTR_LOG_WARN("Found no %ssample locations for %s in %s\n",
-                                 (num_sampled_locs == 0) ? "" : "more ",
-                                 rr_node_typename[rr_type],
-                                 device_ctx.physical_tile_types[itile].name);
-                    break;
-                }
+    // reconstruct the path
+    std::vector<int> path;
+    for (int i = paths[node_ind].parent; i != start_node_ind; i = paths[i].parent) {
+        VTR_ASSERT(i != -1);
+        path.push_back(i);
+    }
+    path.push_back(start_node_ind);
 
-                //VTR_LOG("Sampling %s at (%d,%d)\n", device_ctx.physical_tile_types[itile].name, sample_loc.x(), sample_loc.y());
+    current.adjust_Tsw(-site_pin_delay);
 
-                rr_nodes_at_loc.clear();
+    // add each node along the path subtracting the incremental costs from the current costs
+    Entry start_to_here(start_node_ind, UNDEFINED, nullptr);
+    int parent = start_node_ind;
+    for (auto it = path.rbegin(); it != path.rend(); it++) {
+        RRNodeId this_node(*it);
+        auto& here = device_ctx.rr_nodes[*it];
+        int seg_index = device_ctx.rr_indexed_data[here.cost_index()].seg_index;
 
-                get_rr_node_indices(device_ctx.rr_node_indices, sample_loc.x(), sample_loc.y(), rr_type, &rr_nodes_at_loc);
-                for (int inode : rr_nodes_at_loc) {
-                    if (inode < 0) continue;
+        auto chan_type = rr_graph.node_type(node);
 
-                    RRNodeId node_id(inode);
+        int ichan = 0;
+        if (chan_type == CHANY) {
+            ichan = 1;
+        }
 
-                    int ptc = rr_graph.node_ptc_num(node_id);
+        int delta_x, delta_y;
+        get_xy_deltas(this_node, node, &delta_x, &delta_y);
 
-                    if (ptc >= int(f_src_opin_reachable_wires[itile].size())) {
-                        f_src_opin_reachable_wires[itile].resize(ptc + 1); //Inefficient but functional...
-                    }
+        vtr::Point<int> delta(delta_x, delta_y);
 
-                    //Find the wire types which are reachable from inode and record them and
-                    //the cost to reach them
-                    dijkstra_flood_to_wires(itile, node_id, f_src_opin_reachable_wires);
+        RoutingCostKey key = {
+            ichan,
+            seg_index,
+            delta};
 
-                    if (f_src_opin_reachable_wires[itile][ptc].empty()) {
-                        VTR_LOG_WARN("Found no reachable wires from %s (%s) at (%d,%d)\n",
-                                     rr_node_typename[rr_type],
-                                     rr_node_arch_name(inode).c_str(),
-                                     sample_loc.x(),
-                                     sample_loc.y());
+        if (*it != start_node_ind) {
+            auto& parent_node = device_ctx.rr_nodes[parent];
+            start_to_here = Entry(*it, parent_node.edge_switch(paths[*it].edge), &start_to_here);
+            parent = *it;
+        }
 
-                        ptcs_with_no_reachable_wires = true;
-                    }
-                }
+        float cost = current.cost() - start_to_here.cost();
+        if (cost < 0.f && cost > -10e-15 /* 10 femtosecond */) {
+            cost = 0.f;
+        }
 
-                ++num_sampled_locs;
-            }
-            if (ptcs_with_no_reachable_wires) {
-                VPR_ERROR(VPR_ERROR_ROUTE, "Some SOURCE/OPINs have no reachable wires\n");
+        VTR_ASSERT(cost >= 0.f);
+
+        // NOTE: implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+        auto result = routing_costs->insert(std::make_pair(key, cost));
+        if (!result.second) {
+            if (cost < result.first->second) {
+                result.first->second = cost;
+                new_sample_found = true;
+            } else if (BREAK_ON_MISS) {
+                break;
             }
+        } else {
+            new_sample_found = true;
         }
     }
+    return new_sample_found;
 }
 
-static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> prev) {
-    //Very simple for now, just pick the fist matching tile found
-    vtr::Point<int> loc(OPEN, OPEN);
-
-    //VTR_LOG("Prev: %d,%d\n", prev.x(), prev.y());
-
+/* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */
+static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y) {
     auto& device_ctx = g_vpr_ctx.device();
-    auto& grid = device_ctx.grid;
-
-    int y_init = prev.y() + 1; //Start searching next element above prev
-
-    for (int x = prev.x(); x < int(grid.width()); ++x) {
-        if (x < 0) continue;
+    auto& rr_graph = device_ctx.rr_nodes;
 
-        //VTR_LOG("  x: %d\n", x);
+    e_rr_type from_type = rr_graph.node_type(from_node);
+    e_rr_type to_type = rr_graph.node_type(to_node);
 
-        for (int y = y_init; y < int(grid.height()); ++y) {
-            if (y < 0) continue;
+    if (!is_chan(from_type) && !is_chan(to_type)) {
+        //Alternate formulation for non-channel types
+        int from_x = 0;
+        int from_y = 0;
+        adjust_rr_position(from_node, from_x, from_y);
 
-            //VTR_LOG("   y: %d\n", y);
-            if (grid[x][y].type == tile_type) {
-                loc.set_x(x);
-                loc.set_y(y);
-                break;
-            }
+        int to_x = 0;
+        int to_y = 0;
+        adjust_rr_position(to_node, to_x, to_y);
+
+        *delta_x = to_x - from_x;
+        *delta_y = to_y - from_y;
+    } else {
+        //Traditional formulation
+
+        /* get chan/seg coordinates of the from/to nodes. seg coordinate is along the wire,
+         * chan coordinate is orthogonal to the wire */
+        int from_seg_low;
+        int from_seg_high;
+        int from_chan;
+        int to_seg;
+        int to_chan;
+        if (from_type == CHANY) {
+            from_seg_low = rr_graph.node_ylow(from_node);
+            from_seg_high = rr_graph.node_yhigh(from_node);
+            from_chan = rr_graph.node_xlow(from_node);
+            to_seg = rr_graph.node_ylow(to_node);
+            to_chan = rr_graph.node_xlow(to_node);
+        } else {
+            from_seg_low = rr_graph.node_xlow(from_node);
+            from_seg_high = rr_graph.node_xhigh(from_node);
+            from_chan = rr_graph.node_ylow(from_node);
+            to_seg = rr_graph.node_xlow(to_node);
+            to_chan = rr_graph.node_ylow(to_node);
         }
 
-        if (loc.x() != OPEN && loc.y() != OPEN) {
-            break;
+        /* now we want to count the minimum number of *channel segments* between the from and to nodes */
+        int delta_seg, delta_chan;
+
+        /* orthogonal to wire */
+        int no_need_to_pass_by_clb = 0; //if we need orthogonal wires then we don't need to pass by the target CLB along the current wire direction
+        if (to_chan > from_chan + 1) {
+            /* above */
+            delta_chan = to_chan - from_chan;
+            no_need_to_pass_by_clb = 1;
+        } else if (to_chan < from_chan) {
+            /* below */
+            delta_chan = from_chan - to_chan + 1;
+            no_need_to_pass_by_clb = 1;
         } else {
-            y_init = 0; //Prepare to search next column
+            /* adjacent to current channel */
+            delta_chan = 0;
+            no_need_to_pass_by_clb = 0;
+        }
+
+        /* along same direction as wire. */
+        if (to_seg > from_seg_high) {
+            /* ahead */
+            delta_seg = to_seg - from_seg_high - no_need_to_pass_by_clb;
+        } else if (to_seg < from_seg_low) {
+            /* behind */
+            delta_seg = from_seg_low - to_seg - no_need_to_pass_by_clb;
+        } else {
+            /* along the span of the wire */
+            delta_seg = 0;
+        }
+
+        /* account for wire direction. lookahead map was computed by looking up and to the right starting at INC wires. for targets
+         * that are opposite of the wire direction, let's add 1 to delta_seg */
+        e_direction from_dir = rr_graph.node_direction(from_node);
+        if (is_chan(from_type)
+            && ((to_seg < from_seg_low && from_dir == INC_DIRECTION) || (to_seg > from_seg_high && from_dir == DEC_DIRECTION))) {
+            delta_seg++;
+        }
+
+        if (from_type == CHANY) {
+            *delta_x = delta_chan;
+            *delta_y = delta_seg;
+        } else {
+            *delta_x = delta_seg;
+            *delta_y = delta_chan;
         }
     }
-    //VTR_LOG("Next: %d,%d\n", loc.x(), loc.y());
 
-    return loc;
+    VTR_ASSERT_SAFE(std::abs(*delta_x) < (int)device_ctx.grid.width());
+    VTR_ASSERT_SAFE(std::abs(*delta_y) < (int)device_ctx.grid.height());
 }
 
-void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_reachable_wires& src_opin_reachable_wires) {
+static void adjust_rr_position(const RRNodeId rr, int& x, int& y) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    struct t_pq_entry {
-        float delay;
-        float congestion;
-        RRNodeId node;
-
-        bool operator<(const t_pq_entry& rhs) const {
-            return this->delay < rhs.delay;
-        }
-    };
-
-    std::priority_queue<t_pq_entry> pq;
-
-    t_pq_entry root;
-    root.congestion = 0.;
-    root.delay = 0.;
-    root.node = node;
+    e_rr_type rr_type = rr_graph.node_type(rr);
 
-    int ptc = rr_graph.node_ptc_num(node);
+    if (is_chan(rr_type)) {
+        adjust_rr_wire_position(rr, x, y);
+    } else if (is_pin(rr_type)) {
+        adjust_rr_pin_position(rr, x, y);
+    } else {
+        VTR_ASSERT_SAFE(is_src_sink(rr_type));
+        adjust_rr_src_sink_position(rr, x, y);
+    }
+}
 
+static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y) {
     /*
-     * Perform Djikstra from the SOURCE/OPIN of interest, stopping at the the first
-     * reachable wires (i.e until we hit the inter-block routing network), or a SINK
-     * (via a direct-connect).
+     * VPR uses a co-ordinate system where wires above and to the right of a block
+     * are at the same location as the block:
      *
-     * Note that typical RR graphs are structured :
      *
-     *      SOURCE ---> OPIN --> CHANX/CHANY
-     *              |
-     *              --> OPIN --> CHANX/CHANY
-     *              |
-     *             ...
+     *       <-----------C
+     *    D
+     *    |  +---------+  ^
+     *    |  |         |  |
+     *    |  |  (1,1)  |  |
+     *    |  |         |  |
+     *    V  +---------+  |
+     *                    A
+     *     B----------->
      *
-     *   possibly with direct-connects of the form:
+     * So wires are located as follows:
      *
-     *      SOURCE --> OPIN --> IPIN --> SINK
+     *      A: (1, 1) CHANY
+     *      B: (1, 0) CHANX
+     *      C: (1, 1) CHANX
+     *      D: (0, 1) CHANY
      *
-     * and there is a small number of fixed hops from SOURCE/OPIN to CHANX/CHANY or
-     * to a SINK (via a direct-connect), so this runs very fast (i.e. O(1))
+     * But all pins incident on the surrounding channels
+     * would be at (1,1) along with a relevant side.
+     *
+     * In the following, we adjust the positions of pins to
+     * account for the channel they are incident too.
+     *
+     * Note that blocks at (0,*) such as IOs, may have (unconnected)
+     * pins on the left side, so we also clip the minimum x to zero.
+     * Similarly for blocks at (*,0) we clip the minimum y to zero.
      */
-    pq.push(root);
-    while (!pq.empty()) {
-        t_pq_entry curr = pq.top();
-        pq.pop();
-
-        e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
-        if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) {
-            //We stop expansion at any CHANX/CHANY/SINK
-            int seg_index;
-            if (curr_rr_type != SINK) {
-                //It's a wire, figure out its type
-                int cost_index = rr_graph.node_cost_index(curr.node);
-                seg_index = device_ctx.rr_indexed_data[cost_index].seg_index;
-            } else {
-                //This is a direct-connect path between an IPIN and OPIN,
-                //which terminated at a SINK.
-                //
-                //We treat this as a 'special' wire type
-                seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE;
-            }
-
-            //Keep costs of the best path to reach each wire type
-            if (!src_opin_reachable_wires[itile][ptc].count(seg_index)
-                || curr.delay < src_opin_reachable_wires[itile][ptc][seg_index].delay) {
-                src_opin_reachable_wires[itile][ptc][seg_index].wire_rr_type = curr_rr_type;
-                src_opin_reachable_wires[itile][ptc][seg_index].wire_seg_index = seg_index;
-                src_opin_reachable_wires[itile][ptc][seg_index].delay = curr.delay;
-                src_opin_reachable_wires[itile][ptc][seg_index].congestion = curr.congestion;
-            }
-
-        } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) {
-            //We allow expansion through SOURCE/OPIN/IPIN types
-            int cost_index = rr_graph.node_cost_index(curr.node);
-            float incr_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
 
-            for (RREdgeId edge : rr_graph.edge_range(curr.node)) {
-                int iswitch = rr_graph.edge_switch(edge);
-                float incr_delay = device_ctx.rr_switch_inf[iswitch].Tdel;
+    VTR_ASSERT_SAFE(is_pin(rr_graph.node_type(rr)));
+    VTR_ASSERT_SAFE(rr_graph.node_xlow(rr) == rr_graph.node_xhigh(rr));
+    VTR_ASSERT_SAFE(rr_graph.node_ylow(rr) == rr_graph.node_yhigh(rr));
 
-                RRNodeId next_node = rr_graph.edge_sink_node(edge);
+    x = rr_graph.node_xlow(rr);
+    y = rr_graph.node_ylow(rr);
 
-                t_pq_entry next;
-                next.congestion = curr.congestion + incr_cong; //Of current node
-                next.delay = curr.delay + incr_delay;          //To reach next node
-                next.node = next_node;
+    e_side rr_side = rr_graph.node_side(rr);
 
-                pq.push(next);
-            }
-        } else {
-            VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type");
-        }
+    if (rr_side == LEFT) {
+        x -= 1;
+        x = std::max(x, 0);
+    } else if (rr_side == BOTTOM && y > 0) {
+        y -= 1;
+        y = std::max(y, 0);
     }
 }
 
-/* returns index of a node from which to start routing */
-static RRNodeId get_start_node(int start_x, int start_y, int target_x, int target_y, t_rr_type rr_type, int seg_index, int track_offset) {
+static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    int result = UNDEFINED;
-
-    if (rr_type != CHANX && rr_type != CHANY) {
-        VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Must start lookahead routing from CHANX or CHANY node\n");
-    }
-
-    /* determine which direction the wire should go in based on the start & target coordinates */
-    e_direction direction = INC_DIRECTION;
-    if ((rr_type == CHANX && target_x < start_x) || (rr_type == CHANY && target_y < start_y)) {
-        direction = DEC_DIRECTION;
-    }
-
-    int start_lookup_x = start_x;
-    int start_lookup_y = start_y;
-    if (rr_type == CHANX) {
-        //Bizarely, rr_node_indices stores CHANX with swapped x/y...
-        std::swap(start_lookup_x, start_lookup_y);
-    }
-
-    const std::vector<int>& channel_node_list = device_ctx.rr_node_indices[rr_type][start_lookup_x][start_lookup_y][0];
-
-    /* find first node in channel that has specified segment index and goes in the desired direction */
-    for (unsigned itrack = 0; itrack < channel_node_list.size(); itrack++) {
-        int node_ind = channel_node_list[itrack];
-        if (node_ind < 0) continue;
-
-        RRNodeId node_id(node_ind);
-
-        VTR_ASSERT(rr_graph.node_type(node_id) == rr_type);
+    VTR_ASSERT_SAFE(is_chan(rr_graph.node_type(rr)));
 
-        e_direction node_direction = rr_graph.node_direction(node_id);
-        int node_cost_ind = rr_graph.node_cost_index(node_id);
-        int node_seg_ind = device_ctx.rr_indexed_data[node_cost_ind].seg_index;
+    e_direction rr_dir = rr_graph.node_direction(rr);
 
-        if ((node_direction == direction || node_direction == BI_DIRECTION) && node_seg_ind == seg_index) {
-            /* found first track that has the specified segment index and goes in the desired direction */
-            result = node_ind;
-            if (track_offset == 0) {
-                break;
-            }
-            track_offset -= 2;
-        }
+    if (rr_dir == DEC_DIRECTION) {
+        x = rr_graph.node_xhigh(rr);
+        y = rr_graph.node_yhigh(rr);
+    } else if (rr_dir == INC_DIRECTION) {
+        x = rr_graph.node_xlow(rr);
+        y = rr_graph.node_ylow(rr);
+    } else {
+        VTR_ASSERT_SAFE(rr_dir == BI_DIRECTION);
+        //Not sure what to do here...
+        //Try average for now.
+        x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.);
+        y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.);
     }
-
-    return RRNodeId(result);
 }
 
-/* runs Dijkstra's algorithm from specified node until all nodes have been visited. Each time a pin is visited, the delay/congestion information
- * to that pin is stored is added to an entry in the routing_cost_map */
-static void run_dijkstra(RRNodeId start_node, int start_x, int start_y, t_routing_cost_map& routing_cost_map, t_dijkstra_data* data) {
+static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y) {
+    //SOURCE/SINK nodes assume the full dimensions of their
+    //associated block
+    //
+    //Use the average position.
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    auto& node_expanded = data->node_expanded;
-    node_expanded.resize(device_ctx.rr_nodes.size());
-    std::fill(node_expanded.begin(), node_expanded.end(), false);
+    VTR_ASSERT_SAFE(is_src_sink(rr_graph.node_type(rr)));
 
-    auto& node_visited_costs = data->node_visited_costs;
-    node_visited_costs.resize(device_ctx.rr_nodes.size());
-    std::fill(node_visited_costs.begin(), node_visited_costs.end(), -1.0);
+    x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.);
+    y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.);
+}
 
+/* Runs Dijkstra's algorithm from specified node until all nodes have been
+ * visited. Each time a pin is visited, the delay/congestion information
+ * to that pin is stored to an entry in the routing_cost_map.
+ *
+ * Returns the maximum (last) minimum cost path stored, and
+ * the number of paths from start_node_ind stored. */
+template<typename Entry>
+static std::pair<float, int> run_dijkstra(int start_node_ind,
+                                          std::vector<bool>* node_expanded,
+                                          std::vector<util::Search_Path>* paths,
+                                          RoutingCosts* routing_costs) {
+    auto& device_ctx = g_vpr_ctx.device();
+    int path_count = 0;
+
+    /* a list of boolean flags (one for each rr node) to figure out if a
+     * certain node has already been expanded */
+    std::fill(node_expanded->begin(), node_expanded->end(), false);
+    /* For each node keep a list of the cost with which that node has been
+     * visited (used to determine whether to push a candidate node onto the
+     * expansion queue.
+     * Also store the parent node so we can reconstruct a specific path. */
+    std::fill(paths->begin(), paths->end(), util::Search_Path{std::numeric_limits<float>::infinity(), -1, -1});
     /* a priority queue for expansion */
-    std::priority_queue<PQ_Entry>& pq = data->pq;
-
-    //Clear priority queue if non-empty
-    while (!pq.empty()) {
-        pq.pop();
-    }
+    std::priority_queue<Entry, std::vector<Entry>, std::greater<Entry>> pq;
 
     /* first entry has no upstream delay or congestion */
-    PQ_Entry first_entry(start_node, UNDEFINED, 0, 0, 0, true);
+    Entry first_entry(start_node_ind, UNDEFINED, nullptr);
+    float max_cost = first_entry.cost();
 
     pq.push(first_entry);
 
     /* now do routing */
     while (!pq.empty()) {
-        PQ_Entry current = pq.top();
+        auto current = pq.top();
         pq.pop();
 
-        RRNodeId curr_node = current.rr_node;
+        int node_ind = current.rr_node_ind;
 
         /* check that we haven't already expanded from this node */
-        if (node_expanded[curr_node]) {
+        if ((*node_expanded)[node_ind]) {
             continue;
         }
 
-        //VTR_LOG("Expanding with delay=%10.3g cong=%10.3g (%s)\n", current.delay, current.congestion_upstream, describe_rr_node(curr_node).c_str());
-
         /* if this node is an ipin record its congestion/delay in the routing_cost_map */
-        if (rr_graph.node_type(curr_node) == IPIN) {
-            int ipin_x = rr_graph.node_xlow(curr_node);
-            int ipin_y = rr_graph.node_ylow(curr_node);
-
-            if (ipin_x >= start_x && ipin_y >= start_y) {
-                int delta_x, delta_y;
-                get_xy_deltas(start_node, curr_node, &delta_x, &delta_y);
-                delta_x = std::abs(delta_x);
-                delta_y = std::abs(delta_y);
+        if (device_ctx.rr_nodes[node_ind].type() == IPIN) {
+            // the last cost should be the highest
+            max_cost = current.cost();
 
-                routing_cost_map[delta_x][delta_y].add_cost_entry(current.delay, current.congestion_upstream);
-            }
+            path_count++;
+            add_paths<Entry>(start_node_ind, current, *paths, routing_costs);
+        } else {
+            util::expand_dijkstra_neighbours(device_ctx.rr_nodes,
+                                             current, paths, node_expanded, &pq);
+            (*node_expanded)[node_ind] = true;
         }
-
-        expand_dijkstra_neighbours(current, node_visited_costs, node_expanded, pq);
-        node_expanded[curr_node] = true;
     }
+    return std::make_pair(max_cost, path_count);
 }
 
-/* iterates over the children of the specified node and selectively pushes them onto the priority queue */
-static void expand_dijkstra_neighbours(PQ_Entry parent_entry, vtr::vector<RRNodeId, float>& node_visited_costs, vtr::vector<RRNodeId, bool>& node_expanded, std::priority_queue<PQ_Entry>& pq) {
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
+// compute the cost maps for lookahead
+void MapLookahead::compute(const std::vector<t_segment_inf>& segment_inf) {
+    compute_router_src_opin_lookahead();
+    compute_router_chan_ipin_lookahead();
 
-    RRNodeId parent = parent_entry.rr_node;
+    vtr::ScopedStartFinishTimer timer("Computing connection box lookahead map");
 
-    for (RREdgeId edge : rr_graph.edge_range(parent)) {
-        RRNodeId child_node = rr_graph.edge_sink_node(edge);
-        int switch_ind = rr_graph.edge_switch(edge);
+    // Initialize rr_node_route_inf if not already
+    alloc_and_load_rr_node_route_structs();
 
-        if (rr_graph.node_type(child_node) == SINK) return;
+    size_t num_segments = segment_inf.size();
+    std::vector<SampleRegion> sample_regions = find_sample_regions(num_segments);
 
-        /* skip this child if it has already been expanded from */
-        if (node_expanded[child_node]) {
-            continue;
-        }
+    /* free previous delay map and allocate new one */
+    auto& device_ctx = g_vpr_ctx.device();
+    cost_map_.set_counts(segment_inf.size());
+    cost_map_.build_segment_map();
+
+    VTR_ASSERT(REPRESENTATIVE_ENTRY_METHOD == util::SMALLEST);
+    RoutingCosts all_delay_costs;
+    RoutingCosts all_base_costs;
+
+    /* run Dijkstra's algorithm for each segment type & channel type combination */
+#if defined(VPR_USE_TBB)
+    tbb::mutex all_costs_mutex;
+    tbb::parallel_for_each(sample_regions, [&](const SampleRegion& region) {
+#else
+    for (const auto& region : sample_regions) {
+#endif
+        // holds the cost entries for a run
+        RoutingCosts delay_costs;
+        RoutingCosts base_costs;
+        int total_path_count = 0;
+        std::vector<bool> node_expanded(device_ctx.rr_nodes.size());
+        std::vector<util::Search_Path> paths(device_ctx.rr_nodes.size());
+
+        for (auto& point : region.points) {
+            // statistics
+            vtr::Timer run_timer;
+            float max_delay_cost = 0.f;
+            float max_base_cost = 0.f;
+            int path_count = 0;
+            for (auto node_ind : point.nodes) {
+                {
+                    auto result = run_dijkstra<util::PQ_Entry_Delay>(node_ind, &node_expanded, &paths, &delay_costs);
+                    max_delay_cost = std::max(max_delay_cost, result.first);
+                    path_count += result.second;
+                }
+                {
+                    auto result = run_dijkstra<util::PQ_Entry_Base_Cost>(node_ind, &node_expanded, &paths, &base_costs);
+                    max_base_cost = std::max(max_base_cost, result.first);
+                    path_count += result.second;
+                }
+            }
 
-        PQ_Entry child_entry(child_node, switch_ind, parent_entry.delay,
-                             parent_entry.R_upstream, parent_entry.congestion_upstream, false);
+            if (path_count > 0) {
+                VTR_LOG("Expanded %d paths of segment type %s(%d) starting at (%d, %d) from %d segments, max_cost %e %e (%g paths/sec)\n",
+                        path_count, segment_inf[region.segment_type].name.c_str(), region.segment_type,
+                        point.location.x(), point.location.y(),
+                        (int)point.nodes.size(),
+                        max_delay_cost, max_base_cost,
+                        path_count / run_timer.elapsed_sec());
+            }
 
-        //VTR_ASSERT(child_entry.cost >= 0); //Asertion fails in practise. TODO: debug
+            /*
+             * if (path_count == 0) {
+             * for (auto node_ind : point.nodes) {
+             * VTR_LOG("Expanded node %s\n", describe_rr_node(node_ind).c_str());
+             * }
+             * }
+             */
 
-        /* skip this child if it has been visited with smaller cost */
-        if (node_visited_costs[child_node] >= 0 && node_visited_costs[child_node] < child_entry.cost) {
-            continue;
+            total_path_count += path_count;
+            if (total_path_count > MIN_PATH_COUNT) {
+                break;
+            }
         }
 
-        /* finally, record the cost with which the child was visited and put the child entry on the queue */
-        node_visited_costs[child_node] = child_entry.cost;
-        pq.push(child_entry);
-    }
-}
-
-/* sets the lookahead cost map entries based on representative cost entries from routing_cost_map */
-static void set_lookahead_map_costs(int segment_index, e_rr_type chan_type, t_routing_cost_map& routing_cost_map) {
-    int chan_index = 0;
-    if (chan_type == CHANY) {
-        chan_index = 1;
-    }
+#if defined(VPR_USE_TBB)
+        all_costs_mutex.lock();
+#endif
 
-    /* set the lookahead cost map entries with a representative cost entry from routing_cost_map */
-    for (unsigned ix = 0; ix < routing_cost_map.dim_size(0); ix++) {
-        for (unsigned iy = 0; iy < routing_cost_map.dim_size(1); iy++) {
-            Expansion_Cost_Entry& expansion_cost_entry = routing_cost_map[ix][iy];
+        if (total_path_count == 0) {
+            VTR_LOG_WARN("No paths found for sample region %s(%d, %d)\n",
+                         segment_inf[region.segment_type].name.c_str(), region.grid_location.x(), region.grid_location.y());
+        }
 
-            f_wire_cost_map[chan_index][segment_index][ix][iy] = expansion_cost_entry.get_representative_cost_entry(REPRESENTATIVE_ENTRY_METHOD);
+        // combine the cost map from this run with the final cost maps for each segment
+        for (const auto& cost : delay_costs) {
+            const auto& val = cost.second;
+            auto result = all_delay_costs.insert(std::make_pair(cost.first, val));
+            if (!result.second) {
+                // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+                result.first->second = std::min(result.first->second, val);
+            }
+        }
+        for (const auto& cost : base_costs) {
+            const auto& val = cost.second;
+            auto result = all_base_costs.insert(std::make_pair(cost.first, val));
+            if (!result.second) {
+                // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+                result.first->second = std::min(result.first->second, val);
+            }
         }
-    }
-}
 
-/* fills in missing lookahead map entries by copying the cost of the closest valid entry */
-static void fill_in_missing_lookahead_entries(int segment_index, e_rr_type chan_type) {
-    int chan_index = 0;
-    if (chan_type == CHANY) {
-        chan_index = 1;
+#if defined(VPR_USE_TBB)
+        all_costs_mutex.unlock();
+    });
+#else
     }
+#endif
 
-    auto& device_ctx = g_vpr_ctx.device();
+    VTR_LOG("Combining results\n");
+    /* boil down the cost list in routing_cost_map at each coordinate to a
+     * representative cost entry and store it in the lookahead cost map */
+    cost_map_.set_cost_map(all_delay_costs, all_base_costs);
+
+// diagnostics
+#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_ENTRIES)
+    for (auto& cost : all_costs) {
+        const auto& key = cost.first;
+        const auto& val = cost.second;
+        VTR_LOG("%d -> %d (%d, %d): %g, %g\n",
+                val.from_node, val.to_node,
+                key.delta.x(), key.delta.y(),
+                val.cost_entry.delay, val.cost_entry.congestion);
+    }
+#endif
 
-    /* find missing cost entries and fill them in by copying a nearby cost entry */
-    for (unsigned ix = 0; ix < device_ctx.grid.width(); ix++) {
-        for (unsigned iy = 0; iy < device_ctx.grid.height(); iy++) {
-            Cost_Entry cost_entry = f_wire_cost_map[chan_index][segment_index][ix][iy];
+#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS)
+    for (int iseg = 0; iseg < (ssize_t)num_segments; iseg++) {
+        VTR_LOG("cost map for %s(%d)\n",
+                segment_inf[iseg].name.c_str(), iseg);
+        cost_map_.print(iseg);
+    }
+#endif
 
-            if (std::isnan(cost_entry.delay) && std::isnan(cost_entry.congestion)) {
-                Cost_Entry copied_entry = get_nearby_cost_entry(ix, iy, segment_index, chan_index);
-                f_wire_cost_map[chan_index][segment_index][ix][iy] = copied_entry;
-            }
-        }
+#if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_EMPTY_MAPS)
+    for (std::pair<int, int> p : cost_map_.list_empty()) {
+        int ichan, iseg;
+        std::tie(ichan, iseg) = p;
+        VTR_LOG("cost map for %s(%d), chan %d EMPTY\n",
+                segment_inf[iseg].name.c_str(), iseg, box_id);
     }
+#endif
 }
 
-/* returns a cost entry in the f_wire_cost_map that is near the specified coordinates (and preferably towards (0,0)) */
-static Cost_Entry get_nearby_cost_entry(int x, int y, int segment_index, int chan_index) {
-    /* compute the slope from x,y to 0,0 and then move towards 0,0 by one unit to get the coordinates
-     * of the cost entry to be copied */
+static void compute_router_src_opin_lookahead() {
+    vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead");
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
 
-    //VTR_ASSERT(x > 0 || y > 0); //Asertion fails in practise. TODO: debug
+    f_src_opin_delays.clear();
 
-    float slope;
-    if (x == 0) {
-        slope = 1e12; //just a really large number
-    } else if (y == 0) {
-        slope = 1e-12; //just a really small number
-    } else {
-        slope = (float)y / (float)x;
-    }
+    f_src_opin_delays.resize(device_ctx.physical_tile_types.size());
 
-    int copy_x, copy_y;
-    if (slope >= 1.0) {
-        copy_y = y - 1;
-        copy_x = vtr::nint((float)y / slope);
-    } else {
-        copy_x = x - 1;
-        copy_y = vtr::nint((float)x * slope);
-    }
+    std::vector<int> rr_nodes_at_loc;
 
-    copy_y = std::max(copy_y, 0); //Clip to zero
-    copy_x = std::max(copy_x, 0); //Clip to zero
+    //We assume that the routing connectivity of each instance of a physical tile is the same,
+    //and so only measure one instance of each type
+    for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) {
+        for (e_rr_type rr_type : {SOURCE, OPIN}) {
+            vtr::Point<int> sample_loc(-1, -1);
 
-    Cost_Entry copy_entry = f_wire_cost_map[chan_index][segment_index][copy_x][copy_y];
+            size_t num_sampled_locs = 0;
+            bool ptcs_with_no_delays = true;
+            while (ptcs_with_no_delays) { //Haven't found wire connected to ptc
+                ptcs_with_no_delays = false;
 
-    /* if the entry to be copied is also empty, recurse */
-    if (std::isnan(copy_entry.delay) && std::isnan(copy_entry.congestion)) {
-        if (copy_x == 0 && copy_y == 0) {
-            copy_entry = Cost_Entry(0., 0.); //(0, 0) entry is invalid so set zero to terminate recursion
-        } else {
-            copy_entry = get_nearby_cost_entry(copy_x, copy_y, segment_index, chan_index);
-        }
-    }
+                sample_loc = pick_sample_tile(&device_ctx.physical_tile_types[itile], sample_loc);
 
-    return copy_entry;
-}
+                if (sample_loc.x() == -1 && sample_loc.y() == -1) {
+                    //No untried instances of the current tile type left
+                    VTR_LOG_WARN("Found no %ssample locations for %s in %s\n",
+                                 (num_sampled_locs == 0) ? "" : "more ",
+                                 rr_node_typename[rr_type],
+                                 device_ctx.physical_tile_types[itile].name);
+                    break;
+                }
 
-/* returns cost entry with the smallest delay */
-Cost_Entry Expansion_Cost_Entry::get_smallest_entry() {
-    Cost_Entry smallest_entry;
+                //VTR_LOG("Sampling %s at (%d,%d)\n", device_ctx.physical_tile_types[itile].name, sample_loc.x(), sample_loc.y());
 
-    for (auto entry : this->cost_vector) {
-        if (std::isnan(smallest_entry.delay) || entry.delay < smallest_entry.delay) {
-            smallest_entry = entry;
-        }
-    }
+                rr_nodes_at_loc.clear();
 
-    return smallest_entry;
-}
+                get_rr_node_indices(device_ctx.rr_node_indices, sample_loc.x(), sample_loc.y(), rr_type, &rr_nodes_at_loc);
+                for (int inode : rr_nodes_at_loc) {
+                    if (inode < 0) continue;
 
-/* returns a cost entry that represents the average of all the recorded entries */
-Cost_Entry Expansion_Cost_Entry::get_average_entry() {
-    float avg_delay = 0;
-    float avg_congestion = 0;
+                    RRNodeId node_id(inode);
 
-    for (auto cost_entry : this->cost_vector) {
-        avg_delay += cost_entry.delay;
-        avg_congestion += cost_entry.congestion;
-    }
+                    int ptc = rr_graph.node_ptc_num(node_id);
 
-    avg_delay /= (float)this->cost_vector.size();
-    avg_congestion /= (float)this->cost_vector.size();
+                    if (ptc >= int(f_src_opin_delays[itile].size())) {
+                        f_src_opin_delays[itile].resize(ptc + 1); //Inefficient but functional...
+                    }
 
-    return Cost_Entry(avg_delay, avg_congestion);
-}
+                    //Find the wire types which are reachable from inode and record them and
+                    //the cost to reach them
+                    dijkstra_flood_to_wires(itile, node_id, f_src_opin_delays);
 
-/* returns a cost entry that represents the geomean of all the recorded entries */
-Cost_Entry Expansion_Cost_Entry::get_geomean_entry() {
-    float geomean_delay = 0;
-    float geomean_cong = 0;
-    for (auto cost_entry : this->cost_vector) {
-        geomean_delay += log(cost_entry.delay);
-        geomean_cong += log(cost_entry.congestion);
-    }
+                    if (f_src_opin_delays[itile][ptc].empty()) {
+                        VTR_LOGV_DEBUG(f_router_debug, "Found no reachable wires from %s (%s) at (%d,%d)\n",
+                                       rr_node_typename[rr_type],
+                                       rr_node_arch_name(inode).c_str(),
+                                       sample_loc.x(),
+                                       sample_loc.y());
 
-    geomean_delay = exp(geomean_delay / (float)this->cost_vector.size());
-    geomean_cong = exp(geomean_cong / (float)this->cost_vector.size());
+                        ptcs_with_no_delays = true;
+                    }
+                }
 
-    return Cost_Entry(geomean_delay, geomean_cong);
+                ++num_sampled_locs;
+            }
+            if (ptcs_with_no_delays) {
+                VPR_ERROR(VPR_ERROR_ROUTE, "Some SOURCE/OPINs have no reachable wires\n");
+            }
+        }
+    }
 }
 
-/* returns a cost entry that represents the medial of all recorded entries */
-Cost_Entry Expansion_Cost_Entry::get_median_entry() {
-    /* find median by binning the delays of all entries and then chosing the bin
-     * with the largest number of entries */
+static void compute_router_chan_ipin_lookahead() {
+    vtr::ScopedStartFinishTimer timer("Computing chan/ipin lookahead");
+    auto& device_ctx = g_vpr_ctx.device();
 
-    int num_bins = 10;
+    f_chan_ipins_delays.clear();
 
-    /* find entries with smallest and largest delays */
-    Cost_Entry min_del_entry;
-    Cost_Entry max_del_entry;
-    for (auto entry : this->cost_vector) {
-        if (std::isnan(min_del_entry.delay) || entry.delay < min_del_entry.delay) {
-            min_del_entry = entry;
-        }
-        if (std::isnan(max_del_entry.delay) || entry.delay > max_del_entry.delay) {
-            max_del_entry = entry;
-        }
-    }
+    f_chan_ipins_delays.resize(device_ctx.physical_tile_types.size());
 
-    /* get the bin size */
-    float delay_diff = max_del_entry.delay - min_del_entry.delay;
-    float bin_size = delay_diff / (float)num_bins;
+    std::vector<int> rr_nodes_at_loc;
 
-    /* sort the cost entries into bins */
-    std::vector<std::vector<Cost_Entry>> entry_bins(num_bins, std::vector<Cost_Entry>());
-    for (auto entry : this->cost_vector) {
-        float bin_num = floor((entry.delay - min_del_entry.delay) / bin_size);
+    //We assume that the routing connectivity of each instance of a physical tile is the same,
+    //and so only measure one instance of each type
+    for (auto tile_type : device_ctx.physical_tile_types) {
+        vtr::Point<int> sample_loc(-1, -1);
 
-        VTR_ASSERT(vtr::nint(bin_num) >= 0 && vtr::nint(bin_num) <= num_bins);
-        if (vtr::nint(bin_num) == num_bins) {
-            /* largest entry will otherwise have an out-of-bounds bin number */
-            bin_num -= 1;
-        }
-        entry_bins[vtr::nint(bin_num)].push_back(entry);
-    }
+        sample_loc = pick_sample_tile(&tile_type, sample_loc);
 
-    /* find the bin with the largest number of elements */
-    int largest_bin = 0;
-    int largest_size = 0;
-    for (int ibin = 0; ibin < num_bins; ibin++) {
-        if (entry_bins[ibin].size() > (unsigned)largest_size) {
-            largest_bin = ibin;
-            largest_size = (unsigned)entry_bins[ibin].size();
+        if (sample_loc.x() == -1 && sample_loc.y() == -1) {
+            //No untried instances of the current tile type left
+            VTR_LOG_WARN("Found no sample locations for %s\n",
+                         tile_type.name);
+            continue;
         }
-    }
 
-    /* get the representative delay of the largest bin */
-    Cost_Entry representative_entry = entry_bins[largest_bin][0];
+        int min_x = std::max(0, sample_loc.x() - X_OFFSET);
+        int min_y = std::max(0, sample_loc.y() - Y_OFFSET);
+        int max_x = std::min(int(device_ctx.grid.width()), sample_loc.x() + X_OFFSET);
+        int max_y = std::min(int(device_ctx.grid.height()), sample_loc.y() + Y_OFFSET);
 
-    return representative_entry;
-}
+        for (int ix = min_x; ix < max_x; ix++) {
+            for (int iy = min_y; iy < max_y; iy++) {
+                for (auto rr_type : {CHANX, CHANY}) {
+                    rr_nodes_at_loc.clear();
 
-/* returns the absolute delta_x and delta_y offset required to reach to_node from from_node */
-static void get_xy_deltas(const RRNodeId from_node, const RRNodeId to_node, int* delta_x, int* delta_y) {
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
+                    get_rr_node_indices(device_ctx.rr_node_indices, ix, iy, rr_type, &rr_nodes_at_loc);
+                    for (int inode : rr_nodes_at_loc) {
+                        if (inode < 0) continue;
 
-    e_rr_type from_type = rr_graph.node_type(from_node);
-    e_rr_type to_type = rr_graph.node_type(to_node);
+                        RRNodeId node_id(inode);
 
-    if (!is_chan(from_type) && !is_chan(to_type)) {
-        //Alternate formulation for non-channel types
-        int from_x = 0;
-        int from_y = 0;
-        adjust_rr_position(from_node, from_x, from_y);
+                        //Find the IPINs which are reachable from the wires within the bounding box
+                        //around the selected tile location
+                        dijkstra_flood_to_ipins(node_id, f_chan_ipins_delays);
+                    }
+                }
+            }
+        }
+    }
+}
 
-        int to_x = 0;
-        int to_y = 0;
-        adjust_rr_position(to_node, to_x, to_y);
+static vtr::Point<int> pick_sample_tile(t_physical_tile_type_ptr tile_type, vtr::Point<int> prev) {
+    //Very simple for now, just pick the fist matching tile found
+    vtr::Point<int> loc(OPEN, OPEN);
 
-        *delta_x = to_x - from_x;
-        *delta_y = to_y - from_y;
-    } else {
-        //Traditional formulation
+    //VTR_LOG("Prev: %d,%d\n", prev.x(), prev.y());
 
-        /* get chan/seg coordinates of the from/to nodes. seg coordinate is along the wire,
-         * chan coordinate is orthogonal to the wire */
-        int from_seg_low;
-        int from_seg_high;
-        int from_chan;
-        int to_seg;
-        int to_chan;
-        if (from_type == CHANY) {
-            from_seg_low = rr_graph.node_ylow(from_node);
-            from_seg_high = rr_graph.node_yhigh(from_node);
-            from_chan = rr_graph.node_xlow(from_node);
-            to_seg = rr_graph.node_ylow(to_node);
-            to_chan = rr_graph.node_xlow(to_node);
-        } else {
-            from_seg_low = rr_graph.node_xlow(from_node);
-            from_seg_high = rr_graph.node_xhigh(from_node);
-            from_chan = rr_graph.node_ylow(from_node);
-            to_seg = rr_graph.node_xlow(to_node);
-            to_chan = rr_graph.node_ylow(to_node);
-        }
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& grid = device_ctx.grid;
 
-        /* now we want to count the minimum number of *channel segments* between the from and to nodes */
-        int delta_seg, delta_chan;
+    int y_init = prev.y() + 1; //Start searching next element above prev
 
-        /* orthogonal to wire */
-        int no_need_to_pass_by_clb = 0; //if we need orthogonal wires then we don't need to pass by the target CLB along the current wire direction
-        if (to_chan > from_chan + 1) {
-            /* above */
-            delta_chan = to_chan - from_chan;
-            no_need_to_pass_by_clb = 1;
-        } else if (to_chan < from_chan) {
-            /* below */
-            delta_chan = from_chan - to_chan + 1;
-            no_need_to_pass_by_clb = 1;
-        } else {
-            /* adjacent to current channel */
-            delta_chan = 0;
-            no_need_to_pass_by_clb = 0;
-        }
+    for (int x = prev.x(); x < int(grid.width()); ++x) {
+        if (x < 0) continue;
 
-        /* along same direction as wire. */
-        if (to_seg > from_seg_high) {
-            /* ahead */
-            delta_seg = to_seg - from_seg_high - no_need_to_pass_by_clb;
-        } else if (to_seg < from_seg_low) {
-            /* behind */
-            delta_seg = from_seg_low - to_seg - no_need_to_pass_by_clb;
-        } else {
-            /* along the span of the wire */
-            delta_seg = 0;
-        }
+        //VTR_LOG("  x: %d\n", x);
 
-        /* account for wire direction. lookahead map was computed by looking up and to the right starting at INC wires. for targets
-         * that are opposite of the wire direction, let's add 1 to delta_seg */
-        e_direction from_dir = rr_graph.node_direction(from_node);
-        if (is_chan(from_type)
-            && ((to_seg < from_seg_low && from_dir == INC_DIRECTION) || (to_seg > from_seg_high && from_dir == DEC_DIRECTION))) {
-            delta_seg++;
+        for (int y = y_init; y < int(grid.height()); ++y) {
+            if (y < 0) continue;
+
+            //VTR_LOG("   y: %d\n", y);
+            if (grid[x][y].type->index == tile_type->index) {
+                loc.set_x(x);
+                loc.set_y(y);
+                return loc;
+            }
         }
 
-        if (from_type == CHANY) {
-            *delta_x = delta_chan;
-            *delta_y = delta_seg;
+        if (loc.x() != OPEN && loc.y() != OPEN) {
+            break;
         } else {
-            *delta_x = delta_seg;
-            *delta_y = delta_chan;
+            y_init = 0; //Prepare to search next column
         }
     }
+    //VTR_LOG("Next: %d,%d\n", loc.x(), loc.y());
 
-    VTR_ASSERT_SAFE(std::abs(*delta_x) < (int)device_ctx.grid.width());
-    VTR_ASSERT_SAFE(std::abs(*delta_y) < (int)device_ctx.grid.height());
+    return loc;
 }
 
-static void adjust_rr_position(const RRNodeId rr, int& x, int& y) {
+static void dijkstra_flood_to_wires(int itile, RRNodeId node, t_src_opin_delays& src_opin_delays) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    e_rr_type rr_type = rr_graph.node_type(rr);
+    struct t_pq_entry {
+        float delay;
+        float congestion;
+        RRNodeId node;
 
-    if (is_chan(rr_type)) {
-        adjust_rr_wire_position(rr, x, y);
-    } else if (is_pin(rr_type)) {
-        adjust_rr_pin_position(rr, x, y);
-    } else {
-        VTR_ASSERT_SAFE(is_src_sink(rr_type));
-        adjust_rr_src_sink_position(rr, x, y);
-    }
-}
+        bool operator<(const t_pq_entry& rhs) const {
+            return this->delay < rhs.delay;
+        }
+    };
+
+    std::priority_queue<t_pq_entry> pq;
+
+    t_pq_entry root;
+    root.congestion = 0.;
+    root.delay = 0.;
+    root.node = node;
+
+    int ptc = rr_graph.node_ptc_num(node);
 
-static void adjust_rr_pin_position(const RRNodeId rr, int& x, int& y) {
     /*
-     * VPR uses a co-ordinate system where wires above and to the right of a block
-     * are at the same location as the block:
-     *
-     *
-     *       <-----------C
-     *    D
-     *    |  +---------+  ^
-     *    |  |         |  |
-     *    |  |  (1,1)  |  |
-     *    |  |         |  |
-     *    V  +---------+  |
-     *                    A
-     *     B----------->                  
+     * Perform Djikstra from the SOURCE/OPIN of interest, stopping at the the first
+     * reachable wires (i.e until we hit the inter-block routing network), or a SINK
+     * (via a direct-connect).
      *
-     * So wires are located as follows:
+     * Note that typical RR graphs are structured :
      *
-     *      A: (1, 1) CHANY
-     *      B: (1, 0) CHANX
-     *      C: (1, 1) CHANX
-     *      D: (0, 1) CHANY
+     *      SOURCE ---> OPIN --> CHANX/CHANY
+     *              |
+     *              --> OPIN --> CHANX/CHANY
+     *              |
+     *             ...
      *
-     * But all pins incident on the surrounding channels
-     * would be at (1,1) along with a relevant side.
+     *   possibly with direct-connects of the form:
      *
-     * In the following, we adjust the positions of pins to
-     * account for the channel they are incident too.
+     *      SOURCE --> OPIN --> IPIN --> SINK
      *
-     * Note that blocks at (0,*) such as IOs, may have (unconnected)
-     * pins on the left side, so we also clip the minimum x to zero.
-     * Similarly for blocks at (*,0) we clip the minimum y to zero.
+     * and there is a small number of fixed hops from SOURCE/OPIN to CHANX/CHANY or
+     * to a SINK (via a direct-connect), so this runs very fast (i.e. O(1))
      */
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
+    pq.push(root);
+    while (!pq.empty()) {
+        t_pq_entry curr = pq.top();
+        pq.pop();
 
-    VTR_ASSERT_SAFE(is_pin(rr_graph.node_type(rr)));
-    VTR_ASSERT_SAFE(rr_graph.node_xlow(rr) == rr_graph.node_xhigh(rr));
-    VTR_ASSERT_SAFE(rr_graph.node_ylow(rr) == rr_graph.node_yhigh(rr));
+        e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
+        if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) {
+            //We stop expansion at any CHANX/CHANY/SINK
+            int seg_index;
+            if (curr_rr_type != SINK) {
+                //It's a wire, figure out its type
+                int cost_index = rr_graph.node_cost_index(curr.node);
+                seg_index = device_ctx.rr_indexed_data[cost_index].seg_index;
+            } else {
+                //This is a direct-connect path between an IPIN and OPIN,
+                //which terminated at a SINK.
+                //
+                //We treat this as a 'special' wire type
+                seg_index = DIRECT_CONNECT_SPECIAL_SEG_TYPE;
+            }
 
-    x = rr_graph.node_xlow(rr);
-    y = rr_graph.node_ylow(rr);
+            //Keep costs of the best path to reach each wire type
+            if (!src_opin_delays[itile][ptc].count(seg_index)
+                || curr.delay < src_opin_delays[itile][ptc][seg_index].delay) {
+                src_opin_delays[itile][ptc][seg_index].wire_rr_type = curr_rr_type;
+                src_opin_delays[itile][ptc][seg_index].wire_seg_index = seg_index;
+                src_opin_delays[itile][ptc][seg_index].delay = curr.delay;
+                src_opin_delays[itile][ptc][seg_index].congestion = curr.congestion;
+            }
 
-    e_side rr_side = rr_graph.node_side(rr);
+        } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) {
+            //We allow expansion through SOURCE/OPIN/IPIN types
+            int cost_index = rr_graph.node_cost_index(curr.node);
+            float incr_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost
 
-    if (rr_side == LEFT) {
-        x -= 1;
-        x = std::max(x, 0);
-    } else if (rr_side == BOTTOM && y > 0) {
-        y -= 1;
-        y = std::max(y, 0);
+            for (RREdgeId edge : rr_graph.edge_range(curr.node)) {
+                int iswitch = rr_graph.edge_switch(edge);
+                float incr_delay = device_ctx.rr_switch_inf[iswitch].Tdel;
+
+                RRNodeId next_node = rr_graph.edge_sink_node(edge);
+
+                t_pq_entry next;
+                next.congestion = curr.congestion + incr_cong; //Of current node
+                next.delay = curr.delay + incr_delay;          //To reach next node
+                next.node = next_node;
+
+                pq.push(next);
+            }
+        } else {
+            VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type");
+        }
     }
 }
 
-static void adjust_rr_wire_position(const RRNodeId rr, int& x, int& y) {
+static void dijkstra_flood_to_ipins(RRNodeId node, t_chan_ipins_delays& chan_ipins_delays) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_nodes;
 
-    VTR_ASSERT_SAFE(is_chan(rr_graph.node_type(rr)));
+    struct t_pq_entry {
+        float delay;
+        float congestion;
+        RRNodeId node;
+        int level;
+        int prev_seg_index;
 
-    e_direction rr_dir = rr_graph.node_direction(rr);
+        bool operator<(const t_pq_entry& rhs) const {
+            return this->delay < rhs.delay;
+        }
+    };
 
-    if (rr_dir == DEC_DIRECTION) {
-        x = rr_graph.node_xhigh(rr);
-        y = rr_graph.node_yhigh(rr);
-    } else if (rr_dir == INC_DIRECTION) {
-        x = rr_graph.node_xlow(rr);
-        y = rr_graph.node_ylow(rr);
-    } else {
-        VTR_ASSERT_SAFE(rr_dir == BI_DIRECTION);
-        //Not sure what to do here...
-        //Try average for now.
-        x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.);
-        y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.);
-    }
-}
+    std::priority_queue<t_pq_entry> pq;
 
-static void adjust_rr_src_sink_position(const RRNodeId rr, int& x, int& y) {
-    //SOURCE/SINK nodes assume the full dimensions of their
-    //associated block
-    //
-    //Use the average position.
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& rr_graph = device_ctx.rr_nodes;
+    t_pq_entry root;
+    root.congestion = 0.;
+    root.delay = 0.;
+    root.node = node;
+    root.level = 0;
+    root.prev_seg_index = OPEN;
 
-    VTR_ASSERT_SAFE(is_src_sink(rr_graph.node_type(rr)));
+    /*
+     * Perform Djikstra from the CHAN of interest, stopping at the the first
+     * reachable IPIN
+     *
+     * Note that typical RR graphs are structured :
+     *
+     *      CHANX/CHANY --> CHANX/CHANY --> ... --> CHANX/CHANY --> IPIN --> SINK
+     *                  |
+     *                  --> CHANX/CHANY --> ... --> CHANX/CHANY --> IPIN --> SINK
+     *                  |
+     *                  ...
+     *
+     * and there is a variable number of hops from a given CHANX/CHANY  to IPIN.
+     * To avoid impacting on run-time, a fixed number of hops is performed. This
+     * should be enough to find the delay from the last CAHN to IPIN connection.
+     */
+    pq.push(root);
+    while (!pq.empty()) {
+        t_pq_entry curr = pq.top();
+        pq.pop();
 
-    x = vtr::nint((rr_graph.node_xlow(rr) + rr_graph.node_xhigh(rr)) / 2.);
-    y = vtr::nint((rr_graph.node_ylow(rr) + rr_graph.node_yhigh(rr)) / 2.);
-}
+        e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
+        if (curr_rr_type == IPIN) {
+            int seg_index = curr.prev_seg_index;
 
-static void print_wire_cost_map(const std::vector<t_segment_inf>& segment_inf) {
-    auto& device_ctx = g_vpr_ctx.device();
+            int node_x = rr_graph.node_xlow(curr.node);
+            int node_y = rr_graph.node_ylow(curr.node);
 
-    for (size_t chan_index = 0; chan_index < f_wire_cost_map.dim_size(0); chan_index++) {
-        for (size_t iseg = 0; iseg < f_wire_cost_map.dim_size(1); iseg++) {
-            vtr::printf("Seg %d (%s, length %d) %d\n",
-                        iseg,
-                        segment_inf[iseg].name.c_str(),
-                        segment_inf[iseg].length,
-                        chan_index);
-            for (size_t iy = 0; iy < device_ctx.grid.height(); iy++) {
-                for (size_t ix = 0; ix < device_ctx.grid.width(); ix++) {
-                    vtr::printf("%2d,%2d: %10.3g\t", ix, iy, f_wire_cost_map[chan_index][iseg][ix][iy].delay);
-                }
-                vtr::printf("\n");
+            auto tile_type = device_ctx.grid[node_x][node_y].type;
+            int itile = tile_type->index;
+
+            int ptc = rr_graph.node_ptc_num(curr.node);
+
+            if (ptc >= int(chan_ipins_delays[itile].size())) {
+                chan_ipins_delays[itile].resize(ptc + 1); //Inefficient but functional...
             }
-            vtr::printf("\n\n");
-        }
-    }
-}
 
-static void print_router_cost_map(const t_routing_cost_map& router_cost_map) {
-    VTR_LOG("Djikstra Flood Costs:\n");
-    for (size_t x = 0; x < router_cost_map.dim_size(0); x++) {
-        for (size_t y = 0; y < router_cost_map.dim_size(1); y++) {
-            VTR_LOG("(%zu,%zu):\n", x, y);
+            //Keep costs of the best path to reach each wire type
+            chan_ipins_delays[itile][ptc][seg_index].wire_rr_type = curr_rr_type;
+            chan_ipins_delays[itile][ptc][seg_index].wire_seg_index = seg_index;
+            chan_ipins_delays[itile][ptc][seg_index].delay = curr.delay;
+            chan_ipins_delays[itile][ptc][seg_index].congestion = curr.congestion;
+        } else if (curr_rr_type == CHANX || curr_rr_type == CHANY) {
+            if (curr.level >= MAX_EXPANSION_LEVEL) {
+                continue;
+            }
+
+            //We allow expansion through SOURCE/OPIN/IPIN types
+            int cost_index = rr_graph.node_cost_index(curr.node);
+            float new_cong = device_ctx.rr_indexed_data[cost_index].base_cost; //Current nodes congestion cost
+            int seg_index = device_ctx.rr_indexed_data[cost_index].seg_index;
+
+            for (RREdgeId edge : rr_graph.edge_range(curr.node)) {
+                int iswitch = rr_graph.edge_switch(edge);
+                float new_delay = device_ctx.rr_switch_inf[iswitch].Tdel;
+
+                RRNodeId next_node = rr_graph.edge_sink_node(edge);
+
+                t_pq_entry next;
+                next.congestion = new_cong; //Of current node
+                next.delay = new_delay;     //To reach next node
+                next.node = next_node;
+                next.level = curr.level + 1;
+                next.prev_seg_index = seg_index;
 
-            for (size_t i = 0; i < router_cost_map[x][y].cost_vector.size(); ++i) {
-                Cost_Entry entry = router_cost_map[x][y].cost_vector[i];
-                VTR_LOG("  %d: delay=%10.3g cong=%10.3g\n", i, entry.delay, entry.congestion);
+                pq.push(next);
             }
+        } else {
+            VPR_ERROR(VPR_ERROR_ROUTE, "Unrecognized RR type");
         }
     }
 }
 
-//
-// When writing capnp targetted serialization, always allow compilation when
-// VTR_ENABLE_CAPNPROTO=OFF.  Generally this means throwing an exception
-// instead.
-//
-#ifndef VTR_ENABLE_CAPNPROTO
+// get an expected minimum cost for routing from the current node to the target node
+float MapLookahead::get_expected_cost(
+    int current_node,
+    int target_node,
+    const t_conn_cost_params& params,
+    float /*R_upstream*/) const {
+    auto& device_ctx = g_vpr_ctx.device();
 
-#    define DISABLE_ERROR                               \
-        "is disabled because VTR_ENABLE_CAPNPROTO=OFF." \
-        "Re-compile with CMake option VTR_ENABLE_CAPNPROTO=ON to enable."
+    t_rr_type rr_type = device_ctx.rr_nodes[current_node].type();
 
-void read_router_lookahead(const std::string& /*file*/) {
-    VPR_THROW(VPR_ERROR_PLACE, "MapLookahead::read " DISABLE_ERROR);
+    if (rr_type == CHANX || rr_type == CHANY) {
+        return get_map_cost(
+            current_node, target_node, params.criticality);
+    } else if (rr_type == IPIN) { /* Change if you're allowing route-throughs */
+        return (device_ctx.rr_indexed_data[SINK_COST_INDEX].base_cost);
+    } else { /* Change this if you want to investigate route-throughs */
+        return (0.);
+    }
 }
 
-void DeltaDelayModel::write(const std::string& /*file*/) const {
-    VPR_THROW(VPR_ERROR_PLACE, "MapLookahead::write " DISABLE_ERROR);
+#ifndef VTR_ENABLE_CAPNPROTO
+
+void MapLookahead::read(const std::string& file) {
+    VPR_THROW(VPR_ERROR_ROUTE, "MapLookahead::read not implemented");
+}
+void MapLookahead::write(const std::string& file) const {
+    VPR_THROW(VPR_ERROR_ROUTE, "MapLookahead::write not implemented");
 }
 
-#else /* VTR_ENABLE_CAPNPROTO */
+#else
+
+void MapLookahead::read(const std::string& file) {
+    cost_map_.read(file);
+
+    compute_router_src_opin_lookahead();
+    compute_router_chan_ipin_lookahead();
+}
+void MapLookahead::write(const std::string& file) const {
+    cost_map_.write(file);
+}
 
-static void ToCostEntry(Cost_Entry* out, const VprMapCostEntry::Reader& in) {
+static void ToCostEntry(util::Cost_Entry* out, const VprCostEntry::Reader& in) {
     out->delay = in.getDelay();
     out->congestion = in.getCongestion();
+    out->fill = in.getFill();
 }
 
-static void FromCostEntry(VprMapCostEntry::Builder* out, const Cost_Entry& in) {
+static void FromCostEntry(VprCostEntry::Builder* out, const util::Cost_Entry& in) {
     out->setDelay(in.delay);
     out->setCongestion(in.congestion);
+    out->setFill(in.fill);
+}
+
+static void ToVprVector2D(std::pair<int, int>* out, const VprVector2D::Reader& in) {
+    *out = std::make_pair(in.getX(), in.getY());
+}
+
+static void FromVprVector2D(VprVector2D::Builder* out, const std::pair<int, int>& in) {
+    out->setX(in.first);
+    out->setY(in.second);
+}
+
+static void ToMatrixCostEntry(vtr::NdMatrix<util::Cost_Entry, 2>* out,
+                              const Matrix<VprCostEntry>::Reader& in) {
+    ToNdMatrix<2, VprCostEntry, util::Cost_Entry>(out, in, ToCostEntry);
 }
 
-void read_router_lookahead(const std::string& file) {
-    vtr::ScopedStartFinishTimer timer("Loading router wire lookahead map");
+static void FromMatrixCostEntry(
+    Matrix<VprCostEntry>::Builder* out,
+    const vtr::NdMatrix<util::Cost_Entry, 2>& in) {
+    FromNdMatrix<2, VprCostEntry, util::Cost_Entry>(
+        out, in, FromCostEntry);
+}
+
+static void ToFloat(float* out, const VprFloatEntry::Reader& in) {
+    // Getting a scalar field is always "get<field name>()".
+    *out = in.getValue();
+}
+
+static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
+    // Setting a scalar field is always "set<field name>(value)".
+    out->setValue(in);
+}
+
+void CostMap::read(const std::string& file) {
+    build_segment_map();
     MmapFile f(file);
 
-    /* Increase reader limit to 1G words to allow for large files. */
     ::capnp::ReaderOptions opts = default_large_capnp_opts();
     ::capnp::FlatArrayMessageReader reader(f.getData(), opts);
 
-    auto map = reader.getRoot<VprMapLookahead>();
+    auto cost_map = reader.getRoot<VprCostMap>();
+    {
+        const auto& offset = cost_map.getOffset();
+        ToNdMatrix<2, VprVector2D, std::pair<int, int>>(
+            &offset_, offset, ToVprVector2D);
+    }
+
+    {
+        const auto& cost_maps = cost_map.getCostMap();
+        ToNdMatrix<2, Matrix<VprCostEntry>, vtr::NdMatrix<util::Cost_Entry, 2>>(
+            &cost_map_, cost_maps, ToMatrixCostEntry);
+    }
 
-    ToNdMatrix<4, VprMapCostEntry, Cost_Entry>(&f_wire_cost_map, map.getCostMap(), ToCostEntry);
+    {
+        const auto& penalty = cost_map.getPenalty();
+        ToNdMatrix<2, VprFloatEntry, float>(
+            &penalty_, penalty, ToFloat);
+    }
 }
 
-void write_router_lookahead(const std::string& file) {
+void CostMap::write(const std::string& file) const {
     ::capnp::MallocMessageBuilder builder;
 
-    auto map = builder.initRoot<VprMapLookahead>();
+    auto cost_map = builder.initRoot<VprCostMap>();
+
+    {
+        auto offset = cost_map.initOffset();
+        FromNdMatrix<2, VprVector2D, std::pair<int, int>>(
+            &offset, offset_, FromVprVector2D);
+    }
+
+    {
+        auto cost_maps = cost_map.initCostMap();
+        FromNdMatrix<2, Matrix<VprCostEntry>, vtr::NdMatrix<util::Cost_Entry, 2>>(
+            &cost_maps, cost_map_, FromMatrixCostEntry);
+    }
 
-    auto cost_map = map.initCostMap();
-    FromNdMatrix<4, VprMapCostEntry, Cost_Entry>(&cost_map, f_wire_cost_map, FromCostEntry);
+    {
+        auto penalty = cost_map.initPenalty();
+        FromNdMatrix<2, VprFloatEntry, float>(
+            &penalty, penalty_, FromFloat);
+    }
 
     writeMessageToFile(file, &builder);
 }
-
 #endif
diff --git a/vpr/src/route/router_lookahead_map.h b/vpr/src/route/router_lookahead_map.h
index e37c9967a00..4d8b043f0e6 100644
--- a/vpr/src/route/router_lookahead_map.h
+++ b/vpr/src/route/router_lookahead_map.h
@@ -1,48 +1,84 @@
-#pragma once
+#ifndef CONNECTION_BOX_LOOKAHEAD_H_
+#define CONNECTION_BOX_LOOKAHEAD_H_
 
-#include <string>
-#include <limits>
-#include "vtr_ndmatrix.h"
+#include <vector>
+#include "physical_types.h"
 #include "router_lookahead.h"
+#include "router_lookahead_map_utils.h"
+#include "vtr_geometry.h"
 
-class MapLookahead : public RouterLookahead {
-  protected:
-    float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
-    void compute(const std::vector<t_segment_inf>& segment_inf) override;
-    void read(const std::string& file) override;
-    void write(const std::string& file) const override;
-};
+// Keys in the RoutingCosts map
+struct RoutingCostKey {
+    // index of the channel (CHANX or CHANY)
+    int chan_index;
 
-/* f_cost_map is an array of these cost entries that specifies delay/congestion estimates
- * to travel relative x/y distances */
-class Cost_Entry {
-  public:
-    float delay;
-    float congestion;
+    // segment type index
+    int seg_index;
+
+    // offset of the destination connection box from the starting segment
+    vtr::Point<int> delta;
+
+    RoutingCostKey()
+        : seg_index(-1)
+        , delta(0, 0) {}
+    RoutingCostKey(int chan_index_arg, int seg_index_arg, vtr::Point<int> delta_arg)
+        : chan_index(chan_index_arg)
+        , seg_index(seg_index_arg)
+        , delta(delta_arg) {}
 
-    Cost_Entry()
-        : Cost_Entry(std::numeric_limits<float>::quiet_NaN(),
-                     std::numeric_limits<float>::quiet_NaN()) {
+    bool operator==(const RoutingCostKey& other) const {
+        return seg_index == other.seg_index && chan_index == other.chan_index && delta == other.delta;
     }
+};
 
-    Cost_Entry(float set_delay, float set_congestion) {
-        delay = set_delay;
-        congestion = set_congestion;
+// hash implementation for RoutingCostKey
+struct HashRoutingCostKey {
+    std::size_t operator()(RoutingCostKey const& key) const noexcept {
+        std::size_t hash = std::hash<int>{}(key.chan_index);
+        vtr::hash_combine(hash, key.seg_index);
+        vtr::hash_combine(hash, key.delta.x());
+        vtr::hash_combine(hash, key.delta.y());
+        return hash;
     }
 };
 
-/* provides delay/congestion estimates to travel specified distances
- * in the x/y direction */
-typedef vtr::NdMatrix<Cost_Entry, 4> t_wire_cost_map; //[0..1][[0..num_seg_types-1]0..device_ctx.grid.width()-1][0..device_ctx.grid.height()-1]
-                                                      //[0..1] entry distinguish between CHANX/CHANY start nodes respectively
+// Map used to store intermediate routing costs
+typedef std::unordered_map<RoutingCostKey, float, HashRoutingCostKey> RoutingCosts;
+
+// Dense cost maps per source segment and destination connection box types
+class CostMap {
+  public:
+    void set_counts(size_t seg_count);
+    void build_segment_map();
+    int node_to_segment(int from_node_ind) const;
+    util::Cost_Entry find_cost(int from_seg_index, e_rr_type rr_type, int delta_x, int delta_y) const;
+    void set_cost_map(const RoutingCosts& delay_costs, const RoutingCosts& base_costs);
+    std::pair<util::Cost_Entry, int> get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int cx, int cy, const vtr::Rect<int>& bounds);
+    void read(const std::string& file);
+    void write(const std::string& file) const;
+    void print(int iseg) const;
+    std::vector<std::pair<int, int>> list_empty() const;
+
+  private:
+    vtr::Matrix<vtr::Matrix<util::Cost_Entry>> cost_map_;
+    vtr::Matrix<std::pair<int, int>> offset_;
+    vtr::Matrix<float> penalty_;
+    std::vector<int> segment_map_;
+    size_t seg_count_;
+};
+
+// Implementation of RouterLookahead based on source segment and destination connection box types
+class MapLookahead : public RouterLookahead {
+  public:
+    std::pair<float, float> get_src_opin_delays(RRNodeId from_node, int delta_x, int delta_y, float criticality_fac) const;
+    float get_expected_cost(int node, int target_node, const t_conn_cost_params& params, float R_upstream) const override;
+    float get_map_cost(int from_node_ind, int to_node_ind, float criticality_fac) const;
+    void compute(const std::vector<t_segment_inf>& segment_inf) override;
 
-void read_router_lookahead(const std::string& file);
-void write_router_lookahead(const std::string& file);
+    void read(const std::string& file) override;
+    void write(const std::string& file) const override;
 
-/* Computes the lookahead map to be used by the router. If a map was computed prior to this, a new one will not be computed again.
- * The rr graph must have been built before calling this function. */
-void compute_router_lookahead(const std::vector<t_segment_inf>& segment_inf);
+    CostMap cost_map_;
+};
 
-/* queries the lookahead_map (should have been computed prior to routing) to get the expected cost
- * from the specified source to the specified target */
-float get_lookahead_map_cost(RRNodeId from_node, RRNodeId to_node, float criticality_fac);
+#endif
diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
new file mode 100644
index 00000000000..57c12ab84fd
--- /dev/null
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -0,0 +1,246 @@
+#include "router_lookahead_map_utils.h"
+
+#include "globals.h"
+#include "vpr_context.h"
+#include "vtr_math.h"
+#include "route_common.h"
+
+namespace util {
+
+PQ_Entry::PQ_Entry(
+    int set_rr_node_ind,
+    int switch_ind,
+    float parent_delay,
+    float parent_R_upstream,
+    float parent_congestion_upstream,
+    bool starting_node,
+    float Tsw_adjust) {
+    this->rr_node_ind = set_rr_node_ind;
+
+    auto& device_ctx = g_vpr_ctx.device();
+    this->delay = parent_delay;
+    this->congestion_upstream = parent_congestion_upstream;
+    this->R_upstream = parent_R_upstream;
+    if (!starting_node) {
+        float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel;
+        Tsw += Tsw_adjust;
+        VTR_ASSERT(Tsw >= 0.f);
+        float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
+        float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
+        float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R();
+
+        float T_linear = 0.f;
+        if (device_ctx.rr_switch_inf[switch_ind].buffered()) {
+            T_linear = Tsw + Rsw * Cnode + 0.5 * Rnode * Cnode;
+        } else { /* Pass transistor */
+            T_linear = Tsw + 0.5 * Rsw * Cnode;
+        }
+
+        float base_cost = 0.f;
+        if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
+            base_cost = get_rr_cong_cost(set_rr_node_ind);
+        }
+
+        VTR_ASSERT(T_linear >= 0.);
+        VTR_ASSERT(base_cost >= 0.);
+        this->delay += T_linear;
+
+        this->congestion_upstream += base_cost;
+    }
+
+    /* set the cost of this node */
+    this->cost = this->delay;
+}
+
+util::PQ_Entry_Delay::PQ_Entry_Delay(
+    int set_rr_node_ind,
+    int switch_ind,
+    const util::PQ_Entry_Delay* parent) {
+    this->rr_node_ind = set_rr_node_ind;
+
+    if (parent != nullptr) {
+        auto& device_ctx = g_vpr_ctx.device();
+        float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel;
+        float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
+        float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
+        float Rnode = device_ctx.rr_nodes[set_rr_node_ind].R();
+
+        float T_linear = 0.f;
+        if (device_ctx.rr_switch_inf[switch_ind].buffered()) {
+            T_linear = Tsw + Rsw * Cnode + 0.5 * Rnode * Cnode;
+        } else { /* Pass transistor */
+            T_linear = Tsw + 0.5 * Rsw * Cnode;
+        }
+
+        VTR_ASSERT(T_linear >= 0.);
+        this->delay_cost = parent->delay_cost + T_linear;
+    } else {
+        this->delay_cost = 0.f;
+    }
+}
+
+util::PQ_Entry_Base_Cost::PQ_Entry_Base_Cost(
+    int set_rr_node_ind,
+    int switch_ind,
+    const util::PQ_Entry_Base_Cost* parent) {
+    this->rr_node_ind = set_rr_node_ind;
+
+    if (parent != nullptr) {
+        auto& device_ctx = g_vpr_ctx.device();
+        if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
+            this->base_cost = parent->base_cost + get_rr_cong_cost(set_rr_node_ind);
+        } else {
+            this->base_cost = parent->base_cost;
+        }
+    } else {
+        this->base_cost = 0.f;
+    }
+}
+
+/* returns cost entry with the smallest delay */
+util::Cost_Entry util::Expansion_Cost_Entry::get_smallest_entry() const {
+    util::Cost_Entry smallest_entry;
+
+    for (auto entry : this->cost_vector) {
+        if (!smallest_entry.valid() || entry.delay < smallest_entry.delay) {
+            smallest_entry = entry;
+        }
+    }
+
+    return smallest_entry;
+}
+
+/* returns a cost entry that represents the average of all the recorded entries */
+util::Cost_Entry util::Expansion_Cost_Entry::get_average_entry() const {
+    float avg_delay = 0;
+    float avg_congestion = 0;
+
+    for (auto cost_entry : this->cost_vector) {
+        avg_delay += cost_entry.delay;
+        avg_congestion += cost_entry.congestion;
+    }
+
+    avg_delay /= (float)this->cost_vector.size();
+    avg_congestion /= (float)this->cost_vector.size();
+
+    return util::Cost_Entry(avg_delay, avg_congestion);
+}
+
+/* returns a cost entry that represents the geomean of all the recorded entries */
+util::Cost_Entry util::Expansion_Cost_Entry::get_geomean_entry() const {
+    float geomean_delay = 0;
+    float geomean_cong = 0;
+    for (auto cost_entry : this->cost_vector) {
+        geomean_delay += log(cost_entry.delay);
+        geomean_cong += log(cost_entry.congestion);
+    }
+
+    geomean_delay = exp(geomean_delay / (float)this->cost_vector.size());
+    geomean_cong = exp(geomean_cong / (float)this->cost_vector.size());
+
+    return util::Cost_Entry(geomean_delay, geomean_cong);
+}
+
+/* returns a cost entry that represents the medial of all recorded entries */
+util::Cost_Entry util::Expansion_Cost_Entry::get_median_entry() const {
+    /* find median by binning the delays of all entries and then chosing the bin
+     * with the largest number of entries */
+
+    int num_bins = 10;
+
+    /* find entries with smallest and largest delays */
+    util::Cost_Entry min_del_entry;
+    util::Cost_Entry max_del_entry;
+    for (auto entry : this->cost_vector) {
+        if (!min_del_entry.valid() || entry.delay < min_del_entry.delay) {
+            min_del_entry = entry;
+        }
+        if (!max_del_entry.valid() || entry.delay > max_del_entry.delay) {
+            max_del_entry = entry;
+        }
+    }
+
+    /* get the bin size */
+    float delay_diff = max_del_entry.delay - min_del_entry.delay;
+    float bin_size = delay_diff / (float)num_bins;
+
+    /* sort the cost entries into bins */
+    std::vector<std::vector<util::Cost_Entry>> entry_bins(num_bins, std::vector<util::Cost_Entry>());
+    for (auto entry : this->cost_vector) {
+        float bin_num = floor((entry.delay - min_del_entry.delay) / bin_size);
+
+        VTR_ASSERT(vtr::nint(bin_num) >= 0 && vtr::nint(bin_num) <= num_bins);
+        if (vtr::nint(bin_num) == num_bins) {
+            /* largest entry will otherwise have an out-of-bounds bin number */
+            bin_num -= 1;
+        }
+        entry_bins[vtr::nint(bin_num)].push_back(entry);
+    }
+
+    /* find the bin with the largest number of elements */
+    int largest_bin = 0;
+    int largest_size = 0;
+    for (int ibin = 0; ibin < num_bins; ibin++) {
+        if (entry_bins[ibin].size() > (unsigned)largest_size) {
+            largest_bin = ibin;
+            largest_size = (unsigned)entry_bins[ibin].size();
+        }
+    }
+
+    /* get the representative delay of the largest bin */
+    util::Cost_Entry representative_entry = entry_bins[largest_bin][0];
+
+    return representative_entry;
+}
+
+template<typename Entry>
+void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                const Entry& parent_entry,
+                                std::vector<util::Search_Path>* paths,
+                                std::vector<bool>* node_expanded,
+                                std::priority_queue<Entry,
+                                                    std::vector<Entry>,
+                                                    std::greater<Entry>>* pq) {
+    int parent_ind = size_t(parent_entry.rr_node_ind);
+
+    auto& parent_node = rr_nodes[parent_ind];
+
+    for (int iedge = 0; iedge < parent_node.num_edges(); iedge++) {
+        int child_node_ind = parent_node.edge_sink_node(iedge);
+        int switch_ind = parent_node.edge_switch(iedge);
+
+        /* skip this child if it has already been expanded from */
+        if ((*node_expanded)[child_node_ind]) {
+            continue;
+        }
+
+        Entry child_entry(child_node_ind, switch_ind, &parent_entry);
+        VTR_ASSERT(child_entry.cost() >= 0);
+
+        /* Create (if it doesn't exist) or update (if the new cost is lower)
+         * to specified node */
+        Search_Path path_entry = {child_entry.cost(), parent_ind, iedge};
+        auto& path = (*paths)[child_node_ind];
+        if (path_entry.cost < path.cost) {
+            pq->push(child_entry);
+            path = path_entry;
+        }
+    }
+}
+
+template void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                         const PQ_Entry_Delay& parent_entry,
+                                         std::vector<Search_Path>* paths,
+                                         std::vector<bool>* node_expanded,
+                                         std::priority_queue<PQ_Entry_Delay,
+                                                             std::vector<PQ_Entry_Delay>,
+                                                             std::greater<PQ_Entry_Delay>>* pq);
+template void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                         const PQ_Entry_Base_Cost& parent_entry,
+                                         std::vector<Search_Path>* paths,
+                                         std::vector<bool>* node_expanded,
+                                         std::priority_queue<PQ_Entry_Base_Cost,
+                                                             std::vector<PQ_Entry_Base_Cost>,
+                                                             std::greater<PQ_Entry_Base_Cost>>* pq);
+
+} // namespace util
diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
new file mode 100644
index 00000000000..28b60e3072b
--- /dev/null
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -0,0 +1,206 @@
+#ifndef ROUTER_LOOKAHEAD_MAP_UTILS_H_
+#define ROUTER_LOOKAHEAD_MAP_UTILS_H_
+/*
+ * The router lookahead provides an estimate of the cost from an intermediate node to the target node
+ * during directed (A*-like) routing.
+ *
+ * The VPR 7.0 lookahead (route/route_timing.c ==> get_timing_driven_expected_cost) lower-bounds the remaining delay and
+ * congestion by assuming that a minimum number of wires, of the same type as the current node being expanded, can be used
+ * to complete the route. While this method is efficient, it can run into trouble with architectures that use
+ * multiple interconnected wire types.
+ *
+ * The lookahead in this file pre-computes delay/congestion costs up and to the right of a starting tile. This generates
+ * delay/congestion tables for {CHANX, CHANY} channel types, over all wire types defined in the architecture file.
+ * See Section 3.2.4 in Oleg Petelin's MASc thesis (2016) for more discussion.
+ *
+ */
+
+#include <cmath>
+#include <limits>
+#include <vector>
+#include <queue>
+#include <unordered_map>
+#include "vpr_types.h"
+#include "rr_node.h"
+
+namespace util {
+
+/* when a list of delay/congestion entries at a coordinate in Cost_Entry is boiled down to a single
+ * representative entry, this enum is passed-in to specify how that representative entry should be
+ * calculated */
+enum e_representative_entry_method {
+    FIRST = 0, //the first cost that was recorded
+    SMALLEST,  //the smallest-delay cost recorded
+    AVERAGE,
+    GEOMEAN,
+    MEDIAN
+};
+
+/* f_cost_map is an array of these cost entries that specifies delay/congestion estimates
+ * to travel relative x/y distances */
+class Cost_Entry {
+  public:
+    float delay;
+    float congestion;
+    bool fill;
+
+    Cost_Entry() {
+        delay = std::numeric_limits<float>::infinity();
+        congestion = std::numeric_limits<float>::infinity();
+        fill = false;
+    }
+    Cost_Entry(float set_delay, float set_congestion)
+        : delay(set_delay)
+        , congestion(set_congestion)
+        , fill(false) {}
+    Cost_Entry(float set_delay, float set_congestion, bool set_fill)
+        : delay(set_delay)
+        , congestion(set_congestion)
+        , fill(set_fill) {}
+    bool valid() const {
+        return std::isfinite(delay) && std::isfinite(congestion);
+    }
+};
+
+/* a class that stores delay/congestion information for a given relative coordinate during the Dijkstra expansion.
+ * since it stores multiple cost entries, it is later boiled down to a single representative cost entry to be stored
+ * in the final lookahead cost map */
+class Expansion_Cost_Entry {
+  private:
+    std::vector<Cost_Entry> cost_vector;
+
+    Cost_Entry get_smallest_entry() const;
+    Cost_Entry get_average_entry() const;
+    Cost_Entry get_geomean_entry() const;
+    Cost_Entry get_median_entry() const;
+
+  public:
+    void add_cost_entry(e_representative_entry_method method,
+                        float add_delay,
+                        float add_congestion) {
+        Cost_Entry cost_entry(add_delay, add_congestion);
+        if (method == SMALLEST) {
+            /* taking the smallest-delay entry anyway, so no need to push back multple entries */
+            if (this->cost_vector.empty()) {
+                this->cost_vector.push_back(cost_entry);
+            } else {
+                if (add_delay < this->cost_vector[0].delay) {
+                    this->cost_vector[0] = cost_entry;
+                }
+            }
+        } else {
+            this->cost_vector.push_back(cost_entry);
+        }
+    }
+    void clear_cost_entries() {
+        this->cost_vector.clear();
+    }
+
+    Cost_Entry get_representative_cost_entry(e_representative_entry_method method) const {
+        Cost_Entry entry;
+
+        if (!cost_vector.empty()) {
+            switch (method) {
+                case FIRST:
+                    entry = cost_vector[0];
+                    break;
+                case SMALLEST:
+                    entry = this->get_smallest_entry();
+                    break;
+                case AVERAGE:
+                    entry = this->get_average_entry();
+                    break;
+                case GEOMEAN:
+                    entry = this->get_geomean_entry();
+                    break;
+                case MEDIAN:
+                    entry = this->get_median_entry();
+                    break;
+                default:
+                    break;
+            }
+        }
+        return entry;
+    }
+};
+
+/* a class that represents an entry in the Dijkstra expansion priority queue */
+class PQ_Entry {
+  public:
+    int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
+    float cost;      //the cost of the path to get to this node
+
+    /* store backward delay, R and congestion info */
+    float delay;
+    float R_upstream;
+    float congestion_upstream;
+
+    PQ_Entry(int set_rr_node_ind, int /*switch_ind*/, float parent_delay, float parent_R_upstream, float parent_congestion_upstream, bool starting_node, float Tsw_adjust);
+
+    bool operator<(const PQ_Entry& obj) const {
+        /* inserted into max priority queue so want queue entries with a lower cost to be greater */
+        return (this->cost > obj.cost);
+    }
+};
+
+// A version of PQ_Entry that only calculates and stores the delay.
+class PQ_Entry_Delay {
+  public:
+    int rr_node_ind;  //index in device_ctx.rr_nodes that this entry represents
+    float delay_cost; //the cost of the path to get to this node
+
+    PQ_Entry_Delay(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Delay* parent);
+
+    float cost() const {
+        return delay_cost;
+    }
+
+    void adjust_Tsw(float amount) {
+        delay_cost += amount;
+    }
+
+    bool operator>(const PQ_Entry_Delay& obj) const {
+        return (this->delay_cost > obj.delay_cost);
+    }
+};
+
+// A version of PQ_Entry that only calculates and stores the base cost.
+class PQ_Entry_Base_Cost {
+  public:
+    int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
+    float base_cost;
+
+    PQ_Entry_Base_Cost(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Base_Cost* parent);
+
+    float cost() const {
+        return base_cost;
+    }
+
+    void adjust_Tsw(float /* amount */) {
+        // do nothing
+    }
+
+    bool operator>(const PQ_Entry_Base_Cost& obj) const {
+        return (this->base_cost > obj.base_cost);
+    }
+};
+
+struct Search_Path {
+    float cost;
+    int parent;
+    int edge;
+};
+
+/* iterates over the children of the specified node and selectively pushes them onto the priority queue */
+template<typename Entry>
+void expand_dijkstra_neighbours(const t_rr_graph_storage& rr_nodes,
+                                const Entry& parent_entry,
+                                std::vector<Search_Path>* paths,
+                                std::vector<bool>* node_expanded,
+                                std::priority_queue<Entry,
+                                                    std::vector<Entry>,
+                                                    std::greater<Entry>>* pq);
+
+} // namespace util
+
+#endif
diff --git a/vpr/src/route/router_lookahead_sampling.cpp b/vpr/src/route/router_lookahead_sampling.cpp
new file mode 100644
index 00000000000..568d5134602
--- /dev/null
+++ b/vpr/src/route/router_lookahead_sampling.cpp
@@ -0,0 +1,229 @@
+#include "router_lookahead_sampling.h"
+
+#include <vector>
+
+#include "globals.h"
+#include "vtr_math.h"
+#include "vtr_geometry.h"
+#include "vtr_time.h"
+
+// Sample based an NxN grid of starting segments, where N = SAMPLE_GRID_SIZE
+static constexpr int SAMPLE_GRID_SIZE = 2;
+
+// quantiles (like percentiles but 0-1) of segment count to use as a selection criteria
+// choose locations with higher, but not extreme, counts of each segment type
+static constexpr double kSamplingCountLowerQuantile = 0.5;
+static constexpr double kSamplingCountUpperQuantile = 0.7;
+
+// also known as the L1 norm
+static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b) {
+    return abs(b.x() - a.x()) + abs(b.y() - a.y());
+}
+
+// the smallest bounding box containing a node
+static vtr::Rect<int> bounding_box_for_node(int node_ind) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_graph = device_ctx.rr_nodes;
+    int x = rr_graph.node_xlow(RRNodeId(node_ind));
+    int y = rr_graph.node_ylow(RRNodeId(node_ind));
+
+    return vtr::Rect<int>(vtr::Point<int>(x, y));
+}
+
+static vtr::Rect<int> sample_window(const vtr::Rect<int>& bounding_box, int sx, int sy, int n) {
+    return vtr::Rect<int>(sample(bounding_box, sx, sy, n),
+                          sample(bounding_box, sx + 1, sy + 1, n));
+}
+
+static std::vector<SamplePoint> choose_points(const vtr::Matrix<int>& counts,
+                                              const vtr::Rect<int>& window,
+                                              int min_count,
+                                              int max_count) {
+    VTR_ASSERT(min_count <= max_count);
+    std::vector<SamplePoint> points;
+    for (int y = window.ymin(); y < window.ymax(); y++) {
+        for (int x = window.xmin(); x < window.xmax(); x++) {
+            if (counts[x][y] >= min_count && counts[x][y] <= max_count) {
+                points.push_back(SamplePoint{/* .location = */ vtr::Point<int>(x, y),
+                                             /* .nodes = */ {}});
+            }
+        }
+    }
+
+    vtr::Point<int> center = sample(window, 1, 1, 2);
+
+    // sort by distance from center
+    std::sort(points.begin(), points.end(),
+              [&](const SamplePoint& a, const SamplePoint& b) {
+                  return manhattan_distance(a.location, center) < manhattan_distance(b.location, center);
+              });
+
+    return points;
+}
+
+// histogram is a map from segment count to number of locations having that count
+static int quantile(const std::map<int, int>& histogram, float ratio) {
+    if (histogram.empty()) {
+        return 0;
+    }
+    int sum = 0;
+    for (const auto& entry : histogram) {
+        sum += entry.second;
+    }
+    int limit = std::ceil(sum * ratio);
+    for (const auto& entry : histogram) {
+        limit -= entry.second;
+        if (limit <= 0) {
+            return entry.first;
+        }
+    }
+    return 0;
+}
+
+// select a good number of segments to find
+static std::map<int, int> count_histogram(const vtr::Rect<int>& box, const vtr::Matrix<int>& counts) {
+    std::map<int, int> histogram;
+    for (int y = box.ymin(); y < box.ymax(); y++) {
+        for (int x = box.xmin(); x < box.xmax(); x++) {
+            int count = counts[x][y];
+            if (count > 0) {
+                ++histogram[count];
+            }
+        }
+    }
+    return histogram;
+}
+
+// Used to calculate each region's `order.'
+// A space-filling curve will order the regions so that
+// nearby points stay close in order. A Hilbert curve might
+// be better, but a Morton (Z)-order curve is easy to compute,
+// because it's just interleaving binary bits, so this
+// function interleaves with 0's so that the X and Y
+// dimensions can then be OR'ed together.
+static uint64_t interleave(uint32_t x) {
+    uint64_t i = x;
+    i = (i ^ (i << 16)) & 0x0000ffff0000ffff;
+    i = (i ^ (i << 8)) & 0x00ff00ff00ff00ff;
+    i = (i ^ (i << 4)) & 0x0f0f0f0f0f0f0f0f;
+    i = (i ^ (i << 2)) & 0x3333333333333333;
+    i = (i ^ (i << 1)) & 0x5555555555555555;
+    return i;
+}
+
+// for each segment type, find the nearest nodes to an equally spaced grid of points
+// within the bounding box for that segment type
+std::vector<SampleRegion> find_sample_regions(int num_segments) {
+    vtr::ScopedStartFinishTimer timer("finding sample regions");
+    std::vector<SampleRegion> sample_regions;
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& rr_nodes = device_ctx.rr_nodes;
+    std::vector<vtr::Matrix<int>> segment_counts(num_segments);
+
+    // compute bounding boxes for each segment type
+    std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>());
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+
+        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(i));
+    }
+
+    // initialize counts
+    for (int seg = 0; seg < num_segments; seg++) {
+        const auto& box = bounding_box_for_segment[seg];
+        segment_counts[seg] = vtr::Matrix<int>({size_t(box.width()), size_t(box.height())}, 0);
+    }
+
+    // count sample points
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        int x = rr_nodes.node_xlow(RRNodeId(i));
+        int y = rr_nodes.node_ylow(RRNodeId(i));
+
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+        segment_counts[seg_index][x][y] += 1;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+    }
+
+    // select sample points
+    for (int i = 0; i < num_segments; i++) {
+        const auto& counts = segment_counts[i];
+        const auto& bounding_box = bounding_box_for_segment[i];
+        if (bounding_box.empty()) continue;
+        for (int y = 0; y < SAMPLE_GRID_SIZE; y++) {
+            for (int x = 0; x < SAMPLE_GRID_SIZE; x++) {
+                vtr::Rect<int> window = sample_window(bounding_box, x, y, SAMPLE_GRID_SIZE);
+                if (window.empty()) continue;
+
+                auto histogram = count_histogram(window, segment_counts[i]);
+                SampleRegion region = {
+                    /* .segment_type = */ i,
+                    /* .grid_location = */ vtr::Point<int>(x, y),
+                    /* .points = */ choose_points(counts, window, quantile(histogram, kSamplingCountLowerQuantile), quantile(histogram, kSamplingCountUpperQuantile)),
+                    /* .order = */ 0};
+                if (!region.points.empty()) {
+                    /* In order to improve caching, the list of sample points are
+                     * sorted to keep points that are nearby on the Euclidean plane also
+                     * nearby in the vector of sample points.
+                     *
+                     * This means subsequent expansions on the same thread are likely
+                     * to cover a similar set of nodes, so they are more likely to be
+                     * cached. This improves performance by about 7%, which isn't a lot,
+                     * but not a bad improvement for a few lines of code. */
+                    vtr::Point<int> location = region.points[0].location;
+
+                    // interleave bits of X and Y for a Z-curve ordering.
+                    region.order = interleave(location.x()) | (interleave(location.y()) << 1);
+
+                    sample_regions.push_back(region);
+                }
+            }
+        }
+    }
+
+    // sort regions
+    std::sort(sample_regions.begin(), sample_regions.end(),
+              [](const SampleRegion& a, const SampleRegion& b) {
+                  return a.order < b.order;
+              });
+
+    // build an index of sample points on segment type and location
+    std::map<std::tuple<int, int, int>, SamplePoint*> sample_point_index;
+    for (auto& region : sample_regions) {
+        for (auto& point : region.points) {
+            sample_point_index[std::make_tuple(region.segment_type, point.location.x(), point.location.y())] = &point;
+        }
+    }
+
+    // collect the node indices for each segment type at the selected sample points
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+
+        int x = rr_nodes.node_xlow(RRNodeId(i));
+        int y = rr_nodes.node_ylow(RRNodeId(i));
+
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);
+
+        auto point = sample_point_index.find(std::make_tuple(seg_index, x, y));
+        if (point != sample_point_index.end()) {
+            point->second->nodes.push_back(i);
+        }
+    }
+
+    return sample_regions;
+}
diff --git a/vpr/src/route/router_lookahead_sampling.h b/vpr/src/route/router_lookahead_sampling.h
new file mode 100644
index 00000000000..6668792e9c6
--- /dev/null
+++ b/vpr/src/route/router_lookahead_sampling.h
@@ -0,0 +1,35 @@
+#ifndef ROUTER_LOOKAHEAD_SAMPLING_H
+#define ROUTER_LOOKAHEAD_SAMPLING_H
+
+#include <vector>
+#include "vtr_geometry.h"
+#include "globals.h"
+
+// a sample point for a segment type, contains all segments at the VPR location
+struct SamplePoint {
+    // canonical location
+    vtr::Point<int> location;
+
+    // nodes to expand
+    std::vector<ssize_t> nodes;
+};
+
+struct SampleRegion {
+    // all nodes in `points' have this segment type
+    int segment_type;
+
+    // location on the sample grid
+    vtr::Point<int> grid_location;
+
+    // locations to try
+    // The computation will keep expanding each of the points
+    // until a number of paths (segment -> connection box) are found.
+    std::vector<SamplePoint> points;
+
+    // used to sort the regions to improve caching
+    uint64_t order;
+};
+
+std::vector<SampleRegion> find_sample_regions(int num_segments);
+
+#endif
diff --git a/vpr/src/route/rr_graph_storage.h b/vpr/src/route/rr_graph_storage.h
index cc0207ee123..66837ef5f78 100644
--- a/vpr/src/route/rr_graph_storage.h
+++ b/vpr/src/route/rr_graph_storage.h
@@ -48,7 +48,7 @@
  *       This field is valid only for IPINs and OPINs and should be ignored  *
  *       otherwise.                                                          */
 struct alignas(16) t_rr_node_data {
-    int8_t cost_index_ = -1;
+    int16_t cost_index_ = -1;
     int16_t rc_index_ = -1;
 
     int16_t xlow_ = -1;
diff --git a/vpr/test/test_map_lookahead_serdes.cpp b/vpr/test/test_map_lookahead_serdes.cpp
deleted file mode 100644
index 270bc689a20..00000000000
--- a/vpr/test/test_map_lookahead_serdes.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "catch.hpp"
-
-#include "router_lookahead_map.h"
-
-extern t_wire_cost_map f_wire_cost_map;
-
-namespace {
-
-#ifdef VTR_ENABLE_CAPNPROTO
-static constexpr const char kMapLookaheadBin[] = "test_map_lookahead.bin";
-
-TEST_CASE("round_trip_map_lookahead", "[vpr]") {
-    constexpr std::array<size_t, 4> kDim({10, 12, 15, 16});
-
-    f_wire_cost_map.resize(kDim);
-    for (size_t x = 0; x < kDim[0]; ++x) {
-        for (size_t y = 0; y < kDim[1]; ++y) {
-            for (size_t z = 0; z < kDim[2]; ++z) {
-                for (size_t w = 0; w < kDim[3]; ++w) {
-                    f_wire_cost_map[x][y][z][w].delay = (x + 1) * (y + 1) * (z + 1) * (w + 1);
-                    f_wire_cost_map[x][y][z][w].congestion = 2 * (x + 1) * (y + 1) * (z + 1) * (w + 1);
-                }
-            }
-        }
-    }
-
-    write_router_lookahead(kMapLookaheadBin);
-
-    for (size_t x = 0; x < kDim[0]; ++x) {
-        for (size_t y = 0; y < kDim[1]; ++y) {
-            for (size_t z = 0; z < kDim[2]; ++z) {
-                for (size_t w = 0; w < kDim[3]; ++w) {
-                    f_wire_cost_map[x][y][z][w].delay = 0.f;
-                    f_wire_cost_map[x][y][z][w].congestion = 0.f;
-                }
-            }
-        }
-    }
-
-    f_wire_cost_map.resize({0, 0, 0, 0});
-
-    read_router_lookahead(kMapLookaheadBin);
-
-    for (size_t i = 0; i < kDim.size(); ++i) {
-        REQUIRE(f_wire_cost_map.dim_size(i) == kDim[i]);
-    }
-
-    for (size_t x = 0; x < kDim[0]; ++x) {
-        for (size_t y = 0; y < kDim[1]; ++y) {
-            for (size_t z = 0; z < kDim[2]; ++z) {
-                for (size_t w = 0; w < kDim[3]; ++w) {
-                    REQUIRE(f_wire_cost_map[x][y][z][w].delay == (x + 1) * (y + 1) * (z + 1) * (w + 1));
-                    REQUIRE(f_wire_cost_map[x][y][z][w].congestion == 2 * (x + 1) * (y + 1) * (z + 1) * (w + 1));
-                }
-            }
-        }
-    }
-}
-
-#endif
-
-} // namespace