diff --git a/doc/src/vpr/command_line_usage.rst b/doc/src/vpr/command_line_usage.rst
index 9e1ab94ae72..fc575e08a13 100644
--- a/doc/src/vpr/command_line_usage.rst
+++ b/doc/src/vpr/command_line_usage.rst
@@ -381,6 +381,9 @@ Use the options below to override this default naming behaviour.
 .. option:: --write_placement_delay_lookup <file>
 
     Writes the placement delay lookup to the specified file.
+.. option:: --write_initial_place_file <file>
+
+    Writes out the the placement chosen by the initial placement algorithm to the specified file
 
 .. option:: --outfile_prefix <string>
 
@@ -769,6 +772,19 @@ If any of init_t, exit_t or alpha_t is specified, the user schedule, with a fixe
 
     **Default:**  ``criticality_timing``
 
+.. option:: --place_bounding_box_mode {auto_bb | cube_bb | per_layer_bb}
+
+    Specifies the type of the wirelength estimator used during placement. For single layer architectures, cube_bb (a 3D bounding box) is always used (and is the same as per_layer_bb).
+    For 3D architectures, cube_bb is appropriate if you can cross between layers at switch blocks, while if you can only cross between layers at output pins per_layer_bb (one bouding box per layer) is more accurate and appropriate.
+
+    ``auto_bb``: The bounding box type is determined automatically based on the cross-layer connections.
+
+    ``cube_bb``: ``cube_bb`` bounding box is used to estimate the wirelength.
+
+    ``per_layer_bb``: ``per_layer_bb`` bounding box is used to estimate the wirelength
+
+    **Default:** ``auto_bb``
+
 .. option:: --place_chan_width <int>
 
     Tells VPR how many tracks a channel of relative width 1 is expected to need to complete routing of this circuit.
diff --git a/libs/librrgraph/src/base/rr_graph_utils.cpp b/libs/librrgraph/src/base/rr_graph_utils.cpp
index 6552c8c214b..11b6a569c9e 100644
--- a/libs/librrgraph/src/base/rr_graph_utils.cpp
+++ b/libs/librrgraph/src/base/rr_graph_utils.cpp
@@ -5,13 +5,9 @@
  ***************************************************************************/
 #include <queue>
 #include <random>
-#include <algorithm>
 
 #include "rr_graph_utils.h"
 
-#include "vtr_memory.h"
-#include "vtr_time.h"
-
 #include "vpr_error.h"
 
 #include "rr_graph_obj.h"
@@ -119,4 +115,27 @@ vtr::vector<RRNodeId, std::vector<RREdgeId>> get_fan_in_list(const RRGraphView&
         });
 
     return node_fan_in_list;
+}
+
+bool inter_layer_connections_limited_to_opin(const RRGraphView& rr_graph) {
+    bool limited_to_opin = true;
+    for (const auto& from_node : rr_graph.nodes()) {
+        for (t_edge_size edge : rr_graph.edges(from_node)) {
+            RRNodeId to_node = rr_graph.edge_sink_node(from_node, edge);
+            int from_layer = rr_graph.node_layer(from_node);
+            int to_layer = rr_graph.node_layer(to_node);
+
+            if (from_layer != to_layer) {
+                if (rr_graph.node_type(from_node) != e_rr_type::OPIN) {
+                    limited_to_opin = false;
+                    break;
+                }
+            }
+        }
+        if (!limited_to_opin) {
+            break;
+        }
+    }
+
+    return limited_to_opin;
 }
\ No newline at end of file
diff --git a/libs/librrgraph/src/base/rr_graph_utils.h b/libs/librrgraph/src/base/rr_graph_utils.h
index 0725bcd0cf9..6ef6148c1fa 100644
--- a/libs/librrgraph/src/base/rr_graph_utils.h
+++ b/libs/librrgraph/src/base/rr_graph_utils.h
@@ -48,4 +48,12 @@ vtr::vector<RRNodeId, std::vector<RREdgeId>> get_fan_in_list(const RRGraphView&
 int seg_index_of_cblock(const RRGraphView& rr_graph, t_rr_type from_rr_type, int to_node);
 int seg_index_of_sblock(const RRGraphView& rr_graph, int from_node, int to_node);
 
+/**
+ * @brief This function checks whether all inter-die connections are form OPINs. Return "true"
+ * if that is the case. Can be used for multiple purposes. For example, to determine which type of bounding
+ * box to be used to estimate the wire-length of a net.
+ * @param rr_graph
+ * @return
+ */
+bool inter_layer_connections_limited_to_opin(const RRGraphView& rr_graph);
 #endif
\ No newline at end of file
diff --git a/libs/librrgraph/src/base/rr_graph_view.h b/libs/librrgraph/src/base/rr_graph_view.h
index 1ff9a33115e..9940ef85ee3 100644
--- a/libs/librrgraph/src/base/rr_graph_view.h
+++ b/libs/librrgraph/src/base/rr_graph_view.h
@@ -234,7 +234,7 @@ class RRGraphView {
     }
 
     /** @brief Get string of information about routing resource node. The string will contain the following information.
-     * type, side, x_low, x_high, y_low, y_high, length, direction, segment_name
+     * type, side, x_low, x_high, y_low, y_high, length, direction, segment_name, layer num
      * This function is inlined for runtime optimization.
      */
     inline const std::string node_coordinate_to_string(RRNodeId node) const {
@@ -242,6 +242,7 @@ class RRGraphView {
         std::string start_y;                                           //start y-coordinate
         std::string end_x;                                             //end x-coordinate
         std::string end_y;                                             //end y-coordinate
+        std::string layer_num_str;                                     //layer number
         std::string arrow;                                             //direction arrow
         std::string coordinate_string = node_type_string(node);        //write the component's type as a routing resource node
         coordinate_string += ":" + std::to_string(size_t(node)) + " "; //add the index of the routing resource node
@@ -256,12 +257,14 @@ class RRGraphView {
             coordinate_string += ")"; //add the side of the routing resource node
             // For OPINs and IPINs the starting and ending coordinate are identical, so we can just arbitrarily assign the start to larger values
             // and the end to the lower coordinate
-            start_x = " (" + std::to_string(node_xhigh(node)) + ","; //start and end coordinates are the same for OPINs and IPINs
-            start_y = std::to_string(node_yhigh(node)) + ")";
+            start_x =  " (" + std::to_string(node_xhigh(node)) + ","; //start and end coordinates are the same for OPINs and IPINs
+            start_y = std::to_string(node_yhigh(node)) + ",";
+            layer_num_str = std::to_string(node_layer(node)) + ")";
         } else if (node_type(node) == SOURCE || node_type(node) == SINK) {
             // For SOURCE and SINK the starting and ending coordinate are identical, so just use start
-            start_x = "(" + std::to_string(node_xhigh(node)) + ",";
-            start_y = std::to_string(node_yhigh(node)) + ")";
+            start_x = " (" + std::to_string(node_xhigh(node)) + ",";
+            start_y = std::to_string(node_yhigh(node)) + ",";
+            layer_num_str = std::to_string(node_layer(node)) + ")";
         } else if (node_type(node) == CHANX || node_type(node) == CHANY) { //for channels, we would like to describe the component with segment specific information
             RRIndexedDataId cost_index = node_cost_index(node);
             int seg_index = rr_indexed_data_[cost_index].seg_index;
@@ -272,26 +275,29 @@ class RRGraphView {
             arrow = "->"; //we will point the coordinates from start to finish, left to right
 
             if (node_direction(node) == Direction::DEC) {                //signal travels along decreasing direction
+
                 start_x = " (" + std::to_string(node_xhigh(node)) + ","; //start coordinates have large value
-                start_y = std::to_string(node_yhigh(node)) + ")";
-                end_x = "(" + std::to_string(node_xlow(node)) + ","; //end coordinates have smaller value
-                end_y = std::to_string(node_ylow(node)) + ")";
+                start_y = std::to_string(node_yhigh(node)) + ",";
+                end_x = " (" + std::to_string(node_xlow(node)) + ","; //end coordinates have smaller value
+                end_y = std::to_string(node_ylow(node)) + ",";
+                layer_num_str = std::to_string(node_layer(node)) + ")";
             }
 
             else {                                                      // signal travels in increasing direction, stays at same point, or can travel both directions
                 start_x = " (" + std::to_string(node_xlow(node)) + ","; //start coordinates have smaller value
-                start_y = std::to_string(node_ylow(node)) + ")";
-                end_x = "(" + std::to_string(node_xhigh(node)) + ","; //end coordinates have larger value
-                end_y = std::to_string(node_yhigh(node)) + ")";
+                start_y = std::to_string(node_ylow(node)) + ",";
+                end_x = " (" + std::to_string(node_xhigh(node)) + ","; //end coordinates have larger value
+                end_y = std::to_string(node_yhigh(node)) + ",";
+                layer_num_str = std::to_string(node_layer(node)) + ")"; //layer number
                 if (node_direction(node) == Direction::BIDIR) {
                     arrow = "<->"; //indicate that signal can travel both direction
                 }
             }
         }
 
-        coordinate_string += start_x + start_y; //Write the starting coordinates
+        coordinate_string +=  start_x + start_y + layer_num_str; //Write the starting coordinates
         coordinate_string += arrow;             //Indicate the direction
-        coordinate_string += end_x + end_y;     //Write the end coordinates
+        coordinate_string += end_x + end_y + layer_num_str;     //Write the end coordinates
         return coordinate_string;
     }
 
diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp
index 571c17c30e6..0cf1c901d23 100644
--- a/utils/route_diag/src/main.cpp
+++ b/utils/route_diag/src/main.cpp
@@ -84,6 +84,8 @@ static void do_one_route(const Netlist<>& net_list,
     bounding_box.xmax = device_ctx.grid.width() + 1;
     bounding_box.ymin = 0;
     bounding_box.ymax = device_ctx.grid.height() + 1;
+    bounding_box.layer_min = 0;
+    bounding_box.layer_max = device_ctx.grid.get_num_layers() - 1;
 
     t_conn_cost_params cost_params;
     cost_params.criticality = router_opts.max_criticality;
@@ -203,9 +205,12 @@ static void profile_source(const Netlist<>& net_list,
                     vtr::ScopedStartFinishTimer delay_timer(vtr::string_fmt(
                         "Routing Src: %d Sink: %d", source_rr_node,
                         sink_rr_node));
-                    successfully_routed = profiler.calculate_delay(RRNodeId(source_rr_node), RRNodeId(sink_rr_node),
-                                                        router_opts,
-                                                        &delays[sink_x][sink_y]);
+
+                    successfully_routed = profiler.calculate_delay(RRNodeId(source_rr_node),
+                                                                   RRNodeId(sink_rr_node),
+                                                                   router_opts,
+                                                                   &delays[sink_x][sink_y],
+                                                                   layer_num);
                 }
 
                 if (successfully_routed) {
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index b5492a1f8ec..e596bd51c43 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -622,6 +622,8 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
 
     PlacerOpts->constraints_file = Options.constraints_file;
 
+    PlacerOpts->write_initial_place_file = Options.write_initial_place_file;
+
     PlacerOpts->pad_loc_type = Options.pad_loc_type;
 
     PlacerOpts->place_chan_width = Options.PlaceChanWidth;
@@ -661,6 +663,7 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
     PlacerOpts->place_static_move_prob = Options.place_static_move_prob;
     PlacerOpts->place_static_notiming_move_prob = Options.place_static_notiming_move_prob;
     PlacerOpts->place_high_fanout_net = Options.place_high_fanout_net;
+    PlacerOpts->place_bounding_box_mode = Options.place_bounding_box_mode;
     PlacerOpts->RL_agent_placement = Options.RL_agent_placement;
     PlacerOpts->place_agent_multistate = Options.place_agent_multistate;
     PlacerOpts->place_checkpointing = Options.place_checkpointing;
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 72800d8b1c0..32929c4fc9a 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -430,6 +430,41 @@ struct ParsePlaceAlgorithm {
     }
 };
 
+struct ParsePlaceBoundingBox {
+    ConvertedValue<e_place_bounding_box_mode> from_str(std::string str) {
+        ConvertedValue<e_place_bounding_box_mode> conv_value;
+        if (str == "auto_bb") {
+            conv_value.set_value(AUTO_BB);
+        } else if (str == "cube_bb") {
+            conv_value.set_value(CUBE_BB);
+        } else if (str == "per_layer_bb") {
+            conv_value.set_value(PER_LAYER_BB);
+        } else {
+            std::stringstream msg;
+            msg << "Invalid conversion from '" << str << "' to e_place_algorithm (expected one of: " << argparse::join(default_choices(), ", ") << ")";
+            conv_value.set_error(msg.str());
+        }
+        return conv_value;
+    }
+
+    ConvertedValue<std::string> to_str(e_place_bounding_box_mode val) {
+        ConvertedValue<std::string> conv_value;
+        if (val == AUTO_BB) {
+            conv_value.set_value("auto_bb");
+        } else if (val == CUBE_BB) {
+            conv_value.set_value("cube_bb");
+        } else {
+            VTR_ASSERT(val == PER_LAYER_BB);
+            conv_value.set_value("per_layer_bb");
+        }
+        return conv_value;
+    }
+
+    std::vector<std::string> default_choices() {
+        return {"auto_bb", "cube_bb", "per_layer_bb"};
+    }
+};
+
 struct ParsePlaceAgentAlgorithm {
     ConvertedValue<e_agent_algorithm> from_str(std::string str) {
         ConvertedValue<e_agent_algorithm> conv_value;
@@ -1569,6 +1604,11 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .metavar("RR_GRAPH_FILE")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    file_grp.add_argument(args.write_initial_place_file, "--write_initial_place_file")
+        .help("Writes out the the placement chosen by the initial placement algorithm to the specified file")
+        .metavar("INITIAL_PLACE_FILE")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     file_grp.add_argument(args.read_vpr_constraints_file, "--read_vpr_constraints")
         .help("Reads the floorplanning constraints that packing and placement must respect from the specified XML file.")
         .show_in(argparse::ShowIn::HELP_ONLY);
@@ -2007,6 +2047,20 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("10")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    place_grp.add_argument<e_place_bounding_box_mode, ParsePlaceBoundingBox>(args.place_bounding_box_mode, "--place_bounding_box_mode")
+        .help(
+            "Specifies the type of bounding box to be used in 3D architectures.\n"
+            "\n"
+            "MODE options:\n"
+            "  auto_bb     : Automatically determine the appropriate bounding box based on the connections between layers.\n"
+            "  cube_bb            : Use 3D bounding boxes.\n"
+            "  per_layer_bb     : Use per-layer bounding boxes.\n"
+            "\n"
+            "Choose one of the available modes to define the behavior of bounding boxes in your 3D architecture. The default mode is 'automatic'.")
+        .default_value("auto_bb")
+        .choices({"auto_bb", "cube_bb", "per_layer_bb"})
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     place_grp.add_argument<bool, ParseOnOff>(args.RL_agent_placement, "--RL_agent_placement")
         .help(
             "Uses a Reinforcement Learning (RL) agent in choosing the appropiate move type in placement."
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index 06da9282f94..d1edc5ef2b2 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -27,6 +27,7 @@ struct t_options {
     argparse::ArgValue<std::string> constraints_file;
     argparse::ArgValue<std::string> write_rr_graph_file;
     argparse::ArgValue<std::string> read_rr_graph_file;
+    argparse::ArgValue<std::string> write_initial_place_file;
     argparse::ArgValue<std::string> read_vpr_constraints_file;
     argparse::ArgValue<std::string> write_vpr_constraints_file;
 
@@ -127,6 +128,7 @@ struct t_options {
     argparse::ArgValue<std::vector<float>> place_static_move_prob;
     argparse::ArgValue<std::vector<float>> place_static_notiming_move_prob;
     argparse::ArgValue<int> place_high_fanout_net;
+    argparse::ArgValue<e_place_bounding_box_mode> place_bounding_box_mode;
 
     argparse::ArgValue<bool> RL_agent_placement;
     argparse::ArgValue<bool> place_agent_multistate;
diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp
index 9f379f84e42..65519d5775f 100644
--- a/vpr/src/base/vpr_api.cpp
+++ b/vpr/src/base/vpr_api.cpp
@@ -452,6 +452,8 @@ void vpr_create_device_grid(const t_vpr_setup& vpr_setup, const t_arch& Arch) {
     float target_device_utilization = vpr_setup.PackerOpts.target_device_utilization;
     device_ctx.grid = create_device_grid(vpr_setup.device_layout, Arch.grid_layouts, num_type_instances, target_device_utilization);
 
+    VTR_ASSERT_MSG(device_ctx.grid.get_num_layers() <= MAX_NUM_LAYERS, "Number of layers should be less than MAX_NUM_LAYERS. If you need more layers, please increase the value of MAX_NUM_LAYERS in vpr_types.h");
+
     /*
      *Report on the device
      */
diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h
index 82e7be31249..a07a73e2827 100644
--- a/vpr/src/base/vpr_context.h
+++ b/vpr/src/base/vpr_context.h
@@ -402,6 +402,12 @@ struct PlacementContext : public Context {
      * placer_debug_net or placer_debug_block parameters in the command line.
      */
     bool f_placer_debug = false;
+
+    /**
+     * Set this variable to ture if the type of the bounding box used in placement is of the type cube. If it is false,
+     * it would mean that per-layer bounding box is used. For the 2D architecture, the cube bounding box would be used.
+     */
+    bool cube_bb = false;
 };
 
 /**
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 93ef759bb88..2784c5e63da 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -65,6 +65,14 @@
 
 //#define VERBOSE //Prints additional intermediate data
 
+/*
+ * We need to define the maximum number of layers to address a specific issue.
+ * For certain data structures, such as `num_sink_pin_layer` in the placer context, dynamically allocating
+ * memory based on the number of layers can lead to a performance hit due to additional pointer chasing and
+ * cache locality concerns. Defining a constant variable helps optimize the memory allocation process.
+ */
+constexpr int MAX_NUM_LAYERS = 2;
+
 /**
  * @brief For update_screen. Denotes importance of update.
  *
@@ -570,48 +578,79 @@ struct t_net_power {
 };
 
 /**
- * @brief Stores the bounding box of a net in terms of the minimum and
- *        maximum coordinates of the blocks forming the net, clipped to
- *        the region: (1..device_ctx.grid.width()-2, 1..device_ctx.grid.height()-1)
+ * @brief Stores a 3D bounding box in terms of the minimum and
+ *        maximum coordinates: x, y, layer
  */
 struct t_bb {
     t_bb() = default;
-    t_bb(int xmin_, int xmax_, int ymin_, int ymax_)
+    t_bb(int xmin_, int xmax_, int ymin_, int ymax_, int layer_min_, int layer_max_)
+        : xmin(xmin_)
+        , xmax(xmax_)
+        , ymin(ymin_)
+        , ymax(ymax_)
+        , layer_min(layer_min_)
+        , layer_max(layer_max_) {
+        VTR_ASSERT(xmax_ >= xmin_);
+        VTR_ASSERT(ymax_ >= ymin_);
+        VTR_ASSERT(layer_max_ >= layer_min_);
+    }
+    int xmin = OPEN;
+    int xmax = OPEN;
+    int ymin = OPEN;
+    int ymax = OPEN;
+    int layer_min = OPEN;
+    int layer_max = OPEN;
+};
+
+/**
+ * @brief Stores a 2D bounding box in terms of the minimum and maximum x and y
+ * @note layer_num indicates the layer that the bounding box is on.
+ */
+struct t_2D_bb {
+    t_2D_bb() = default;
+    t_2D_bb(int xmin_, int xmax_, int ymin_, int ymax_, int layer_num_)
         : xmin(xmin_)
         , xmax(xmax_)
         , ymin(ymin_)
-        , ymax(ymax_) {
+        , ymax(ymax_)
+        , layer_num(layer_num_) {
         VTR_ASSERT(xmax_ >= xmin_);
         VTR_ASSERT(ymax_ >= ymin_);
+        VTR_ASSERT(layer_num_ >= 0);
     }
     int xmin = OPEN;
     int xmax = OPEN;
     int ymin = OPEN;
     int ymax = OPEN;
+    int layer_num = OPEN;
 };
 
 /**
  * @brief An offset between placement locations (t_pl_loc)
- *
+ * @note In the case of comparing the offset, the layer offset should be equal
  * x: x-offset
  * y: y-offset
- * z: z-offset
+ * sub_tile: sub_tile-offset
+ * layer: layer-offset
  */
 struct t_pl_offset {
     t_pl_offset() = default;
-    t_pl_offset(int xoffset, int yoffset, int sub_tile_offset)
+    t_pl_offset(int xoffset, int yoffset, int sub_tile_offset, int layer_offset)
         : x(xoffset)
         , y(yoffset)
-        , sub_tile(sub_tile_offset) {}
+        , sub_tile(sub_tile_offset)
+        , layer(layer_offset) {}
 
     int x = 0;
     int y = 0;
     int sub_tile = 0;
+    int layer = 0;
 
     t_pl_offset& operator+=(const t_pl_offset& rhs) {
         x += rhs.x;
         y += rhs.y;
         sub_tile += rhs.sub_tile;
+        layer += rhs.layer;
         return *this;
     }
 
@@ -619,6 +658,7 @@ struct t_pl_offset {
         x -= rhs.x;
         y -= rhs.y;
         sub_tile -= rhs.sub_tile;
+        layer -= rhs.layer;
         return *this;
     }
 
@@ -633,18 +673,19 @@ struct t_pl_offset {
     }
 
     friend t_pl_offset operator-(const t_pl_offset& other) {
-        return t_pl_offset(-other.x, -other.y, -other.sub_tile);
+        return t_pl_offset(-other.x, -other.y, -other.sub_tile, -other.layer);
     }
     friend t_pl_offset operator+(const t_pl_offset& other) {
-        return t_pl_offset(+other.x, +other.y, +other.sub_tile);
+        return t_pl_offset(+other.x, +other.y, +other.sub_tile, +other.layer);
     }
 
     friend bool operator<(const t_pl_offset& lhs, const t_pl_offset& rhs) {
+        VTR_ASSERT(lhs.layer == rhs.layer);
         return std::tie(lhs.x, lhs.y, lhs.sub_tile) < std::tie(rhs.x, rhs.y, rhs.sub_tile);
     }
 
     friend bool operator==(const t_pl_offset& lhs, const t_pl_offset& rhs) {
-        return std::tie(lhs.x, lhs.y, lhs.sub_tile) == std::tie(rhs.x, rhs.y, rhs.sub_tile);
+        return std::tie(lhs.x, lhs.y, lhs.sub_tile, lhs.layer) == std::tie(rhs.x, rhs.y, rhs.sub_tile, rhs.layer);
     }
 
     friend bool operator!=(const t_pl_offset& lhs, const t_pl_offset& rhs) {
@@ -659,6 +700,7 @@ struct hash<t_pl_offset> {
         std::size_t seed = std::hash<int>{}(v.x);
         vtr::hash_combine(seed, v.y);
         vtr::hash_combine(seed, v.sub_tile);
+        vtr::hash_combine(seed, v.layer);
         return seed;
     }
 };
@@ -688,7 +730,7 @@ struct t_pl_loc {
     int layer = OPEN;
 
     t_pl_loc& operator+=(const t_pl_offset& rhs) {
-        VTR_ASSERT(this->layer != OPEN);
+        layer += rhs.layer;
         x += rhs.x;
         y += rhs.y;
         sub_tile += rhs.sub_tile;
@@ -696,7 +738,7 @@ struct t_pl_loc {
     }
 
     t_pl_loc& operator-=(const t_pl_offset& rhs) {
-        VTR_ASSERT(this->layer != OPEN);
+        layer -= rhs.layer;
         x -= rhs.x;
         y -= rhs.y;
         sub_tile -= rhs.sub_tile;
@@ -720,8 +762,10 @@ struct t_pl_loc {
     }
 
     friend t_pl_offset operator-(const t_pl_loc& lhs, const t_pl_loc& rhs) {
-        VTR_ASSERT(lhs.layer == rhs.layer);
-        return {lhs.x - rhs.x, lhs.y - rhs.y, lhs.sub_tile - rhs.sub_tile};
+        return {lhs.x - rhs.x,
+                lhs.y - rhs.y,
+                lhs.sub_tile - rhs.sub_tile,
+                lhs.layer - rhs.layer};
     }
 
     friend bool operator<(const t_pl_loc& lhs, const t_pl_loc& rhs) {
@@ -745,6 +789,7 @@ struct hash<t_pl_loc> {
         std::size_t seed = std::hash<int>{}(v.x);
         vtr::hash_combine(seed, v.y);
         vtr::hash_combine(seed, v.sub_tile);
+        vtr::hash_combine(seed, v.layer);
         return seed;
     }
 };
@@ -974,6 +1019,12 @@ enum e_place_algorithm {
     SLACK_TIMING_PLACE
 };
 
+enum e_place_bounding_box_mode {
+    AUTO_BB,
+    CUBE_BB,
+    PER_LAYER_BB
+};
+
 /**
  * @brief Provides a wrapper around enum e_place_algorithm.
  *
@@ -1109,6 +1160,9 @@ enum class e_place_delta_delay_algorithm {
  *   @param constraints_file
  *              File that specifies locations of locked down (constrained)
  *              blocks for placement. Empty string means no constraints file.
+ *   @param write_initial_place_file
+ *              Write the initial placement into this file. Empty string means
+ *              the initial placement is not written.
  *   @param pad_loc_file
  *              File to read pad locations from if pad_loc_type is USER.
  *   @param place_freq
@@ -1151,6 +1205,7 @@ struct t_placer_opts {
     int place_chan_width;
     enum e_pad_loc_type pad_loc_type;
     std::string constraints_file;
+    std::string write_initial_place_file;
     enum pfreq place_freq;
     int recompute_crit_iter;
     int inner_loop_recompute_divider;
@@ -1186,6 +1241,7 @@ struct t_placer_opts {
     bool place_agent_multistate;
     bool place_checkpointing;
     int place_high_fanout_net;
+    e_place_bounding_box_mode place_bounding_box_mode;
     e_agent_algorithm place_agent_algorithm;
     float place_agent_epsilon;
     float place_agent_gamma;
diff --git a/vpr/src/noc/noc_storage.cpp b/vpr/src/noc/noc_storage.cpp
index 6104226a605..70c92878f82 100644
--- a/vpr/src/noc/noc_storage.cpp
+++ b/vpr/src/noc/noc_storage.cpp
@@ -131,7 +131,7 @@ void NocStorage::set_device_grid_width(int grid_width) {
 
 void NocStorage::set_device_grid_spec(int grid_width, int grid_height) {
     device_grid_width = grid_width;
-    num_layer_blocks = grid_width * grid_height;
+    layer_num_grid_locs = grid_width * grid_height;
     return;
 }
 
@@ -235,7 +235,7 @@ NocLinkId NocStorage::get_parallel_link(NocLinkId current_link) const {
 
 int NocStorage::generate_router_key_from_grid_location(int grid_position_x, int grid_position_y, int layer_position) const {
     // calculate the key value
-    return (num_layer_blocks * layer_position + device_grid_width * grid_position_y + grid_position_x);
+    return (layer_num_grid_locs * layer_position + device_grid_width * grid_position_y + grid_position_x);
 }
 
 void NocStorage::echo_noc(char* file_name) const {
diff --git a/vpr/src/noc/noc_storage.h b/vpr/src/noc/noc_storage.h
index d490b529324..f35f0121eb2 100644
--- a/vpr/src/noc/noc_storage.h
+++ b/vpr/src/noc/noc_storage.h
@@ -141,7 +141,13 @@ class NocStorage {
      * 
      */
     int device_grid_width;
-    int num_layer_blocks;
+    /**
+     * @brief Internal reference to the number of blocks at each layer (width * height). This is necessary
+     * to compute a unique key for a given grid location which we can then use
+     * to get the corresponding physical (hard) router at the given grid
+     * location using 'grid_location_to_router_id'.
+     */
+    int layer_num_grid_locs;
 
     // prevent "copying" of this object
     NocStorage(const NocStorage&) = delete;
diff --git a/vpr/src/place/centroid_move_generator.cpp b/vpr/src/place/centroid_move_generator.cpp
index cab42663a7b..f1316701998 100644
--- a/vpr/src/place/centroid_move_generator.cpp
+++ b/vpr/src/place/centroid_move_generator.cpp
@@ -38,6 +38,9 @@ e_create_move CentroidMoveGenerator::propose_move(t_pl_blocks_to_be_moved& block
     /* Calculate the centroid location*/
     calculate_centroid_loc(b_from, false, centroid, nullptr);
 
+    // Centroid location is not necessarily a valid location, and the downstream location expect a valid
+    // layer for "to" location. So if the layer is not valid, we set it to the same layer as from loc.
+    to.layer = (centroid.layer < 0) ? from.layer : centroid.layer;
     /* Find a location near the weighted centroid_loc */
     if (!find_to_loc_centroid(cluster_from_type, from, centroid, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
diff --git a/vpr/src/place/critical_uniform_move_generator.cpp b/vpr/src/place/critical_uniform_move_generator.cpp
index 264656d8329..9fbc93a7645 100644
--- a/vpr/src/place/critical_uniform_move_generator.cpp
+++ b/vpr/src/place/critical_uniform_move_generator.cpp
@@ -28,7 +28,7 @@ e_create_move CriticalUniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved
     VTR_ASSERT(is_tile_compatible(grid_from_type, cluster_from_type));
 
     t_pl_loc to;
-
+    to.layer = from.layer;
     if (!find_to_loc_uniform(cluster_from_type, rlim, from, to, b_from)) {
         return e_create_move::ABORT;
     }
diff --git a/vpr/src/place/directed_moves_util.cpp b/vpr/src/place/directed_moves_util.cpp
index db49fc88486..330f1904368 100644
--- a/vpr/src/place/directed_moves_util.cpp
+++ b/vpr/src/place/directed_moves_util.cpp
@@ -25,6 +25,7 @@ void calculate_centroid_loc(ClusterBlockId b_from, bool timing_weights, t_pl_loc
     float acc_weight = 0;
     float acc_x = 0;
     float acc_y = 0;
+    float acc_layer = 0;
     float weight = 1;
 
     int from_block_layer_num = g_vpr_ctx.placement().block_locs[b_from].loc.layer;
@@ -65,6 +66,7 @@ void calculate_centroid_loc(ClusterBlockId b_from, bool timing_weights, t_pl_loc
 
                 acc_x += tile_loc.x * weight;
                 acc_y += tile_loc.y * weight;
+                acc_layer += tile_loc.layer_num * weight;
                 acc_weight += weight;
             }
         }
@@ -84,6 +86,7 @@ void calculate_centroid_loc(ClusterBlockId b_from, bool timing_weights, t_pl_loc
 
             acc_x += tile_loc.x * weight;
             acc_y += tile_loc.y * weight;
+            acc_layer += tile_loc.layer_num * weight;
             acc_weight += weight;
         }
     }
@@ -91,8 +94,7 @@ void calculate_centroid_loc(ClusterBlockId b_from, bool timing_weights, t_pl_loc
     //Calculate the centroid location
     centroid.x = acc_x / acc_weight;
     centroid.y = acc_y / acc_weight;
-    // TODO: For now, we don't move the centroid to a different layer
-    centroid.layer = from_block_layer_num;
+    centroid.layer = acc_layer / acc_weight;
 }
 
 static std::map<std::string, e_reward_function> available_reward_function = {
diff --git a/vpr/src/place/feasible_region_move_generator.cpp b/vpr/src/place/feasible_region_move_generator.cpp
index 8baf52f8a46..995c2a37836 100644
--- a/vpr/src/place/feasible_region_move_generator.cpp
+++ b/vpr/src/place/feasible_region_move_generator.cpp
@@ -33,6 +33,8 @@ e_create_move FeasibleRegionMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
 
     /* Calculate the feasible region */
     t_pl_loc to;
+    // Currently, we don't change the layer for this move
+    to.layer = from.layer;
     int ipin;
     ClusterBlockId bnum;
     int max_x, min_x, max_y, min_y;
@@ -101,6 +103,9 @@ e_create_move FeasibleRegionMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
         FR_coords.ymin = std::min(from.y, max_y);
         FR_coords.ymax = std::max(from.y, yt);
     }
+
+    FR_coords.layer_min = from.layer;
+    FR_coords.layer_max = from.layer;
     VTR_ASSERT(FR_coords.ymin <= FR_coords.ymax);
 
     t_range_limiters range_limiters{rlim,
diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp
index 66f47358ff5..c80d5ff245b 100644
--- a/vpr/src/place/initial_placement.cpp
+++ b/vpr/src/place/initial_placement.cpp
@@ -307,11 +307,10 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_
     int first_rlim = 15;
 
     auto search_range = get_compressed_grid_target_search_range(compressed_block_grid,
-                                                                compressed_centroid_loc,
-                                                                first_rlim,
-                                                                num_layers);
+                                                                compressed_centroid_loc[centroid_loc_layer_num],
+                                                                first_rlim);
 
-    int delta_cx = search_range[centroid_loc_layer_num].xmax - search_range[centroid_loc_layer_num].xmin;
+    int delta_cx = search_range.xmax - search_range.xmin;
 
     //Block has not been placed yet, so the "from" coords will be (-1, -1)
     int cx_from = OPEN;
@@ -323,7 +322,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_
     bool legal = find_compatible_compressed_loc_in_range(block_type,
                                                          delta_cx,
                                                          {cx_from, cy_from, layer_from},
-                                                         search_range[centroid_loc_layer_num],
+                                                         search_range,
                                                          to_compressed_loc,
                                                          false,
                                                          centroid_loc_layer_num);
@@ -426,9 +425,8 @@ static std::vector<ClusterBlockId> find_centroid_loc(t_pl_macro pl_macro, t_pl_l
         centroid.y = acc_y / acc_weight;
         if (find_layer) {
             auto max_element = std::max_element(layer_count.begin(), layer_count.end());
-            VTR_ASSERT(*max_element != 0);
-            auto index = std::distance(layer_count.begin(), max_element);
-            centroid.layer = static_cast<int>(index);
+            VTR_ASSERT((*max_element) != 0);
+            centroid.layer = (int)std::distance(layer_count.begin(), max_element);
         } else {
             centroid.layer = head_layer_num;
         }
@@ -634,11 +632,13 @@ static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_log
     t_physical_tile_loc to_compressed_loc;
 
     bool legal;
+
     legal = find_compatible_compressed_loc_in_range(block_type,
                                                     delta_cx,
                                                     {cx_from, cy_from, reg_coord.layer_num},
                                                     {min_compressed_loc.x, max_compressed_loc.x,
-                                                     min_compressed_loc.y, max_compressed_loc.y},
+                                                     min_compressed_loc.y, max_compressed_loc.y,
+                                                     reg_coord.layer_num, reg_coord.layer_num},
                                                     to_compressed_loc,
                                                     false,
                                                     reg_coord.layer_num);
@@ -1097,7 +1097,7 @@ bool place_one_block(const ClusterBlockId& blk_id,
         //If it does not belong to a macro, create a macro with the one block and then pass to the placement routines
         //This is done so that the initial placement flow can be the same whether the block belongs to a macro or not
         t_pl_macro_member macro_member;
-        t_pl_offset block_offset(0, 0, 0);
+        t_pl_offset block_offset(0, 0, 0, 0);
 
         macro_member.blk_index = blk_id;
         macro_member.offset = block_offset;
diff --git a/vpr/src/place/median_move_generator.cpp b/vpr/src/place/median_move_generator.cpp
index a0853978f2b..324d0cd3e44 100644
--- a/vpr/src/place/median_move_generator.cpp
+++ b/vpr/src/place/median_move_generator.cpp
@@ -5,9 +5,9 @@
 #include "placer_globals.h"
 #include "move_utils.h"
 
-static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xold, int yold, int xnew, int ynew);
+static bool get_bb_incrementally(ClusterNetId net_id, t_bb& bb_coord_new, int xold, int yold, int xnew, int ynew);
 
-static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_coord_new, ClusterBlockId block_id, bool& skip_net);
+static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_coord_new, ClusterBlockId block_id, bool& skip_net);
 
 e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_affected, t_propose_action& proposed_action, float rlim, const t_placer_opts& placer_opts, const PlacerCriticalities* /*criticalities*/) {
     //Find a movable block based on blk_type
@@ -28,15 +28,20 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     auto& device_ctx = g_vpr_ctx.device();
     auto& place_move_ctx = g_placer_ctx.mutable_move();
 
+    const int num_layers = device_ctx.grid.get_num_layers();
+    bool is_multi_layer = (num_layers > 1);
+
     t_pl_loc from = place_ctx.block_locs[b_from].loc;
+    int from_layer = from.layer;
     auto cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from);
-    auto grid_from_type = g_vpr_ctx.device().grid.get_physical_type({from.x, from.y, from.layer});
+    auto grid_from_type = g_vpr_ctx.device().grid.get_physical_type({from.x, from.y, from_layer});
     VTR_ASSERT(is_tile_compatible(grid_from_type, cluster_from_type));
 
     /* Calculate the median region */
     t_pl_loc to;
 
-    t_bb coords, limit_coords;
+    t_bb coords(OPEN, OPEN, OPEN, OPEN, OPEN, OPEN);
+    t_bb limit_coords;
     ClusterBlockId bnum;
     int pnum, xnew, xold, ynew, yold;
 
@@ -44,6 +49,7 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     //reused to save allocation time
     place_move_ctx.X_coord.clear();
     place_move_ctx.Y_coord.clear();
+    std::vector<int> layer_blk_cnt(num_layers, 0);
 
     //true if the net is a feedback from the block to itself
     bool skip_net;
@@ -61,10 +67,17 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
             continue;
         if (cluster_ctx.clb_nlist.net_sinks(net_id).size() < SMALL_NET) {
             //calculate the bb from scratch
-            get_bb_from_scratch_excluding_block(net_id, &coords, b_from, skip_net);
+            get_bb_from_scratch_excluding_block(net_id, coords, b_from, skip_net);
             if (skip_net)
                 continue;
         } else {
+            t_bb union_bb;
+            const bool& cube_bb = g_vpr_ctx.placement().cube_bb;
+            if (!cube_bb) {
+                union_bb = union_2d_bb(place_move_ctx.layer_bb_coords[net_id]);
+            }
+
+            const auto& net_bb_coords = cube_bb ? place_move_ctx.bb_coords[net_id] : union_bb;
             //use the incremental update of the bb
             bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
             pnum = tile_pin_index(pin_id);
@@ -76,20 +89,20 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
 
             //To calulate the bb incrementally while excluding the moving block
             //assume that the moving block is moved to a non-critical coord of the bb
-            if (place_move_ctx.bb_coords[net_id].xmin == xold) {
-                xnew = place_move_ctx.bb_coords[net_id].xmax;
+            if (net_bb_coords.xmin == xold) {
+                xnew = net_bb_coords.xmax;
             } else {
-                xnew = place_move_ctx.bb_coords[net_id].xmin;
+                xnew = net_bb_coords.xmin;
             }
 
-            if (place_move_ctx.bb_coords[net_id].ymin == yold) {
-                ynew = place_move_ctx.bb_coords[net_id].ymax;
+            if (net_bb_coords.ymin == yold) {
+                ynew = net_bb_coords.ymax;
             } else {
-                ynew = place_move_ctx.bb_coords[net_id].ymin;
+                ynew = net_bb_coords.ymin;
             }
 
-            if (!get_bb_incrementally(net_id, &coords, xold, yold, xnew, ynew)) {
-                get_bb_from_scratch_excluding_block(net_id, &coords, b_from, skip_net);
+            if (!get_bb_incrementally(net_id, coords, xold, yold, xnew, ynew)) {
+                get_bb_from_scratch_excluding_block(net_id, coords, b_from, skip_net);
                 if (skip_net)
                     continue;
             }
@@ -99,6 +112,17 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
         place_move_ctx.X_coord.push_back(coords.xmax);
         place_move_ctx.Y_coord.push_back(coords.ymin);
         place_move_ctx.Y_coord.push_back(coords.ymax);
+        if (is_multi_layer) {
+            for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+                layer_blk_cnt[layer_num] += place_move_ctx.num_sink_pin_layer[size_t(net_id)][layer_num];
+            }
+            // If the pin under consideration is of type sink, it shouldn't be added to layer_blk_cnt since the block
+            // is moving
+            if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK) {
+                VTR_ASSERT_SAFE(layer_blk_cnt[from_layer] > 0);
+                layer_blk_cnt[from_layer]--;
+            }
+        }
     }
 
     if ((place_move_ctx.X_coord.empty()) || (place_move_ctx.Y_coord.empty())) {
@@ -125,10 +149,20 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     t_pl_loc median_point;
     median_point.x = (limit_coords.xmin + limit_coords.xmax) / 2;
     median_point.y = (limit_coords.ymin + limit_coords.ymax) / 2;
-    // TODO: When placer is updated to support moving blocks between dice, this needs to be changed. Currently, we only move blocks within a die.
-    median_point.layer = from.layer;
-    if (!find_to_loc_centroid(cluster_from_type, from, median_point, range_limiters, to, b_from))
+
+    // Before calling find_to_loc_centroid a valid layer should be assigned to "to" location. If there are multiple layers, the layer
+    // with highest number of sinks will be used. Otherwise, the same layer as "from" loc is assigned.
+    if (is_multi_layer) {
+        int layer_num = std::distance(layer_blk_cnt.begin(), std::max_element(layer_blk_cnt.begin(), layer_blk_cnt.end()));
+        median_point.layer = layer_num;
+        to.layer = layer_num;
+    } else {
+        median_point.layer = from.layer;
+        to.layer = from.layer;
+    }
+    if (!find_to_loc_centroid(cluster_from_type, from, median_point, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
+    }
 
     e_create_move create_move = ::create_move(blocks_affected, b_from, to);
 
@@ -150,17 +184,16 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
  * Currently assumes channels on both sides of the CLBs forming the   *
  * edges of the bounding box can be used.  Essentially, I am assuming *
  * the pins always lie on the outside of the bounding box.            */
-static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_coord_new, ClusterBlockId block_id, bool& skip_net) {
+static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_coord_new, ClusterBlockId block_id, bool& skip_net) {
     //TODO: account for multiple physical pin instances per logical pin
 
     skip_net = true;
 
-    int xmin = 0;
-    int xmax = 0;
-    int ymin = 0;
-    int ymax = 0;
+    int xmin = OPEN;
+    int xmax = OPEN;
+    int ymin = OPEN;
+    int ymax = OPEN;
 
-    int x, y;
     int pnum;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -173,14 +206,13 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_co
     if (bnum != block_id) {
         skip_net = false;
         pnum = net_pin_to_tile_pin_index(net_id, 0);
-        x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
-        y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
-
-        xmin = x;
-        ymin = y;
-        xmax = x;
-        ymax = y;
+        int src_x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
+        int src_y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
 
+        xmin = src_x;
+        ymin = src_y;
+        xmax = src_x;
+        ymax = src_y;
         first_block = true;
     }
 
@@ -190,8 +222,9 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_co
         if (bnum == block_id)
             continue;
         skip_net = false;
-        x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
-        y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
+        const auto& block_loc = place_ctx.block_locs[bnum].loc;
+        int x = block_loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
+        int y = block_loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
 
         if (!first_block) {
             xmin = x;
@@ -199,6 +232,7 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_co
             xmax = x;
             ymax = y;
             first_block = true;
+            continue;
         }
         if (x < xmin) {
             xmin = x;
@@ -220,11 +254,10 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_co
      * channel immediately to the left of the bounding box, I want to    *
      * clip to 1 in both directions as well (since minimum channel index *
      * is 0).  See route_common.cpp for a channel diagram.               */
-
-    bb_coord_new->xmin = std::max(std::min<int>(xmin, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
-    bb_coord_new->ymin = std::max(std::min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
-    bb_coord_new->xmax = std::max(std::min<int>(xmax, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
-    bb_coord_new->ymax = std::max(std::min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.xmin = std::max(std::min<int>(xmin, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    bb_coord_new.ymin = std::max(std::min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.xmax = std::max(std::min<int>(xmax, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    bb_coord_new.ymax = std::max(std::min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
 }
 
 /*
@@ -240,11 +273,9 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb* bb_co
  * the pins always lie on the outside of the bounding box.            *
  * The x and y coordinates are the pin's x and y coordinates.         */
 /* IO blocks are considered to be one cell in for simplicity.         */
-static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xold, int yold, int xnew, int ynew) {
+static bool get_bb_incrementally(ClusterNetId net_id, t_bb& bb_coord_new, int xold, int yold, int xnew, int ynew) {
     //TODO: account for multiple physical pin instances per logical pin
 
-    const t_bb *curr_bb_edge, *curr_bb_coord;
-
     auto& device_ctx = g_vpr_ctx.device();
     auto& place_move_ctx = g_placer_ctx.move();
 
@@ -253,9 +284,19 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xo
     xold = std::max(std::min<int>(xold, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     yold = std::max(std::min<int>(yold, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
 
-    /* The net had NOT been updated before, could use the old values */
-    curr_bb_coord = &(place_move_ctx.bb_coords[net_id]);
-    curr_bb_edge = &(place_move_ctx.bb_num_on_edges[net_id]);
+    t_bb union_bb_edge;
+    t_bb union_bb;
+    const bool& cube_bb = g_vpr_ctx.placement().cube_bb;
+    if (!cube_bb) {
+        std::tie(union_bb_edge, union_bb) = union_2d_bb_incr(place_move_ctx.layer_bb_num_on_edges[net_id],
+                                                             place_move_ctx.layer_bb_coords[net_id]);
+    }
+
+    /* In this move, we use a 3D bounding box. Thus, if per-layer BB is used by placer, we need to take a union of BBs and use that for the rest of
+     * operations in this move
+     */
+    const t_bb& curr_bb_edge = cube_bb ? place_move_ctx.bb_num_on_edges[net_id] : union_bb_edge;
+    const t_bb& curr_bb_coord = cube_bb ? place_move_ctx.bb_coords[net_id] : union_bb;
 
     /* Check if I can update the bounding box incrementally. */
 
@@ -263,24 +304,24 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xo
 
         /* Update the xmax fields for coordinates and number of edges first. */
 
-        if (xold == curr_bb_coord->xmax) { /* Old position at xmax. */
-            if (curr_bb_edge->xmax == 1) {
+        if (xold == curr_bb_coord.xmax) { /* Old position at xmax. */
+            if (curr_bb_edge.xmax == 1) {
                 return false;
             } else {
-                bb_coord_new->xmax = curr_bb_coord->xmax;
+                bb_coord_new.xmax = curr_bb_coord.xmax;
             }
         } else { /* Move to left, old postion was not at xmax. */
-            bb_coord_new->xmax = curr_bb_coord->xmax;
+            bb_coord_new.xmax = curr_bb_coord.xmax;
         }
 
         /* Now do the xmin fields for coordinates and number of edges. */
 
-        if (xnew < curr_bb_coord->xmin) { /* Moved past xmin */
-            bb_coord_new->xmin = xnew;
-        } else if (xnew == curr_bb_coord->xmin) { /* Moved to xmin */
-            bb_coord_new->xmin = xnew;
+        if (xnew < curr_bb_coord.xmin) { /* Moved past xmin */
+            bb_coord_new.xmin = xnew;
+        } else if (xnew == curr_bb_coord.xmin) { /* Moved to xmin */
+            bb_coord_new.xmin = xnew;
         } else { /* Xmin unchanged. */
-            bb_coord_new->xmin = curr_bb_coord->xmin;
+            bb_coord_new.xmin = curr_bb_coord.xmin;
         }
         /* End of move to left case. */
 
@@ -288,29 +329,29 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xo
 
         /* Update the xmin fields for coordinates and number of edges first. */
 
-        if (xold == curr_bb_coord->xmin) { /* Old position at xmin. */
-            if (curr_bb_edge->xmin == 1) {
+        if (xold == curr_bb_coord.xmin) { /* Old position at xmin. */
+            if (curr_bb_edge.xmin == 1) {
                 return false;
             } else {
-                bb_coord_new->xmin = curr_bb_coord->xmin;
+                bb_coord_new.xmin = curr_bb_coord.xmin;
             }
         } else { /* Move to right, old position was not at xmin. */
-            bb_coord_new->xmin = curr_bb_coord->xmin;
+            bb_coord_new.xmin = curr_bb_coord.xmin;
         }
         /* Now do the xmax fields for coordinates and number of edges. */
 
-        if (xnew > curr_bb_coord->xmax) { /* Moved past xmax. */
-            bb_coord_new->xmax = xnew;
-        } else if (xnew == curr_bb_coord->xmax) { /* Moved to xmax */
-            bb_coord_new->xmax = xnew;
+        if (xnew > curr_bb_coord.xmax) { /* Moved past xmax. */
+            bb_coord_new.xmax = xnew;
+        } else if (xnew == curr_bb_coord.xmax) { /* Moved to xmax */
+            bb_coord_new.xmax = xnew;
         } else { /* Xmax unchanged. */
-            bb_coord_new->xmax = curr_bb_coord->xmax;
+            bb_coord_new.xmax = curr_bb_coord.xmax;
         }
         /* End of move to right case. */
 
     } else { /* xnew == xold -- no x motion. */
-        bb_coord_new->xmin = curr_bb_coord->xmin;
-        bb_coord_new->xmax = curr_bb_coord->xmax;
+        bb_coord_new.xmin = curr_bb_coord.xmin;
+        bb_coord_new.xmax = curr_bb_coord.xmax;
     }
 
     /* Now account for the y-direction motion. */
@@ -319,24 +360,24 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xo
 
         /* Update the ymax fields for coordinates and number of edges first. */
 
-        if (yold == curr_bb_coord->ymax) { /* Old position at ymax. */
-            if (curr_bb_edge->ymax == 1) {
+        if (yold == curr_bb_coord.ymax) { /* Old position at ymax. */
+            if (curr_bb_edge.ymax == 1) {
                 return false;
             } else {
-                bb_coord_new->ymax = curr_bb_coord->ymax;
+                bb_coord_new.ymax = curr_bb_coord.ymax;
             }
         } else { /* Move down, old postion was not at ymax. */
-            bb_coord_new->ymax = curr_bb_coord->ymax;
+            bb_coord_new.ymax = curr_bb_coord.ymax;
         }
 
         /* Now do the ymin fields for coordinates and number of edges. */
 
-        if (ynew < curr_bb_coord->ymin) { /* Moved past ymin */
-            bb_coord_new->ymin = ynew;
-        } else if (ynew == curr_bb_coord->ymin) { /* Moved to ymin */
-            bb_coord_new->ymin = ynew;
+        if (ynew < curr_bb_coord.ymin) { /* Moved past ymin */
+            bb_coord_new.ymin = ynew;
+        } else if (ynew == curr_bb_coord.ymin) { /* Moved to ymin */
+            bb_coord_new.ymin = ynew;
         } else { /* ymin unchanged. */
-            bb_coord_new->ymin = curr_bb_coord->ymin;
+            bb_coord_new.ymin = curr_bb_coord.ymin;
         }
         /* End of move down case. */
 
@@ -344,30 +385,30 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb* bb_coord_new, int xo
 
         /* Update the ymin fields for coordinates and number of edges first. */
 
-        if (yold == curr_bb_coord->ymin) { /* Old position at ymin. */
-            if (curr_bb_edge->ymin == 1) {
+        if (yold == curr_bb_coord.ymin) { /* Old position at ymin. */
+            if (curr_bb_edge.ymin == 1) {
                 return false;
             } else {
-                bb_coord_new->ymin = curr_bb_coord->ymin;
+                bb_coord_new.ymin = curr_bb_coord.ymin;
             }
         } else { /* Moved up, old position was not at ymin. */
-            bb_coord_new->ymin = curr_bb_coord->ymin;
+            bb_coord_new.ymin = curr_bb_coord.ymin;
         }
 
         /* Now do the ymax fields for coordinates and number of edges. */
 
-        if (ynew > curr_bb_coord->ymax) { /* Moved past ymax. */
-            bb_coord_new->ymax = ynew;
-        } else if (ynew == curr_bb_coord->ymax) { /* Moved to ymax */
-            bb_coord_new->ymax = ynew;
+        if (ynew > curr_bb_coord.ymax) { /* Moved past ymax. */
+            bb_coord_new.ymax = ynew;
+        } else if (ynew == curr_bb_coord.ymax) { /* Moved to ymax */
+            bb_coord_new.ymax = ynew;
         } else { /* ymax unchanged. */
-            bb_coord_new->ymax = curr_bb_coord->ymax;
+            bb_coord_new.ymax = curr_bb_coord.ymax;
         }
         /* End of move up case. */
 
     } else { /* ynew == yold -- no y motion. */
-        bb_coord_new->ymin = curr_bb_coord->ymin;
-        bb_coord_new->ymax = curr_bb_coord->ymax;
+        bb_coord_new.ymin = curr_bb_coord.ymin;
+        bb_coord_new.ymax = curr_bb_coord.ymax;
     }
     return true;
 }
diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index 57419340317..ca10cfc500b 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -454,7 +454,9 @@ bool is_legal_swap_to_location(ClusterBlockId blk, t_pl_loc to) {
     auto& place_ctx = g_vpr_ctx.placement();
 
     if (to.x < 0 || to.x >= int(device_ctx.grid.width())
-        || to.y < 0 || to.y >= int(device_ctx.grid.height())) {
+        || to.y < 0 || to.y >= int(device_ctx.grid.height())
+        || to.layer < 0
+        || to.layer >= int(device_ctx.grid.get_num_layers())) {
         return false;
     }
 
@@ -753,7 +755,8 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type,
     //Retrieve the compressed block grid for this block type
     const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[type->index];
     const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
-    const int from_layer_num = from.layer;
+    const int to_layer_num = get_random_layer(type);
+    VTR_ASSERT(to_layer_num != OPEN);
 
     //Determine the coordinates in the compressed grid space of the current block
     std::vector<t_physical_tile_loc> compressed_locs = get_compressed_loc(compressed_block_grid,
@@ -761,11 +764,10 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type,
                                                                           num_layers);
 
     //Determine the valid compressed grid location ranges
-    std::vector<t_bb> search_range = get_compressed_grid_target_search_range(compressed_block_grid,
-                                                                             compressed_locs,
-                                                                             rlim,
-                                                                             num_layers);
-    int delta_cx = search_range[from_layer_num].xmax - search_range[from_layer_num].xmin;
+    t_bb search_range = get_compressed_grid_target_search_range(compressed_block_grid,
+                                                                compressed_locs[to_layer_num],
+                                                                rlim);
+    int delta_cx = search_range.xmax - search_range.xmin;
 
     t_physical_tile_loc to_compressed_loc;
     bool legal = false;
@@ -774,9 +776,9 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type,
     if (is_cluster_constrained(b_from)) {
         bool intersect = intersect_range_limit_with_floorplan_constraints(type,
                                                                           b_from,
-                                                                          search_range[from_layer_num],
+                                                                          search_range,
                                                                           delta_cx,
-                                                                          from_layer_num);
+                                                                          to_layer_num);
         if (!intersect) {
             return false;
         }
@@ -784,11 +786,11 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type,
     //TODO: For now, we only move the blocks on the same tile
     legal = find_compatible_compressed_loc_in_range(type,
                                                     delta_cx,
-                                                    compressed_locs[from_layer_num],
-                                                    search_range[from_layer_num],
+                                                    compressed_locs[to_layer_num],
+                                                    search_range,
                                                     to_compressed_loc,
                                                     false,
-                                                    from_layer_num);
+                                                    to_layer_num);
 
     if (!legal) {
         //No valid position found
@@ -808,8 +810,8 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type,
     VTR_ASSERT_MSG(grid.get_height_offset({to.x, to.y, to.layer}) == 0, "Should be at block base location");
 
     VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tSearch range %dx%dx%d x %dx%dx%d - Legal position at %d,%d,%d is found\n",
-                   search_range[from_layer_num].xmin, search_range[from_layer_num].ymin, from_layer_num,
-                   search_range[from_layer_num].xmax, search_range[from_layer_num].ymax, from_layer_num,
+                   search_range.xmin, search_range.ymin, search_range.layer_min,
+                   search_range.xmax, search_range.ymax, search_range.layer_max,
                    to.x, to.y, to.layer);
     return true;
 }
@@ -829,7 +831,8 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type,
                         t_pl_loc& to_loc,
                         ClusterBlockId b_from) {
     int num_layers = g_vpr_ctx.device().grid.get_num_layers();
-    int from_layer_num = from_loc.layer;
+    const int to_layer_num = to_loc.layer;
+    VTR_ASSERT(to_layer_num != OPEN);
     const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[blk_type->index];
 
     //Determine the coordinates in the compressed grid space of the current block
@@ -842,25 +845,27 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type,
 
     //Determine the valid compressed grid location ranges
     std::vector<t_physical_tile_loc> min_compressed_loc = get_compressed_loc_approx(compressed_block_grid,
-                                                                                    {limit_coords->xmin, limit_coords->ymin, 0, from_layer_num},
+                                                                                    {limit_coords->xmin, limit_coords->ymin, 0, to_layer_num},
                                                                                     num_layers);
     std::vector<t_physical_tile_loc> max_compressed_loc = get_compressed_loc_approx(compressed_block_grid,
-                                                                                    {limit_coords->xmax, limit_coords->ymax, 0, from_layer_num},
+                                                                                    {limit_coords->xmax, limit_coords->ymax, 0, to_layer_num},
                                                                                     num_layers);
 
-    VTR_ASSERT(min_compressed_loc[from_layer_num].x >= 0);
-    VTR_ASSERT(static_cast<int>(compressed_block_grid.get_num_columns(from_layer_num)) - 1 - max_compressed_loc[from_layer_num].x >= 0);
-    VTR_ASSERT(max_compressed_loc[from_layer_num].x >= min_compressed_loc[from_layer_num].x);
-    int delta_cx = max_compressed_loc[from_layer_num].x - min_compressed_loc[from_layer_num].x;
+    VTR_ASSERT(min_compressed_loc[to_layer_num].x >= 0);
+    VTR_ASSERT(static_cast<int>(compressed_block_grid.get_num_columns(to_layer_num)) - 1 - max_compressed_loc[to_layer_num].x >= 0);
+    VTR_ASSERT(max_compressed_loc[to_layer_num].x >= min_compressed_loc[to_layer_num].x);
+    int delta_cx = max_compressed_loc[to_layer_num].x - min_compressed_loc[to_layer_num].x;
 
-    VTR_ASSERT(min_compressed_loc[from_layer_num].y >= 0);
-    VTR_ASSERT(static_cast<int>(compressed_block_grid.get_num_rows(from_layer_num)) - 1 - max_compressed_loc[from_layer_num].y >= 0);
-    VTR_ASSERT(max_compressed_loc[from_layer_num].y >= min_compressed_loc[from_layer_num].y);
+    VTR_ASSERT(min_compressed_loc[to_layer_num].y >= 0);
+    VTR_ASSERT(static_cast<int>(compressed_block_grid.get_num_rows(to_layer_num)) - 1 - max_compressed_loc[to_layer_num].y >= 0);
+    VTR_ASSERT(max_compressed_loc[to_layer_num].y >= min_compressed_loc[to_layer_num].y);
 
-    t_bb search_range(min_compressed_loc[from_layer_num].x,
-                      max_compressed_loc[from_layer_num].x,
-                      min_compressed_loc[from_layer_num].y,
-                      max_compressed_loc[from_layer_num].y);
+    t_bb search_range(min_compressed_loc[to_layer_num].x,
+                      max_compressed_loc[to_layer_num].x,
+                      min_compressed_loc[to_layer_num].y,
+                      max_compressed_loc[to_layer_num].y,
+                      to_layer_num,
+                      to_layer_num);
 
     t_physical_tile_loc to_compressed_loc;
     bool legal = false;
@@ -870,7 +875,7 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type,
                                                                           b_from,
                                                                           search_range,
                                                                           delta_cx,
-                                                                          from_layer_num);
+                                                                          to_layer_num);
         if (!intersect) {
             return false;
         }
@@ -878,11 +883,11 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type,
 
     legal = find_compatible_compressed_loc_in_range(blk_type,
                                                     delta_cx,
-                                                    from_compressed_locs[from_layer_num],
+                                                    from_compressed_locs[to_layer_num],
                                                     search_range,
                                                     to_compressed_loc,
                                                     true,
-                                                    from_layer_num);
+                                                    to_layer_num);
 
     if (!legal) {
         //No valid position found
@@ -902,8 +907,8 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type,
     VTR_ASSERT_MSG(grid.get_height_offset({to_loc.x, to_loc.y, to_loc.layer}) == 0, "Should be at block base location");
 
     VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tSearch range %dx%dx%d x %dx%dx%d - Legal position at %d,%d,%d is found\n",
-                   search_range.xmin, search_range.ymin, from_layer_num,
-                   search_range.xmax, search_range.ymax, from_layer_num,
+                   search_range.xmin, search_range.ymin, search_range.layer_min,
+                   search_range.xmax, search_range.ymax, search_range.layer_max,
                    to_loc.x, to_loc.y, to_loc.layer);
     return true;
 }
@@ -916,7 +921,8 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
                           ClusterBlockId b_from) {
     //Retrieve the compressed block grid for this block type
     const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[blk_type->index];
-    const int from_layer_num = from_loc.layer;
+    const int to_layer_num = to_loc.layer;
+    VTR_ASSERT(to_layer_num >= 0);
     const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
 
     std::vector<t_physical_tile_loc> from_compressed_loc = get_compressed_loc(compressed_block_grid,
@@ -930,23 +936,21 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
 
     //Determine the valid compressed grid location ranges
     int delta_cx;
-    std::vector<t_bb> search_range;
+    t_bb search_range;
 
     // If we are early in the anneal and the range limit still big enough --> search around the center location that the move proposed
     // If not --> search around the current location of the block but in the direction of the center location that the move proposed
     if (range_limiters.original_rlim > 0.15 * range_limiters.first_rlim) {
         search_range = get_compressed_grid_target_search_range(compressed_block_grid,
-                                                               centroid_compressed_loc,
-                                                               std::min<float>(range_limiters.original_rlim, range_limiters.dm_rlim),
-                                                               num_layers);
+                                                               centroid_compressed_loc[to_layer_num],
+                                                               std::min<float>(range_limiters.original_rlim, range_limiters.dm_rlim));
     } else {
         search_range = get_compressed_grid_bounded_search_range(compressed_block_grid,
-                                                                from_compressed_loc,
-                                                                centroid_compressed_loc,
-                                                                std::min<float>(range_limiters.original_rlim, range_limiters.dm_rlim),
-                                                                num_layers);
+                                                                from_compressed_loc[to_layer_num],
+                                                                centroid_compressed_loc[to_layer_num],
+                                                                std::min<float>(range_limiters.original_rlim, range_limiters.dm_rlim));
     }
-    delta_cx = search_range[from_layer_num].xmax - search_range[from_layer_num].xmin;
+    delta_cx = search_range.xmax - search_range.xmin;
 
     t_physical_tile_loc to_compressed_loc;
     bool legal = false;
@@ -954,9 +958,9 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
     if (is_cluster_constrained(b_from)) {
         bool intersect = intersect_range_limit_with_floorplan_constraints(blk_type,
                                                                           b_from,
-                                                                          search_range[from_layer_num],
+                                                                          search_range,
                                                                           delta_cx,
-                                                                          from_layer_num);
+                                                                          to_layer_num);
         if (!intersect) {
             return false;
         }
@@ -965,11 +969,11 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
     //TODO: For now, we only move the blocks on the same tile
     legal = find_compatible_compressed_loc_in_range(blk_type,
                                                     delta_cx,
-                                                    from_compressed_loc[from_layer_num],
-                                                    search_range[from_layer_num],
+                                                    from_compressed_loc[to_layer_num],
+                                                    search_range,
                                                     to_compressed_loc,
                                                     false,
-                                                    from_layer_num);
+                                                    to_layer_num);
 
     if (!legal) {
         //No valid position found
@@ -989,8 +993,8 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
     VTR_ASSERT_MSG(grid.get_height_offset({to_loc.x, to_loc.y, to_loc.layer}) == 0, "Should be at block base location");
 
     VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tSearch range %dx%dx%d x %dx%dx%d - Legal position at %d,%d,%d is found\n",
-                   search_range[from_layer_num].xmin, search_range[from_layer_num].ymin, from_layer_num,
-                   search_range[from_layer_num].xmax, search_range[from_layer_num].ymax, from_layer_num,
+                   search_range.xmin, search_range.ymin, search_range.layer_min,
+                   search_range.xmax, search_range.ymax, search_range.layer_max,
                    to_loc.x, to_loc.y, to_loc.layer);
     return true;
 }
@@ -999,8 +1003,8 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
 static const std::array<std::string, NUM_PL_MOVE_TYPES + 1> move_type_strings = {
     "Uniform",
     "Median",
-    "W. Centroid",
     "Centroid",
+    "W. Centroid",
     "W. Median",
     "Crit. Uniform",
     "Feasible Region",
@@ -1133,10 +1137,10 @@ std::vector<t_physical_tile_loc> get_compressed_loc(const t_compressed_block_gri
     //TODO: This function currently only determine the compressed location for the same layer as grid_loc - it should be updated to cover all layers
     std::vector<t_physical_tile_loc> compressed_locs(num_layers);
 
-    for (int layer_num = 0; layer_num < num_layers; ++layer_num) {
-        if (layer_num != grid_loc.layer) {
-            continue;
-        }
+    const auto& compatible_layers = compressed_block_grid.get_layer_nums();
+
+    for (const auto& layer_num : compatible_layers) {
+        // This would cause a problem if two blocks of the same types are on different x/y locations of different layers
         compressed_locs[layer_num] = compressed_block_grid.grid_loc_to_compressed_loc({grid_loc.x, grid_loc.y, layer_num});
     }
 
@@ -1146,91 +1150,78 @@ std::vector<t_physical_tile_loc> get_compressed_loc(const t_compressed_block_gri
 std::vector<t_physical_tile_loc> get_compressed_loc_approx(const t_compressed_block_grid& compressed_block_grid,
                                                            t_pl_loc grid_loc,
                                                            int num_layers) {
-    //TODO: This function currently only determine the compressed location for the same layer as grid_loc - it should be updated to cover all layers
     std::vector<t_physical_tile_loc> compressed_locs(num_layers);
 
-    for (int layer_num = 0; layer_num < num_layers; ++layer_num) {
-        if (layer_num != grid_loc.layer) {
-            continue;
-        }
+    const auto& compatible_layers = compressed_block_grid.get_layer_nums();
+
+    for (const auto& layer_num : compatible_layers) {
         compressed_locs[layer_num] = compressed_block_grid.grid_loc_to_compressed_loc_approx({grid_loc.x, grid_loc.y, layer_num});
     }
 
     return compressed_locs;
 }
 
-std::vector<t_bb> get_compressed_grid_target_search_range(const t_compressed_block_grid& compressed_block_grid,
-                                                          const std::vector<t_physical_tile_loc>& compressed_locs,
-                                                          float rlim,
-                                                          int num_layers) {
-    std::vector<t_bb> search_ranges(num_layers, t_bb());
-    for (int layer_num = 0; layer_num < num_layers; ++layer_num) {
-        const auto& layer_loc = compressed_locs[layer_num];
-        //TODO: This if condition is added because blocks are only moved in the same layer. After the update, this condition should be replaced with an assertion
-        if (layer_loc.x == OPEN || layer_loc.y == OPEN || layer_loc.layer_num == OPEN) {
-            //No valid compressed location for this layer
-            continue;
-        }
-        int rlim_x_max_range = std::min<int>((int)compressed_block_grid.get_num_columns(layer_num), rlim);
-        int rlim_y_max_range = std::min<int>((int)compressed_block_grid.get_num_rows(layer_num), rlim); /* for aspect_ratio != 1 case. */
+t_bb get_compressed_grid_target_search_range(const t_compressed_block_grid& compressed_block_grid,
+                                             const t_physical_tile_loc& compressed_loc,
+                                             float rlim) {
+    t_bb search_ranges;
+    int layer_num = compressed_loc.layer_num;
+    VTR_ASSERT(compressed_loc.x != OPEN && compressed_loc.y != OPEN && compressed_loc.layer_num != OPEN);
 
-        search_ranges[layer_num].xmin = std::max(0, layer_loc.x - rlim_x_max_range);
-        search_ranges[layer_num].xmax = std::min<int>(compressed_block_grid.get_num_columns(layer_num) - 1, layer_loc.x + rlim_x_max_range);
+    int rlim_x_max_range = std::min<int>((int)compressed_block_grid.get_num_columns(layer_num), rlim);
+    int rlim_y_max_range = std::min<int>((int)compressed_block_grid.get_num_rows(layer_num), rlim); /* for aspect_ratio != 1 case. */
 
-        search_ranges[layer_num].ymin = std::max(0, layer_loc.y - rlim_y_max_range);
-        search_ranges[layer_num].ymax = std::min<int>(compressed_block_grid.get_num_rows(layer_num) - 1, layer_loc.y + rlim_y_max_range);
-    }
+    search_ranges.xmin = std::max(0, compressed_loc.x - rlim_x_max_range);
+    search_ranges.xmax = std::min<int>(compressed_block_grid.get_num_columns(layer_num) - 1, compressed_loc.x + rlim_x_max_range);
+
+    search_ranges.ymin = std::max(0, compressed_loc.y - rlim_y_max_range);
+    search_ranges.ymax = std::min<int>(compressed_block_grid.get_num_rows(layer_num) - 1, compressed_loc.y + rlim_y_max_range);
+
+    search_ranges.layer_min = compressed_loc.layer_num;
+    search_ranges.layer_max = compressed_loc.layer_num;
 
     return search_ranges;
 }
 
-std::vector<t_bb> get_compressed_grid_bounded_search_range(const t_compressed_block_grid& compressed_block_grid,
-                                                           const std::vector<t_physical_tile_loc>& from_compressed_loc,
-                                                           const std::vector<t_physical_tile_loc>& target_compressed_loc,
-                                                           float rlim,
-                                                           int num_layers) {
-    std::vector<t_bb> search_range(num_layers, t_bb());
+t_bb get_compressed_grid_bounded_search_range(const t_compressed_block_grid& compressed_block_grid,
+                                              const t_physical_tile_loc& from_compressed_loc,
+                                              const t_physical_tile_loc& target_compressed_loc,
+                                              float rlim) {
+    t_bb search_range;
 
     int min_cx, max_cx, min_cy, max_cy;
 
-    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
-        //TODO: This if condition is added because blocks are only moved in the same layer. After the update, this condition should be replaced with an assertion
-        if (from_compressed_loc[layer_num].x == OPEN || from_compressed_loc[layer_num].y == OPEN || from_compressed_loc[layer_num].layer_num == OPEN) {
-            continue;
-        }
-        VTR_ASSERT(from_compressed_loc[layer_num].layer_num == layer_num);
-        VTR_ASSERT(target_compressed_loc[layer_num].layer_num == layer_num);
+    //TODO: This if condition is added because blocks are only moved in the same layer. After the update, this condition should be replaced with an assertion
+    VTR_ASSERT(from_compressed_loc.x != OPEN && from_compressed_loc.y != OPEN && from_compressed_loc.layer_num != OPEN);
+    VTR_ASSERT(target_compressed_loc.x != OPEN && target_compressed_loc.y != OPEN && target_compressed_loc.layer_num != OPEN);
 
-        int rlim_x_max_range = std::min<int>(compressed_block_grid.get_num_columns(layer_num), rlim);
-        int rlim_y_max_range = std::min<int>(compressed_block_grid.get_num_rows(layer_num), rlim); /* for aspect_ratio != 1 case. */
+    int layer_num = target_compressed_loc.layer_num;
+    int rlim_x_max_range = std::min<int>(compressed_block_grid.get_num_columns(layer_num), rlim);
+    int rlim_y_max_range = std::min<int>(compressed_block_grid.get_num_rows(layer_num), rlim); /* for aspect_ratio != 1 case. */
 
-        int cx_from = from_compressed_loc[layer_num].x;
-        int cy_from = from_compressed_loc[layer_num].y;
-        if (cx_from == OPEN || cy_from == OPEN) {
-            continue;
-        }
-
-        int cx_centroid = target_compressed_loc[layer_num].x;
-        int cy_centroid = target_compressed_loc[layer_num].y;
+    int cx_from = from_compressed_loc.x;
+    int cy_from = from_compressed_loc.y;
 
-        if (cx_centroid < cx_from) {
-            min_cx = std::max(0, cx_from - rlim_x_max_range);
-            max_cx = cx_from;
-        } else {
-            min_cx = cx_from;
-            max_cx = std::min<int>(compressed_block_grid.get_num_columns(layer_num) - 1, cx_from + rlim_x_max_range);
-        }
-        if (cy_centroid < cy_from) {
-            min_cy = std::max(0, cy_from - rlim_y_max_range);
-            max_cy = cy_from;
-        } else {
-            min_cy = cy_from;
-            max_cy = std::min<int>(compressed_block_grid.get_num_rows(layer_num) - 1, cy_from + rlim_y_max_range);
-        }
+    int cx_centroid = target_compressed_loc.x;
+    int cy_centroid = target_compressed_loc.y;
 
-        search_range[layer_num] = t_bb(min_cx, max_cx, min_cy, max_cy);
+    if (cx_centroid < cx_from) {
+        min_cx = std::max(0, cx_from - rlim_x_max_range);
+        max_cx = cx_from;
+    } else {
+        min_cx = cx_from;
+        max_cx = std::min<int>(compressed_block_grid.get_num_columns(layer_num) - 1, cx_from + rlim_x_max_range);
+    }
+    if (cy_centroid < cy_from) {
+        min_cy = std::max(0, cy_from - rlim_y_max_range);
+        max_cy = cy_from;
+    } else {
+        min_cy = cy_from;
+        max_cy = std::min<int>(compressed_block_grid.get_num_rows(layer_num) - 1, cy_from + rlim_y_max_range);
     }
 
+    search_range = t_bb(min_cx, max_cx, min_cy, max_cy, layer_num, layer_num);
+
     return search_range;
 }
 
@@ -1299,3 +1290,150 @@ std::string e_move_result_to_string(e_move_result move_outcome) {
     std::string move_result_to_string[] = {"Rejected", "Accepted", "Aborted"};
     return move_result_to_string[move_outcome];
 }
+
+int find_free_layer(t_logical_block_type_ptr logical_block, const t_pl_loc& loc) {
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& place_ctx = g_vpr_ctx.placement();
+
+    // TODO: Compatible layer vector should be shuffled first, and then iterated through
+    int free_layer = loc.layer;
+    VTR_ASSERT(loc.layer != OPEN);
+    if (device_ctx.grid.get_num_layers() > 1) {
+        const auto& compatible_layers = place_ctx.compressed_block_grids[logical_block->index].get_layer_nums();
+        if (compatible_layers.size() > 1) {
+            if (place_ctx.grid_blocks.block_at_location(loc) != EMPTY_BLOCK_ID) {
+                for (const auto& layer : compatible_layers) {
+                    if (layer != free_layer) {
+                        if (place_ctx.grid_blocks.block_at_location(loc) == EMPTY_BLOCK_ID) {
+                            free_layer = layer;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return free_layer;
+}
+
+int get_random_layer(t_logical_block_type_ptr logical_block) {
+    const auto& compatible_layers = g_vpr_ctx.placement().compressed_block_grids[logical_block->index].get_layer_nums();
+    VTR_ASSERT(!compatible_layers.empty());
+    int layer_num = OPEN;
+    if (compatible_layers.size() == 1) {
+        layer_num = compatible_layers[0];
+    } else {
+        layer_num = compatible_layers[vtr::irand(compatible_layers.size() - 1)];
+    }
+
+    return layer_num;
+}
+
+t_bb union_2d_bb(const std::vector<t_2D_bb>& bb_vec) {
+    t_bb merged_bb;
+
+    // Not all 2d_bbs are valid. Thus, if one of the coordinates in the 2D_bb is not valid (equal to OPEN),
+    // we need to skip it.
+    for (const auto& layer_bb : bb_vec) {
+        if (layer_bb.xmin == OPEN) {
+            VTR_ASSERT_SAFE(layer_bb.xmax == OPEN);
+            VTR_ASSERT_SAFE(layer_bb.ymin == OPEN);
+            VTR_ASSERT_SAFE(layer_bb.ymax == OPEN);
+            VTR_ASSERT_SAFE(layer_bb.layer_num == OPEN);
+            continue;
+        }
+        if (merged_bb.xmin == OPEN || layer_bb.xmin < merged_bb.xmin) {
+            merged_bb.xmin = layer_bb.xmin;
+        }
+        if (merged_bb.xmax == OPEN || layer_bb.xmax > merged_bb.xmax) {
+            merged_bb.xmax = layer_bb.xmax;
+        }
+        if (merged_bb.ymin == OPEN || layer_bb.ymin < merged_bb.ymin) {
+            merged_bb.ymin = layer_bb.ymin;
+        }
+        if (merged_bb.ymax == OPEN || layer_bb.ymax > merged_bb.ymax) {
+            merged_bb.ymax = layer_bb.ymax;
+        }
+        if (merged_bb.layer_min == OPEN || layer_bb.layer_num < merged_bb.layer_min) {
+            merged_bb.layer_min = layer_bb.layer_num;
+        }
+        if (merged_bb.layer_max == OPEN || layer_bb.layer_num > merged_bb.layer_max) {
+            merged_bb.layer_max = layer_bb.layer_num;
+        }
+    }
+
+    return merged_bb;
+}
+
+std::pair<t_bb, t_bb> union_2d_bb_incr(const std::vector<t_2D_bb>& num_edge_vec,
+                                       const std::vector<t_2D_bb>& bb_vec) {
+    t_bb merged_num_edge;
+    t_bb merged_bb;
+
+    for (const auto& layer_bb : bb_vec) {
+        if (layer_bb.xmin == OPEN) {
+            VTR_ASSERT_SAFE(layer_bb.xmax == OPEN);
+            VTR_ASSERT_SAFE(layer_bb.ymin == OPEN);
+            VTR_ASSERT_SAFE(layer_bb.ymax == OPEN);
+            VTR_ASSERT_SAFE(layer_bb.layer_num == OPEN);
+            continue;
+        }
+        if (merged_bb.xmin == OPEN || layer_bb.xmin <= merged_bb.xmin) {
+            if (layer_bb.xmin == merged_bb.xmin) {
+                VTR_ASSERT_SAFE(merged_num_edge.xmin != OPEN);
+                merged_num_edge.xmin += num_edge_vec[layer_bb.layer_num].xmin;
+            } else {
+                merged_num_edge.xmin = num_edge_vec[layer_bb.layer_num].xmin;
+            }
+            merged_bb.xmin = layer_bb.xmin;
+        }
+        if (merged_bb.xmax == OPEN || layer_bb.xmax >= merged_bb.xmax) {
+            if (layer_bb.xmax == merged_bb.xmax) {
+                VTR_ASSERT_SAFE(merged_num_edge.xmax != OPEN);
+                merged_num_edge.xmax += num_edge_vec[layer_bb.layer_num].xmax;
+            } else {
+                merged_num_edge.xmax = num_edge_vec[layer_bb.layer_num].xmax;
+            }
+            merged_bb.xmax = layer_bb.xmax;
+        }
+        if (merged_bb.ymin == OPEN || layer_bb.ymin <= merged_bb.ymin) {
+            if (layer_bb.ymin == merged_bb.ymin) {
+                VTR_ASSERT_SAFE(merged_num_edge.ymin != OPEN);
+                merged_num_edge.ymin += num_edge_vec[layer_bb.layer_num].ymin;
+            } else {
+                merged_num_edge.ymin = num_edge_vec[layer_bb.layer_num].ymin;
+            }
+            merged_bb.ymin = layer_bb.ymin;
+        }
+        if (merged_bb.ymax == OPEN || layer_bb.ymax >= merged_bb.ymax) {
+            if (layer_bb.ymax == merged_bb.ymax) {
+                VTR_ASSERT_SAFE(merged_num_edge.ymax != OPEN);
+                merged_num_edge.ymax += num_edge_vec[layer_bb.layer_num].ymax;
+            } else {
+                merged_num_edge.ymax = num_edge_vec[layer_bb.layer_num].ymax;
+            }
+            merged_bb.ymax = layer_bb.ymax;
+        }
+        if (merged_bb.layer_min == OPEN || layer_bb.layer_num <= merged_bb.layer_min) {
+            if (layer_bb.layer_num == merged_bb.layer_min) {
+                VTR_ASSERT_SAFE(merged_num_edge.layer_min != OPEN);
+                merged_num_edge.layer_min += num_edge_vec[layer_bb.layer_num].layer_num;
+            } else {
+                merged_num_edge.layer_min = num_edge_vec[layer_bb.layer_num].layer_num;
+            }
+            merged_bb.layer_min = layer_bb.layer_num;
+        }
+        if (merged_bb.layer_max == OPEN || layer_bb.layer_num >= merged_bb.layer_max) {
+            if (layer_bb.layer_num == merged_bb.layer_max) {
+                VTR_ASSERT_SAFE(merged_num_edge.layer_max != OPEN);
+                merged_num_edge.layer_max += num_edge_vec[layer_bb.layer_num].layer_num;
+            } else {
+                merged_num_edge.layer_max = num_edge_vec[layer_bb.layer_num].layer_num;
+            }
+            merged_bb.layer_max = layer_bb.layer_num;
+        }
+    }
+
+    return std::make_pair(merged_num_edge, merged_bb);
+}
diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h
index 72b53b6b02a..f9369acd4f7 100644
--- a/vpr/src/place/move_utils.h
+++ b/vpr/src/place/move_utils.h
@@ -275,13 +275,11 @@ std::vector<t_physical_tile_loc> get_compressed_loc_approx(const t_compressed_bl
  * @param compressed_block_grid
  * @param compressed_locs
  * @param rlim
- * @param num_layers
  * @return A compressed search range for each layer
  */
-std::vector<t_bb> get_compressed_grid_target_search_range(const t_compressed_block_grid& compressed_block_grid,
-                                                          const std::vector<t_physical_tile_loc>& compressed_locs,
-                                                          float rlim,
-                                                          int num_layers);
+t_bb get_compressed_grid_target_search_range(const t_compressed_block_grid& compressed_block_grid,
+                                             const t_physical_tile_loc& compressed_locs,
+                                             float rlim);
 
 /**
  * @brief This function calculates the search range based on the given rlim value and the number of columns/rows
@@ -294,14 +292,12 @@ std::vector<t_bb> get_compressed_grid_target_search_range(const t_compressed_blo
  * @param from_compressed_loc
  * @param target_compressed_loc
  * @param rlim
- * @param num_layers
  * @return
  */
-std::vector<t_bb> get_compressed_grid_bounded_search_range(const t_compressed_block_grid& compressed_block_grid,
-                                                           const std::vector<t_physical_tile_loc>& from_compressed_loc,
-                                                           const std::vector<t_physical_tile_loc>& target_compressed_loc,
-                                                           float rlim,
-                                                           int num_layers);
+t_bb get_compressed_grid_bounded_search_range(const t_compressed_block_grid& compressed_block_grid,
+                                              const t_physical_tile_loc& from_compressed_loc,
+                                              const t_physical_tile_loc& target_compressed_loc,
+                                              float rlim);
 
 /*
  * If the block to be moved (b_from) has a floorplan constraint, this routine changes the max and min coords
@@ -328,6 +324,37 @@ bool intersect_range_limit_with_floorplan_constraints(t_logical_block_type_ptr t
 
 std::string e_move_result_to_string(e_move_result move_outcome);
 
+/**
+ * @brif Iterate over all layers that have a physical tile at the x-y location specified by "loc" that can accomodate "logical_block".
+ * If the location in the layer specified by "layer_num" is empty, return that layer. Otherwise,
+ * return a layer that is not occupied at that location. If there isn't any, again, return the layer of loc.
+ *
+ * @param logical_block
+ * @param loc
+ * @return
+ */
+int find_free_layer(t_logical_block_type_ptr logical_block, const t_pl_loc& loc);
+
+int get_random_layer(t_logical_block_type_ptr logical_block);
+
+/**
+ * @brief Iterate over all layers and get the maximum x and y over that layers that have a valid value. set the layer min and max
+ * based on the layers that have a valid BB.
+ * @param tbb_vec
+ * @return 3D bounding box
+ */
+t_bb union_2d_bb(const std::vector<t_2D_bb>& tbb_vec);
+
+/**
+ * @brief Iterate over all layers and get the maximum x and y over that layers that have a valid value. Create the "num_edge" in a similar way. This data structure
+ * stores how many blocks are on each edge of the BB. set the layer min and max based on the layers that have a valid BB.
+ * @param num_edge_vec
+ * @param bb_vec
+ * @return num_edge, 3D bb
+ */
+std::pair<t_bb, t_bb> union_2d_bb_incr(const std::vector<t_2D_bb>& num_edge_vec,
+                                       const std::vector<t_2D_bb>& bb_vec);
+
 #ifdef VTR_ENABLE_DEBUG_LOGGING
 /**
  * @brief If the block ID passed to the placer_debug_net parameter of the command line is equal to blk_id, or if any of the nets
diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp
index 5e68e7d1462..17d96dd3677 100644
--- a/vpr/src/place/noc_place_utils.cpp
+++ b/vpr/src/place/noc_place_utils.cpp
@@ -495,7 +495,7 @@ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, floa
 
     // now choose a compatible block to swap with
     t_pl_loc to;
-
+    to.layer = from.layer;
     if (!find_to_loc_uniform(cluster_from_type, rlim, from, to, b_from)) {
         return e_create_move::ABORT;
     }
diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp
index 52e4e5e6ff1..51dfce9ee32 100644
--- a/vpr/src/place/place.cpp
+++ b/vpr/src/place/place.cpp
@@ -148,7 +148,9 @@ static vtr::NdMatrix<float, 2> chany_place_cost_fac({0, 0}); //[0...device_ctx.g
 
 /* The following arrays are used by the try_swap function for speed.   */
 /* [0...cluster_ctx.clb_nlist.nets().size()-1] */
-static vtr::vector<ClusterNetId, t_bb> ts_bb_coord_new, ts_bb_edge_new;
+static vtr::vector<ClusterNetId, t_bb> ts_bb_edge_new, ts_bb_coord_new;
+static vtr::vector<ClusterNetId, std::vector<t_2D_bb>> layer_ts_bb_edge_new, layer_ts_bb_coord_new;
+static vtr::Matrix<int> ts_layer_sink_pin_count;
 static std::vector<ClusterNetId> ts_nets_to_update;
 
 /* These file-scoped variables keep track of the number of swaps       *
@@ -252,13 +254,22 @@ std::unique_ptr<FILE, decltype(&vtr::fclose)> f_move_stats_file(nullptr,
 void print_clb_placement(const char* fname);
 #endif
 
+/**
+ * @brief determine the type of the bounding box used by the placer to predict the wirelength
+ *
+ * @param place_bb_mode The bounding box mode passed by the CLI
+ * @param rr_graph The routing resource graph
+ */
+static bool is_cube_bb(const e_place_bounding_box_mode place_bb_mode,
+                       const RRGraphView& rr_graph);
+
 static void alloc_and_load_placement_structs(float place_cost_exp,
                                              const t_placer_opts& placer_opts,
                                              const t_noc_opts& noc_opts,
                                              t_direct_inf* directs,
                                              int num_directs);
 
-static void alloc_and_load_try_swap_structs();
+static void alloc_and_load_try_swap_structs(const bool cube_bb);
 static void free_try_swap_structs();
 
 static void free_placement_structs(const t_placer_opts& placer_opts, const t_noc_opts& noc_opts);
@@ -269,7 +280,11 @@ static void free_fast_cost_update();
 
 static double comp_bb_cost(e_cost_methods method);
 
-static void update_move_nets(int num_nets_affected);
+static double comp_layer_bb_cost(e_cost_methods method);
+
+static void update_move_nets(int num_nets_affected,
+                             const bool cube_bb);
+
 static void reset_move_nets(int num_nets_affected);
 
 static e_move_result try_swap(const t_annealing_state* state,
@@ -304,7 +319,20 @@ static int check_placement_consistency();
 static int check_block_placement_consistency();
 static int check_macro_placement_consistency();
 
-static float starting_t(const t_annealing_state* state, t_placer_costs* costs, t_annealing_sched annealing_sched, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, SetupTimingInfo* timing_info, MoveGenerator& move_generator, ManualMoveGenerator& manual_move_generator, NetPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts, const t_noc_opts& noc_opts, MoveTypeStat& move_type_stat);
+static float starting_t(const t_annealing_state* state,
+                        t_placer_costs* costs,
+                        t_annealing_sched annealing_sched,
+                        const PlaceDelayModel* delay_model,
+                        PlacerCriticalities* criticalities,
+                        PlacerSetupSlacks* setup_slacks,
+                        SetupTimingInfo* timing_info,
+                        MoveGenerator& move_generator,
+                        ManualMoveGenerator& manual_move_generator,
+                        NetPinTimingInvalidator* pin_timing_invalidator,
+                        t_pl_blocks_to_be_moved& blocks_affected,
+                        const t_placer_opts& placer_opts,
+                        const t_noc_opts& noc_opts,
+                        MoveTypeStat& move_type_stat);
 
 static int count_connections();
 
@@ -326,9 +354,69 @@ static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks);
 
 static e_move_result assess_swap(double delta_c, double t);
 
-static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new);
-
-static void update_bb(ClusterNetId net_id, t_bb* bb_coord_new, t_bb* bb_edge_new, int xold, int yold, int xnew, int ynew);
+static void get_non_updateable_bb(ClusterNetId net_id,
+                                  t_bb& bb_coord_new,
+                                  vtr::NdMatrixProxy<int, 1> num_sink_pin_layer);
+
+static void get_non_updateable_layer_bb(ClusterNetId net_id,
+                                        std::vector<t_2D_bb>& bb_coord_new,
+                                        vtr::NdMatrixProxy<int, 1> num_sink_layer);
+
+static void update_bb(ClusterNetId net_id,
+                      t_bb& bb_edge_new,
+                      t_bb& bb_coord_new,
+                      vtr::NdMatrixProxy<int, 1> num_sink_pin_layer_new,
+                      t_physical_tile_loc pin_old_loc,
+                      t_physical_tile_loc pin_new_loc,
+                      bool src_pin);
+
+static void update_layer_bb(ClusterNetId net_id,
+                            std::vector<t_2D_bb>& bb_edge_new,
+                            std::vector<t_2D_bb>& bb_coord_new,
+                            vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                            t_physical_tile_loc pin_old_loc,
+                            t_physical_tile_loc pin_new_loc,
+                            bool is_output_pin);
+
+static inline void update_bb_same_layer(ClusterNetId net_id,
+                                        const t_physical_tile_loc& pin_old_loc,
+                                        const t_physical_tile_loc& pin_new_loc,
+                                        const std::vector<t_2D_bb>& curr_bb_edge,
+                                        const std::vector<t_2D_bb>& curr_bb_coord,
+                                        vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                                        std::vector<t_2D_bb>& bb_edge_new,
+                                        std::vector<t_2D_bb>& bb_coord_new);
+
+static inline void update_bb_layer_changed(ClusterNetId net_id,
+                                           const t_physical_tile_loc& pin_old_loc,
+                                           const t_physical_tile_loc& pin_new_loc,
+                                           const std::vector<t_2D_bb>& curr_bb_edge,
+                                           const std::vector<t_2D_bb>& curr_bb_coord,
+                                           vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                                           std::vector<t_2D_bb>& bb_edge_new,
+                                           std::vector<t_2D_bb>& bb_coord_new);
+
+static void update_bb_pin_sink_count(ClusterNetId net_id,
+                                     const t_physical_tile_loc& pin_old_loc,
+                                     const t_physical_tile_loc& pin_new_loc,
+                                     const vtr::NdMatrixProxy<int, 1> curr_layer_pin_sink_count,
+                                     vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                                     bool is_output_pin);
+
+static inline void update_bb_edge(ClusterNetId net_id,
+                                  std::vector<t_2D_bb>& bb_edge_new,
+                                  std::vector<t_2D_bb>& bb_coord_new,
+                                  vtr::NdMatrixProxy<int, 1> bb_layer_pin_sink_count,
+                                  const int& old_num_block_on_edge,
+                                  const int& old_edge_coord,
+                                  int& new_num_block_on_edge,
+                                  int& new_edge_coord);
+
+static void add_block_to_bb(const t_physical_tile_loc& new_pin_loc,
+                            const t_2D_bb& bb_edge_old,
+                            const t_2D_bb& bb_coord_old,
+                            t_2D_bb& bb_edge_new,
+                            t_2D_bb& bb_coord_new);
 
 static int find_affected_nets_and_update_costs(
     const t_place_algorithm& place_algorithm,
@@ -345,6 +433,13 @@ static void update_net_bb(const ClusterNetId net,
                           int iblk,
                           const ClusterBlockId blk,
                           const ClusterPinId blk_pin);
+
+static void update_net_layer_bb(const ClusterNetId net,
+                                const t_pl_blocks_to_be_moved& blocks_affected,
+                                int iblk,
+                                const ClusterBlockId blk,
+                                const ClusterPinId blk_pin);
+
 static void update_td_delta_costs(const PlaceDelayModel* delay_model,
                                   const PlacerCriticalities& criticalities,
                                   const ClusterNetId net,
@@ -356,11 +451,27 @@ static void update_placement_cost_normalization_factors(t_placer_costs* costs, c
 
 static double get_total_cost(t_placer_costs* costs, const t_placer_opts& placer_opts, const t_noc_opts& noc_opts);
 
-static double get_net_cost(ClusterNetId net_id, t_bb* bb_ptr);
+static double get_net_cost(ClusterNetId net_id, const t_bb& bbptr);
+
+static double get_net_layer_cost(ClusterNetId /* net_id */,
+                                 const std::vector<t_2D_bb>& bbptr,
+                                 const vtr::NdMatrixProxy<int, 1> layer_pin_sink_count);
 
-static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_edges);
+static void get_bb_from_scratch(ClusterNetId net_id,
+                                t_bb& coords,
+                                t_bb& num_on_edges,
+                                vtr::NdMatrixProxy<int, 1> num_sink_pin_layer);
 
-static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr);
+static void get_layer_bb_from_scratch(ClusterNetId net_id,
+                                      std::vector<t_2D_bb>& num_on_edges,
+                                      std::vector<t_2D_bb>& coords,
+                                      vtr::NdMatrixProxy<int, 1> layer_pin_sink_count);
+
+static double get_net_wirelength_estimate(ClusterNetId net_id, const t_bb& bbptr);
+
+static double get_net_layer_wirelength_estimate(ClusterNetId /* net_id */,
+                                                const std::vector<t_2D_bb>& bbptr,
+                                                const vtr::NdMatrixProxy<int, 1> layer_pin_sink_count);
 
 static void free_try_swap_arrays();
 
@@ -508,6 +619,7 @@ void try_place(const Netlist<>& net_list,
     if (placer_opts.place_algorithm.is_timing_driven()) {
         /*do this before the initial placement to avoid messing up the initial placement */
         place_delay_model = alloc_lookups_and_delay_model(net_list,
+                                                          device_ctx.arch_switch_inf,
                                                           chan_width_dist,
                                                           placer_opts,
                                                           router_opts,
@@ -523,6 +635,14 @@ void try_place(const Netlist<>& net_list,
         }
     }
 
+    g_vpr_ctx.mutable_placement().cube_bb = is_cube_bb(placer_opts.place_bounding_box_mode,
+                                                       device_ctx.rr_graph);
+    const auto& cube_bb = g_vpr_ctx.placement().cube_bb;
+
+    VTR_LOG("\n");
+    VTR_LOG("Bounding box mode is %s\n", (cube_bb ? "Cube" : "Per-layer"));
+    VTR_LOG("\n");
+
     int move_lim = 1;
     move_lim = (int)(annealing_sched.inner_num
                      * pow(net_list.blocks().size(), 1.3333));
@@ -549,6 +669,12 @@ void try_place(const Netlist<>& net_list,
                       placer_opts.constraints_file.c_str(),
                       noc_opts.noc);
 
+    if (!placer_opts.write_initial_place_file.empty()) {
+        print_place(nullptr,
+                    nullptr,
+                    (placer_opts.write_initial_place_file + ".init.place").c_str());
+    }
+
 #ifdef ENABLE_ANALYTIC_PLACE
     /*
      * Analytic Placer:
@@ -578,7 +704,12 @@ void try_place(const Netlist<>& net_list,
     /* Gets initial cost and loads bounding boxes. */
 
     if (placer_opts.place_algorithm.is_timing_driven()) {
-        costs.bb_cost = comp_bb_cost(NORMAL);
+        if (cube_bb) {
+            costs.bb_cost = comp_bb_cost(NORMAL);
+        } else {
+            VTR_ASSERT_SAFE(!cube_bb);
+            costs.bb_cost = comp_layer_bb_cost(NORMAL);
+        }
 
         first_crit_exponent = placer_opts.td_place_exp_first; /*this will be modified when rlim starts to change */
 
@@ -658,7 +789,12 @@ void try_place(const Netlist<>& net_list,
         VTR_ASSERT(placer_opts.place_algorithm == BOUNDING_BOX_PLACE);
 
         /* Total cost is the same as wirelength cost normalized*/
-        costs.bb_cost = comp_bb_cost(NORMAL);
+        if (cube_bb) {
+            costs.bb_cost = comp_bb_cost(NORMAL);
+        } else {
+            VTR_ASSERT_SAFE(!cube_bb);
+            costs.bb_cost = comp_layer_bb_cost(NORMAL);
+        }
         costs.bb_cost_norm = 1 / costs.bb_cost;
 
         /* Timing cost and normalization factors are not used */
@@ -684,8 +820,11 @@ void try_place(const Netlist<>& net_list,
     costs.cost = get_total_cost(&costs, placer_opts, noc_opts);
 
     //Sanity check that initial placement is legal
-    check_place(costs, place_delay_model.get(), placer_criticalities.get(),
-                placer_opts.place_algorithm, noc_opts);
+    check_place(costs,
+                place_delay_model.get(),
+                placer_criticalities.get(),
+                placer_opts.place_algorithm,
+                noc_opts);
 
     //Initial pacement statistics
     VTR_LOG("Initial placement cost: %g bb_cost: %g td_cost: %g\n", costs.cost,
@@ -985,8 +1124,11 @@ void try_place(const Netlist<>& net_list,
         place_sync_external_block_connections(block_id);
     }
 
-    check_place(costs, place_delay_model.get(), placer_criticalities.get(),
-                placer_opts.place_algorithm, noc_opts);
+    check_place(costs,
+                place_delay_model.get(),
+                placer_criticalities.get(),
+                placer_opts.place_algorithm,
+                noc_opts);
 
     //Some stats
     VTR_LOG("\n");
@@ -1134,7 +1276,8 @@ static void placement_inner_loop(const t_annealing_state* state,
         e_move_result swap_result = try_swap(state, costs, move_generator,
                                              manual_move_generator, timing_info, pin_timing_invalidator,
                                              blocks_affected, delay_model, criticalities, setup_slacks,
-                                             placer_opts, noc_opts, move_type_stat, place_algorithm, timing_bb_factor, manual_move_enabled);
+                                             placer_opts, noc_opts, move_type_stat, place_algorithm,
+                                             timing_bb_factor, manual_move_enabled);
 
         if (swap_result == ACCEPTED) {
             /* Move was accepted.  Update statistics that are useful for the annealing schedule. */
@@ -1292,7 +1435,20 @@ static int count_connections() {
 }
 
 ///@brief Find the starting temperature for the annealing loop.
-static float starting_t(const t_annealing_state* state, t_placer_costs* costs, t_annealing_sched annealing_sched, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, SetupTimingInfo* timing_info, MoveGenerator& move_generator, ManualMoveGenerator& manual_move_generator, NetPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts, const t_noc_opts& noc_opts, MoveTypeStat& move_type_stat) {
+static float starting_t(const t_annealing_state* state,
+                        t_placer_costs* costs,
+                        t_annealing_sched annealing_sched,
+                        const PlaceDelayModel* delay_model,
+                        PlacerCriticalities* criticalities,
+                        PlacerSetupSlacks* setup_slacks,
+                        SetupTimingInfo* timing_info,
+                        MoveGenerator& move_generator,
+                        ManualMoveGenerator& manual_move_generator,
+                        NetPinTimingInvalidator* pin_timing_invalidator,
+                        t_pl_blocks_to_be_moved& blocks_affected,
+                        const t_placer_opts& placer_opts,
+                        const t_noc_opts& noc_opts,
+                        MoveTypeStat& move_type_stat) {
     if (annealing_sched.type == USER_SCHED) {
         return (annealing_sched.init_t);
     }
@@ -1370,7 +1526,8 @@ static float starting_t(const t_annealing_state* state, t_placer_costs* costs, t
     return init_temp;
 }
 
-static void update_move_nets(int num_nets_affected) {
+static void update_move_nets(int num_nets_affected,
+                             const bool cube_bb) {
     /* update net cost functions and reset flags. */
     auto& cluster_ctx = g_vpr_ctx.clustering();
     auto& place_move_ctx = g_placer_ctx.mutable_move();
@@ -1379,9 +1536,23 @@ static void update_move_nets(int num_nets_affected) {
          inet_affected++) {
         ClusterNetId net_id = ts_nets_to_update[inet_affected];
 
-        place_move_ctx.bb_coords[net_id] = ts_bb_coord_new[net_id];
-        if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET)
-            place_move_ctx.bb_num_on_edges[net_id] = ts_bb_edge_new[net_id];
+        if (cube_bb) {
+            place_move_ctx.bb_coords[net_id] = ts_bb_coord_new[net_id];
+        } else {
+            place_move_ctx.layer_bb_coords[net_id] = layer_ts_bb_coord_new[net_id];
+        }
+
+        for (int layer_num = 0; layer_num < g_vpr_ctx.device().grid.get_num_layers(); layer_num++) {
+            place_move_ctx.num_sink_pin_layer[size_t(net_id)][layer_num] = ts_layer_sink_pin_count[size_t(net_id)][layer_num];
+        }
+
+        if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET) {
+            if (cube_bb) {
+                place_move_ctx.bb_num_on_edges[net_id] = ts_bb_edge_new[net_id];
+            } else {
+                place_move_ctx.layer_bb_num_on_edges[net_id] = layer_ts_bb_edge_new[net_id];
+            }
+        }
 
         net_cost[net_id] = proposed_net_cost[net_id];
 
@@ -1637,7 +1808,8 @@ static e_move_result try_swap(const t_annealing_state* state,
             }
 
             /* Update net cost functions and reset flags. */
-            update_move_nets(num_nets_affected);
+            update_move_nets(num_nets_affected,
+                             g_vpr_ctx.placement().cube_bb);
 
             /* Update clb data structures since we kept the move. */
             commit_move_blocks(blocks_affected);
@@ -1746,6 +1918,37 @@ static e_move_result try_swap(const t_annealing_state* state,
     return move_outcome;
 }
 
+static bool is_cube_bb(const e_place_bounding_box_mode place_bb_mode,
+                       const RRGraphView& rr_graph) {
+    bool cube_bb;
+    const int number_layers = g_vpr_ctx.device().grid.get_num_layers();
+
+    // If the FPGA has only layer, then we can only use cube bounding box
+    if (number_layers == 1) {
+        cube_bb = true;
+    } else {
+        VTR_ASSERT(number_layers > 1);
+        if (place_bb_mode == AUTO_BB) {
+            // If the auto_bb is used, we analyze the RR graph to see whether is there any inter-layer connection that is not
+            // originated from OPIN. If there is any, cube BB is chosen, otherwise, per-layer bb is chosen.
+            if (inter_layer_connections_limited_to_opin(rr_graph)) {
+                cube_bb = false;
+            } else {
+                cube_bb = true;
+            }
+        } else if (place_bb_mode == CUBE_BB) {
+            // The user has specifically asked for CUBE_BB
+            cube_bb = true;
+        } else {
+            // The user has specifically asked for PER_LAYER_BB
+            VTR_ASSERT_SAFE(place_bb_mode == PER_LAYER_BB);
+            cube_bb = false;
+        }
+    }
+
+    return cube_bb;
+}
+
 /**
  * @brief Find all the nets and pins affected by this swap and update costs.
  *
@@ -1781,6 +1984,8 @@ static int find_affected_nets_and_update_costs(
 
     int num_affected_nets = 0;
 
+    const auto& cube_bb = g_vpr_ctx.placement().cube_bb;
+
     /* Go through all the blocks moved. */
     for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
         ClusterBlockId blk = blocks_affected.moved_blocks[iblk].block_num;
@@ -1800,7 +2005,11 @@ static int find_affected_nets_and_update_costs(
             record_affected_net(net_id, num_affected_nets);
 
             /* Update the net bounding boxes. */
-            update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin);
+            if (cube_bb) {
+                update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin);
+            } else {
+                update_net_layer_bb(net_id, blocks_affected, iblk, blk, blk_pin);
+            }
 
             if (place_algorithm.is_timing_driven()) {
                 /* Determine the change in connection delay and timing cost. */
@@ -1816,8 +2025,15 @@ static int find_affected_nets_and_update_costs(
          inet_affected++) {
         ClusterNetId net_id = ts_nets_to_update[inet_affected];
 
-        proposed_net_cost[net_id] = get_net_cost(net_id,
-                                                 &ts_bb_coord_new[net_id]);
+        if (cube_bb) {
+            proposed_net_cost[net_id] = get_net_cost(net_id,
+                                                     ts_bb_coord_new[net_id]);
+        } else {
+            proposed_net_cost[net_id] = get_net_layer_cost(net_id,
+                                                           layer_ts_bb_coord_new[net_id],
+                                                           ts_layer_sink_pin_count[size_t(net_id)]);
+        }
+
         bb_delta_c += proposed_net_cost[net_id] - net_cost[net_id];
     }
 
@@ -1855,24 +2071,78 @@ static void update_net_bb(const ClusterNetId net,
         //For small nets brute-force bounding box update is faster
 
         if (bb_updated_before[net] == NOT_UPDATED_YET) { //Only once per-net
-            get_non_updateable_bb(net, &ts_bb_coord_new[net]);
+            get_non_updateable_bb(net,
+                                  ts_bb_coord_new[net],
+                                  ts_layer_sink_pin_count[size_t(net)]);
         }
     } else {
         //For large nets, update bounding box incrementally
         int iblk_pin = tile_pin_index(blk_pin);
+        bool src_pin = cluster_ctx.clb_nlist.pin_type(blk_pin) == PinType::DRIVER;
 
         t_physical_tile_type_ptr blk_type = physical_tile_type(blk);
         int pin_width_offset = blk_type->pin_width_offset[iblk_pin];
         int pin_height_offset = blk_type->pin_height_offset[iblk_pin];
 
         //Incremental bounding box update
-        update_bb(net, &ts_bb_coord_new[net], &ts_bb_edge_new[net],
-                  blocks_affected.moved_blocks[iblk].old_loc.x + pin_width_offset,
-                  blocks_affected.moved_blocks[iblk].old_loc.y
-                      + pin_height_offset,
-                  blocks_affected.moved_blocks[iblk].new_loc.x + pin_width_offset,
-                  blocks_affected.moved_blocks[iblk].new_loc.y
-                      + pin_height_offset);
+        t_physical_tile_loc pin_old_loc(
+            blocks_affected.moved_blocks[iblk].old_loc.x + pin_width_offset,
+            blocks_affected.moved_blocks[iblk].old_loc.y + pin_height_offset,
+            blocks_affected.moved_blocks[iblk].old_loc.layer);
+        t_physical_tile_loc pin_new_loc(
+            blocks_affected.moved_blocks[iblk].new_loc.x + pin_width_offset,
+            blocks_affected.moved_blocks[iblk].new_loc.y + pin_height_offset,
+            blocks_affected.moved_blocks[iblk].new_loc.layer);
+        update_bb(net,
+                  ts_bb_edge_new[net],
+                  ts_bb_coord_new[net],
+                  ts_layer_sink_pin_count[size_t(net)],
+                  pin_old_loc,
+                  pin_new_loc,
+                  src_pin);
+    }
+}
+
+static void update_net_layer_bb(const ClusterNetId net,
+                                const t_pl_blocks_to_be_moved& blocks_affected,
+                                int iblk,
+                                const ClusterBlockId blk,
+                                const ClusterPinId blk_pin) {
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    if (cluster_ctx.clb_nlist.net_sinks(net).size() < SMALL_NET) {
+        //For small nets brute-force bounding box update is faster
+
+        if (bb_updated_before[net] == NOT_UPDATED_YET) { //Only once per-net
+            get_non_updateable_layer_bb(net,
+                                        layer_ts_bb_coord_new[net],
+                                        ts_layer_sink_pin_count[size_t(net)]);
+        }
+    } else {
+        //For large nets, update bounding box incrementally
+        int iblk_pin = tile_pin_index(blk_pin);
+
+        t_physical_tile_type_ptr blk_type = physical_tile_type(blk);
+        int pin_width_offset = blk_type->pin_width_offset[iblk_pin];
+        int pin_height_offset = blk_type->pin_height_offset[iblk_pin];
+
+        //Incremental bounding box update
+        t_physical_tile_loc pin_old_loc(
+            blocks_affected.moved_blocks[iblk].old_loc.x + pin_width_offset,
+            blocks_affected.moved_blocks[iblk].old_loc.y + pin_height_offset,
+            blocks_affected.moved_blocks[iblk].old_loc.layer);
+        t_physical_tile_loc pin_new_loc(
+            blocks_affected.moved_blocks[iblk].new_loc.x + pin_width_offset,
+            blocks_affected.moved_blocks[iblk].new_loc.y + pin_height_offset,
+            blocks_affected.moved_blocks[iblk].new_loc.layer);
+        auto pin_dir = get_pin_type_from_pin_physical_num(blk_type, iblk_pin);
+        update_layer_bb(net,
+                        layer_ts_bb_edge_new[net],
+                        layer_ts_bb_coord_new[net],
+                        ts_layer_sink_pin_count[size_t(net)],
+                        pin_old_loc,
+                        pin_new_loc,
+                        pin_dir == e_pin_type::DRIVER);
     }
 }
 
@@ -2237,19 +2507,61 @@ static double comp_bb_cost(e_cost_methods method) {
              * so they can use a fast bounding box calculator.                    */
             if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET
                 && method == NORMAL) {
-                get_bb_from_scratch(net_id, &place_move_ctx.bb_coords[net_id],
-                                    &place_move_ctx.bb_num_on_edges[net_id]);
+                get_bb_from_scratch(net_id,
+                                    place_move_ctx.bb_coords[net_id],
+                                    place_move_ctx.bb_num_on_edges[net_id],
+                                    place_move_ctx.num_sink_pin_layer[size_t(net_id)]);
             } else {
                 get_non_updateable_bb(net_id,
-                                      &place_move_ctx.bb_coords[net_id]);
+                                      place_move_ctx.bb_coords[net_id],
+                                      place_move_ctx.num_sink_pin_layer[size_t(net_id)]);
             }
 
-            net_cost[net_id] = get_net_cost(net_id,
-                                            &place_move_ctx.bb_coords[net_id]);
+            net_cost[net_id] = get_net_cost(net_id, place_move_ctx.bb_coords[net_id]);
             cost += net_cost[net_id];
             if (method == CHECK)
-                expected_wirelength += get_net_wirelength_estimate(net_id,
-                                                                   &place_move_ctx.bb_coords[net_id]);
+                expected_wirelength += get_net_wirelength_estimate(net_id, place_move_ctx.bb_coords[net_id]);
+        }
+    }
+
+    if (method == CHECK) {
+        VTR_LOG("\n");
+        VTR_LOG("BB estimate of min-dist (placement) wire length: %.0f\n",
+                expected_wirelength);
+    }
+    return cost;
+}
+
+static double comp_layer_bb_cost(e_cost_methods method) {
+    double cost = 0;
+    double expected_wirelength = 0.0;
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& place_move_ctx = g_placer_ctx.mutable_move();
+
+    for (auto net_id : cluster_ctx.clb_nlist.nets()) {       /* for each net ... */
+        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */
+            /* Small nets don't use incremental updating on their bounding boxes, *
+             * so they can use a fast bounding box calculator.                    */
+            if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET
+                && method == NORMAL) {
+                get_layer_bb_from_scratch(net_id,
+                                          place_move_ctx.layer_bb_num_on_edges[net_id],
+                                          place_move_ctx.layer_bb_coords[net_id],
+                                          place_move_ctx.num_sink_pin_layer[size_t(net_id)]);
+            } else {
+                get_non_updateable_layer_bb(net_id,
+                                            place_move_ctx.layer_bb_coords[net_id],
+                                            place_move_ctx.num_sink_pin_layer[size_t(net_id)]);
+            }
+
+            net_cost[net_id] = get_net_layer_cost(net_id,
+                                                  place_move_ctx.layer_bb_coords[net_id],
+                                                  place_move_ctx.num_sink_pin_layer[size_t(net_id)]);
+            cost += net_cost[net_id];
+            if (method == CHECK)
+                expected_wirelength += get_net_layer_wirelength_estimate(net_id,
+                                                                         place_move_ctx.layer_bb_coords[net_id],
+                                                                         place_move_ctx.num_sink_pin_layer[size_t(net_id)]);
         }
     }
 
@@ -2275,11 +2587,15 @@ static void alloc_and_load_placement_structs(float place_cost_exp,
     const auto& cluster_ctx = g_vpr_ctx.clustering();
     auto& place_ctx = g_vpr_ctx.mutable_placement();
 
+    const auto& cube_bb = place_ctx.cube_bb;
+
     auto& p_timing_ctx = g_placer_ctx.mutable_timing();
     auto& place_move_ctx = g_placer_ctx.mutable_move();
 
     size_t num_nets = cluster_ctx.clb_nlist.nets().size();
 
+    const int num_layers = device_ctx.grid.get_num_layers();
+
     init_placement_context();
 
     max_pins_per_clb = 0;
@@ -2323,8 +2639,21 @@ static void alloc_and_load_placement_structs(float place_cost_exp,
 
     net_cost.resize(num_nets, -1.);
     proposed_net_cost.resize(num_nets, -1.);
-    place_move_ctx.bb_coords.resize(num_nets, t_bb());
-    place_move_ctx.bb_num_on_edges.resize(num_nets, t_bb());
+
+    if (cube_bb) {
+        place_move_ctx.bb_coords.resize(num_nets, t_bb());
+        place_move_ctx.bb_num_on_edges.resize(num_nets, t_bb());
+    } else {
+        VTR_ASSERT_SAFE(!cube_bb);
+        place_move_ctx.layer_bb_num_on_edges.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
+        place_move_ctx.layer_bb_coords.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
+    }
+
+    place_move_ctx.num_sink_pin_layer.resize({num_nets, size_t(num_layers)});
+    for (size_t flat_idx = 0; flat_idx < ts_layer_sink_pin_count.size(); flat_idx++) {
+        auto& elem = ts_layer_sink_pin_count.get(flat_idx);
+        elem = OPEN;
+    }
 
     /* Used to store costs for moves not yet made and to indicate when a net's   *
      * cost has been recomputed. proposed_net_cost[inet] < 0 means net's cost hasn't *
@@ -2333,7 +2662,7 @@ static void alloc_and_load_placement_structs(float place_cost_exp,
 
     alloc_and_load_for_fast_cost_update(place_cost_exp);
 
-    alloc_and_load_try_swap_structs();
+    alloc_and_load_try_swap_structs(cube_bb);
 
     place_ctx.pl_macros = alloc_and_load_placement_macros(directs, num_directs);
 
@@ -2362,8 +2691,13 @@ static void free_placement_structs(const t_placer_opts& placer_opts, const t_noc
 
     vtr::release_memory(net_cost);
     vtr::release_memory(proposed_net_cost);
-    vtr::release_memory(place_move_ctx.bb_coords);
     vtr::release_memory(place_move_ctx.bb_num_on_edges);
+    vtr::release_memory(place_move_ctx.bb_coords);
+
+    vtr::release_memory(place_move_ctx.layer_bb_num_on_edges);
+    vtr::release_memory(place_move_ctx.layer_bb_coords);
+
+    place_move_ctx.num_sink_pin_layer.clear();
 
     vtr::release_memory(bb_updated_before);
 
@@ -2376,15 +2710,30 @@ static void free_placement_structs(const t_placer_opts& placer_opts, const t_noc
     }
 }
 
-static void alloc_and_load_try_swap_structs() {
+static void alloc_and_load_try_swap_structs(const bool cube_bb) {
     /* Allocate the local bb_coordinate storage, etc. only once. */
     /* Allocate with size cluster_ctx.clb_nlist.nets().size() for any number of nets affected. */
     auto& cluster_ctx = g_vpr_ctx.clustering();
 
     size_t num_nets = cluster_ctx.clb_nlist.nets().size();
 
-    ts_bb_coord_new.resize(num_nets, t_bb());
-    ts_bb_edge_new.resize(num_nets, t_bb());
+    const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
+
+    if (cube_bb) {
+        ts_bb_edge_new.resize(num_nets, t_bb());
+        ts_bb_coord_new.resize(num_nets, t_bb());
+    } else {
+        VTR_ASSERT_SAFE(!cube_bb);
+        layer_ts_bb_edge_new.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
+        layer_ts_bb_coord_new.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
+    }
+
+    ts_layer_sink_pin_count.resize({num_nets, size_t(num_layers)});
+    for (size_t flat_idx = 0; flat_idx < ts_layer_sink_pin_count.size(); flat_idx++) {
+        auto& elem = ts_layer_sink_pin_count.get(flat_idx);
+        elem = OPEN;
+    }
+
     ts_nets_to_update.resize(num_nets, ClusterNetId::INVALID());
 
     auto& place_ctx = g_vpr_ctx.mutable_placement();
@@ -2392,8 +2741,11 @@ static void alloc_and_load_try_swap_structs() {
 }
 
 static void free_try_swap_structs() {
-    vtr::release_memory(ts_bb_coord_new);
     vtr::release_memory(ts_bb_edge_new);
+    vtr::release_memory(ts_bb_coord_new);
+    vtr::release_memory(layer_ts_bb_edge_new);
+    vtr::release_memory(layer_ts_bb_coord_new);
+    ts_layer_sink_pin_count.clear();
     vtr::release_memory(ts_nets_to_update);
 
     auto& place_ctx = g_vpr_ctx.mutable_placement();
@@ -2404,8 +2756,11 @@ static void free_try_swap_structs() {
  * from only the block location information).  It updates both the       *
  * coordinate and number of pins on each edge information.  It           *
  * should only be called when the bounding box information is not valid. */
-static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_edges) {
-    int pnum, x, y, xmin, xmax, ymin, ymax;
+static void get_bb_from_scratch(ClusterNetId net_id,
+                                t_bb& coords,
+                                t_bb& num_on_edges,
+                                vtr::NdMatrixProxy<int, 1> num_sink_pin_layer) {
+    int pnum, x, y, pin_layer, xmin, xmax, ymin, ymax;
     int xmin_edge, xmax_edge, ymin_edge, ymax_edge;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -2433,6 +2788,10 @@ static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_
     xmax_edge = 1;
     ymax_edge = 1;
 
+    for (int layer_num = 0; layer_num < grid.get_num_layers(); layer_num++) {
+        num_sink_pin_layer[layer_num] = 0;
+    }
+
     for (auto pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) {
         bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
         pnum = tile_pin_index(pin_id);
@@ -2440,6 +2799,7 @@ static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_
             + physical_tile_type(bnum)->pin_width_offset[pnum];
         y = place_ctx.block_locs[bnum].loc.y
             + physical_tile_type(bnum)->pin_height_offset[pnum];
+        pin_layer = place_ctx.block_locs[bnum].loc.layer;
 
         /* Code below counts IO blocks as being within the 1..grid.width()-2, 1..grid.height()-2 clb array. *
          * This is because channels do not go out of the 0..grid.width()-2, 0..grid.height()-2 range, and   *
@@ -2476,19 +2836,134 @@ static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_
             ymax = y;
             ymax_edge = 1;
         }
+
+        num_sink_pin_layer[pin_layer]++;
     }
 
     /* Copy the coordinates and number on edges information into the proper   *
      * structures.                                                            */
-    coords->xmin = xmin;
-    coords->xmax = xmax;
-    coords->ymin = ymin;
-    coords->ymax = ymax;
+    coords.xmin = xmin;
+    coords.xmax = xmax;
+    coords.ymin = ymin;
+    coords.ymax = ymax;
+
+    num_on_edges.xmin = xmin_edge;
+    num_on_edges.xmax = xmax_edge;
+    num_on_edges.ymin = ymin_edge;
+    num_on_edges.ymax = ymax_edge;
+}
+
+/* This routine finds the bounding box of each net from scratch when the bounding box is of type per-layer (i.e.   *
+ * from only the block location information).  It updates the       *
+ * coordinate, number of pins on each edge information, and the number of sinks on each layer.  It           *
+ * should only be called when the bounding box information is not valid. */
+static void get_layer_bb_from_scratch(ClusterNetId net_id,
+                                      std::vector<t_2D_bb>& num_on_edges,
+                                      std::vector<t_2D_bb>& coords,
+                                      vtr::NdMatrixProxy<int, 1> layer_pin_sink_count) {
+    auto& device_ctx = g_vpr_ctx.device();
+    const int num_layers = device_ctx.grid.get_num_layers();
+    std::vector<int> xmin(num_layers, OPEN);
+    std::vector<int> xmax(num_layers, OPEN);
+    std::vector<int> ymin(num_layers, OPEN);
+    std::vector<int> ymax(num_layers, OPEN);
+    std::vector<int> xmin_edge(num_layers, OPEN);
+    std::vector<int> xmax_edge(num_layers, OPEN);
+    std::vector<int> ymin_edge(num_layers, OPEN);
+    std::vector<int> ymax_edge(num_layers, OPEN);
+
+    std::vector<int> num_sink_pin_layer(num_layers, 0);
+
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& place_ctx = g_vpr_ctx.placement();
+    auto& grid = device_ctx.grid;
 
-    num_on_edges->xmin = xmin_edge;
-    num_on_edges->xmax = xmax_edge;
-    num_on_edges->ymin = ymin_edge;
-    num_on_edges->ymax = ymax_edge;
+    ClusterBlockId bnum = cluster_ctx.clb_nlist.net_driver_block(net_id);
+    int pnum_src = net_pin_to_tile_pin_index(net_id, 0);
+    VTR_ASSERT(pnum_src >= 0);
+    int x_src = place_ctx.block_locs[bnum].loc.x
+                + physical_tile_type(bnum)->pin_width_offset[pnum_src];
+    int y_src = place_ctx.block_locs[bnum].loc.y
+                + physical_tile_type(bnum)->pin_height_offset[pnum_src];
+
+    x_src = max(min<int>(x_src, grid.width() - 2), 1);
+    y_src = max(min<int>(y_src, grid.height() - 2), 1);
+
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+        xmin[layer_num] = x_src;
+        ymin[layer_num] = y_src;
+        xmax[layer_num] = x_src;
+        ymax[layer_num] = y_src;
+        xmin_edge[layer_num] = 1;
+        ymin_edge[layer_num] = 1;
+        xmax_edge[layer_num] = 1;
+        ymax_edge[layer_num] = 1;
+    }
+
+    for (auto pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) {
+        bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
+        int pnum = tile_pin_index(pin_id);
+        int layer = place_ctx.block_locs[bnum].loc.layer;
+        VTR_ASSERT(layer >= 0 && layer < num_layers);
+        num_sink_pin_layer[layer]++;
+        int x = place_ctx.block_locs[bnum].loc.x
+                + physical_tile_type(bnum)->pin_width_offset[pnum];
+        int y = place_ctx.block_locs[bnum].loc.y
+                + physical_tile_type(bnum)->pin_height_offset[pnum];
+
+        /* Code below counts IO blocks as being within the 1..grid.width()-2, 1..grid.height()-2 clb array. *
+         * This is because channels do not go out of the 0..grid.width()-2, 0..grid.height()-2 range, and   *
+         * I always take all channels impinging on the bounding box to be within   *
+         * that bounding box.  Hence, this "movement" of IO blocks does not affect *
+         * the which channels are included within the bounding box, and it         *
+         * simplifies the code a lot.                                              */
+
+        x = max(min<int>(x, grid.width() - 2), 1);  //-2 for no perim channels
+        y = max(min<int>(y, grid.height() - 2), 1); //-2 for no perim channels
+
+        if (x == xmin[layer]) {
+            xmin_edge[layer]++;
+        }
+        if (x == xmax[layer]) { /* Recall that xmin could equal xmax -- don't use else */
+            xmax_edge[layer]++;
+        } else if (x < xmin[layer]) {
+            xmin[layer] = x;
+            xmin_edge[layer] = 1;
+        } else if (x > xmax[layer]) {
+            xmax[layer] = x;
+            xmax_edge[layer] = 1;
+        }
+
+        if (y == ymin[layer]) {
+            ymin_edge[layer]++;
+        }
+        if (y == ymax[layer]) {
+            ymax_edge[layer]++;
+        } else if (y < ymin[layer]) {
+            ymin[layer] = y;
+            ymin_edge[layer] = 1;
+        } else if (y > ymax[layer]) {
+            ymax[layer] = y;
+            ymax_edge[layer] = 1;
+        }
+    }
+
+    /* Copy the coordinates and number on edges information into the proper   *
+     * structures.                                                            */
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+        layer_pin_sink_count[layer_num] = num_sink_pin_layer[layer_num];
+        coords[layer_num].xmin = xmin[layer_num];
+        coords[layer_num].xmax = xmax[layer_num];
+        coords[layer_num].ymin = ymin[layer_num];
+        coords[layer_num].ymax = ymax[layer_num];
+        coords[layer_num].layer_num = layer_num;
+
+        num_on_edges[layer_num].xmin = xmin_edge[layer_num];
+        num_on_edges[layer_num].xmax = xmax_edge[layer_num];
+        num_on_edges[layer_num].ymin = ymin_edge[layer_num];
+        num_on_edges[layer_num].ymax = ymax_edge[layer_num];
+        num_on_edges[layer_num].layer_num = layer_num;
+    }
 }
 
 static double wirelength_crossing_count(size_t fanout) {
@@ -2502,7 +2977,7 @@ static double wirelength_crossing_count(size_t fanout) {
     }
 }
 
-static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr) {
+static double get_net_wirelength_estimate(ClusterNetId net_id, const t_bb& bbptr) {
     /* WMF: Finds the estimate of wirelength due to one net by looking at   *
      * its coordinate bounding box.                                         */
 
@@ -2519,14 +2994,46 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr) {
     /* Cost = wire length along channel * cross_count / average      *
      * channel capacity.   Do this for x, then y direction and add.  */
 
-    ncost = (bbptr->xmax - bbptr->xmin + 1) * crossing;
+    ncost = (bbptr.xmax - bbptr.xmin + 1) * crossing;
+
+    ncost += (bbptr.ymax - bbptr.ymin + 1) * crossing;
+
+    return (ncost);
+}
+
+static double get_net_layer_wirelength_estimate(ClusterNetId /* net_id */,
+                                                const std::vector<t_2D_bb>& bbptr,
+                                                const vtr::NdMatrixProxy<int, 1> layer_pin_sink_count) {
+    /* WMF: Finds the estimate of wirelength due to one net by looking at   *
+     * its coordinate bounding box.                                         */
+
+    double ncost = 0.;
+    double crossing = 0.;
+    int num_layers = g_vpr_ctx.device().grid.get_num_layers();
+
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+        VTR_ASSERT(layer_pin_sink_count[layer_num] != OPEN);
+        if (layer_pin_sink_count[layer_num] == 0) {
+            continue;
+        }
+        crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
+
+        /* Could insert a check for xmin == xmax.  In that case, assume  *
+         * connection will be made with no bends and hence no x-cost.    *
+         * Same thing for y-cost.                                        */
+
+        /* Cost = wire length along channel * cross_count / average      *
+         * channel capacity.   Do this for x, then y direction and add.  */
 
-    ncost += (bbptr->ymax - bbptr->ymin + 1) * crossing;
+        ncost += (bbptr[layer_num].xmax - bbptr[layer_num].xmin + 1) * crossing;
+
+        ncost += (bbptr[layer_num].ymax - bbptr[layer_num].ymin + 1) * crossing;
+    }
 
     return (ncost);
 }
 
-static double get_net_cost(ClusterNetId net_id, t_bb* bbptr) {
+static double get_net_cost(ClusterNetId net_id, const t_bb& bbptr) {
     /* Finds the cost due to one net by looking at its coordinate bounding  *
      * box.                                                                 */
 
@@ -2543,11 +3050,45 @@ static double get_net_cost(ClusterNetId net_id, t_bb* bbptr) {
     /* Cost = wire length along channel * cross_count / average      *
      * channel capacity.   Do this for x, then y direction and add.  */
 
-    ncost = (bbptr->xmax - bbptr->xmin + 1) * crossing
-            * chanx_place_cost_fac[bbptr->ymax][bbptr->ymin - 1];
+    ncost = (bbptr.xmax - bbptr.xmin + 1) * crossing
+            * chanx_place_cost_fac[bbptr.ymax][bbptr.ymin - 1];
 
-    ncost += (bbptr->ymax - bbptr->ymin + 1) * crossing
-             * chany_place_cost_fac[bbptr->xmax][bbptr->xmin - 1];
+    ncost += (bbptr.ymax - bbptr.ymin + 1) * crossing
+             * chany_place_cost_fac[bbptr.xmax][bbptr.xmin - 1];
+
+    return (ncost);
+}
+
+static double get_net_layer_cost(ClusterNetId /* net_id */,
+                                 const std::vector<t_2D_bb>& bbptr,
+                                 const vtr::NdMatrixProxy<int, 1> layer_pin_sink_count) {
+    /* Finds the cost due to one net by looking at its coordinate bounding  *
+     * box.                                                                 */
+
+    double ncost = 0.;
+    double crossing = 0.;
+    int num_layers = g_vpr_ctx.device().grid.get_num_layers();
+
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+        VTR_ASSERT(layer_pin_sink_count[layer_num] != OPEN);
+        if (layer_pin_sink_count[layer_num] == 0) {
+            continue;
+        }
+        crossing = wirelength_crossing_count(layer_pin_sink_count[layer_num] + 1);
+
+        /* Could insert a check for xmin == xmax.  In that case, assume  *
+         * connection will be made with no bends and hence no x-cost.    *
+         * Same thing for y-cost.                                        */
+
+        /* Cost = wire length along channel * cross_count / average      *
+         * channel capacity.   Do this for x, then y direction and add.  */
+
+        ncost += (bbptr[layer_num].xmax - bbptr[layer_num].xmin + 1) * crossing
+                 * chanx_place_cost_fac[bbptr[layer_num].ymax][bbptr[layer_num].ymin - 1];
+
+        ncost += (bbptr[layer_num].ymax - bbptr[layer_num].ymin + 1) * crossing
+                 * chany_place_cost_fac[bbptr[layer_num].xmax][bbptr[layer_num].xmin - 1];
+    }
 
     return (ncost);
 }
@@ -2559,10 +3100,12 @@ static double get_net_cost(ClusterNetId net_id, t_bb* bbptr) {
  * Currently assumes channels on both sides of the CLBs forming the   *
  * edges of the bounding box can be used.  Essentially, I am assuming *
  * the pins always lie on the outside of the bounding box.            */
-static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new) {
+static void get_non_updateable_bb(ClusterNetId net_id,
+                                  t_bb& bb_coord_new,
+                                  vtr::NdMatrixProxy<int, 1> num_sink_pin_layer) {
     //TODO: account for multiple physical pin instances per logical pin
 
-    int xmax, ymax, xmin, ymin, x, y;
+    int xmax, ymax, xmin, ymin, x, y, layer;
     int pnum;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -2582,6 +3125,10 @@ static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new) {
     xmax = x;
     ymax = y;
 
+    for (int layer_num = 0; layer_num < device_ctx.grid.get_num_layers(); layer_num++) {
+        num_sink_pin_layer[layer_num] = 0;
+    }
+
     for (auto pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) {
         bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
         pnum = tile_pin_index(pin_id);
@@ -2589,6 +3136,7 @@ static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new) {
             + physical_tile_type(bnum)->pin_width_offset[pnum];
         y = place_ctx.block_locs[bnum].loc.y
             + physical_tile_type(bnum)->pin_height_offset[pnum];
+        layer = place_ctx.block_locs[bnum].loc.layer;
 
         if (x < xmin) {
             xmin = x;
@@ -2601,6 +3149,8 @@ static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new) {
         } else if (y > ymax) {
             ymax = y;
         }
+
+        num_sink_pin_layer[layer]++;
     }
 
     /* Now I've found the coordinates of the bounding box.  There are no *
@@ -2611,13 +3161,87 @@ static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new) {
      * clip to 1 in both directions as well (since minimum channel index *
      * is 0).  See route_common.cpp for a channel diagram.               */
 
-    bb_coord_new->xmin = max(min<int>(xmin, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
-    bb_coord_new->ymin = max(min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
-    bb_coord_new->xmax = max(min<int>(xmax, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
-    bb_coord_new->ymax = max(min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.xmin = max(min<int>(xmin, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    bb_coord_new.ymin = max(min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.xmax = max(min<int>(xmax, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    bb_coord_new.ymax = max(min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+}
+
+static void get_non_updateable_layer_bb(ClusterNetId net_id,
+                                        std::vector<t_2D_bb>& bb_coord_new,
+                                        vtr::NdMatrixProxy<int, 1> num_sink_layer) {
+    //TODO: account for multiple physical pin instances per logical pin
+
+    auto& device_ctx = g_vpr_ctx.device();
+    int num_layers = device_ctx.grid.get_num_layers();
+    for (int layer_num = 0; layer_num < device_ctx.grid.get_num_layers(); layer_num++) {
+        num_sink_layer[layer_num] = 0;
+    }
+
+    int pnum;
+
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& place_ctx = g_vpr_ctx.placement();
+
+    ClusterBlockId bnum = cluster_ctx.clb_nlist.net_driver_block(net_id);
+    pnum = net_pin_to_tile_pin_index(net_id, 0);
+
+    int src_x = place_ctx.block_locs[bnum].loc.x
+                + physical_tile_type(bnum)->pin_width_offset[pnum];
+    int src_y = place_ctx.block_locs[bnum].loc.y
+                + physical_tile_type(bnum)->pin_height_offset[pnum];
+
+    std::vector<int> xmin(num_layers, src_x);
+    std::vector<int> ymin(num_layers, src_y);
+    std::vector<int> xmax(num_layers, src_x);
+    std::vector<int> ymax(num_layers, src_y);
+
+    for (auto pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) {
+        bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
+        pnum = tile_pin_index(pin_id);
+        int x = place_ctx.block_locs[bnum].loc.x
+                + physical_tile_type(bnum)->pin_width_offset[pnum];
+        int y = place_ctx.block_locs[bnum].loc.y
+                + physical_tile_type(bnum)->pin_height_offset[pnum];
+
+        int layer_num = place_ctx.block_locs[bnum].loc.layer;
+        num_sink_layer[layer_num]++;
+        if (x < xmin[layer_num]) {
+            xmin[layer_num] = x;
+        } else if (x > xmax[layer_num]) {
+            xmax[layer_num] = x;
+        }
+
+        if (y < ymin[layer_num]) {
+            ymin[layer_num] = y;
+        } else if (y > ymax[layer_num]) {
+            ymax[layer_num] = y;
+        }
+    }
+
+    /* Now I've found the coordinates of the bounding box.  There are no *
+     * channels beyond device_ctx.grid.width()-2 and                     *
+     * device_ctx.grid.height() - 2, so I want to clip to that.  As well,*
+     * since I'll always include the channel immediately below and the   *
+     * channel immediately to the left of the bounding box, I want to    *
+     * clip to 1 in both directions as well (since minimum channel index *
+     * is 0).  See route_common.cpp for a channel diagram.               */
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+        bb_coord_new[layer_num].layer_num = layer_num;
+        bb_coord_new[layer_num].xmin = max(min<int>(xmin[layer_num], device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+        bb_coord_new[layer_num].ymin = max(min<int>(ymin[layer_num], device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+        bb_coord_new[layer_num].xmax = max(min<int>(xmax[layer_num], device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+        bb_coord_new[layer_num].ymax = max(min<int>(ymax[layer_num], device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    }
 }
 
-static void update_bb(ClusterNetId net_id, t_bb* bb_coord_new, t_bb* bb_edge_new, int xold, int yold, int xnew, int ynew) {
+static void update_bb(ClusterNetId net_id,
+                      t_bb& bb_edge_new,
+                      t_bb& bb_coord_new,
+                      vtr::NdMatrixProxy<int, 1> num_sink_pin_layer_new,
+                      t_physical_tile_loc pin_old_loc,
+                      t_physical_tile_loc pin_new_loc,
+                      bool src_pin) {
     /* Updates the bounding box of a net by storing its coordinates in    *
      * the bb_coord_new data structure and the number of blocks on each   *
      * edge in the bb_edge_new data structure.  This routine should only  *
@@ -2636,174 +3260,546 @@ static void update_bb(ClusterNetId net_id, t_bb* bb_coord_new, t_bb* bb_edge_new
     auto& device_ctx = g_vpr_ctx.device();
     auto& place_move_ctx = g_placer_ctx.move();
 
-    xnew = max(min<int>(xnew, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
-    ynew = max(min<int>(ynew, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
-    xold = max(min<int>(xold, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
-    yold = max(min<int>(yold, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    const int num_layers = device_ctx.grid.get_num_layers();
+
+    pin_new_loc.x = max(min<int>(pin_new_loc.x, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    pin_new_loc.y = max(min<int>(pin_new_loc.y, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    pin_old_loc.x = max(min<int>(pin_old_loc.x, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    pin_old_loc.y = max(min<int>(pin_old_loc.y, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
 
     /* Check if the net had been updated before. */
     if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
         /* The net had been updated from scratch, DO NOT update again! */
         return;
-    } else if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
+    }
+
+    vtr::NdMatrixProxy<int, 1> curr_num_sink_pin_layer = (bb_updated_before[net_id] == NOT_UPDATED_YET) ? place_move_ctx.num_sink_pin_layer[size_t(net_id)] : num_sink_pin_layer_new;
+
+    if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
         /* The net had NOT been updated before, could use the old values */
-        curr_bb_coord = &place_move_ctx.bb_coords[net_id];
         curr_bb_edge = &place_move_ctx.bb_num_on_edges[net_id];
+        curr_bb_coord = &place_move_ctx.bb_coords[net_id];
         bb_updated_before[net_id] = UPDATED_ONCE;
     } else {
         /* The net had been updated before, must use the new values */
-        curr_bb_coord = bb_coord_new;
-        curr_bb_edge = bb_edge_new;
+        curr_bb_coord = &bb_coord_new;
+        curr_bb_edge = &bb_edge_new;
     }
 
     /* Check if I can update the bounding box incrementally. */
 
-    if (xnew < xold) { /* Move to left. */
+    if (pin_new_loc.x < pin_old_loc.x) { /* Move to left. */
 
         /* Update the xmax fields for coordinates and number of edges first. */
 
-        if (xold == curr_bb_coord->xmax) { /* Old position at xmax. */
+        if (pin_old_loc.x == curr_bb_coord->xmax) { /* Old position at xmax. */
             if (curr_bb_edge->xmax == 1) {
-                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
+                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new, num_sink_pin_layer_new);
                 bb_updated_before[net_id] = GOT_FROM_SCRATCH;
                 return;
             } else {
-                bb_edge_new->xmax = curr_bb_edge->xmax - 1;
-                bb_coord_new->xmax = curr_bb_coord->xmax;
+                bb_edge_new.xmax = curr_bb_edge->xmax - 1;
+                bb_coord_new.xmax = curr_bb_coord->xmax;
             }
         } else { /* Move to left, old postion was not at xmax. */
-            bb_coord_new->xmax = curr_bb_coord->xmax;
-            bb_edge_new->xmax = curr_bb_edge->xmax;
+            bb_coord_new.xmax = curr_bb_coord->xmax;
+            bb_edge_new.xmax = curr_bb_edge->xmax;
         }
 
         /* Now do the xmin fields for coordinates and number of edges. */
 
-        if (xnew < curr_bb_coord->xmin) { /* Moved past xmin */
-            bb_coord_new->xmin = xnew;
-            bb_edge_new->xmin = 1;
-        } else if (xnew == curr_bb_coord->xmin) { /* Moved to xmin */
-            bb_coord_new->xmin = xnew;
-            bb_edge_new->xmin = curr_bb_edge->xmin + 1;
+        if (pin_new_loc.x < curr_bb_coord->xmin) { /* Moved past xmin */
+            bb_coord_new.xmin = pin_new_loc.x;
+            bb_edge_new.xmin = 1;
+        } else if (pin_new_loc.x == curr_bb_coord->xmin) { /* Moved to xmin */
+            bb_coord_new.xmin = pin_new_loc.x;
+            bb_edge_new.xmin = curr_bb_edge->xmin + 1;
         } else { /* Xmin unchanged. */
-            bb_coord_new->xmin = curr_bb_coord->xmin;
-            bb_edge_new->xmin = curr_bb_edge->xmin;
+            bb_coord_new.xmin = curr_bb_coord->xmin;
+            bb_edge_new.xmin = curr_bb_edge->xmin;
         }
         /* End of move to left case. */
 
-    } else if (xnew > xold) { /* Move to right. */
+    } else if (pin_new_loc.x > pin_old_loc.x) { /* Move to right. */
 
         /* Update the xmin fields for coordinates and number of edges first. */
 
-        if (xold == curr_bb_coord->xmin) { /* Old position at xmin. */
+        if (pin_old_loc.x == curr_bb_coord->xmin) { /* Old position at xmin. */
             if (curr_bb_edge->xmin == 1) {
-                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
+                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new, num_sink_pin_layer_new);
                 bb_updated_before[net_id] = GOT_FROM_SCRATCH;
                 return;
             } else {
-                bb_edge_new->xmin = curr_bb_edge->xmin - 1;
-                bb_coord_new->xmin = curr_bb_coord->xmin;
+                bb_edge_new.xmin = curr_bb_edge->xmin - 1;
+                bb_coord_new.xmin = curr_bb_coord->xmin;
             }
         } else { /* Move to right, old position was not at xmin. */
-            bb_coord_new->xmin = curr_bb_coord->xmin;
-            bb_edge_new->xmin = curr_bb_edge->xmin;
+            bb_coord_new.xmin = curr_bb_coord->xmin;
+            bb_edge_new.xmin = curr_bb_edge->xmin;
         }
 
         /* Now do the xmax fields for coordinates and number of edges. */
 
-        if (xnew > curr_bb_coord->xmax) { /* Moved past xmax. */
-            bb_coord_new->xmax = xnew;
-            bb_edge_new->xmax = 1;
-        } else if (xnew == curr_bb_coord->xmax) { /* Moved to xmax */
-            bb_coord_new->xmax = xnew;
-            bb_edge_new->xmax = curr_bb_edge->xmax + 1;
+        if (pin_new_loc.x > curr_bb_coord->xmax) { /* Moved past xmax. */
+            bb_coord_new.xmax = pin_new_loc.x;
+            bb_edge_new.xmax = 1;
+        } else if (pin_new_loc.x == curr_bb_coord->xmax) { /* Moved to xmax */
+            bb_coord_new.xmax = pin_new_loc.x;
+            bb_edge_new.xmax = curr_bb_edge->xmax + 1;
         } else { /* Xmax unchanged. */
-            bb_coord_new->xmax = curr_bb_coord->xmax;
-            bb_edge_new->xmax = curr_bb_edge->xmax;
+            bb_coord_new.xmax = curr_bb_coord->xmax;
+            bb_edge_new.xmax = curr_bb_edge->xmax;
         }
         /* End of move to right case. */
 
-    } else { /* xnew == xold -- no x motion. */
-        bb_coord_new->xmin = curr_bb_coord->xmin;
-        bb_coord_new->xmax = curr_bb_coord->xmax;
-        bb_edge_new->xmin = curr_bb_edge->xmin;
-        bb_edge_new->xmax = curr_bb_edge->xmax;
+    } else { /* pin_new_loc.x == pin_old_loc.x -- no x motion. */
+        bb_coord_new.xmin = curr_bb_coord->xmin;
+        bb_coord_new.xmax = curr_bb_coord->xmax;
+        bb_edge_new.xmin = curr_bb_edge->xmin;
+        bb_edge_new.xmax = curr_bb_edge->xmax;
     }
 
     /* Now account for the y-direction motion. */
 
-    if (ynew < yold) { /* Move down. */
+    if (pin_new_loc.y < pin_old_loc.y) { /* Move down. */
 
         /* Update the ymax fields for coordinates and number of edges first. */
 
-        if (yold == curr_bb_coord->ymax) { /* Old position at ymax. */
+        if (pin_old_loc.y == curr_bb_coord->ymax) { /* Old position at ymax. */
             if (curr_bb_edge->ymax == 1) {
-                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
+                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new, num_sink_pin_layer_new);
                 bb_updated_before[net_id] = GOT_FROM_SCRATCH;
                 return;
             } else {
-                bb_edge_new->ymax = curr_bb_edge->ymax - 1;
-                bb_coord_new->ymax = curr_bb_coord->ymax;
+                bb_edge_new.ymax = curr_bb_edge->ymax - 1;
+                bb_coord_new.ymax = curr_bb_coord->ymax;
             }
         } else { /* Move down, old postion was not at ymax. */
-            bb_coord_new->ymax = curr_bb_coord->ymax;
-            bb_edge_new->ymax = curr_bb_edge->ymax;
+            bb_coord_new.ymax = curr_bb_coord->ymax;
+            bb_edge_new.ymax = curr_bb_edge->ymax;
         }
 
         /* Now do the ymin fields for coordinates and number of edges. */
 
-        if (ynew < curr_bb_coord->ymin) { /* Moved past ymin */
-            bb_coord_new->ymin = ynew;
-            bb_edge_new->ymin = 1;
-        } else if (ynew == curr_bb_coord->ymin) { /* Moved to ymin */
-            bb_coord_new->ymin = ynew;
-            bb_edge_new->ymin = curr_bb_edge->ymin + 1;
+        if (pin_new_loc.y < curr_bb_coord->ymin) { /* Moved past ymin */
+            bb_coord_new.ymin = pin_new_loc.y;
+            bb_edge_new.ymin = 1;
+        } else if (pin_new_loc.y == curr_bb_coord->ymin) { /* Moved to ymin */
+            bb_coord_new.ymin = pin_new_loc.y;
+            bb_edge_new.ymin = curr_bb_edge->ymin + 1;
         } else { /* ymin unchanged. */
-            bb_coord_new->ymin = curr_bb_coord->ymin;
-            bb_edge_new->ymin = curr_bb_edge->ymin;
+            bb_coord_new.ymin = curr_bb_coord->ymin;
+            bb_edge_new.ymin = curr_bb_edge->ymin;
         }
         /* End of move down case. */
 
-    } else if (ynew > yold) { /* Moved up. */
+    } else if (pin_new_loc.y > pin_old_loc.y) { /* Moved up. */
 
         /* Update the ymin fields for coordinates and number of edges first. */
 
-        if (yold == curr_bb_coord->ymin) { /* Old position at ymin. */
+        if (pin_old_loc.y == curr_bb_coord->ymin) { /* Old position at ymin. */
             if (curr_bb_edge->ymin == 1) {
-                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
+                get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new, num_sink_pin_layer_new);
                 bb_updated_before[net_id] = GOT_FROM_SCRATCH;
                 return;
             } else {
-                bb_edge_new->ymin = curr_bb_edge->ymin - 1;
-                bb_coord_new->ymin = curr_bb_coord->ymin;
+                bb_edge_new.ymin = curr_bb_edge->ymin - 1;
+                bb_coord_new.ymin = curr_bb_coord->ymin;
             }
         } else { /* Moved up, old position was not at ymin. */
-            bb_coord_new->ymin = curr_bb_coord->ymin;
-            bb_edge_new->ymin = curr_bb_edge->ymin;
+            bb_coord_new.ymin = curr_bb_coord->ymin;
+            bb_edge_new.ymin = curr_bb_edge->ymin;
         }
 
         /* Now do the ymax fields for coordinates and number of edges. */
 
-        if (ynew > curr_bb_coord->ymax) { /* Moved past ymax. */
-            bb_coord_new->ymax = ynew;
-            bb_edge_new->ymax = 1;
-        } else if (ynew == curr_bb_coord->ymax) { /* Moved to ymax */
-            bb_coord_new->ymax = ynew;
-            bb_edge_new->ymax = curr_bb_edge->ymax + 1;
+        if (pin_new_loc.y > curr_bb_coord->ymax) { /* Moved past ymax. */
+            bb_coord_new.ymax = pin_new_loc.y;
+            bb_edge_new.ymax = 1;
+        } else if (pin_new_loc.y == curr_bb_coord->ymax) { /* Moved to ymax */
+            bb_coord_new.ymax = pin_new_loc.y;
+            bb_edge_new.ymax = curr_bb_edge->ymax + 1;
         } else { /* ymax unchanged. */
-            bb_coord_new->ymax = curr_bb_coord->ymax;
-            bb_edge_new->ymax = curr_bb_edge->ymax;
+            bb_coord_new.ymax = curr_bb_coord->ymax;
+            bb_edge_new.ymax = curr_bb_edge->ymax;
         }
         /* End of move up case. */
 
-    } else { /* ynew == yold -- no y motion. */
-        bb_coord_new->ymin = curr_bb_coord->ymin;
-        bb_coord_new->ymax = curr_bb_coord->ymax;
-        bb_edge_new->ymin = curr_bb_edge->ymin;
-        bb_edge_new->ymax = curr_bb_edge->ymax;
+    } else { /* pin_new_loc.y == yold -- no y motion. */
+        bb_coord_new.ymin = curr_bb_coord->ymin;
+        bb_coord_new.ymax = curr_bb_coord->ymax;
+        bb_edge_new.ymin = curr_bb_edge->ymin;
+        bb_edge_new.ymax = curr_bb_edge->ymax;
+    }
+
+    /* Now account for the layer motion. */
+    if (num_layers > 1) {
+        /* We need to update it only if multiple layers are available */
+        for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+            num_sink_pin_layer_new[layer_num] = curr_num_sink_pin_layer[layer_num];
+        }
+        if (!src_pin) {
+            /* if src pin is being moved, we don't need to update this data structure */
+            if (pin_old_loc.layer_num != pin_new_loc.layer_num) {
+                num_sink_pin_layer_new[pin_old_loc.layer_num] = (curr_num_sink_pin_layer)[pin_old_loc.layer_num] - 1;
+                num_sink_pin_layer_new[pin_new_loc.layer_num] = (curr_num_sink_pin_layer)[pin_new_loc.layer_num] + 1;
+            }
+        }
+    }
+
+    if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
+        bb_updated_before[net_id] = UPDATED_ONCE;
+    }
+}
+
+static void update_layer_bb(ClusterNetId net_id,
+                            std::vector<t_2D_bb>& bb_edge_new,
+                            std::vector<t_2D_bb>& bb_coord_new,
+                            vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                            t_physical_tile_loc pin_old_loc,
+                            t_physical_tile_loc pin_new_loc,
+                            bool is_output_pin) {
+    /* Updates the bounding box of a net by storing its coordinates in    *
+     * the bb_coord_new data structure and the number of blocks on each   *
+     * edge in the bb_edge_new data structure.  This routine should only  *
+     * be called for large nets, since it has some overhead relative to   *
+     * just doing a brute force bounding box calculation.  The bounding   *
+     * box coordinate and edge information for inet must be valid before  *
+     * this routine is called.                                            *
+     * Currently assumes channels on both sides of the CLBs forming the   *
+     * edges of the bounding box can be used.  Essentially, I am assuming *
+     * the pins always lie on the outside of the bounding box.            *
+     * The x and y coordinates are the pin's x and y coordinates.         */
+    /* IO blocks are considered to be one cell in for simplicity.         */
+    //TODO: account for multiple physical pin instances per logical pin
+    const std::vector<t_2D_bb>*curr_bb_edge, *curr_bb_coord;
+
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& place_move_ctx = g_placer_ctx.move();
+
+    pin_new_loc.x = max(min<int>(pin_new_loc.x, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    pin_new_loc.y = max(min<int>(pin_new_loc.y, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    pin_old_loc.x = max(min<int>(pin_old_loc.x, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
+    pin_old_loc.y = max(min<int>(pin_old_loc.y, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+
+    /* Check if the net had been updated before. */
+    if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+        /* The net had been updated from scratch, DO NOT update again! */
+        return;
     }
 
+    const vtr::NdMatrixProxy<int, 1> curr_layer_pin_sink_count = (bb_updated_before[net_id] == NOT_UPDATED_YET) ? place_move_ctx.num_sink_pin_layer[size_t(net_id)] : bb_pin_sink_count_new;
+
     if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
+        /* The net had NOT been updated before, could use the old values */
+        curr_bb_edge = &place_move_ctx.layer_bb_num_on_edges[net_id];
+        curr_bb_coord = &place_move_ctx.layer_bb_coords[net_id];
         bb_updated_before[net_id] = UPDATED_ONCE;
+    } else {
+        /* The net had been updated before, must use the new values */
+        curr_bb_edge = &bb_edge_new;
+        curr_bb_coord = &bb_coord_new;
+    }
+
+    /* Check if I can update the bounding box incrementally. */
+
+    update_bb_pin_sink_count(net_id,
+                             pin_old_loc,
+                             pin_new_loc,
+                             curr_layer_pin_sink_count,
+                             bb_pin_sink_count_new,
+                             is_output_pin);
+
+    int layer_old = pin_old_loc.layer_num;
+    int layer_new = pin_new_loc.layer_num;
+    bool layer_changed = (layer_old != layer_new);
+
+    bb_edge_new = *curr_bb_edge;
+    bb_coord_new = *curr_bb_coord;
+
+    if (layer_changed) {
+        update_bb_layer_changed(net_id,
+                                pin_old_loc,
+                                pin_new_loc,
+                                *curr_bb_edge,
+                                *curr_bb_coord,
+                                bb_pin_sink_count_new,
+                                bb_edge_new,
+                                bb_coord_new);
+    } else {
+        update_bb_same_layer(net_id,
+                             pin_old_loc,
+                             pin_new_loc,
+                             *curr_bb_edge,
+                             *curr_bb_coord,
+                             bb_pin_sink_count_new,
+                             bb_edge_new,
+                             bb_coord_new);
+    }
+
+    if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
+        bb_updated_before[net_id] = UPDATED_ONCE;
+    }
+}
+
+static inline void update_bb_same_layer(ClusterNetId net_id,
+                                        const t_physical_tile_loc& pin_old_loc,
+                                        const t_physical_tile_loc& pin_new_loc,
+                                        const std::vector<t_2D_bb>& curr_bb_edge,
+                                        const std::vector<t_2D_bb>& curr_bb_coord,
+                                        vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                                        std::vector<t_2D_bb>& bb_edge_new,
+                                        std::vector<t_2D_bb>& bb_coord_new) {
+    int x_old = pin_old_loc.x;
+    int x_new = pin_new_loc.x;
+
+    int y_old = pin_old_loc.y;
+    int y_new = pin_new_loc.y;
+
+    int layer_num = pin_old_loc.layer_num;
+    VTR_ASSERT_SAFE(layer_num == pin_new_loc.layer_num);
+
+    if (x_new < x_old) {
+        if (x_old == curr_bb_coord[layer_num].xmax) {
+            update_bb_edge(net_id,
+                           bb_edge_new,
+                           bb_coord_new,
+                           bb_pin_sink_count_new,
+                           curr_bb_edge[layer_num].xmax,
+                           curr_bb_coord[layer_num].xmax,
+                           bb_edge_new[layer_num].xmax,
+                           bb_coord_new[layer_num].xmax);
+            if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+                return;
+            }
+        }
+
+        if (x_new < curr_bb_coord[layer_num].xmin) {
+            bb_edge_new[layer_num].xmin = 1;
+            bb_coord_new[layer_num].xmin = x_new;
+        } else if (x_new == curr_bb_coord[layer_num].xmin) {
+            bb_edge_new[layer_num].xmin = curr_bb_edge[layer_num].xmin + 1;
+            bb_coord_new[layer_num].xmin = curr_bb_coord[layer_num].xmin;
+        }
+
+    } else if (x_new > x_old) {
+        if (x_old == curr_bb_coord[layer_num].xmin) {
+            update_bb_edge(net_id,
+                           bb_edge_new,
+                           bb_coord_new,
+                           bb_pin_sink_count_new,
+                           curr_bb_edge[layer_num].xmin,
+                           curr_bb_coord[layer_num].xmin,
+                           bb_edge_new[layer_num].xmin,
+                           bb_coord_new[layer_num].xmin);
+            if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+                return;
+            }
+        }
+
+        if (x_new > curr_bb_coord[layer_num].xmax) {
+            bb_edge_new[layer_num].xmax = 1;
+            bb_coord_new[layer_num].xmax = x_new;
+        } else if (x_new == curr_bb_coord[layer_num].xmax) {
+            bb_edge_new[layer_num].xmax = curr_bb_edge[layer_num].xmax + 1;
+            bb_coord_new[layer_num].xmax = curr_bb_coord[layer_num].xmax;
+        }
+    }
+
+    if (y_new < y_old) {
+        if (y_old == curr_bb_coord[layer_num].ymax) {
+            update_bb_edge(net_id,
+                           bb_edge_new,
+                           bb_coord_new,
+                           bb_pin_sink_count_new,
+                           curr_bb_edge[layer_num].ymax,
+                           curr_bb_coord[layer_num].ymax,
+                           bb_edge_new[layer_num].ymax,
+                           bb_coord_new[layer_num].ymax);
+            if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+                return;
+            }
+        }
+
+        if (y_new < curr_bb_coord[layer_num].ymin) {
+            bb_edge_new[layer_num].ymin = 1;
+            bb_coord_new[layer_num].ymin = y_new;
+        } else if (y_new == curr_bb_coord[layer_num].ymin) {
+            bb_edge_new[layer_num].ymin = curr_bb_edge[layer_num].ymin + 1;
+            bb_coord_new[layer_num].ymin = curr_bb_coord[layer_num].ymin;
+        }
+
+    } else if (y_new > y_old) {
+        if (y_old == curr_bb_coord[layer_num].ymin) {
+            update_bb_edge(net_id,
+                           bb_edge_new,
+                           bb_coord_new,
+                           bb_pin_sink_count_new,
+                           curr_bb_edge[layer_num].ymin,
+                           curr_bb_coord[layer_num].ymin,
+                           bb_edge_new[layer_num].ymin,
+                           bb_coord_new[layer_num].ymin);
+            if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+                return;
+            }
+        }
+
+        if (y_new > curr_bb_coord[layer_num].ymax) {
+            bb_edge_new[layer_num].ymax = 1;
+            bb_coord_new[layer_num].ymax = y_new;
+        } else if (y_new == curr_bb_coord[layer_num].ymax) {
+            bb_edge_new[layer_num].ymax = curr_bb_edge[layer_num].ymax + 1;
+            bb_coord_new[layer_num].ymax = curr_bb_coord[layer_num].ymax;
+        }
+    }
+}
+
+static inline void update_bb_layer_changed(ClusterNetId net_id,
+                                           const t_physical_tile_loc& pin_old_loc,
+                                           const t_physical_tile_loc& pin_new_loc,
+                                           const std::vector<t_2D_bb>& curr_bb_edge,
+                                           const std::vector<t_2D_bb>& curr_bb_coord,
+                                           vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                                           std::vector<t_2D_bb>& bb_edge_new,
+                                           std::vector<t_2D_bb>& bb_coord_new) {
+    int x_old = pin_old_loc.x;
+
+    int y_old = pin_old_loc.y;
+
+    int old_layer_num = pin_old_loc.layer_num;
+    int new_layer_num = pin_new_loc.layer_num;
+    VTR_ASSERT_SAFE(old_layer_num != new_layer_num);
+
+    if (x_old == curr_bb_coord[old_layer_num].xmax) {
+        update_bb_edge(net_id,
+                       bb_edge_new,
+                       bb_coord_new,
+                       bb_pin_sink_count_new,
+                       curr_bb_edge[old_layer_num].xmax,
+                       curr_bb_coord[old_layer_num].xmax,
+                       bb_edge_new[old_layer_num].xmax,
+                       bb_coord_new[old_layer_num].xmax);
+        if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+            return;
+        }
+    } else if (x_old == curr_bb_coord[old_layer_num].xmin) {
+        update_bb_edge(net_id,
+                       bb_edge_new,
+                       bb_coord_new,
+                       bb_pin_sink_count_new,
+                       curr_bb_edge[old_layer_num].xmin,
+                       curr_bb_coord[old_layer_num].xmin,
+                       bb_edge_new[old_layer_num].xmin,
+                       bb_coord_new[old_layer_num].xmin);
+        if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+            return;
+        }
+    }
+
+    if (y_old == curr_bb_coord[old_layer_num].ymax) {
+        update_bb_edge(net_id,
+                       bb_edge_new,
+                       bb_coord_new,
+                       bb_pin_sink_count_new,
+                       curr_bb_edge[old_layer_num].ymax,
+                       curr_bb_coord[old_layer_num].ymax,
+                       bb_edge_new[old_layer_num].ymax,
+                       bb_coord_new[old_layer_num].ymax);
+        if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+            return;
+        }
+    } else if (y_old == curr_bb_coord[old_layer_num].ymin) {
+        update_bb_edge(net_id,
+                       bb_edge_new,
+                       bb_coord_new,
+                       bb_pin_sink_count_new,
+                       curr_bb_edge[old_layer_num].ymin,
+                       curr_bb_coord[old_layer_num].ymin,
+                       bb_edge_new[old_layer_num].ymin,
+                       bb_coord_new[old_layer_num].ymin);
+        if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
+            return;
+        }
+    }
+
+    add_block_to_bb(pin_new_loc,
+                    curr_bb_edge[new_layer_num],
+                    curr_bb_coord[new_layer_num],
+                    bb_edge_new[new_layer_num],
+                    bb_coord_new[new_layer_num]);
+}
+
+static void update_bb_pin_sink_count(ClusterNetId /* net_id */,
+                                     const t_physical_tile_loc& pin_old_loc,
+                                     const t_physical_tile_loc& pin_new_loc,
+                                     const vtr::NdMatrixProxy<int, 1> curr_layer_pin_sink_count,
+                                     vtr::NdMatrixProxy<int, 1> bb_pin_sink_count_new,
+                                     bool is_output_pin) {
+    VTR_ASSERT(curr_layer_pin_sink_count[pin_old_loc.layer_num] > 0 || is_output_pin == 1);
+    for (int layer_num = 0; layer_num < g_vpr_ctx.device().grid.get_num_layers(); layer_num++) {
+        bb_pin_sink_count_new[layer_num] = curr_layer_pin_sink_count[layer_num];
+    }
+    if (!is_output_pin) {
+        bb_pin_sink_count_new[pin_old_loc.layer_num] -= 1;
+        bb_pin_sink_count_new[pin_new_loc.layer_num] += 1;
+    }
+}
+
+static inline void update_bb_edge(ClusterNetId net_id,
+                                  std::vector<t_2D_bb>& bb_edge_new,
+                                  std::vector<t_2D_bb>& bb_coord_new,
+                                  vtr::NdMatrixProxy<int, 1> bb_layer_pin_sink_count,
+                                  const int& old_num_block_on_edge,
+                                  const int& old_edge_coord,
+                                  int& new_num_block_on_edge,
+                                  int& new_edge_coord) {
+    if (old_num_block_on_edge == 1) {
+        get_layer_bb_from_scratch(net_id,
+                                  bb_edge_new,
+                                  bb_coord_new,
+                                  bb_layer_pin_sink_count);
+        bb_updated_before[net_id] = GOT_FROM_SCRATCH;
+        return;
+    } else {
+        new_num_block_on_edge = old_num_block_on_edge - 1;
+        new_edge_coord = old_edge_coord;
+    }
+}
+
+static void add_block_to_bb(const t_physical_tile_loc& new_pin_loc,
+                            const t_2D_bb& bb_edge_old,
+                            const t_2D_bb& bb_coord_old,
+                            t_2D_bb& bb_edge_new,
+                            t_2D_bb& bb_coord_new) {
+    int x_new = new_pin_loc.x;
+    int y_new = new_pin_loc.y;
+
+    if (x_new > bb_coord_old.xmax) {
+        bb_edge_new.xmax = 1;
+        bb_coord_new.xmax = x_new;
+    } else if (x_new == bb_coord_old.xmax) {
+        bb_edge_new.xmax = bb_edge_old.xmax + 1;
+    }
+
+    if (x_new < bb_coord_old.xmin) {
+        bb_edge_new.xmin = 1;
+        bb_coord_new.xmin = x_new;
+    } else if (x_new == bb_coord_old.xmin) {
+        bb_edge_new.xmin = bb_edge_old.xmin + 1;
+    }
+
+    if (y_new > bb_coord_old.ymax) {
+        bb_edge_new.ymax = 1;
+        bb_coord_new.ymax = y_new;
+    } else if (y_new == bb_coord_old.ymax) {
+        bb_edge_new.ymax = bb_edge_old.ymax + 1;
+    }
+
+    if (y_new < bb_coord_old.ymin) {
+        bb_edge_new.ymin = 1;
+        bb_coord_new.ymin = y_new;
+    } else if (y_new == bb_coord_old.ymin) {
+        bb_edge_new.ymin = bb_edge_old.ymin + 1;
     }
 }
 
@@ -2961,7 +3957,15 @@ static int check_placement_costs(const t_placer_costs& costs,
     double bb_cost_check;
     double timing_cost_check;
 
-    bb_cost_check = comp_bb_cost(CHECK);
+    const auto& cube_bb = g_vpr_ctx.placement().cube_bb;
+
+    if (cube_bb) {
+        bb_cost_check = comp_bb_cost(CHECK);
+    } else {
+        VTR_ASSERT_SAFE(!cube_bb);
+        bb_cost_check = comp_layer_bb_cost(CHECK);
+    }
+
     if (fabs(bb_cost_check - costs.bb_cost) > costs.bb_cost * ERROR_TOL) {
         VTR_LOG_ERROR(
             "bb_cost_check: %g and bb_cost: %g differ in check_place.\n",
@@ -3259,7 +4263,7 @@ static void print_placement_swaps_stats(const t_annealing_state& state) {
             num_swap_accepted, 100 * accept_rate);
     VTR_LOG("\tSwaps rejected: %*d (%4.1f %%)\n", num_swap_print_digits,
             num_swap_rejected, 100 * reject_rate);
-    VTR_LOG("\tSwaps aborted : %*d (%4.1f %%)\n", num_swap_print_digits,
+    VTR_LOG("\tSwaps aborted: %*d (%4.1f %%)\n", num_swap_print_digits,
             num_swap_aborted, 100 * abort_rate);
 }
 
diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/place_delay_model.cpp
index 44d8c4a0b49..bfdd15707f4 100644
--- a/vpr/src/place/place_delay_model.cpp
+++ b/vpr/src/place/place_delay_model.cpp
@@ -27,11 +27,20 @@
 #endif /* VTR_ENABLE_CAPNPROTO */
 
 ///@brief DeltaDelayModel methods.
-float DeltaDelayModel::delay(int from_x, int from_y, int /*from_pin*/, int to_x, int to_y, int /*to_pin*/, int layer_num) const {
-    int delta_x = std::abs(from_x - to_x);
-    int delta_y = std::abs(from_y - to_y);
-
-    return delays_[layer_num][delta_x][delta_y];
+float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const {
+    int delta_x = std::abs(from_loc.x - to_loc.x);
+    int delta_y = std::abs(from_loc.y - to_loc.y);
+
+    // TODO: This is compatible with the case that only OPINs are connected to other layers.
+    // Ideally, I should check whether OPINs are conneced or IPINs and use the correct layer.
+    // If both are connected, minimum should be taken. In the case that channels are also connected,
+    // I haven't thought about what to do.
+    float cross_layer_td = 0;
+    if (from_loc.layer_num != to_loc.layer_num) {
+        VTR_ASSERT(std::isfinite(cross_layer_delay_));
+        cross_layer_td = cross_layer_delay_;
+    }
+    return delays_[to_loc.layer_num][delta_x][delta_y] + cross_layer_td;
 }
 
 void DeltaDelayModel::dump_echo(std::string filepath) const {
@@ -60,13 +69,13 @@ const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
 }
 
 ///@brief OverrideDelayModel methods.
-float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin, int layer_num) const {
+float OverrideDelayModel::delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const {
     //First check to if there is an override delay value
     auto& device_ctx = g_vpr_ctx.device();
     auto& grid = device_ctx.grid;
 
-    t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type({from_x, from_y, layer_num});
-    t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type({to_x, to_y, layer_num});
+    t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type(from_loc);
+    t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type(to_loc);
 
     t_override override_key;
     override_key.from_type = from_type_ptr->index;
@@ -76,8 +85,8 @@ float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x,
 
     //Delay overrides may be different for +/- delta so do not use
     //an absolute delta for the look-up
-    override_key.delta_x = to_x - from_x;
-    override_key.delta_y = to_y - from_y;
+    override_key.delta_x = to_loc.x - from_loc.x;
+    override_key.delta_y = to_loc.y - from_loc.y;
 
     float delay_val = std::numeric_limits<float>::quiet_NaN();
     auto override_iter = delay_overrides_.find(override_key);
@@ -86,7 +95,7 @@ float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x,
         delay_val = override_iter->second;
     } else {
         //Fall back to the base delay model if no override was found
-        delay_val = base_delay_model_->delay(from_x, from_y, from_pin, to_x, to_y, to_pin, layer_num);
+        delay_val = base_delay_model_->delay(from_loc, from_pin, to_loc, to_pin);
     }
 
     return delay_val;
@@ -258,7 +267,7 @@ void OverrideDelayModel::read(const std::string& file) {
     auto model = reader.getRoot<VprOverrideDelayModel>();
     ToNdMatrix<3, VprFloatEntry, float>(&delays, model.getDelays(), ToFloat);
 
-    base_delay_model_ = std::make_unique<DeltaDelayModel>(delays, is_flat_);
+    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, is_flat_);
 
     // Reading non-scalar capnproto fields is roughly equivilant to using
     // a std::vector of the field type.  Actual type is capnp::List<X>::Reader.
@@ -310,6 +319,7 @@ void OverrideDelayModel::write(const std::string& file) const {
 
 ///@brief Initialize the placer delay model.
 std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(const Netlist<>& net_list,
+                                                               const std::vector<t_arch_switch_inf>& arch_switch_inf,
                                                                t_chan_width_dist chan_width_dist,
                                                                const t_placer_opts& placer_opts,
                                                                const t_router_opts& router_opts,
@@ -318,8 +328,16 @@ std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(const Netlist<>&
                                                                const t_direct_inf* directs,
                                                                const int num_directs,
                                                                bool is_flat) {
-    return compute_place_delay_model(placer_opts, router_opts, net_list, det_routing_arch, segment_inf,
-                                     chan_width_dist, directs, num_directs, is_flat);
+    return compute_place_delay_model(placer_opts,
+                                     router_opts,
+                                     net_list,
+                                     arch_switch_inf,
+                                     det_routing_arch,
+                                     segment_inf,
+                                     chan_width_dist,
+                                     directs,
+                                     num_directs,
+                                     is_flat);
 }
 
 /**
@@ -346,9 +364,10 @@ float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, Cluste
 
         int source_x = place_ctx.block_locs[source_block].loc.x;
         int source_y = place_ctx.block_locs[source_block].loc.y;
+        int source_layer = place_ctx.block_locs[source_block].loc.layer;
         int sink_x = place_ctx.block_locs[sink_block].loc.x;
         int sink_y = place_ctx.block_locs[sink_block].loc.y;
-        int sink_layer_num = place_ctx.block_locs[sink_block].loc.layer;
+        int sink_layer = place_ctx.block_locs[sink_block].loc.layer;
 
         /**
          * This heuristic only considers delta_x and delta_y, a much better
@@ -357,13 +376,10 @@ float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, Cluste
          * In particular this approach does not accurately capture the effect
          * of fast carry-chain connections.
          */
-        delay_source_to_sink = delay_model->delay(source_x,
-                                                  source_y,
+        delay_source_to_sink = delay_model->delay({source_x, source_y, source_layer},
                                                   source_block_ipin,
-                                                  sink_x,
-                                                  sink_y,
-                                                  sink_block_ipin,
-                                                  sink_layer_num);
+                                                  {sink_x, sink_y, sink_layer},
+                                                  sink_block_ipin);
         if (delay_source_to_sink < 0) {
             VPR_ERROR(VPR_ERROR_PLACE,
                       "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n"
diff --git a/vpr/src/place/place_delay_model.h b/vpr/src/place/place_delay_model.h
index 09b6969c011..b10eae12204 100644
--- a/vpr/src/place/place_delay_model.h
+++ b/vpr/src/place/place_delay_model.h
@@ -29,6 +29,7 @@ class PlaceDelayModel;
 
 ///@brief Initialize the placer delay model.
 std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(const Netlist<>& net_list,
+                                                               const std::vector<t_arch_switch_inf>& arch_switch_inf,
                                                                t_chan_width_dist chan_width_dist,
                                                                const t_placer_opts& place_opts,
                                                                const t_router_opts& router_opts,
@@ -62,7 +63,7 @@ class PlaceDelayModel {
      *
      * Either compute or read methods must be invoked before invoking delay.
      */
-    virtual float delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin, int layer_num) const = 0;
+    virtual float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const = 0;
 
     ///@brief Dumps the delay model to an echo file.
     virtual void dump_echo(std::string filename) const = 0;
@@ -85,10 +86,15 @@ class PlaceDelayModel {
 ///@brief A simple delay model based on the distance (delta) between block locations.
 class DeltaDelayModel : public PlaceDelayModel {
   public:
-    DeltaDelayModel(bool is_flat)
-        : is_flat_(is_flat) {}
-    DeltaDelayModel(vtr::NdMatrix<float, 3> delta_delays, bool is_flat)
+    DeltaDelayModel(float min_cross_layer_delay,
+                    bool is_flat)
+        : cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
+    DeltaDelayModel(float min_cross_layer_delay,
+                    vtr::NdMatrix<float, 3> delta_delays,
+                    bool is_flat)
         : delays_(std::move(delta_delays))
+        , cross_layer_delay_(min_cross_layer_delay)
         , is_flat_(is_flat) {}
 
     void compute(
@@ -96,7 +102,7 @@ class DeltaDelayModel : public PlaceDelayModel {
         const t_placer_opts& placer_opts,
         const t_router_opts& router_opts,
         int longest_length) override;
-    float delay(int from_x, int from_y, int /*from_pin*/, int to_x, int to_y, int /*to_pin*/, int layer_num) const override;
+    float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override;
     void dump_echo(std::string filepath) const override;
 
     void read(const std::string& file) override;
@@ -107,13 +113,16 @@ class DeltaDelayModel : public PlaceDelayModel {
 
   private:
     vtr::NdMatrix<float, 3> delays_; // [0..num_layers-1][0..max_dx][0..max_dy]
+    float cross_layer_delay_;
     bool is_flat_;
 };
 
 class OverrideDelayModel : public PlaceDelayModel {
   public:
-    OverrideDelayModel(bool is_flat)
-        : is_flat_(is_flat) {}
+    OverrideDelayModel(float min_cross_layer_delay,
+                       bool is_flat)
+        : cross_layer_delay_(min_cross_layer_delay)
+        , is_flat_(is_flat) {}
     void compute(
         RouterDelayProfiler& route_profiler,
         const t_placer_opts& placer_opts,
@@ -121,7 +130,7 @@ class OverrideDelayModel : public PlaceDelayModel {
         int longest_length) override;
     // returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the
     // specified from and to pins
-    float delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin, int layer_num) const override;
+    float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override;
     void dump_echo(std::string filepath) const override;
 
     void read(const std::string& file) override;
@@ -135,6 +144,8 @@ class OverrideDelayModel : public PlaceDelayModel {
 
   private:
     std::unique_ptr<DeltaDelayModel> base_delay_model_;
+    /* Minimum delay of cross-layer connections */
+    float cross_layer_delay_;
     bool is_flat_;
 
     void compute_override_delay_model(RouterDelayProfiler& router,
diff --git a/vpr/src/place/placer_context.h b/vpr/src/place/placer_context.h
index 23df961b144..f5e56bbf37f 100644
--- a/vpr/src/place/placer_context.h
+++ b/vpr/src/place/placer_context.h
@@ -91,11 +91,20 @@ struct PlacerRuntimeContext : public Context {
  */
 struct PlacerMoveContext : public Context {
   public:
+    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the number of blocks on each of a net's bounding box (to allow efficient updates)
+    vtr::vector<ClusterNetId, t_bb> bb_num_on_edges;
+
     // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates of a net's bounding box
     vtr::vector<ClusterNetId, t_bb> bb_coords;
 
     // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the number of blocks on each of a net's bounding box (to allow efficient updates)
-    vtr::vector<ClusterNetId, t_bb> bb_num_on_edges;
+    vtr::vector<ClusterNetId, std::vector<t_2D_bb>> layer_bb_num_on_edges;
+
+    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates of a net's bounding box
+    vtr::vector<ClusterNetId, std::vector<t_2D_bb>> layer_bb_coords;
+
+    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the number of blocks on each layer ()
+    vtr::Matrix<int> num_sink_pin_layer;
 
     // The first range limit calculated by the anneal
     float first_rlim;
diff --git a/vpr/src/place/simpleRL_move_generator.h b/vpr/src/place/simpleRL_move_generator.h
index 9ded69055d5..de108313023 100644
--- a/vpr/src/place/simpleRL_move_generator.h
+++ b/vpr/src/place/simpleRL_move_generator.h
@@ -211,6 +211,8 @@ class SimpleRLMoveGenerator : public MoveGenerator {
      *
      *   @param agent std::unique_ptr to the agent. Only EpsilonGreedyAgent and SoftmaxAgent types are accepted
      *   by the constructor. If other types are passed, a compile error would be thrown.
+     *
+     *   @param is_multi_layer A boolean value to indicate whether the placement is multi-layer or not
      */
     template<class T,
              class = typename std::enable_if<std::is_same<T, EpsilonGreedyAgent>::value || std::is_same<T, SoftmaxAgent>::value>::type>
diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp
index 5476e24cafe..fba8b1e9c46 100644
--- a/vpr/src/place/timing_place_lookup.cpp
+++ b/vpr/src/place/timing_place_lookup.cpp
@@ -167,6 +167,7 @@ static float find_neightboring_average(vtr::NdMatrix<float, 3>& matrix, t_physic
 std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts& placer_opts,
                                                            const t_router_opts& router_opts,
                                                            const Netlist<>& net_list,
+                                                           const std::vector<t_arch_switch_inf>& arch_switch_inf,
                                                            t_det_routing_arch* det_routing_arch,
                                                            std::vector<t_segment_inf>& segment_inf,
                                                            t_chan_width_dist chan_width_dist,
@@ -194,10 +195,13 @@ std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts&
 
     /*now setup and compute the actual arrays */
     std::unique_ptr<PlaceDelayModel> place_delay_model;
+    float min_cross_layer_delay = get_min_cross_layer_delay(arch_switch_inf,
+                                                            segment_inf,
+                                                            det_routing_arch->wire_to_arch_ipin_switch_between_dice);
     if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) {
-        place_delay_model = std::make_unique<DeltaDelayModel>(is_flat);
+        place_delay_model = std::make_unique<DeltaDelayModel>(min_cross_layer_delay, is_flat);
     } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) {
-        place_delay_model = std::make_unique<OverrideDelayModel>(is_flat);
+        place_delay_model = std::make_unique<OverrideDelayModel>(min_cross_layer_delay, is_flat);
     } else {
         VTR_ASSERT_MSG(false, "Invalid placer delay model");
     }
@@ -241,7 +245,7 @@ void OverrideDelayModel::compute(
         longest_length,
         is_flat_);
 
-    base_delay_model_ = std::make_unique<DeltaDelayModel>(delays, false);
+    base_delay_model_ = std::make_unique<DeltaDelayModel>(cross_layer_delay_, delays, false);
 
     compute_override_delay_model(route_profiler, router_opts);
 }
@@ -389,7 +393,8 @@ static float route_connection_delay(
                 successfully_routed = route_profiler.calculate_delay(
                     source_rr_node, sink_rr_node,
                     router_opts,
-                    &net_delay_value);
+                    &net_delay_value,
+                    layer_num);
             }
 
             if (successfully_routed) break;
@@ -1192,7 +1197,7 @@ void OverrideDelayModel::compute_override_delay_model(
             if (sampled_rr_pairs.count({src_rr, sink_rr})) continue;
 
             float direct_connect_delay = std::numeric_limits<float>::quiet_NaN();
-            bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay);
+            bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay, OPEN);
 
             if (found_routing_path) {
                 set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay);
diff --git a/vpr/src/place/timing_place_lookup.h b/vpr/src/place/timing_place_lookup.h
index 30e1a8ae01a..f9efe7fc933 100644
--- a/vpr/src/place/timing_place_lookup.h
+++ b/vpr/src/place/timing_place_lookup.h
@@ -5,6 +5,7 @@
 std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts& placer_opts,
                                                            const t_router_opts& router_opts,
                                                            const Netlist<>& net_list,
+                                                           const std::vector<t_arch_switch_inf>& arch_switch_inf,
                                                            t_det_routing_arch* det_routing_arch,
                                                            std::vector<t_segment_inf>& segment_inf,
                                                            t_chan_width_dist chan_width_dist,
diff --git a/vpr/src/place/uniform_move_generator.cpp b/vpr/src/place/uniform_move_generator.cpp
index 320ce987c39..6560c32af24 100644
--- a/vpr/src/place/uniform_move_generator.cpp
+++ b/vpr/src/place/uniform_move_generator.cpp
@@ -26,7 +26,6 @@ e_create_move UniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks
     VTR_ASSERT(is_tile_compatible(grid_from_type, cluster_from_type));
 
     t_pl_loc to;
-
     if (!find_to_loc_uniform(cluster_from_type, rlim, from, to, b_from)) {
         return e_create_move::ABORT;
     }
diff --git a/vpr/src/place/weighted_centroid_move_generator.cpp b/vpr/src/place/weighted_centroid_move_generator.cpp
index d5f62ae9d20..d33b6fa2ebe 100644
--- a/vpr/src/place/weighted_centroid_move_generator.cpp
+++ b/vpr/src/place/weighted_centroid_move_generator.cpp
@@ -38,7 +38,9 @@ e_create_move WeightedCentroidMoveGenerator::propose_move(t_pl_blocks_to_be_move
     /* Calculate the weighted centroid */
     calculate_centroid_loc(b_from, true, centroid, criticalities);
 
-    /* Find a  */
+    // Centroid location is not necessarily a valid location, and the downstream location expect a valid
+    // layer for "to" location. So if the layer is not valid, we set it to the same layer as from loc.
+    to.layer = (centroid.layer < 0) ? from.layer : centroid.layer;
     if (!find_to_loc_centroid(cluster_from_type, from, centroid, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
     }
diff --git a/vpr/src/place/weighted_median_move_generator.cpp b/vpr/src/place/weighted_median_move_generator.cpp
index 17f064d4c9c..2d343cd3347 100644
--- a/vpr/src/place/weighted_median_move_generator.cpp
+++ b/vpr/src/place/weighted_median_move_generator.cpp
@@ -27,6 +27,9 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     auto& cluster_ctx = g_vpr_ctx.clustering();
     auto& place_move_ctx = g_placer_ctx.mutable_move();
 
+    int num_layers = g_vpr_ctx.device().grid.get_num_layers();
+    bool is_multi_layer = (num_layers > 1);
+
     t_pl_loc from = place_ctx.block_locs[b_from].loc;
     auto cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from);
     auto grid_from_type = g_vpr_ctx.device().grid.get_physical_type({from.x, from.y, from.layer});
@@ -42,6 +45,7 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     //reused to save allocation time
     place_move_ctx.X_coord.clear();
     place_move_ctx.Y_coord.clear();
+    std::vector<int> layer_blk_cnt(num_layers, 0);
 
     //true if the net is a feedback from the block to itself (all the net terminals are connected to the same block)
     bool skip_net;
@@ -72,6 +76,17 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
         place_move_ctx.X_coord.insert(place_move_ctx.X_coord.end(), ceil(coords.xmax.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.xmax.edge);
         place_move_ctx.Y_coord.insert(place_move_ctx.Y_coord.end(), ceil(coords.ymin.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.ymin.edge);
         place_move_ctx.Y_coord.insert(place_move_ctx.Y_coord.end(), ceil(coords.ymax.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.ymax.edge);
+        // If multile layers are available, I need to keep track of how many sinks are in each layer.
+        if (is_multi_layer) {
+            for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+                layer_blk_cnt[layer_num] += place_move_ctx.num_sink_pin_layer[size_t(net_id)][layer_num];
+            }
+            // If the pin under consideration if of type sink, it is counted in place_move_ctx.num_sink_pin_layer, and we don't want to consider the moving pins
+            if (cluster_ctx.clb_nlist.pin_type(pin_id) != PinType::DRIVER) {
+                VTR_ASSERT(layer_blk_cnt[from.layer] > 0);
+                layer_blk_cnt[from.layer]--;
+            }
+        }
     }
 
     if ((place_move_ctx.X_coord.empty()) || (place_move_ctx.Y_coord.empty())) {
@@ -106,8 +121,17 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     t_pl_loc w_median_point;
     w_median_point.x = (limit_coords.xmin + limit_coords.xmax) / 2;
     w_median_point.y = (limit_coords.ymin + limit_coords.ymax) / 2;
-    // TODO: Currently, we don't move blocks between different types of layers
-    w_median_point.layer = from.layer;
+
+    // If multiple layers are available, we would choose the median layer, otherwise the same layer (layer #0) as the from_loc would be chosen
+    //#TODO: Since we are now only considering 2 layers, the layer with maximum number of sinks should be chosen. we need to update it to get the true median
+    if (is_multi_layer) {
+        int layer_num = std::distance(layer_blk_cnt.begin(), std::max_element(layer_blk_cnt.begin(), layer_blk_cnt.end()));
+        w_median_point.layer = layer_num;
+        to.layer = layer_num;
+    } else {
+        w_median_point.layer = from.layer;
+        to.layer = from.layer;
+    }
     if (!find_to_loc_centroid(cluster_from_type, from, w_median_point, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
     }
diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp
index 62db70ed31f..b99fb60b650 100644
--- a/vpr/src/route/connection_router.cpp
+++ b/vpr/src/route/connection_router.cpp
@@ -5,58 +5,31 @@
 #include "bucket.h"
 #include "rr_graph_fwd.h"
 
-static inline bool relevant_node_to_target(const RRGraphView* rr_graph,
-                                           RRNodeId node_to_add,
-                                           RRNodeId target_node) {
-    VTR_ASSERT(rr_graph->node_type(target_node) == t_rr_type::SINK);
-    auto node_to_add_type = rr_graph->node_type(node_to_add);
-    if (node_to_add_type == t_rr_type::OPIN || node_to_add_type == t_rr_type::SOURCE || node_to_add_type == t_rr_type::CHANX || node_to_add_type == t_rr_type::CHANY || node_to_add_type == SINK) {
-        return true;
-    } else if (node_in_same_physical_tile(node_to_add, target_node)) {
-        VTR_ASSERT(node_to_add_type == IPIN);
-        return true;
-    }
-    return false;
-}
-
-inline void update_router_stats(const DeviceContext& device_ctx,
+/**
+ * @brief This function is relevant when the architecture is 3D. If inter-layer connections are only from OPINs (determine by is_inter_layer_opin_connection),
+ * then nodes (other that OPINs) which are on the other layer than sink's layer, don't need to be pushed back to the heap.
+ * @param rr_nodes
+ * @param rr_graph
+ * @param from_node
+ * @param sink_node
+ * @param is_inter_layer_opin_connection It is true if the architecture is 3D and inter-layer connections are only from OPINs.
+ * @return
+ */
+static bool has_path_to_sink(const t_rr_graph_view& rr_nodes,
+                             const RRGraphView* rr_graph,
+                             RRNodeId from_node,
+                             RRNodeId sink_node,
+                             bool is_inter_layer_opin_connection);
+
+static bool relevant_node_to_target(const RRGraphView* rr_graph,
+                                    RRNodeId node_to_add,
+                                    RRNodeId target_node);
+
+static void update_router_stats(const DeviceContext& device_ctx,
                                 const RRGraphView* rr_graph,
                                 RouterStats* router_stats,
                                 RRNodeId rr_node_id,
-                                bool is_push) {
-    if (is_push) {
-        router_stats->heap_pushes++;
-    } else {
-        router_stats->heap_pops++;
-    }
-
-    auto node_type = rr_graph->node_type(rr_node_id);
-    VTR_ASSERT(node_type != NUM_RR_TYPES);
-    t_physical_tile_type_ptr physical_type = device_ctx.grid.get_physical_type({rr_graph->node_xlow(rr_node_id),
-                                                                                rr_graph->node_ylow(rr_node_id),
-                                                                                rr_graph->node_layer(rr_node_id)});
-
-    if (is_inter_cluster_node(physical_type,
-                              node_type,
-                              rr_graph->node_ptc_num(rr_node_id))) {
-        if (is_push) {
-            router_stats->inter_cluster_node_pushes++;
-            router_stats->inter_cluster_node_type_cnt_pushes[node_type]++;
-        } else {
-            router_stats->inter_cluster_node_pops++;
-            router_stats->inter_cluster_node_type_cnt_pops[node_type]++;
-        }
-
-    } else {
-        if (is_push) {
-            router_stats->intra_cluster_node_pushes++;
-            router_stats->intra_cluster_node_type_cnt_pushes[node_type]++;
-        } else {
-            router_stats->intra_cluster_node_pops++;
-            router_stats->intra_cluster_node_type_cnt_pops[node_type]++;
-        }
-    }
-}
+                                bool is_push);
 
 /** return tuple <found_path, retry_with_full_bb, cheapest> */
 template<typename Heap>
@@ -112,9 +85,9 @@ std::tuple<bool, t_heap*> ConnectionRouter<Heap>::timing_driven_route_connection
         return std::make_tuple(false, nullptr);
     }
 
-    VTR_LOGV_DEBUG(router_debug_, "  Routing to %d as normal net (BB: %d,%d x %d,%d)\n", sink_node,
-                   bounding_box.xmin, bounding_box.ymin,
-                   bounding_box.xmax, bounding_box.ymax);
+    VTR_LOGV_DEBUG(router_debug_, "  Routing to %d as normal net (BB: %d,%d,%d x %d,%d,%d)\n", sink_node,
+                   bounding_box.layer_min, bounding_box.xmin, bounding_box.ymin,
+                   bounding_box.layer_max, bounding_box.xmax, bounding_box.ymax);
 
     t_heap* cheapest = timing_driven_route_connection_from_heap(sink_node,
                                                                 cost_params,
@@ -152,6 +125,8 @@ std::tuple<bool, t_heap*> ConnectionRouter<Heap>::timing_driven_route_connection
         full_device_bounding_box.ymin = 0;
         full_device_bounding_box.xmax = grid_.width() - 1;
         full_device_bounding_box.ymax = grid_.height() - 1;
+        full_device_bounding_box.layer_min = 0;
+        full_device_bounding_box.layer_max = grid_.get_num_layers() - 1;
 
         //
         //TODO: potential future optimization
@@ -220,9 +195,9 @@ std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_conne
         return std::make_tuple(false, false, t_heap());
     }
 
-    VTR_LOGV_DEBUG(router_debug_, "  Routing to %d as high fanout net (BB: %d,%d x %d,%d)\n", sink_node,
-                   high_fanout_bb.xmin, high_fanout_bb.ymin,
-                   high_fanout_bb.xmax, high_fanout_bb.ymax);
+    VTR_LOGV_DEBUG(router_debug_, "  Routing to %d as high fanout net (BB: %d,%d,%d x %d,%d,%d)\n", sink_node,
+                   high_fanout_bb.layer_min, high_fanout_bb.xmin, high_fanout_bb.ymin,
+                   high_fanout_bb.layer_max, high_fanout_bb.xmax, high_fanout_bb.ymax);
 
     bool retry_with_full_bb = false;
     t_heap* cheapest;
@@ -473,10 +448,12 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbours(t_heap* current,
 
     t_bb target_bb;
     if (target_node != RRNodeId::INVALID()) {
-        target_bb.xmin = rr_graph_->node_xlow(target_node);
-        target_bb.ymin = rr_graph_->node_ylow(target_node);
-        target_bb.xmax = rr_graph_->node_xhigh(target_node);
-        target_bb.ymax = rr_graph_->node_yhigh(target_node);
+        target_bb.xmin = rr_graph_->node_xlow(RRNodeId(target_node));
+        target_bb.ymin = rr_graph_->node_ylow(RRNodeId(target_node));
+        target_bb.xmax = rr_graph_->node_xhigh(RRNodeId(target_node));
+        target_bb.ymax = rr_graph_->node_yhigh(RRNodeId(target_node));
+        target_bb.layer_min = rr_graph_->node_layer(RRNodeId(target_node));
+        target_bb.layer_max = rr_graph_->node_layer(RRNodeId(target_node));
     }
 
     // For each node associated with the current heap element, expand all of it's neighbors
@@ -537,6 +514,9 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
     int to_ylow = rr_graph_->node_ylow(to_node);
     int to_xhigh = rr_graph_->node_xhigh(to_node);
     int to_yhigh = rr_graph_->node_yhigh(to_node);
+    int to_layer = rr_graph_->node_layer(to_node);
+
+    VTR_ASSERT(bounding_box.layer_max < g_vpr_ctx.device().grid.get_num_layers());
 
     // BB-pruning
     // Disable BB-pruning if RCV is enabled, as this can make it harder for circuits with high negative hold slack to resolve this
@@ -544,15 +524,19 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
     if ((to_xhigh < bounding_box.xmin    // Strictly left of BB left-edge
          || to_xlow > bounding_box.xmax  // Strictly right of BB right-edge
          || to_yhigh < bounding_box.ymin // Strictly below BB bottom-edge
-         || to_ylow > bounding_box.ymax) // Strictly above BB top-edge
+         || to_ylow > bounding_box.ymax
+         || to_layer < bounding_box.layer_min
+         || to_layer > bounding_box.layer_max) // Strictly above BB top-edge
         && !rcv_path_manager.is_enabled()) {
         VTR_LOGV_DEBUG(router_debug_,
                        "      Pruned expansion of node %d edge %zu -> %d"
-                       " (to node location %d,%dx%d,%d outside of expanded"
-                       " net bounding box %d,%dx%d,%d)\n",
+                       " (to node location %d,%d,%d x %d,%d,%d outside of expanded"
+                       " net bounding box %d,%d,%d x %d,%d,%d)\n",
                        from_node, size_t(from_edge), size_t(to_node),
-                       to_xlow, to_ylow, to_xhigh, to_yhigh,
-                       bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax);
+                       to_xlow, to_ylow, to_layer,
+                       to_xhigh, to_yhigh, to_layer,
+                       bounding_box.xmin, bounding_box.ymin, bounding_box.layer_min,
+                       bounding_box.xmax, bounding_box.ymax, bounding_box.layer_max);
         return; /* Node is outside (expanded) bounding box. */
     }
 
@@ -568,14 +552,18 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
             if (to_xlow < target_bb.xmin
                 || to_ylow < target_bb.ymin
                 || to_xhigh > target_bb.xmax
-                || to_yhigh > target_bb.ymax) {
+                || to_yhigh > target_bb.ymax
+                || to_layer < target_bb.layer_min
+                || to_layer > target_bb.layer_max) {
                 VTR_LOGV_DEBUG(router_debug_,
                                "      Pruned expansion of node %d edge %zu -> %d"
-                               " (to node is IPIN at %d,%dx%d,%d which does not"
-                               " lead to target block %d,%dx%d,%d)\n",
+                               " (to node is IPIN at %d,%d,%d x %d,%d,%d which does not"
+                               " lead to target block %d,%d,%d x %d,%d,%d)\n",
                                from_node, size_t(from_edge), size_t(to_node),
-                               to_xlow, to_ylow, to_xhigh, to_yhigh,
-                               target_bb.xmin, target_bb.ymin, target_bb.xmax, target_bb.ymax);
+                               to_xlow, to_ylow, to_layer,
+                               to_xhigh, to_yhigh, to_layer,
+                               target_bb.xmin, target_bb.ymin, target_bb.layer_min,
+                               target_bb.xmax, target_bb.ymax, target_bb.layer_max);
                 return;
             }
         }
@@ -929,6 +917,9 @@ void ConnectionRouter<Heap>::add_route_tree_to_heap(
     /* Pre-order depth-first traversal */
     // IPINs and SINKS are not re_expanded
     if (rt_node.re_expand) {
+        if (target_node.is_valid() && !has_path_to_sink(rr_nodes_, rr_graph_, RRNodeId(rt_node.inode), RRNodeId(target_node), only_opin_inter_layer)) {
+            return;
+        }
         add_route_tree_node_to_heap(rt_node,
                                     target_node,
                                     cost_params,
@@ -1021,6 +1012,9 @@ static t_bb adjust_highfanout_bounding_box(t_bb highfanout_bb) {
     bb.xmax += HIGH_FANOUT_BB_FAC;
     bb.ymax += HIGH_FANOUT_BB_FAC;
 
+    bb.layer_min = highfanout_bb.layer_min;
+    bb.layer_max = highfanout_bb.layer_max;
+
     return bb;
 }
 
@@ -1054,6 +1048,8 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
     highfanout_bb.xmax = rr_graph_->node_xhigh(target_node);
     highfanout_bb.ymin = rr_graph_->node_ylow(target_node);
     highfanout_bb.ymax = rr_graph_->node_yhigh(target_node);
+    highfanout_bb.layer_min = rr_graph_->node_layer(target_node);
+    highfanout_bb.layer_max = rr_graph_->node_layer(target_node);
 
     //Add existing routing starting from the target bin.
     //If the target's bin has insufficient existing routing add from the surrounding bins
@@ -1077,6 +1073,9 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
                         continue;
                 }
 
+                if (!has_path_to_sink(rr_nodes_, rr_graph_, RRNodeId(rt_node.inode), target_node, only_opin_inter_layer)) {
+                    continue;
+                }
                 // Put the node onto the heap
                 add_route_tree_node_to_heap(rt_node, target_node, cost_params, true);
 
@@ -1085,6 +1084,8 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
                 highfanout_bb.ymin = std::min<int>(highfanout_bb.ymin, rr_graph_->node_ylow(rr_node_to_add));
                 highfanout_bb.xmax = std::max<int>(highfanout_bb.xmax, rr_graph_->node_xhigh(rr_node_to_add));
                 highfanout_bb.ymax = std::max<int>(highfanout_bb.ymax, rr_graph_->node_yhigh(rr_node_to_add));
+                highfanout_bb.layer_min = std::min<int>(highfanout_bb.layer_min, rr_graph_->node_layer(rr_node_to_add));
+                highfanout_bb.layer_max = std::max<int>(highfanout_bb.layer_max, rr_graph_->node_layer(rr_node_to_add));
                 if (is_flat_) {
                     if (rr_graph_->node_type(rr_node_to_add) == CHANY || rr_graph_->node_type(rr_node_to_add) == CHANX) {
                         chan_nodes_added++;
@@ -1121,6 +1122,84 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
     return bounding_box;
 }
 
+static inline bool has_path_to_sink(const t_rr_graph_view& rr_nodes,
+                                    const RRGraphView* rr_graph,
+                                    RRNodeId from_node,
+                                    RRNodeId sink_node,
+                                    bool is_inter_layer_opin_connection) {
+    int sink_layer = rr_graph->node_layer(sink_node);
+
+    if (rr_graph->node_layer(from_node) == sink_layer || rr_graph->node_type(from_node) == SOURCE || !is_inter_layer_opin_connection) {
+        return true;
+    } else if (rr_graph->node_type(from_node) == CHANX || rr_graph->node_type(from_node) == CHANY || rr_graph->node_type(from_node) == IPIN) {
+        return false;
+    } else {
+        VTR_ASSERT(rr_graph->node_type(from_node) == OPIN && is_inter_layer_opin_connection);
+        auto edges = rr_nodes.edge_range(from_node);
+
+        for (RREdgeId from_edge : edges) {
+            RRNodeId to_node = rr_nodes.edge_sink_node(from_edge);
+            if (rr_graph->node_layer(to_node) == sink_layer) {
+                return true;
+            }
+        }
+        return false;
+    }
+}
+
+static inline bool relevant_node_to_target(const RRGraphView* rr_graph,
+                                           RRNodeId node_to_add,
+                                           RRNodeId target_node) {
+    VTR_ASSERT(rr_graph->node_type(target_node) == t_rr_type::SINK);
+    auto node_to_add_type = rr_graph->node_type(node_to_add);
+    if (node_to_add_type == t_rr_type::OPIN || node_to_add_type == t_rr_type::SOURCE || node_to_add_type == t_rr_type::CHANX || node_to_add_type == t_rr_type::CHANY || node_to_add_type == SINK) {
+        return true;
+    } else if (node_in_same_physical_tile(node_to_add, target_node)) {
+        VTR_ASSERT(node_to_add_type == IPIN);
+        return true;
+    }
+    return false;
+}
+
+static inline void update_router_stats(const DeviceContext& device_ctx,
+                                       const RRGraphView* rr_graph,
+                                       RouterStats* router_stats,
+                                       RRNodeId rr_node_id,
+                                       bool is_push) {
+    if (is_push) {
+        router_stats->heap_pushes++;
+    } else {
+        router_stats->heap_pops++;
+    }
+
+    auto node_type = rr_graph->node_type(rr_node_id);
+    VTR_ASSERT(node_type != NUM_RR_TYPES);
+    t_physical_tile_type_ptr physical_type = device_ctx.grid.get_physical_type({rr_graph->node_xlow(rr_node_id),
+                                                                                rr_graph->node_ylow(rr_node_id),
+                                                                                rr_graph->node_layer(rr_node_id)});
+
+    if (is_inter_cluster_node(physical_type,
+                              node_type,
+                              rr_graph->node_ptc_num(rr_node_id))) {
+        if (is_push) {
+            router_stats->inter_cluster_node_pushes++;
+            router_stats->inter_cluster_node_type_cnt_pushes[node_type]++;
+        } else {
+            router_stats->inter_cluster_node_pops++;
+            router_stats->inter_cluster_node_type_cnt_pops[node_type]++;
+        }
+
+    } else {
+        if (is_push) {
+            router_stats->intra_cluster_node_pushes++;
+            router_stats->intra_cluster_node_type_cnt_pushes[node_type]++;
+        } else {
+            router_stats->intra_cluster_node_pops++;
+            router_stats->intra_cluster_node_type_cnt_pops[node_type]++;
+        }
+    }
+}
+
 std::unique_ptr<ConnectionRouterInterface> make_connection_router(e_heap_type heap_type,
                                                                   const DeviceGrid& grid,
                                                                   const RouterLookahead& router_lookahead,
diff --git a/vpr/src/route/connection_router.h b/vpr/src/route/connection_router.h
index 5834e852409..093ab8fed83 100644
--- a/vpr/src/route/connection_router.h
+++ b/vpr/src/route/connection_router.h
@@ -47,6 +47,7 @@ class ConnectionRouter : public ConnectionRouterInterface {
         , router_debug_(false) {
         heap_.init_heap(grid);
         heap_.set_prune_limit(rr_nodes_.size(), kHeapPruneFactor * rr_nodes_.size());
+        only_opin_inter_layer = (grid.get_num_layers() > 1) && inter_layer_connections_limited_to_opin(*rr_graph);
     }
 
     // Clear's the modified list.  Should be called after reset_path_costs
@@ -286,6 +287,8 @@ class ConnectionRouter : public ConnectionRouterInterface {
     HeapImplementation heap_;
     bool router_debug_;
 
+    bool only_opin_inter_layer;
+
     // The path manager for RCV, keeps track of the route tree as a set, also manages the allocation of the heap types
     PathManager rcv_path_manager;
 };
diff --git a/vpr/src/route/route_common.cpp b/vpr/src/route/route_common.cpp
index 99d116b0de6..2a2f9cb787e 100644
--- a/vpr/src/route/route_common.cpp
+++ b/vpr/src/route/route_common.cpp
@@ -835,6 +835,8 @@ vtr::vector<ParentNetId, t_bb> load_route_bb(const Netlist<>& net_list,
         full_device_bounding_box.ymin = 0;
         full_device_bounding_box.xmax = device_ctx.grid.width() - 1;
         full_device_bounding_box.ymax = device_ctx.grid.height() - 1;
+        full_device_bounding_box.layer_min = 0;
+        full_device_bounding_box.layer_max = device_ctx.grid.get_num_layers() - 1;
     }
 
     auto nets = net_list.nets();
@@ -905,6 +907,8 @@ t_bb load_net_route_bb(const Netlist<>& net_list,
     int ymin = rr_graph.node_ylow(driver_rr);
     int xmax = rr_graph.node_xhigh(driver_rr);
     int ymax = rr_graph.node_yhigh(driver_rr);
+    int layer_min = rr_graph.node_layer(driver_rr);
+    int layer_max = rr_graph.node_layer(driver_rr);
 
     auto net_sinks = net_list.net_sinks(net_id);
     for (size_t ipin = 1; ipin < net_sinks.size() + 1; ++ipin) { //Start at 1 since looping through sinks
@@ -914,10 +918,15 @@ t_bb load_net_route_bb(const Netlist<>& net_list,
         VTR_ASSERT(rr_graph.node_xlow(sink_rr) <= rr_graph.node_xhigh(sink_rr));
         VTR_ASSERT(rr_graph.node_ylow(sink_rr) <= rr_graph.node_yhigh(sink_rr));
 
+        VTR_ASSERT(rr_graph.node_layer(sink_rr) >= 0);
+        VTR_ASSERT(rr_graph.node_layer(sink_rr) <= device_ctx.grid.get_num_layers() - 1);
+
         xmin = std::min<int>(xmin, rr_graph.node_xlow(sink_rr));
         xmax = std::max<int>(xmax, rr_graph.node_xhigh(sink_rr));
         ymin = std::min<int>(ymin, rr_graph.node_ylow(sink_rr));
         ymax = std::max<int>(ymax, rr_graph.node_yhigh(sink_rr));
+        layer_min = std::min<int>(layer_min, rr_graph.node_layer(sink_rr));
+        layer_max = std::max<int>(layer_max, rr_graph.node_layer(sink_rr));
     }
 
     /* Want the channels on all 4 sides to be usuable, even if bb_factor = 0. */
@@ -933,6 +942,8 @@ t_bb load_net_route_bb(const Netlist<>& net_list,
     bb.xmax = std::min<int>(xmax + bb_factor, device_ctx.grid.width() - 1);
     bb.ymin = std::max<int>(ymin - bb_factor, 0);
     bb.ymax = std::min<int>(ymax + bb_factor, device_ctx.grid.height() - 1);
+    bb.layer_min = layer_min;
+    bb.layer_max = layer_max;
 
     return bb;
 }
diff --git a/vpr/src/route/route_parallel.cpp b/vpr/src/route/route_parallel.cpp
index 96e6464f62b..b3a6dda3b72 100644
--- a/vpr/src/route/route_parallel.cpp
+++ b/vpr/src/route/route_parallel.cpp
@@ -985,7 +985,9 @@ RouteIterResults route_partition_tree(tbb::task_group& g,
                 0,
                 (int)(device_ctx.grid.width() - 1),
                 0,
-                (int)(device_ctx.grid.height() - 1)};
+                (int)(device_ctx.grid.height() - 1),
+                0,
+                (int)(device_ctx.grid.get_num_layers() - 1)};
             tree.root().nets.push_back(net_id);
         }
     }
diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp
index 62930ad2555..2b497066e32 100644
--- a/vpr/src/route/route_timing.cpp
+++ b/vpr/src/route/route_timing.cpp
@@ -1908,8 +1908,10 @@ t_bb calc_current_bb(const RouteTree& tree) {
     t_bb bb;
     bb.xmin = grid.width() - 1;
     bb.ymin = grid.height() - 1;
+    bb.layer_min = grid.get_num_layers() - 1;
     bb.xmax = 0;
     bb.ymax = 0;
+    bb.layer_max = 0;
 
     for (auto& rt_node : tree.all_nodes()) {
         //The router interprets RR nodes which cross the boundary as being
@@ -1918,8 +1920,10 @@ t_bb calc_current_bb(const RouteTree& tree) {
         //and xlow/ylow for xmax/ymax calculations
         bb.xmin = std::min<int>(bb.xmin, rr_graph.node_xhigh(rt_node.inode));
         bb.ymin = std::min<int>(bb.ymin, rr_graph.node_yhigh(rt_node.inode));
+        bb.layer_min = std::min<int>(bb.layer_min, rr_graph.node_layer(rt_node.inode));
         bb.xmax = std::max<int>(bb.xmax, rr_graph.node_xlow(rt_node.inode));
         bb.ymax = std::max<int>(bb.ymax, rr_graph.node_ylow(rt_node.inode));
+        bb.layer_max = std::max<int>(bb.layer_max, rr_graph.node_layer(rt_node.inode));
     }
 
     VTR_ASSERT(bb.xmin <= bb.xmax);
diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp
index 4e2274c406f..eac8fdf28c4 100644
--- a/vpr/src/route/router_delay_profiling.cpp
+++ b/vpr/src/route/router_delay_profiling.cpp
@@ -23,7 +23,11 @@ RouterDelayProfiler::RouterDelayProfiler(const Netlist<>& net_list,
           is_flat)
     , is_flat_(is_flat) {}
 
-bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RRNodeId sink_node, const t_router_opts& router_opts, float* net_delay) {
+bool RouterDelayProfiler::calculate_delay(RRNodeId source_node,
+                                          RRNodeId sink_node,
+                                          const t_router_opts& router_opts,
+                                          float* net_delay,
+                                          int layer_num) {
     /* Returns true as long as found some way to hook up this net, even if that *
      * way resulted in overuse of resources (congestion).  If there is no way   *
      * to route this net, even ignoring congestion, it returns false.  In this  *
@@ -54,6 +58,14 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RRNodeId sink_no
     bounding_box.xmax = device_ctx.grid.width() + 1;
     bounding_box.ymin = 0;
     bounding_box.ymax = device_ctx.grid.height() + 1;
+    // If layer num is not specified, it means the BB should cover all layers
+    if (layer_num == OPEN) {
+        bounding_box.layer_min = 0;
+        bounding_box.layer_max = device_ctx.grid.get_num_layers() - 1;
+    } else {
+        bounding_box.layer_min = layer_num;
+        bounding_box.layer_max = layer_num;
+    }
 
     t_conn_cost_params cost_params;
     cost_params.criticality = 1.;
@@ -81,7 +93,7 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RRNodeId sink_no
         true);
 
     if (found_path) {
-        VTR_ASSERT(RRNodeId(cheapest.index) == sink_node);
+        VTR_ASSERT(cheapest.index == sink_node);
 
         vtr::optional<const RouteTreeNode&> rt_node_of_sink;
         std::tie(std::ignore, rt_node_of_sink) = tree.update_from_heap(&cheapest, OPEN, nullptr, is_flat_);
@@ -111,13 +123,15 @@ vtr::vector<RRNodeId, float> calculate_all_path_delays_from_rr_node(RRNodeId src
 
     vtr::vector<RRNodeId, float> path_delays_to(device_ctx.rr_graph.num_nodes(), std::numeric_limits<float>::quiet_NaN());
 
-    RouteTree tree((RRNodeId(src_rr_node)));
+    RouteTree tree(src_rr_node);
 
     t_bb bounding_box;
     bounding_box.xmin = 0;
     bounding_box.xmax = device_ctx.grid.width() + 1;
     bounding_box.ymin = 0;
     bounding_box.ymax = device_ctx.grid.height() + 1;
+    bounding_box.layer_min = 0;
+    bounding_box.layer_max = device_ctx.grid.get_num_layers() - 1;
 
     t_conn_cost_params cost_params;
     cost_params.criticality = 1.;
diff --git a/vpr/src/route/router_delay_profiling.h b/vpr/src/route/router_delay_profiling.h
index d4dcbb5dac7..11d8eb25f1d 100644
--- a/vpr/src/route/router_delay_profiling.h
+++ b/vpr/src/route/router_delay_profiling.h
@@ -13,7 +13,24 @@ class RouterDelayProfiler {
     RouterDelayProfiler(const Netlist<>& net_list,
                         const RouterLookahead* lookahead,
                         bool is_flat);
-    bool calculate_delay(RRNodeId source_node, RRNodeId sink_node, const t_router_opts& router_opts, float* net_delay);
+
+    /**
+     * @brief Returns true as long as found some way to hook up this net, even if that
+     * way resulted in overuse of resources (congestion).  If there is no way
+     * to route this net, even ignoring congestion, it returns false.  In this
+     * case the rr_graph is disconnected and you can give up.
+     * @param source_node
+     * @param sink_node
+     * @param router_opts
+     * @param net_delay
+     * @param layer_num
+     * @return
+     */
+    bool calculate_delay(RRNodeId source_node,
+                         RRNodeId sink_node,
+                         const t_router_opts& router_opts,
+                         float* net_delay,
+                         int layer_num);
 
   private:
     const Netlist<>& net_list_;
diff --git a/vpr/src/route/router_lookahead_cost_map.cpp b/vpr/src/route/router_lookahead_cost_map.cpp
index 9e3de711d9d..c00257a1ce4 100644
--- a/vpr/src/route/router_lookahead_cost_map.cpp
+++ b/vpr/src/route/router_lookahead_cost_map.cpp
@@ -397,6 +397,8 @@ std::pair<util::Cost_Entry, int> CostMap::get_nearby_cost_entry(const vtr::NdMat
  * the cost map data structures, exploiting the capnp serialization.
  */
 
+#ifdef VTR_ENABLE_CAPNPROTO
+
 static void ToCostEntry(util::Cost_Entry* out, const VprCostEntry::Reader& in) {
     out->delay = in.getDelay();
     out->congestion = in.getCongestion();
@@ -491,3 +493,15 @@ void CostMap::write(const std::string& file) const {
 
     writeMessageToFile(file, &builder);
 }
+
+#else
+
+void CostMap::read(const std::string& /*file*/) {
+    VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Read CostMap requires the support of capnp");
+}
+
+void CostMap::write(const std::string& /*file*/) const {
+    VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Write CostMap requires the support of capnp");
+}
+
+#endif
diff --git a/vpr/src/route/router_lookahead_extended_map.cpp b/vpr/src/route/router_lookahead_extended_map.cpp
index fa5a9a9849e..b176372e686 100644
--- a/vpr/src/route/router_lookahead_extended_map.cpp
+++ b/vpr/src/route/router_lookahead_extended_map.cpp
@@ -419,7 +419,8 @@ std::pair<float, int> ExtendedMapLookahead::run_dijkstra(RRNodeId start_node,
 
 // compute the cost maps for lookahead
 void ExtendedMapLookahead::compute(const std::vector<t_segment_inf>& segment_inf) {
-    this->src_opin_delays = util::compute_router_src_opin_lookahead(is_flat_);
+    std::tie(this->src_opin_delays, this->src_opin_inter_layer_delays) = util::compute_router_src_opin_lookahead(is_flat_);
+
     this->chan_ipins_delays = util::compute_router_chan_ipin_lookahead();
 
     vtr::ScopedStartFinishTimer timer("Computing connection box lookahead map");
@@ -614,7 +615,8 @@ void ExtendedMapLookahead::write(const std::string& file) const {
 void ExtendedMapLookahead::read(const std::string& file) {
     cost_map_.read(file);
 
-    this->src_opin_delays = util::compute_router_src_opin_lookahead(is_flat_);
+    std::tie(this->src_opin_delays, this->src_opin_inter_layer_delays) = util::compute_router_src_opin_lookahead(is_flat_);
+
     this->chan_ipins_delays = util::compute_router_chan_ipin_lookahead();
 }
 void ExtendedMapLookahead::write(const std::string& file) const {
diff --git a/vpr/src/route/router_lookahead_extended_map.h b/vpr/src/route/router_lookahead_extended_map.h
index 424a1dfa23d..ccd3faaa2ad 100644
--- a/vpr/src/route/router_lookahead_extended_map.h
+++ b/vpr/src/route/router_lookahead_extended_map.h
@@ -19,6 +19,9 @@ class ExtendedMapLookahead : public RouterLookahead {
     ///<Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
     util::t_src_opin_delays src_opin_delays;
 
+    ///< Lookup table from SOURCE/OPIN to CHANX/CHANY of the another layer
+    util::t_src_opin_inter_layer_delays src_opin_inter_layer_delays;
+
     ///<Look-up table from CHANX/CHANY to SINK/IPIN of various types
     util::t_chan_ipins_delays chan_ipins_delays;
 
diff --git a/vpr/src/route/router_lookahead_map.cpp b/vpr/src/route/router_lookahead_map.cpp
index 33b181a8be7..5f5f8e886f2 100644
--- a/vpr/src/route/router_lookahead_map.cpp
+++ b/vpr/src/route/router_lookahead_map.cpp
@@ -255,6 +255,19 @@ static void store_min_cost_to_sinks(std::unordered_map<int, std::unordered_map<i
  */
 static void min_global_cost_map(vtr::NdMatrix<util::Cost_Entry, 3>& internal_opin_global_cost_map);
 
+/**
+ * @brief Iterate over all of the wire segments accessible from the SOURCE/OPIN (stored in src_opin_delay_map) and return the minimum cost (congestion and delay) across them to the sink
+ * @param src_opin_delay_map
+ * @param layer_num
+ * @param delta_x
+ * @param delta_y
+ * @return (delay, congestion)
+ */
+static std::pair<float, float> get_cost_from_src_opin(const std::map<int, util::t_reachable_wire_inf>& src_opin_delay_map,
+                                                      int layer_num,
+                                                      int delta_x,
+                                                      int delta_y);
+
 // Read the file and fill inter_tile_pin_primitive_pin_delay and tile_min_cost
 static void read_intra_cluster_router_lookahead(std::unordered_map<int, util::t_ipin_primitive_sink_delays>& inter_tile_pin_primitive_pin_delay,
                                                 const std::string& file);
@@ -295,6 +308,10 @@ static void print_wire_cost_map(int layer_num, const std::vector<t_segment_inf>&
 static void print_router_cost_map(const t_routing_cost_map& router_cost_map);
 
 /******** Interface class member function definitions ********/
+MapLookahead::MapLookahead(const t_det_routing_arch& det_routing_arch, bool is_flat)
+    : det_routing_arch_(det_routing_arch)
+    , is_flat_(is_flat) {}
+
 float MapLookahead::get_expected_cost(RRNodeId current_node, RRNodeId target_node, const t_conn_cost_params& params, float R_upstream) const {
     auto& device_ctx = g_vpr_ctx.device();
     const auto& rr_graph = device_ctx.rr_graph;
@@ -430,6 +447,7 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(RRNodeId from_
 
     int delta_x, delta_y;
     int from_layer_num = rr_graph.node_layer(from_node);
+    int to_layer_num = rr_graph.node_layer(to_node);
     get_xy_deltas(from_node, to_node, &delta_x, &delta_y);
     delta_x = abs(delta_x);
     delta_y = abs(delta_y);
@@ -444,66 +462,31 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(RRNodeId from_
         //reachable, we query the f_wire_cost_map (i.e. the wire lookahead) to get the final
         //delay to reach the sink.
 
-        t_physical_tile_type_ptr tile_type = device_ctx.grid.get_physical_type({rr_graph.node_xlow(from_node),
-                                                                                rr_graph.node_ylow(from_node),
-                                                                                from_layer_num});
+        t_physical_tile_type_ptr from_tile_type = device_ctx.grid.get_physical_type({rr_graph.node_xlow(from_node),
+                                                                                     rr_graph.node_ylow(from_node),
+                                                                                     from_layer_num});
 
-        auto tile_index = std::distance(&device_ctx.physical_tile_types[0], tile_type);
+        auto from_tile_index = std::distance(&device_ctx.physical_tile_types[0], from_tile_type);
 
         auto from_ptc = rr_graph.node_ptc_num(from_node);
 
-        if (this->src_opin_delays[from_layer_num][tile_index][from_ptc].empty()) {
-            //During lookahead profiling we were unable to find any wires which connected
-            //to this PTC.
-            //
-            //This can sometimes occur at very low channel widths (e.g. during min W search on
-            //small designs) where W discretization combined with fraction Fc may cause some
-            //pins/sources to be left disconnected.
-            //
-            //Such RR graphs are of course unroutable, but that should be determined by the
-            //router. So just return an arbitrary value here rather than error.
-
-            //We choose to return the largest (non-infinite) value possible, but scaled
-            //down by a large factor to maintain some dynaimc range in case this value ends
-            //up being processed (e.g. by the timing analyzer).
-            //
-            //The cost estimate should still be *extremely* large compared to a typical delay, and
-            //so should ensure that the router de-prioritizes exploring this path, but does not
-            //forbid the router from trying.
-            expected_delay_cost = std::numeric_limits<float>::max() / 1e12;
-            expected_cong_cost = std::numeric_limits<float>::max() / 1e12;
-        } else {
-            //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
-            //and then add the estimates from those wire types for the distance of interest.
-            //If there are multiple options we use the minimum value.
-            for (const auto& kv : this->src_opin_delays[from_layer_num][tile_index][from_ptc]) {
-                const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
-
-                Cost_Entry wire_cost_entry;
-                if (reachable_wire_inf.wire_rr_type == SINK) {
-                    //Some pins maybe reachable via a direct (OPIN -> IPIN) connection.
-                    //In the lookahead, we treat such connections as 'special' wire types
-                    //with no delay/congestion cost
-                    wire_cost_entry.delay = 0;
-                    wire_cost_entry.congestion = 0;
-                } else {
-                    //For an actual accessible wire, we query the wire look-up to get it's
-                    //delay and congestion cost estimates
-                    wire_cost_entry = get_wire_cost_entry(reachable_wire_inf.wire_rr_type,
-                                                          reachable_wire_inf.wire_seg_index,
-                                                          from_layer_num,
-                                                          delta_x,
-                                                          delta_y);
-                }
-
-                float this_delay_cost = (params.criticality) * (reachable_wire_inf.delay + wire_cost_entry.delay);
-                float this_cong_cost = (1. - params.criticality) * (reachable_wire_inf.congestion + wire_cost_entry.congestion);
-
-                expected_delay_cost = std::min(expected_delay_cost, this_delay_cost);
-                expected_cong_cost = std::min(expected_cong_cost, this_cong_cost);
-            }
+        // Currently, we assume inter-layer connections are only from a block output pin to another layer. Thus, if the from and to layers are different,
+        // We use src_opin_inter_layer_delays.
+        if (from_layer_num == to_layer_num) {
+            std::tie(expected_delay_cost, expected_cong_cost) = get_cost_from_src_opin(src_opin_delays[from_layer_num][from_tile_index][from_ptc],
+                                                                                       from_layer_num,
+                                                                                       delta_x,
+                                                                                       delta_y);
+        } else if (from_layer_num != to_layer_num) {
+            std::tie(expected_delay_cost, expected_cong_cost) = get_cost_from_src_opin(src_opin_inter_layer_delays[from_layer_num][from_tile_index][from_ptc][to_layer_num],
+                                                                                       to_layer_num,
+                                                                                       delta_x,
+                                                                                       delta_y);
         }
 
+        expected_delay_cost *= params.criticality;
+        expected_cong_cost *= (1 - params.criticality);
+
         VTR_ASSERT_SAFE_MSG(std::isfinite(expected_delay_cost),
                             vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
                                             rr_node_arch_name(from_node, is_flat_).c_str(),
@@ -516,7 +499,6 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(RRNodeId from_
                                 .c_str());
 
     } else if (from_type == CHANX || from_type == CHANY) {
-        VTR_ASSERT_SAFE(from_type == CHANX || from_type == CHANY);
         //When estimating costs from a wire, we directly look-up the result in the wire lookahead (f_wire_cost_map)
 
         auto from_cost_index = rr_graph.node_cost_index(from_node);
@@ -524,29 +506,34 @@ std::pair<float, float> MapLookahead::get_expected_delay_and_cong(RRNodeId from_
 
         VTR_ASSERT(from_seg_index >= 0);
 
-        /* now get the expected cost from our lookahead map */
-        Cost_Entry cost_entry = get_wire_cost_entry(from_type,
-                                                    from_seg_index,
-                                                    from_layer_num,
-                                                    delta_x,
-                                                    delta_y);
-
-        float expected_delay = cost_entry.delay;
-        float expected_cong = cost_entry.congestion;
-
-        expected_delay_cost = params.criticality * expected_delay;
-        expected_cong_cost = (1.0 - params.criticality) * expected_cong;
-
-        VTR_ASSERT_SAFE_MSG(std::isfinite(expected_delay_cost),
-                            vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
-                                            rr_node_arch_name(from_node, is_flat_).c_str(),
-                                            describe_rr_node(rr_graph,
-                                                             device_ctx.grid,
-                                                             device_ctx.rr_indexed_data,
-                                                             from_node,
-                                                             is_flat_)
-                                                .c_str())
-                                .c_str());
+        // Since we assume that inter-layer connections are only from a block output pin to another layer, if the from node
+        // is of type CHANX/CHANY, and the sink node is on the other layer, there will no path from that node to the sink
+        if (from_layer_num != to_layer_num) {
+            expected_delay_cost = std::numeric_limits<float>::max() / 1e12;
+            expected_cong_cost = std::numeric_limits<float>::max() / 1e12;
+        } else {
+            /* now get the expected cost from our lookahead map */
+            Cost_Entry cost_entry = get_wire_cost_entry(from_type,
+                                                        from_seg_index,
+                                                        from_layer_num,
+                                                        delta_x,
+                                                        delta_y);
+            expected_delay_cost = cost_entry.delay;
+            expected_cong_cost = cost_entry.congestion;
+
+            VTR_ASSERT_SAFE_MSG(std::isfinite(expected_delay_cost),
+                                vtr::string_fmt("Lookahead failed to estimate cost from %s: %s",
+                                                rr_node_arch_name(from_node, is_flat_).c_str(),
+                                                describe_rr_node(rr_graph,
+                                                                 device_ctx.grid,
+                                                                 device_ctx.rr_indexed_data,
+                                                                 from_node,
+                                                                 is_flat_)
+                                                    .c_str())
+                                    .c_str());
+        }
+        expected_delay_cost *= params.criticality;
+        expected_cong_cost *= (1 - params.criticality);
     } else if (from_type == IPIN) { /* Change if you're allowing route-throughs */
         return std::make_pair(0., device_ctx.rr_indexed_data[RRIndexedDataId(SINK_COST_INDEX)].base_cost);
     } else { /* Change this if you want to investigate route-throughs */
@@ -565,7 +552,7 @@ void MapLookahead::compute(const std::vector<t_segment_inf>& segment_inf) {
 
     //Next, compute which wire types are accessible (and the cost to reach them)
     //from the different physical tile type's SOURCEs & OPINs
-    this->src_opin_delays = util::compute_router_src_opin_lookahead(is_flat_);
+    std::tie(this->src_opin_delays, this->src_opin_inter_layer_delays) = util::compute_router_src_opin_lookahead(is_flat_);
 }
 
 void MapLookahead::compute_intra_tile() {
@@ -588,7 +575,7 @@ void MapLookahead::read(const std::string& file) {
 
     //Next, compute which wire types are accessible (and the cost to reach them)
     //from the different physical tile type's SOURCEs & OPINs
-    this->src_opin_delays = util::compute_router_src_opin_lookahead(is_flat_);
+    std::tie(this->src_opin_delays, this->src_opin_inter_layer_delays) = util::compute_router_src_opin_lookahead(is_flat_);
 }
 
 void MapLookahead::read_intra_cluster(const std::string& file) {
@@ -1503,6 +1490,67 @@ static void min_global_cost_map(vtr::NdMatrix<util::Cost_Entry, 3>& internal_opi
     }
 }
 
+static std::pair<float, float> get_cost_from_src_opin(const std::map<int, util::t_reachable_wire_inf>& src_opin_delay_map,
+                                                      int layer_num,
+                                                      int delta_x,
+                                                      int delta_y) {
+    float expected_delay_cost = std::numeric_limits<float>::infinity();
+    float expected_cong_cost = std::numeric_limits<float>::infinity();
+    if (src_opin_delay_map.empty()) {
+        //During lookahead profiling we were unable to find any wires which connected
+        //to this PTC.
+        //
+        //This can sometimes occur at very low channel widths (e.g. during min W search on
+        //small designs) where W discretization combined with fraction Fc may cause some
+        //pins/sources to be left disconnected.
+        //
+        //Such RR graphs are of course unroutable, but that should be determined by the
+        //router. So just return an arbitrary value here rather than error.
+
+        //We choose to return the largest (non-infinite) value possible, but scaled
+        //down by a large factor to maintain some dynaimc range in case this value ends
+        //up being processed (e.g. by the timing analyzer).
+        //
+        //The cost estimate should still be *extremely* large compared to a typical delay, and
+        //so should ensure that the router de-prioritizes exploring this path, but does not
+        //forbid the router from trying.
+        expected_delay_cost = std::numeric_limits<float>::max() / 1e12;
+        expected_cong_cost = std::numeric_limits<float>::max() / 1e12;
+    } else {
+        //From the current SOURCE/OPIN we look-up the wiretypes which are reachable
+        //and then add the estimates from those wire types for the distance of interest.
+        //If there are multiple options we use the minimum value.
+        for (const auto& kv : src_opin_delay_map) {
+            const util::t_reachable_wire_inf& reachable_wire_inf = kv.second;
+
+            Cost_Entry wire_cost_entry;
+            if (reachable_wire_inf.wire_rr_type == SINK) {
+                //Some pins maybe reachable via a direct (OPIN -> IPIN) connection.
+                //In the lookahead, we treat such connections as 'special' wire types
+                //with no delay/congestion cost
+                wire_cost_entry.delay = 0;
+                wire_cost_entry.congestion = 0;
+            } else {
+                //For an actual accessible wire, we query the wire look-up to get it's
+                //delay and congestion cost estimates
+                wire_cost_entry = get_wire_cost_entry(reachable_wire_inf.wire_rr_type,
+                                                      reachable_wire_inf.wire_seg_index,
+                                                      layer_num,
+                                                      delta_x,
+                                                      delta_y);
+            }
+
+            float this_delay_cost = reachable_wire_inf.delay + wire_cost_entry.delay;
+            float this_cong_cost = reachable_wire_inf.congestion + wire_cost_entry.congestion;
+
+            expected_delay_cost = std::min(expected_delay_cost, this_delay_cost);
+            expected_cong_cost = std::min(expected_cong_cost, this_cong_cost);
+        }
+    }
+
+    return std::make_pair(expected_delay_cost, expected_cong_cost);
+}
+
 //
 // When writing capnp targetted serialization, always allow compilation when
 // VTR_ENABLE_CAPNPROTO=OFF.  Generally this means throwing an exception
@@ -1518,7 +1566,7 @@ void read_router_lookahead(const std::string& /*file*/) {
     VPR_THROW(VPR_ERROR_PLACE, "MapLookahead::read " DISABLE_ERROR);
 }
 
-void DeltaDelayModel::write(const std::string& /*file*/) const {
+void write_router_lookahead(const std::string& file) {
     VPR_THROW(VPR_ERROR_PLACE, "MapLookahead::write " DISABLE_ERROR);
 }
 
diff --git a/vpr/src/route/router_lookahead_map.h b/vpr/src/route/router_lookahead_map.h
index f3e82531510..d6340acac85 100644
--- a/vpr/src/route/router_lookahead_map.h
+++ b/vpr/src/route/router_lookahead_map.h
@@ -8,9 +8,7 @@
 
 class MapLookahead : public RouterLookahead {
   public:
-    explicit MapLookahead(const t_det_routing_arch& det_routing_arch, bool is_flat)
-        : det_routing_arch_(det_routing_arch)
-        , is_flat_(is_flat) {}
+    explicit MapLookahead(const t_det_routing_arch& det_routing_arch, bool is_flat);
 
   private:
     //Look-up table from SOURCE/OPIN to CHANX/CHANY of various types
@@ -21,6 +19,9 @@ class MapLookahead : public RouterLookahead {
     std::unordered_map<int, std::unordered_map<int, util::Cost_Entry>> tile_min_cost; // [physical_tile_type][sink_physical_num] -> cost
     // Lookup table to store the minimum cost for each dx and dy
     vtr::NdMatrix<util::Cost_Entry, 3> distance_based_min_cost; // [layer_num][dx][dy] -> cost
+    // [tile_index][from_layer_num][to_layer_num] -> pair<seg_index, t_reachable_wire_inf>
+    util::t_src_opin_inter_layer_delays src_opin_inter_layer_delays;
+
     const t_det_routing_arch& det_routing_arch_;
     bool is_flat_;
 
diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
index e3141e947c3..01c0e79d16c 100644
--- a/vpr/src/route/router_lookahead_map_utils.cpp
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -19,7 +19,8 @@
 #include "route_common.h"
 #include "route_timing.h"
 
-static void dijkstra_flood_to_wires(int itile, RRNodeId inode, util::t_src_opin_delays& src_opin_delays);
+static void dijkstra_flood_to_wires(int itile, RRNodeId inode, util::t_src_opin_delays& src_opin_delays, util::t_src_opin_inter_layer_delays& src_opin_inter_layer_delays, bool is_multi_layer);
+
 static void dijkstra_flood_to_ipins(RRNodeId node, util::t_chan_ipins_delays& chan_ipins_delays);
 
 static t_physical_tile_loc pick_sample_tile(int layer_num, t_physical_tile_type_ptr tile_type, t_physical_tile_loc prev);
@@ -305,21 +306,32 @@ template void expand_dijkstra_neighbours(const RRGraphView& rr_graph,
                                                              std::vector<PQ_Entry_Base_Cost>,
                                                              std::greater<PQ_Entry_Base_Cost>>* pq);
 
-t_src_opin_delays compute_router_src_opin_lookahead(bool is_flat) {
+std::pair<t_src_opin_delays, t_src_opin_inter_layer_delays> compute_router_src_opin_lookahead(bool is_flat) {
     vtr::ScopedStartFinishTimer timer("Computing src/opin lookahead");
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_graph = device_ctx.rr_graph;
 
-    t_src_opin_delays src_opin_delays;
+    int num_layers = device_ctx.grid.get_num_layers();
+    bool is_multi_layer = (num_layers > 1);
 
-    src_opin_delays.resize(device_ctx.grid.get_num_layers());
-    for (int layer_num = 0; layer_num < device_ctx.grid.get_num_layers(); layer_num++) {
+    t_src_opin_delays src_opin_delays;
+    src_opin_delays.resize(num_layers);
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
         src_opin_delays[layer_num].resize(device_ctx.physical_tile_types.size());
     }
 
+    t_src_opin_inter_layer_delays src_opin_inter_layer_delays;
+    if (is_multi_layer) {
+        src_opin_inter_layer_delays.resize(num_layers);
+        for (int layer_num = 0; layer_num < num_layers; layer_num++) {
+            int num_physical_tiles = (int)device_ctx.physical_tile_types.size();
+            src_opin_inter_layer_delays[layer_num].resize(num_physical_tiles);
+        }
+    }
+
     //We assume that the routing connectivity of each instance of a physical tile is the same,
     //and so only measure one instance of each type
-    for (int layer_num = 0; layer_num < device_ctx.grid.get_num_layers(); layer_num++) {
+    for (int layer_num = 0; layer_num < num_layers; layer_num++) {
         for (size_t itile = 0; itile < device_ctx.physical_tile_types.size(); ++itile) {
             if (device_ctx.grid.num_instances(&device_ctx.physical_tile_types[itile], layer_num) == 0) {
                 continue;
@@ -356,11 +368,22 @@ t_src_opin_delays compute_router_src_opin_lookahead(bool is_flat) {
 
                         if (ptc >= int(src_opin_delays[layer_num][itile].size())) {
                             src_opin_delays[layer_num][itile].resize(ptc + 1); //Inefficient but functional...
+                            if (is_multi_layer) {
+                                size_t old_size = src_opin_inter_layer_delays[layer_num][itile].size();
+                                src_opin_inter_layer_delays[layer_num][itile].resize(ptc + 1);
+                                for (size_t i = old_size; i < src_opin_inter_layer_delays[layer_num][itile].size(); ++i) {
+                                    src_opin_inter_layer_delays[layer_num][itile][i].resize(num_layers);
+                                }
+                            }
                         }
 
                         //Find the wire types which are reachable from inode and record them and
                         //the cost to reach them
-                        dijkstra_flood_to_wires(itile, node_id, src_opin_delays);
+                        dijkstra_flood_to_wires(itile,
+                                                node_id,
+                                                src_opin_delays,
+                                                src_opin_inter_layer_delays,
+                                                is_multi_layer);
 
                         if (src_opin_delays[layer_num][itile][ptc].empty()) {
                             VTR_LOGV_DEBUG(f_router_debug, "Found no reachable wires from %s (%s) at (%d,%d)\n",
@@ -383,7 +406,7 @@ t_src_opin_delays compute_router_src_opin_lookahead(bool is_flat) {
         }
     }
 
-    return src_opin_delays;
+    return std::make_pair(src_opin_delays, src_opin_inter_layer_delays);
 }
 
 t_chan_ipins_delays compute_router_chan_ipin_lookahead() {
@@ -466,7 +489,11 @@ t_ipin_primitive_sink_delays compute_intra_tile_dijkstra(const RRGraphView& rr_g
 
 } // namespace util
 
-static void dijkstra_flood_to_wires(int itile, RRNodeId node, util::t_src_opin_delays& src_opin_delays) {
+static void dijkstra_flood_to_wires(int itile,
+                                    RRNodeId node,
+                                    util::t_src_opin_delays& src_opin_delays,
+                                    util::t_src_opin_inter_layer_delays& src_opin_inter_layer_delays,
+                                    bool is_multi_layer) {
     auto& device_ctx = g_vpr_ctx.device();
     const auto& rr_graph = device_ctx.rr_graph;
 
@@ -516,6 +543,7 @@ static void dijkstra_flood_to_wires(int itile, RRNodeId node, util::t_src_opin_d
         pq.pop();
 
         e_rr_type curr_rr_type = rr_graph.node_type(curr.node);
+        int curr_layer_num = rr_graph.node_layer(curr.node);
         if (curr_rr_type == CHANX || curr_rr_type == CHANY || curr_rr_type == SINK) {
             //We stop expansion at any CHANX/CHANY/SINK
             int seg_index;
@@ -535,12 +563,20 @@ static void dijkstra_flood_to_wires(int itile, RRNodeId node, util::t_src_opin_d
             }
 
             //Keep costs of the best path to reach each wire type
-            if (!src_opin_delays[node_layer_num][itile][ptc].count(seg_index)
-                || curr.delay < src_opin_delays[node_layer_num][itile][ptc][seg_index].delay) {
+            if ((!src_opin_delays[node_layer_num][itile][ptc].count(seg_index)
+                 || curr.delay < src_opin_delays[node_layer_num][itile][ptc][seg_index].delay)
+                && curr_layer_num == node_layer_num) {
                 src_opin_delays[node_layer_num][itile][ptc][seg_index].wire_rr_type = curr_rr_type;
                 src_opin_delays[node_layer_num][itile][ptc][seg_index].wire_seg_index = seg_index;
                 src_opin_delays[node_layer_num][itile][ptc][seg_index].delay = curr.delay;
                 src_opin_delays[node_layer_num][itile][ptc][seg_index].congestion = curr.congestion;
+            } else if (is_multi_layer && (!src_opin_inter_layer_delays[node_layer_num][itile][ptc][curr_layer_num].count(seg_index) || curr.delay < src_opin_inter_layer_delays[node_layer_num][itile][ptc][curr_layer_num][seg_index].delay)
+                       && curr_layer_num != node_layer_num) {
+                // Store a CHANX/Y node or a SINK node on another layer that is reachable by the current node.
+                src_opin_inter_layer_delays[node_layer_num][itile][ptc][curr_layer_num][seg_index].wire_rr_type = curr_rr_type;
+                src_opin_inter_layer_delays[node_layer_num][itile][ptc][curr_layer_num][seg_index].wire_seg_index = seg_index;
+                src_opin_inter_layer_delays[node_layer_num][itile][ptc][curr_layer_num][seg_index].delay = curr.delay;
+                src_opin_inter_layer_delays[node_layer_num][itile][ptc][curr_layer_num][seg_index].congestion = curr.congestion;
             }
 
         } else if (curr_rr_type == SOURCE || curr_rr_type == OPIN || curr_rr_type == IPIN) {
@@ -564,11 +600,6 @@ static void dijkstra_flood_to_wires(int itile, RRNodeId node, util::t_src_opin_d
                     continue;
                 }
 
-                if (rr_graph.node_layer(curr.node) != node_layer_num) {
-                    //Don't change the layer
-                    continue;
-                }
-
                 t_pq_entry next;
                 next.congestion = curr.congestion + incr_cong; //Of current node
                 next.delay = curr.delay + incr_delay;          //To reach next node
diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
index 24f02df20f8..0245208fdf7 100644
--- a/vpr/src/route/router_lookahead_map_utils.h
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -271,8 +271,11 @@ struct t_reachable_wire_inf {
 // SOURCE/OPIN of a given tile type.
 //
 // When querying this data structure, the minimum cost is computed for each delay/congestion pair, and returned
-// as the lookahead expected cost.
+// as the lookahead expected cost. [opin/src layer_num][tile_index][opin/src ptc_number] -> pair<seg_index, t_reachable_wire_inf>
 typedef std::vector<std::vector<std::vector<std::map<int, t_reachable_wire_inf>>>> t_src_opin_delays;
+// Store the wire segments on to_layer_num reachable from a given SOURCE/OPIN
+// [from_layer_num][tile_index][from opin/src ptc num][to_layer_num] -> pair<seg_index, t_reachable_wire_inf>
+typedef std::vector<std::vector<std::vector<std::vector<std::map<int, util::t_reachable_wire_inf>>>>> t_src_opin_inter_layer_delays;
 
 //[from pin ptc num][target src ptc num]->cost
 typedef std::vector<std::unordered_map<int, Cost_Entry>> t_ipin_primitive_sink_delays;
@@ -288,7 +291,13 @@ typedef std::vector<std::unordered_map<int, Cost_Entry>> t_ipin_primitive_sink_d
 // and the tile's IPIN. If there are many connections to the same IPIN, the one with the minimum delay is selected.
 typedef std::vector<std::vector<std::vector<t_reachable_wire_inf>>> t_chan_ipins_delays;
 
-t_src_opin_delays compute_router_src_opin_lookahead(bool is_flat);
+/**
+ * @brief For each tile, iterate over its OPINs and store which segment types are accessible from each OPIN
+ * @param is_flat
+ * @return (segments accessible on the same type, segments accessible on other layer)
+ */
+std::pair<t_src_opin_delays, t_src_opin_inter_layer_delays> compute_router_src_opin_lookahead(bool is_flat);
+
 t_chan_ipins_delays compute_router_chan_ipin_lookahead();
 
 t_ipin_primitive_sink_delays compute_intra_tile_dijkstra(const RRGraphView& rr_graph,
diff --git a/vpr/src/route/rr_graph.cpp b/vpr/src/route/rr_graph.cpp
index 0b0bb8f90ab..ee879fea226 100644
--- a/vpr/src/route/rr_graph.cpp
+++ b/vpr/src/route/rr_graph.cpp
@@ -673,46 +673,47 @@ void create_rr_graph(const t_graph_type graph_type,
     bool echo_enabled = getEchoEnabled() && isEchoFileEnabled(E_ECHO_RR_GRAPH_INDEXED_DATA);
     const char* echo_file_name = getEchoFileName(E_ECHO_RR_GRAPH_INDEXED_DATA);
     bool load_rr_graph = !det_routing_arch->read_rr_graph_filename.empty();
-    if (load_rr_graph) {
-        if (device_ctx.read_rr_graph_filename != det_routing_arch->read_rr_graph_filename) {
-            free_rr_graph();
 
-            load_rr_file(&mutable_device_ctx.rr_graph_builder,
-                         &mutable_device_ctx.rr_graph,
-                         device_ctx.physical_tile_types,
-                         segment_inf,
-                         &mutable_device_ctx.rr_indexed_data,
-                         &mutable_device_ctx.rr_rc_data,
-                         grid,
-                         device_ctx.arch_switch_inf,
-                         graph_type,
-                         device_ctx.arch,
-                         &mutable_device_ctx.chan_width,
-                         router_opts.base_cost_type,
-                         device_ctx.virtual_clock_network_root_idx,
-                         &det_routing_arch->wire_to_rr_ipin_switch,
-                         &det_routing_arch->wire_to_arch_ipin_switch_between_dice,
-                         det_routing_arch->read_rr_graph_filename.c_str(),
-                         &det_routing_arch->read_rr_graph_filename,
-                         router_opts.read_rr_edge_metadata,
-                         router_opts.do_check_rr_graph,
-                         echo_enabled,
-                         echo_file_name,
-                         is_flat);
-            if (router_opts.reorder_rr_graph_nodes_algorithm != DONT_REORDER) {
-                mutable_device_ctx.rr_graph_builder.reorder_nodes(router_opts.reorder_rr_graph_nodes_algorithm,
-                                                                  router_opts.reorder_rr_graph_nodes_threshold,
-                                                                  router_opts.reorder_rr_graph_nodes_seed);
-            }
+    if (channel_widths_unchanged(device_ctx.chan_width, nodes_per_chan) && !device_ctx.rr_graph.empty()) {
+        //No change in channel width, so skip re-building RR graph
+        if (is_flat && !device_ctx.rr_graph_is_flat) {
+            VTR_LOG("RR graph channel widths unchanged, intra-cluster resources should be added...\n");
+        } else {
+            VTR_LOG("RR graph channel widths unchanged, skipping RR graph rebuild\n");
+            return;
         }
     } else {
-        if (channel_widths_unchanged(device_ctx.chan_width, nodes_per_chan) && !device_ctx.rr_graph.empty()) {
-            //No change in channel width, so skip re-building RR graph
-            if (is_flat && !device_ctx.rr_graph_is_flat) {
-                VTR_LOG("RR graph channel widths unchanged, intra-cluster resources should be added...\n");
-            } else {
-                VTR_LOG("RR graph channel widths unchanged, skipping RR graph rebuild\n");
-                return;
+        if (load_rr_graph) {
+            if (device_ctx.read_rr_graph_filename != det_routing_arch->read_rr_graph_filename) {
+                free_rr_graph();
+
+                load_rr_file(&mutable_device_ctx.rr_graph_builder,
+                             &mutable_device_ctx.rr_graph,
+                             device_ctx.physical_tile_types,
+                             segment_inf,
+                             &mutable_device_ctx.rr_indexed_data,
+                             &mutable_device_ctx.rr_rc_data,
+                             grid,
+                             device_ctx.arch_switch_inf,
+                             graph_type,
+                             device_ctx.arch,
+                             &mutable_device_ctx.chan_width,
+                             router_opts.base_cost_type,
+                             device_ctx.virtual_clock_network_root_idx,
+                             &det_routing_arch->wire_to_rr_ipin_switch,
+                             &det_routing_arch->wire_to_arch_ipin_switch_between_dice,
+                             det_routing_arch->read_rr_graph_filename.c_str(),
+                             &det_routing_arch->read_rr_graph_filename,
+                             router_opts.read_rr_edge_metadata,
+                             router_opts.do_check_rr_graph,
+                             echo_enabled,
+                             echo_file_name,
+                             is_flat);
+                if (router_opts.reorder_rr_graph_nodes_algorithm != DONT_REORDER) {
+                    mutable_device_ctx.rr_graph_builder.reorder_nodes(router_opts.reorder_rr_graph_nodes_algorithm,
+                                                                      router_opts.reorder_rr_graph_nodes_threshold,
+                                                                      router_opts.reorder_rr_graph_nodes_seed);
+                }
             }
         } else {
             free_rr_graph();
diff --git a/vpr/src/route/rr_graph_area.cpp b/vpr/src/route/rr_graph_area.cpp
index 996723ad11e..3ac736eebd8 100644
--- a/vpr/src/route/rr_graph_area.cpp
+++ b/vpr/src/route/rr_graph_area.cpp
@@ -492,7 +492,7 @@ void count_unidir_routing_transistors(std::vector<t_segment_inf>& /*segment_inf*
 
     VTR_LOG("\n");
     VTR_LOG("Routing area (in minimum width transistor areas)...\n");
-    VTR_LOG("\tTotal routing area: %#g, per logic tile: %#g\n", ntrans, ntrans / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
+    VTR_LOG("\tTotal routing area: %#g, per logic tile: %#g\n", ntrans, ntrans / (float)(device_ctx.grid.get_num_layers() * device_ctx.grid.width() * device_ctx.grid.height()));
 }
 
 static float get_cblock_trans(int* num_inputs_to_cblock, int wire_to_ipin_switch, int max_inputs_to_cblock, float trans_sram_bit) {
diff --git a/vpr/src/util/vpr_utils.cpp b/vpr/src/util/vpr_utils.cpp
index 6157c9b980d..b200a06ba7f 100644
--- a/vpr/src/util/vpr_utils.cpp
+++ b/vpr/src/util/vpr_utils.cpp
@@ -2508,3 +2508,25 @@ void add_pb_child_to_list(std::list<const t_pb*>& pb_list, const t_pb* parent_pb
         }
     }
 }
+
+float get_min_cross_layer_delay(const std::vector<t_arch_switch_inf>& arch_switch_inf,
+                                const std::vector<t_segment_inf>& segment_inf,
+                                const int wire_to_ipin_arch_sw_id) {
+    float min_delay = std::numeric_limits<float>::max();
+
+    // Check whether the inter-layer switch type for connection block is defined. If it is,
+    // get the delay of it.
+    if (wire_to_ipin_arch_sw_id != OPEN) {
+        min_delay = arch_switch_inf[wire_to_ipin_arch_sw_id].Tdel();
+    }
+
+    // Iterate over inter-layer switch types of segments to find the minimum delay
+    for (const auto& seg_inf : segment_inf) {
+        int cross_layer_sw_arch_id = seg_inf.arch_opin_between_dice_switch;
+        if (cross_layer_sw_arch_id != OPEN) {
+            min_delay = std::min(min_delay, arch_switch_inf[cross_layer_sw_arch_id].Tdel());
+        }
+    }
+
+    return min_delay;
+}
diff --git a/vpr/src/util/vpr_utils.h b/vpr/src/util/vpr_utils.h
index 1ba3dcb35b7..75842967cd1 100644
--- a/vpr/src/util/vpr_utils.h
+++ b/vpr/src/util/vpr_utils.h
@@ -311,4 +311,16 @@ t_arch_switch_inf create_internal_arch_sw(float delay);
 
 void add_pb_child_to_list(std::list<const t_pb*>& pb_list, const t_pb* parent_pb);
 
+/**
+ * @brief Iterate over all inter-layer switch types and return the minimum delay of it.
+ * useful four router lookahead to to have some estimate of the cost of crossing a layer
+ * @param arch_switch_inf
+ * @param segment_inf
+ * @param wire_to_ipin_arch_sw_id
+ * @return
+ */
+float get_min_cross_layer_delay(const std::vector<t_arch_switch_inf>& arch_switch_inf,
+                                const std::vector<t_segment_inf>& segment_inf,
+                                const int wire_to_ipin_arch_sw_id);
+
 #endif
diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index 6c1b54734e3..0fef4f22a84 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -34,6 +34,8 @@ static float do_one_route(RRNodeId source_node,
     bounding_box.xmax = device_ctx.grid.width() + 1;
     bounding_box.ymin = 0;
     bounding_box.ymax = device_ctx.grid.height() + 1;
+    bounding_box.layer_min = 0;
+    bounding_box.layer_max = device_ctx.grid.get_num_layers() - 1;
 
     t_conn_cost_params cost_params;
     cost_params.criticality = router_opts.max_criticality;
diff --git a/vpr/test/test_place_delay_model_serdes.cpp b/vpr/test/test_place_delay_model_serdes.cpp
index 818b5cc3dfe..988b3e255b4 100644
--- a/vpr/test/test_place_delay_model_serdes.cpp
+++ b/vpr/test/test_place_delay_model_serdes.cpp
@@ -23,12 +23,17 @@ TEST_CASE("round_trip_delta_delay_model", "[vpr]") {
         }
     }
 
-    DeltaDelayModel model(std::move(delays), false);
+    float min_cross_layer_delay = 0.;
+
+    DeltaDelayModel model(min_cross_layer_delay,
+                          std::move(delays),
+                          false);
     const auto& delays1 = model.delays();
 
     model.write(kDeltaDelayBin);
 
-    DeltaDelayModel model2(false);
+    DeltaDelayModel model2(min_cross_layer_delay,
+                           false);
     model2.read(kDeltaDelayBin);
 
     const auto& delays2 = model2.delays();
@@ -61,15 +66,19 @@ TEST_CASE("round_trip_override_delay_model", "[vpr]") {
             }
         }
     }
-    OverrideDelayModel model(false);
-    auto base_model = std::make_unique<DeltaDelayModel>(delays, false);
+    float min_cross_layer_delay = 0.;
+    OverrideDelayModel model(min_cross_layer_delay, false);
+    auto base_model = std::make_unique<DeltaDelayModel>(min_cross_layer_delay,
+                                                        delays,
+                                                        false);
     model.set_base_delay_model(std::move(base_model));
     model.set_delay_override(1, 2, 3, 4, 5, 6, -1);
     model.set_delay_override(2, 2, 3, 4, 5, 6, -2);
 
     model.write(kOverrideDelayBin);
 
-    OverrideDelayModel model2(false);
+    OverrideDelayModel model2(min_cross_layer_delay,
+                              false);
     model2.read(kOverrideDelayBin);
 
     const auto& delays1 = model.base_delay_model()->delays();
diff --git a/vpr/test/test_vpr_constraints.cpp b/vpr/test/test_vpr_constraints.cpp
index 30772950e19..f9a5d7e5bd4 100644
--- a/vpr/test/test_vpr_constraints.cpp
+++ b/vpr/test/test_vpr_constraints.cpp
@@ -441,7 +441,7 @@ TEST_CASE("PartRegionIntersect6", "[vpr]") {
 TEST_CASE("MacroConstraints", "[vpr]") {
     t_pl_macro pl_macro;
     PartitionRegion head_pr;
-    t_pl_offset offset(2, 1, 0);
+    t_pl_offset offset(2, 1, 0, 0);
 
     Region reg;
     reg.set_region_rect({5, 2, 9, 6, 0});
diff --git a/vtr_flow/arch/multi_die/aman_3d_coffe.xml b/vtr_flow/arch/multi_die/aman_3d_coffe.xml
new file mode 100644
index 00000000000..1825d967a18
--- /dev/null
+++ b/vtr_flow/arch/multi_die/aman_3d_coffe.xml
@@ -0,0 +1,1597 @@
+<architecture>
+  <models>
+    <model name="single_port_ram">
+      <input_ports>
+        <port name="we" clock="clk" combinational_sink_ports="out"/>
+        <!-- control -->
+        <port name="addr" clock="clk" combinational_sink_ports="out"/>
+        <!-- address lines -->
+        <port name="data" clock="clk" combinational_sink_ports="out"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <model name="dual_port_ram">
+      <input_ports>
+        <port name="we1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- write enable -->
+        <port name="we2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- write enable -->
+        <port name="addr1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- address lines -->
+        <port name="addr2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- address lines -->
+        <port name="data1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="data2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out1" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        <port name="out2" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <!-- Used inside DSPs. 
+         Fixed point multiplication.
+         ODIN infers these when * sign appears in RTL. -->
+    <model name="multiply">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_2">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    
+    <model name="mult_add_int_18x19">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="scanin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+        <port name="scanout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_4">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+  </models>
+    <!-- Fixed point MAC inside DSP slices -->
+  <tiles>
+    <tile name="io" area="0">
+      <sub_tile name="io" capacity="8">
+        <equivalent_sites>
+          <site pb_type="io" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+        <pinlocations pattern="custom">
+          <loc side="left" layer_offset="1">io.inpad</loc>
+          <loc side="left">io.outpad io.clock</loc>
+          <loc side="top" layer_offset="1">io.inpad</loc>
+          <loc side="top">io.outpad io.clock</loc>
+          <loc side="right" layer_offset="1">io.inpad</loc>
+          <loc side="right">io.outpad io.clock</loc>
+          <loc side="bottom" layer_offset="1">io.inpad</loc>
+          <loc side="bottom">io.outpad io.clock</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="clb" height="1" width="1" area="27905">
+      <sub_tile name="clb">
+        <equivalent_sites>
+          <site pb_type="clb" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="I1" num_pins="10" equivalent="full"/>
+        <input name="I2" num_pins="10" equivalent="full"/>
+        <input name="I3" num_pins="10" equivalent="full"/>
+        <input name="I4" num_pins="10" equivalent="full"/>
+        <output name="O1" num_pins="2" equivalent="full"/>
+        <output name="O2" num_pins="2" equivalent="full"/>
+        <output name="O3" num_pins="2" equivalent="full"/>
+        <output name="O4" num_pins="2" equivalent="full"/>
+        <output name="O5" num_pins="2" equivalent="full"/>
+        <output name="O6" num_pins="2" equivalent="full"/>
+        <output name="O7" num_pins="2" equivalent="full"/>
+        <output name="O8" num_pins="2" equivalent="full"/>
+        <output name="O9" num_pins="2" equivalent="full"/>
+        <output name="O10" num_pins="2" equivalent="full"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.2" out_type="frac" out_val="0.025"/>
+        <!-- Two sided connectivity CLB architecture--> 
+        <pinlocations pattern="custom">
+          <loc side="right">clb.I1 clb.I3 clb.clk</loc>
+          <loc side="right" layer_offset="1">clb.O1 clb.O2 clb.O3 clb.O4 clb.O5</loc>
+          <loc side="bottom">clb.I2 clb.I4 clb.clk</loc>
+          <loc side="bottom" layer_offset="1">clb.O6 clb.O7 clb.O8 clb.O9 clb.O10</loc>    
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="dsp_top" height="1" width="1" area="253779">
+      <sub_tile name="dsp_top">
+        <equivalent_sites>
+          <site pb_type="dsp_top" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="reset" num_pins="1" is_non_clock_global="true"/>
+        <input name="dsp_I1" num_pins="37" />
+        <input name="dsp_I2" num_pins="37" />
+        <input name="chainin" num_pins="64"/>
+        <input name="scanin" num_pins="27"/>
+        <output name="result" num_pins="74"/>
+        <output name="chainout" num_pins="64"/>
+        <output name="scanout" num_pins="27"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <!-- clock pins and chain ports do not connect to local routing -->
+          <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="chainin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="chainout" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="scanin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="scanout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="custom">
+            <loc side="left">dsp_top.dsp_I1 dsp_top.reset</loc>
+            <loc side="right">dsp_top.dsp_I2 dsp_top.clk</loc>
+            <loc side="top">dsp_top.chainin dsp_top.scanin</loc>
+            <loc side="bottom">dsp_top.chainout dsp_top.scanout</loc>
+            <loc side="right" layer_offset="1">dsp_top.result[36:0]</loc>
+            <loc side="left" layer_offset="1">dsp_top.result[73:37]</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="memory" height="1" width="1" area="137668">
+      <sub_tile name="memory">
+        <equivalent_sites>
+          <site pb_type="memory" pin_mapping="direct"/>
+        </equivalent_sites>
+      <input name="addr1" num_pins="11"/>
+      <input name="addr2" num_pins="11"/>
+      <input name="data" num_pins="40"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="40"/>
+      <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <fc_override fc_type="frac" fc_val="0" port_name="clk"/>
+        </fc>  
+        <pinlocations pattern="custom">
+          <loc side="top"> memory.addr1[0] memory.addr1[8] memory.addr2[5] memory.data[2] memory.data[10] memory.data[18] memory.data[26] memory.data[34] memory.clk memory.addr1[1] memory.addr1[9] memory.addr2[6] memory.data[3] memory.data[11] memory.data[19] memory.data[27] memory.data[35]</loc>
+          <loc side="right"> memory.addr1[2] memory.addr1[10] memory.addr2[7] memory.data[4] memory.data[12] memory.data[20] memory.data[28] memory.data[36] memory.addr1[3] memory.addr2[0] memory.addr2[8] memory.data[5] memory.data[13] memory.data[21] memory.data[29] memory.data[37]</loc>
+          <loc side="bottom"> memory.addr1[4] memory.addr2[1] memory.addr2[9] memory.data[6] memory.data[14] memory.data[22] memory.data[30] memory.data[38] memory.addr1[5] memory.addr2[2] memory.addr2[10] memory.data[7] memory.data[15] memory.data[23] memory.data[31] memory.data[39]</loc>
+          <loc side="left" > memory.addr1[6] memory.addr2[3] memory.data[0] memory.data[8] memory.data[16] memory.data[24] memory.data[32] memory.we1 memory.addr1[7] memory.addr2[4] memory.data[1] memory.data[9] memory.data[17] memory.data[25] memory.data[33] memory.we2</loc>
+
+          <loc side="top" layer_offset="1"> memory.out[0] memory.out[8] memory.out[16] memory.out[24] memory.out[32] memory.out[1] memory.out[9] memory.out[17] memory.out[25] memory.out[33]</loc>
+          <loc side="right" layer_offset="1"> memory.out[2] memory.out[10] memory.out[18] memory.out[26] memory.out[34] memory.out[3] memory.out[11] memory.out[19] memory.out[27] memory.out[35]</loc>
+          <loc side="bottom" layer_offset="1"> memory.out[4] memory.out[12] memory.out[20] memory.out[28] memory.out[36] memory.out[5] memory.out[13] memory.out[21] memory.out[29] memory.out[37]</loc>
+          <loc side="left" layer_offset="1"> memory.out[6] memory.out[14] memory.out[22] memory.out[30] memory.out[38] memory.out[7] memory.out[15] memory.out[23] memory.out[31] memory.out[39]</loc>
+
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="tsv_hole" height="2" width="2" area="137668">
+      <sub_tile name="tsv_hole">
+        <equivalent_sites>
+          <site pb_type="tsv_hole"/>
+        </equivalent_sites>
+        <input name="in" num_pins="1"/>
+        <output name="out" num_pins="1"/>
+        <fc in_type="abs" in_val="0" out_type="abs" out_val="0"/>
+      </sub_tile>
+    </tile>
+  </tiles>
+  <!-- ODIN II specific config ends -->
+  <layout>
+    <!-- Physical descriptions begin -->
+    <fixed_layout name="coffe_7nm" width="328" height="288">
+      <layer die="0">
+        <perimeter type="io" priority="101"/>
+      
+        <corners type="EMPTY" priority="102"/>
+
+        <fill type="clb" priority="10"/>
+
+        <col type="memory" startx="11" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="25" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="37" starty="1" repeatx="41" priority="20"/>
+
+        <col type="dsp_top" startx="18" starty="1" repeatx="41" priority="20"/>
+        <col type="dsp_top" startx="31" starty="1" repeatx="41" priority="20"/>
+
+        <!-- PW -->
+        <col type="tsv_hole" startx="8" starty="5" repeatx="13" incry="12" priority="103"/>
+        
+        <!-- GND -->
+        <col type="tsv_hole" startx="14" starty="11" repeatx="13" incry="12" priority="103"/>
+
+      </layer>
+      <layer die="1">
+        <perimeter type="io" priority="101"/>
+      
+        <corners type="EMPTY" priority="102"/>
+
+        <fill type="clb" priority="10"/>
+
+        <col type="memory" startx="11" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="25" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="37" starty="1" repeatx="41" priority="20"/>
+
+        <col type="dsp_top" startx="18" starty="1" repeatx="41" priority="20"/>
+        <col type="dsp_top" startx="31" starty="1" repeatx="41" priority="20"/>
+        
+      </layer>
+    </fixed_layout>
+  </layout>
+  <device>
+    <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/>
+    <area grid_logic_tile_area="0"/>
+    <chan_width_distr>
+      <x distr="uniform" peak="1.000000"/>
+      <y distr="uniform" peak="1.000000"/>
+    </chan_width_distr>
+    <switch_block type="custom"/>
+    <connection_block input_switch_name="ipin_cblock" input_inter_die_switch_name="die_connection"/>
+  </device>
+  <switchlist>
+    <switch type="mux" name="L4_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="9.877e-11" mux_trans_size="2.6482996805637553" buf_size="18.744014602932605"/>
+    <switch type="mux" name="L4_inter_layer_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="17.177e-11" mux_trans_size="2.6482996805637553" buf_size="18.744014602932605"/>
+    <!-- Delay of L16 driver is scaled from L4 by a factor of 1.5x (based on numbers from the Titan Stratix IV architecture file)
+   Area numbers will not be totally accurate because of the same buf_size -->
+    <switch type="mux" name="L16_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="2.016e-10" mux_trans_size="3.1851297470059468" buf_size="39.327334265524485"/>
+    <switch type="mux" name="L16_inter_layer_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="2.746e-10" mux_trans_size="3.1851297470059468" buf_size="39.327334265524485"/>
+    <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="5.636e-11" mux_trans_size="2.008" buf_size="9.624436045683868"/>
+    <switch type="mux" name="die_connection" R="0.0" Cout="0.0" Cin="0.0" Tdel="130e-12" mux_trans_size="1.508" buf_size="11.71"/>
+  </switchlist>
+  <segmentlist>
+    <segment name="L4" freq="280" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0">
+      <mux name="L4_driver"/>
+      <mux_inter_die name="L4_inter_layer_driver"/>
+      <sb type="pattern">1 1 1 1 1</sb>
+      <cb type="pattern">1 1 1 1</cb>
+    </segment>
+    <segment name="L16" freq="40" length="16" type="unidir" Rmetal="0.0" Cmetal="0.0">
+      <mux name="L16_driver"/>
+      <mux_inter_die name="L16_inter_layer_driver"/>
+      <!-- Vias from the top of the metal stack (global layers, where the long wires are 
+           implemented) down to the middle/bottom of the metal stack (semi-global layers, 
+           where the short wires are implemented) are expensive and restrictive.
+           As a result Startix IV only places long wire switch blocks every 4 LABs -->
+      <sb type="pattern">1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1</sb>
+      <!-- For the same reasons, long wires do not connect to block pins in Stratix IV -->
+      <cb type="pattern">0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0</cb>
+    </segment>
+  </segmentlist>
+  <directlist>
+    <!-- Direct connect from one DSP to the DSP directly below it -->
+    <direct name="dsp_out_chain" from_pin="dsp_top.chainout" from_side="bottom" to_pin="dsp_top.chainin" to_side="top" x_offset="0" y_offset="-4" z_offset="0"/>
+    <direct name="dsp_in_chain" from_pin="dsp_top.scanout" from_side="bottom" to_pin="dsp_top.scanin" to_side="top" x_offset="0" y_offset="-4" z_offset="0"/>
+  </directlist>
+
+  <complexblocklist>
+    <!-- Define I/O pads begin -->
+    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+    <pb_type name="io">
+      <input name="outpad" num_pins="1"/>
+      <output name="inpad" num_pins="1"/>
+      <clock name="clock" num_pins="1"/>
+      <!-- IOs can operate as either inputs or outputs.
+       Delays below come from Ian Kuon. They are small, so they should be interpreted as
+       the delays to and from registers in the I/O (and generally I/Os are registered 
+       today and that is when you timing analyze them.
+       -->
+      <mode name="inpad">
+        <pb_type name="inpad" blif_model=".input" num_pb="1">
+          <output name="inpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="outpad">
+        <pb_type name="outpad" blif_model=".output" num_pb="1">
+          <input name="outpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+      <!-- Place I/Os on the sides of the FPGA -->
+    </pb_type>
+    <!-- Define I/O pads ends -->
+    <!-- Define general purpose logic block (CLB) begin -->
+    <pb_type name="clb">
+      <input name="I1" num_pins="10" equivalent="full"/>
+      <input name="I2" num_pins="10" equivalent="full"/>
+      <input name="I3" num_pins="10" equivalent="full"/>
+      <input name="I4" num_pins="10" equivalent="full"/>
+      <output name="O1" num_pins="2" equivalent="full"/>
+      <output name="O2" num_pins="2" equivalent="full"/>
+      <output name="O3" num_pins="2" equivalent="full"/>
+      <output name="O4" num_pins="2" equivalent="full"/>
+      <output name="O5" num_pins="2" equivalent="full"/>
+      <output name="O6" num_pins="2" equivalent="full"/>
+      <output name="O7" num_pins="2" equivalent="full"/>
+      <output name="O8" num_pins="2" equivalent="full"/>
+      <output name="O9" num_pins="2" equivalent="full"/>
+      <output name="O10" num_pins="2" equivalent="full"/>
+      <clock name="clk" num_pins="1"/>  <!-- Basic logic element definition -->
+      <pb_type name="fle" num_pb="10">
+        <input name="in_A" num_pins="1"/>
+        <input name="in_B" num_pins="1"/>
+        <input name="in_C" num_pins="1"/>
+        <input name="in_D" num_pins="1"/>
+        <input name="in_E" num_pins="1"/>
+        <input name="in_F" num_pins="1"/>
+        <output name="out_local" num_pins="2"/>
+        <output name="out_routing" num_pins="2"/>
+        <clock name="clk" num_pins="1"/> 
+        <mode name="n1_lut6">
+          <pb_type name="ble6" num_pb="1">
+            <input name="in_A" num_pins="1"/>
+            <input name="in_B" num_pins="1"/>
+            <input name="in_C" num_pins="1"/>
+            <input name="in_D" num_pins="1"/>
+            <input name="in_E" num_pins="1"/>
+            <input name="in_F" num_pins="1"/>
+            <output name="out_local" num_pins="1"/>
+            <output name="out_routing" num_pins="2"/>
+            <clock name="clk" num_pins="1"/> 
+            <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+              <input name="in" num_pins="6" port_class="lut_in"/>
+              <output name="out" num_pins="1" port_class="lut_out"/>
+              <!-- We define the LUT delays on the LUT pins instead of through the LUT -->
+              <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                 0
+                 0
+                 0
+                 0
+                 0
+                 0
+              </delay_matrix>
+            </pb_type>
+            <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+              <input name="D" num_pins="1" port_class="D"/>
+              <output name="Q" num_pins="1" port_class="Q"/>
+              <clock name="clk" num_pins="1" port_class="clock"/>
+              <T_setup value="1.891e-11" port="ff.D" clock="clk"/>
+              <T_clock_to_Q max="6.032e-11" port="ff.Q" clock="clk"/>
+            </pb_type>
+            <interconnect>
+              <direct name="direct0" input="ble6.in_A" output="lut6.in[0:0]">
+                <delay_constant max="1.1287999999999999e-10" in_port="ble6.in_A" out_port="lut6.in[0:0]" />
+              </direct>
+              <direct name="direct1" input="ble6.in_B" output="lut6.in[1:1]">
+                <delay_constant max="1.1072500000000001e-10" in_port="ble6.in_B" out_port="lut6.in[1:1]" />
+              </direct>
+              <direct name="direct3" input="ble6.in_D" output="lut6.in[3:3]">
+                <delay_constant max="8.1212e-11" in_port="ble6.in_D" out_port="lut6.in[3:3]" />
+              </direct>
+              <direct name="direct4" input="ble6.in_E" output="lut6.in[4:4]">
+                <delay_constant max="7.961e-11" in_port="ble6.in_E" out_port="lut6.in[4:4]" />
+              </direct>
+              <direct name="direct5" input="ble6.in_F" output="lut6.in[5:5]">
+                <delay_constant max="4.9300999999999996e-11" in_port="ble6.in_F" out_port="lut6.in[5:5]" />
+              </direct>
+              <!--Clock -->
+              <direct name="direct6" input="ble6.clk" output="ff.clk"/>
+              <!-- Register feedback mux -->   
+              <mux name="mux1" input="ble6.in_C ff.Q" output="lut6.in[2:2]">
+                <delay_constant max="1.1347e-10" in_port="ble6.in_C" out_port="lut6.in[2:2]" />
+                <delay_constant max="1.1347e-10" in_port="ff.Q" out_port="lut6.in[2:2]" />  
+              </mux>
+              <!-- FF input selection mux -->
+              <mux name="2" input="lut6.out ble6.in_C" output="ff.D">
+                <delay_constant max="1.74588e-11" in_port="lut6.out" out_port="ff.D" />
+                <delay_constant max="1.74588e-11" in_port="ble6.in_C" out_port="ff.D" />
+              </mux>
+              <!-- BLE output (local) -->
+              <mux name="mux3" input="ff.Q lut6.out" output="ble6.out_local">
+                <delay_constant max="1.346e-10" in_port="ff.Q" out_port="ble6.out_local" />
+                <delay_constant max="1.346e-10" in_port="lut6.out" out_port="ble6.out_local" />
+              </mux>
+              <!-- BLE output (routing 1) --> 
+              <mux name="mux4" input="ff.Q lut6.out" output="ble6.out_routing[0:0]">
+                <delay_constant max="3.771e-11" in_port="ff.Q" out_port="ble6.out_routing[0:0]" />
+                <delay_constant max="3.771e-11" in_port="lut6.out" out_port="ble6.out_routing[0:0]" />
+              </mux>
+              <!-- BLE output (routing 2) --> 
+              <mux name="mux5" input="ff.Q lut6.out" output="ble6.out_routing[1:1]">
+                <delay_constant max="3.771e-11" in_port="ff.Q" out_port="ble6.out_routing[1:1]" />
+                <delay_constant max="3.771e-11" in_port="lut6.out" out_port="ble6.out_routing[1:1]" />
+              </mux>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in_A" output="ble6.in_A"/>
+            <direct name="direct2" input="fle.in_B" output="ble6.in_B"/>
+            <direct name="direct3" input="fle.in_C" output="ble6.in_C"/>
+            <direct name="direct4" input="fle.in_D" output="ble6.in_D"/>
+            <direct name="direct5" input="fle.in_E" output="ble6.in_E"/>
+            <direct name="direct6" input="fle.in_F" output="ble6.in_F"/>
+            <direct name="direct7" input="ble6.out_local" output="fle.out_local[0:0]"/>
+            <direct name="direct8" input="ble6.out_routing" output="fle.out_routing"/>
+            <direct name="direct9" input="fle.clk" output="ble6.clk"/>
+          </interconnect>
+        </mode>  
+        <mode name="n2_lut5">
+          <pb_type name="lut5inter" num_pb="1">
+            <input name="in_A" num_pins="1"/>
+            <input name="in_B" num_pins="1"/>
+            <input name="in_C" num_pins="1"/>
+            <input name="in_D" num_pins="1"/>
+            <input name="in_E" num_pins="1"/>
+            <output name="out_local" num_pins="2"/>
+            <output name="out_routing" num_pins="2"/>
+            <clock name="clk" num_pins="1"/> 
+            <pb_type name="ble5" num_pb="2">
+              <input name="in_A" num_pins="1"/>
+              <input name="in_B" num_pins="1"/>
+              <input name="in_C" num_pins="1"/>
+              <input name="in_D" num_pins="1"/>
+              <input name="in_E" num_pins="1"/>
+              <output name="out_local" num_pins="1"/>
+              <output name="out_routing" num_pins="1"/>
+              <clock name="clk" num_pins="1"/> 
+              <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                <input name="in" num_pins="5" port_class="lut_in"/>
+                <output name="out" num_pins="1" port_class="lut_out"/>
+                <!-- We define the LUT delays on the LUT pins instead of through the LUT -->
+                <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                   0
+                   0
+                   0
+                   0
+                   0
+                </delay_matrix>
+              </pb_type>
+              <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                <input name="D" num_pins="1" port_class="D"/>
+                <output name="Q" num_pins="1" port_class="Q"/>
+                <clock name="clk" num_pins="1" port_class="clock"/>
+                <T_setup value="1.891e-11" port="ff.D" clock="clk"/>
+                <T_clock_to_Q max="6.032e-11" port="ff.Q" clock="clk"/>
+              </pb_type>
+              <interconnect>
+                <direct name="direct0" input="ble5.in_A" output="lut5.in[0:0]">
+                  <delay_constant max="1.1287999999999999e-10" in_port="ble5.in_A" out_port="lut5.in[0:0]" />
+                </direct>
+                <direct name="direct1" input="ble5.in_B" output="lut5.in[1:1]">
+                  <delay_constant max="1.1072500000000001e-10" in_port="ble5.in_B" out_port="lut5.in[1:1]" />
+                </direct>
+                <direct name="direct3" input="ble5.in_D" output="lut5.in[3:3]">
+                  <delay_constant max="8.1212e-11" in_port="ble5.in_D" out_port="lut5.in[3:3]" />
+                </direct>
+                <direct name="direct4" input="ble5.in_E" output="lut5.in[4:4]">
+                  <delay_constant max="7.961e-11" in_port="ble5.in_E" out_port="lut5.in[4:4]" />
+                </direct>
+                  <!--Clock -->
+                <direct name="direct5" input="ble5.clk" output="ff.clk"/>
+                <!-- Register feedback mux -->   
+                <mux name="mux1" input="ble5.in_C ff.Q" output="lut5.in[2:2]">
+                  <delay_constant max="1.1347e-10" in_port="ble5.in_C" out_port="lut5.in[2:2]" />
+                  <delay_constant max="1.1347e-10" in_port="ff.Q" out_port="lut5.in[2:2]" />  
+                </mux>
+                <!-- FF input selection mux -->
+                <mux name="2" input="lut5.out ble5.in_C" output="ff.D">
+                  <delay_constant max="1.74588e-11" in_port="lut5.out" out_port="ff.D" />
+                  <delay_constant max="1.74588e-11" in_port="ble5.in_C" out_port="ff.D" />
+                </mux>
+                <!-- BLE output (local) -->
+                <mux name="mux3" input="ff.Q lut5.out" output="ble5.out_local">
+                  <delay_constant max="1.346e-10" in_port="ff.Q" out_port="ble5.out_local" />
+                  <delay_constant max="1.346e-10" in_port="lut5.out" out_port="ble5.out_local" />
+                </mux>
+                <!-- BLE output (routing 1) --> 
+                <mux name="mux4" input="ff.Q lut5.out" output="ble5.out_routing[0:0]">
+                  <delay_constant max="3.771e-11" in_port="ff.Q" out_port="ble5.out_routing[0:0]" />
+                  <delay_constant max="3.771e-11" in_port="lut5.out" out_port="ble5.out_routing[0:0]" />
+                </mux>
+              </interconnect>
+            </pb_type>
+            <interconnect>
+              <direct name="direct1" input="lut5inter.in_A" output="ble5[0:0].in_A"/>
+              <direct name="direct2" input="lut5inter.in_B" output="ble5[0:0].in_B"/>
+              <direct name="direct3" input="lut5inter.in_C" output="ble5[0:0].in_C"/>
+              <direct name="direct4" input="lut5inter.in_D" output="ble5[0:0].in_D"/>
+              <direct name="direct5" input="lut5inter.in_E" output="ble5[0:0].in_E"/>
+              <direct name="direct6" input="lut5inter.in_A" output="ble5[1:1].in_A"/>
+              <direct name="direct7" input="lut5inter.in_B" output="ble5[1:1].in_B"/>
+              <direct name="direct8" input="lut5inter.in_C" output="ble5[1:1].in_C"/>
+              <direct name="direct9" input="lut5inter.in_D" output="ble5[1:1].in_D"/>
+              <direct name="direct10" input="lut5inter.in_E" output="ble5[1:1].in_E"/>
+              <direct name="direct11" input="ble5[1:0].out_local" output="lut5inter.out_local"/>
+              <direct name="direct12" input="ble5[1:0].out_routing" output="lut5inter.out_routing"/>
+              <complete name="complete1" input="lut5inter.clk" output="ble5[1:0].clk"/> 
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in_A" output="lut5inter.in_A"/>
+            <direct name="direct2" input="fle.in_B" output="lut5inter.in_B"/>
+            <direct name="direct3" input="fle.in_C" output="lut5inter.in_C"/>
+            <direct name="direct4" input="fle.in_D" output="lut5inter.in_D"/>
+            <direct name="direct5" input="fle.in_E" output="lut5inter.in_E"/>
+            <direct name="direct7" input="lut5inter.out_local" output="fle.out_local"/>
+            <direct name="direct8" input="lut5inter.out_routing" output="fle.out_routing"/>
+            <direct name="direct9" input="fle.clk" output="lut5inter.clk"/>
+          </interconnect>
+        </mode> 
+        </pb_type>
+        <interconnect>
+        <!-- 50% sparsely populated local routing -->
+        <complete name="lutA" input="clb.I4 clb.I3 fle[1:0].out_local fle[3:2].out_local fle[8:8].out_local" output="fle[9:0].in_A">
+          <delay_constant max="2.842e-11" in_port="clb.I4" out_port="fle.in_A" />
+          <delay_constant max="2.842e-11" in_port="clb.I3" out_port="fle.in_A" />
+          </complete>
+        <complete name="lutB" input="clb.I3 clb.I2 fle[3:2].out_local fle[5:4].out_local fle[9:9].out_local" output="fle[9:0].in_B">
+          <delay_constant max="2.842e-11" in_port="clb.I3" out_port="fle.in_B" />
+          <delay_constant max="2.842e-11" in_port="clb.I2" out_port="fle.in_B" />
+          </complete>
+        <complete name="lutC" input="clb.I2 clb.I1 fle[5:4].out_local fle[7:6].out_local fle[8:8].out_local" output="fle[9:0].in_C">
+          <delay_constant max="2.842e-11" in_port="clb.I2" out_port="fle.in_C" />
+          <delay_constant max="2.842e-11" in_port="clb.I1" out_port="fle.in_C" />
+          </complete>
+        <complete name="lutD" input="clb.I4 clb.I2 fle[1:0].out_local fle[5:4].out_local fle[9:9].out_local" output="fle[9:0].in_D">
+          <delay_constant max="2.842e-11" in_port="clb.I4" out_port="fle.in_D" />
+          <delay_constant max="2.842e-11" in_port="clb.I2" out_port="fle.in_D" />
+          </complete>
+        <complete name="lutE" input="clb.I3 clb.I1 fle[3:2].out_local fle[7:6].out_local fle[8:8].out_local" output="fle[9:0].in_E">
+          <delay_constant max="2.842e-11" in_port="clb.I3" out_port="fle.in_E" />
+          <delay_constant max="2.842e-11" in_port="clb.I1" out_port="fle.in_E" />
+          </complete>
+        <complete name="lutF" input="clb.I4 clb.I1 fle[1:0].out_local fle[7:6].out_local fle[9:9].out_local" output="fle[9:0].in_F">
+          <delay_constant max="2.842e-11" in_port="clb.I4" out_port="fle.in_F" />
+          <delay_constant max="2.842e-11" in_port="clb.I1" out_port="fle.in_F" />
+          </complete>
+          <complete name="clks" input="clb.clk" output="fle[9:0].clk">
+          </complete>
+          <!-- Direct connections to CLB outputs -->
+          <direct name="clbouts1" input="fle[0:0].out_routing" output="clb.O1"/>
+          <direct name="clbouts2" input="fle[1:1].out_routing" output="clb.O2"/>
+          <direct name="clbouts3" input="fle[2:2].out_routing" output="clb.O3"/>
+          <direct name="clbouts4" input="fle[3:3].out_routing" output="clb.O4"/>
+          <direct name="clbouts5" input="fle[4:4].out_routing" output="clb.O5"/>
+          <direct name="clbouts6" input="fle[5:5].out_routing" output="clb.O6"/>
+          <direct name="clbouts7" input="fle[6:6].out_routing" output="clb.O7"/>
+          <direct name="clbouts8" input="fle[7:7].out_routing" output="clb.O8"/>
+          <direct name="clbouts9" input="fle[8:8].out_routing" output="clb.O9"/>
+          <direct name="clbouts10" input="fle[9:9].out_routing" output="clb.O10"/>
+        </interconnect>
+      </pb_type>
+    <!-- Define general purpose logic block (CLB) ends -->
+
+    <!-- Define DSP slice begin -->
+    <pb_type name="dsp_top">
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <input name="dsp_I1" num_pins="37" />
+      <input name="dsp_I2" num_pins="37" />
+      <input name="chainin" num_pins="64"/>
+      <input name="scanin" num_pins="27"/>
+      <output name="result" num_pins="74"/>
+      <output name="chainout" num_pins="64"/>
+      <output name="scanout" num_pins="27"/>
+      <clock name="clk" num_pins="1"/>
+
+    <pb_type name="dsp" num_pb="1">
+      <input name="reset" num_pins="1"/>
+      <input name="dsp_I1" num_pins="37"/>
+      <input name="dsp_I2" num_pins="37"/>
+      <input name="chainin" num_pins="64"/>
+      <input name="scanin" num_pins="27"/>
+      <output name="result" num_pins="74"/>
+      <output name="chainout" num_pins="64"/>
+      <output name="scanout" num_pins="27"/>
+      <clock name="clk" num_pins="1"/>
+
+      <pb_type name="dsp_pb" num_pb="1">
+        <input name="reset" num_pins="1"/>
+        <input name="datain" num_pins="74"/>
+        <input name="chainin" num_pins="64"/>
+        <input name="scanin" num_pins="27"/>
+        <output name="result" num_pins="74"/>
+        <output name="chainout" num_pins="64"/>
+        <output name="scanout" num_pins="27"/>
+        <clock name="clk" num_pins="1"/>
+
+        <!-- fixed-point multiplier mode (1 27x27 multiplier) result = ax*ay -->
+        <mode name="one_mult_27x27">
+          <pb_type name="one_mult_27x27" num_pb="1">
+            <input name="a" num_pins="27"/>
+            <input name="b" num_pins="27"/>
+            <output name="out" num_pins="54"/>
+            <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1">
+              <input name="a" num_pins="27"/>
+              <input name="b" num_pins="27"/>
+              <output name="out" num_pins="54"/>
+              <delay_constant max="1.667e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/>
+              <delay_constant max="1.667e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a">
+              </direct>
+              <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b">
+              </direct>
+              <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out">
+              </direct>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="datain2a" input="dsp_pb.datain[26:0]" output="one_mult_27x27.a">
+            </direct>
+            <direct name="datain2b" input="dsp_pb.datain[53:27]" output="one_mult_27x27.b">
+            </direct>
+            <direct name="out2dataout" input="one_mult_27x27.out" output="dsp_pb.result[53:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier mode (2 18x19 multipliers) result[some:bits] = ax*ay, result[other:bits] = bx*by -->
+        <mode name="two_mult_18x19">
+          <pb_type name="two_mult_18x19" num_pb="2">
+            <input name="a" num_pins="18"/>
+            <input name="b" num_pins="19"/>
+            <output name="out" num_pins="37"/>
+            <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1">
+              <input name="a" num_pins="18"/>
+              <input name="b" num_pins="19"/>
+              <output name="out" num_pins="37"/>
+              <delay_constant max="1.667e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/>
+              <delay_constant max="1.667e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a">
+                 </direct>
+              <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b">
+                 </direct>
+              <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out">
+                 </direct>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="datain2a1" input="dsp_pb.datain[17:0]" output="two_mult_18x19[0].a">
+            </direct>
+            <direct name="datain2b1" input="dsp_pb.datain[36:18]" output="two_mult_18x19[0].b">
+            </direct>
+            <direct name="datain2a2" input="dsp_pb.datain[54:37]" output="two_mult_18x19[1].a">
+            </direct>
+            <direct name="datain2b2" input="dsp_pb.datain[73:55]" output="two_mult_18x19[1].b">
+            </direct>
+            <direct name="out2result" input="two_mult_18x19.out" output="dsp_pb.result[73:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (bx * by) + (ax * ay) + chainin. chainout = result -->
+        <mode name="sop_2_mode">
+          <pb_type name="sop_2" num_pb="1" blif_model=".subckt int_sop_2">
+            <input name="reset" num_pins="1"/>
+            <input name="ax" num_pins="18"/>
+            <input name="ay" num_pins="19"/>
+            <input name="bx" num_pins="18"/>
+            <input name="by" num_pins="19"/>
+            <input name="chainin" num_pins="37"/>
+            <output name="result" num_pins="37"/>
+            <output name="chainout" num_pins="37"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_2.reset" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ax" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ay" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.bx" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.by" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.chainin" out_port="sop_2.result"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_2.reset" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ax" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ay" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.bx" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.by" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.chainin" out_port="sop_2.chainout"/>
+
+            <T_setup value="1.891e-11" port="sop_2.ax" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.ay" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.bx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.by" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.chainin" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.reset" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.result" clock="clk"/>
+
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.ax" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.ay" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.bx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.by" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.chainin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.reset" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_2.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_2.reset">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[17:0]" output="sop_2.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[36:18]" output="sop_2.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[54:37]" output="sop_2.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[73:55]" output="sop_2.by">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[36:0]" output="sop_2.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_2.result" output="dsp_pb.result[36:0]">
+            </direct>
+            <direct name="chainout" input="sop_2.chainout" output="dsp_pb.chainout[36:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (ax * ay) + bx + chainin. chainout = result. with scanin-scanout support -->
+        <mode name="mult_add_mode_18_19_36">
+          <pb_type name="mult_add" num_pb="1" blif_model=".subckt mult_add_int_18x19">
+            <input name="reset" num_pins="1"/>
+            <input name="ax" num_pins="18"/>
+            <input name="ay" num_pins="19"/>
+            <input name="bx" num_pins="36"/>
+            <input name="chainin" num_pins="64"/>
+            <input name="scanin" num_pins="19"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <output name="scanout" num_pins="19"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="1.667e-9" in_port="mult_add.reset" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ax" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ay" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.bx" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.chainin" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.scanin" out_port="mult_add.result"/>
+
+            <delay_constant max="1.667e-9" in_port="mult_add.reset" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ax" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ay" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.bx" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.chainin" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.scanin" out_port="mult_add.chainout"/>
+
+            <delay_constant max="1.667e-9" in_port="mult_add.reset" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ax" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ay" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.bx" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.chainin" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.scanin" out_port="mult_add.scanout"/>
+
+            <T_setup value="1.891e-11" port="mult_add.ax" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.ay" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.bx" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.chainin" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.scanin" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.reset" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.ax" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.ay" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.bx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.scanin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.reset" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="mult_add.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="mult_add.reset">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[17:0]" output="mult_add.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[36:18]" output="mult_add.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[72:37]" output="mult_add.bx">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="mult_add.chainin">
+            </direct>
+            <direct name="scanin"   input="dsp_pb.scanin[18:0]" output="mult_add.scanin">
+            </direct>
+            <direct name="dataout2result" input="mult_add.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="mult_add.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+            <direct name="scanout" input="mult_add.scanout" output="dsp_pb.scanout[18:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point sum-of-4 mode result = (dx * dy) + (cx * cy) + (bx * by) + (ax * ay) + chainin. chainout = result -->
+        <mode name="sop_4_mode">
+          <pb_type name="sop_4" num_pb="1" blif_model=".subckt int_sop_4">
+            <input name="reset" num_pins="1"/>
+            <input name="ax" num_pins="9"/>
+            <input name="ay" num_pins="9"/>
+            <input name="bx" num_pins="9"/>
+            <input name="by" num_pins="9"/>
+            <input name="cx" num_pins="9"/>
+            <input name="cy" num_pins="9"/>
+            <input name="dx" num_pins="9"/>
+            <input name="dy" num_pins="9"/>
+            <input name="chainin" num_pins="64"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_4.reset" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ax" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ay" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.bx" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.by" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cx" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cy" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dx" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dy" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.chainin" out_port="sop_4.result"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_4.reset" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ax" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ay" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.bx" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.by" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cx" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cy" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dx" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dy" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.chainin" out_port="sop_4.chainout"/>
+
+            <T_setup value="1.891e-11" port="sop_4.ax" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.ay" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.bx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.by" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.cx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.cy" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.dx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.dy" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.chainin" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.reset" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.result" clock="clk"/>
+
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.ax" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.ay" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.bx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.by" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.cx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.cy" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.dx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.dy" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.chainin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.reset" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_4.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_4.reset">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[8:0]" output="sop_4.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[17:9]" output="sop_4.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[26:18]" output="sop_4.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[35:27]" output="sop_4.by">
+            </direct>
+            <direct name="datain2cx" input="dsp_pb.datain[44:36]" output="sop_4.cx">
+            </direct>
+            <direct name="datain2cy" input="dsp_pb.datain[53:45]" output="sop_4.cy">
+            </direct>
+            <direct name="datain2dx" input="dsp_pb.datain[62:54]" output="sop_4.dx">
+            </direct>
+            <direct name="datain2dy" input="dsp_pb.datain[71:63]" output="sop_4.dy">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="sop_4.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_4.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="sop_4.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+          </interconnect>
+        </mode>
+      </pb_type>
+
+      <interconnect>
+        <direct name="datain1" input="dsp.dsp_I1" output ="dsp_pb.datain[36:0]"/>
+        <direct name="datain2" input="dsp.dsp_I2" output ="dsp_pb.datain[73:37]"/>
+        <direct name="reset" input="dsp.reset" output="dsp_pb.reset"></direct>
+        <direct name="chainin" input="dsp.chainin"    output="dsp_pb.chainin"></direct>
+        <direct name="chainout" input="dsp_pb.chainout" output="dsp.chainout"></direct>
+        <direct name="scanin" input="dsp.scanin"    output="dsp_pb.scanin"></direct>
+        <direct name="scanout" input="dsp_pb.scanout" output="dsp.scanout"></direct>
+        <direct name="result" input="dsp_pb.result" output="dsp.result"></direct>
+        <direct name="clk" input="dsp.clk" output="dsp_pb.clk"></direct>
+      </interconnect>  
+    </pb_type>
+
+      
+    <interconnect>
+      <!--50% sparse crossbar means 50% of the lines can reach an actual input of the dsp 
+      We do this by splitting inputs into two buckets and having two full crossbars-->
+      <!--
+     <complete name="first_half" input="dsp_top.dsp_I1" output="dsp.dsp_I1">
+          <delay_constant max="333e-12" in_port="dsp_top.dsp_I1" out_port="dsp.dsp_I1"/>
+     </complete>
+
+      <complete name="second_half" input="dsp_top.dsp_I2" output="dsp.dsp_I2">
+          <delay_constant max="333e-12" in_port="dsp_top.dsp_I2" out_port="dsp.dsp_I2"/>
+      </complete>
+      -->
+      <direct name="datain1" input="dsp_top.dsp_I1" output ="dsp.dsp_I1"/>
+      <direct name="datain2" input="dsp_top.dsp_I2" output ="dsp.dsp_I2"/>
+
+      <direct name="reset" input="dsp_top.reset" output="dsp.reset"></direct>
+      <direct name="chainin" input="dsp_top.chainin" output="dsp.chainin">
+          <delay_constant max="1179e-12" in_port="dsp_top.chainin" out_port="dsp.chainin"/>
+      </direct>
+      <direct name="chainout" input="dsp.chainout" output="dsp_top.chainout">
+          <delay_constant max="1179e-12" in_port="dsp.chainout" out_port="dsp_top.chainout"/>
+      </direct>
+      <direct name="scanin" input="dsp_top.scanin" output="dsp.scanin">
+          <delay_constant max="1179e-12" in_port="dsp_top.scanin" out_port="dsp.scanin"/>
+      </direct>
+      <direct name="scanout" input="dsp.scanout" output="dsp_top.scanout">
+          <delay_constant max="1179e-12" in_port="dsp.scanout" out_port="dsp_top.scanout"/>
+      </direct>
+      <direct name="result" input="dsp.result" output="dsp_top.result"></direct>
+      <direct name="clk" input="dsp_top.clk" output="dsp.clk"></direct>
+    </interconnect>
+  </pb_type>
+    <!-- Define DSP slice end -->
+
+
+    <!-- Define fracturable memory begin -->
+    <!-- 
+    RAM blocks always have registered inputs. The input FFs appear before the address decoder & wordline driver,
+    and after the local input crossbar & level shifter.
+    RAM blocks optionally have registered outputs. The output FFs (if present) appear after the output crossbar.
+    If BRAM doesn't have registered outputs, then T_clk_to_q is the whole delay of the read/write operation.
+    If BRAM does have registered output, then T_clk_to_q is just the FF clk_to_q and then delay_constant
+    can be used to specify the whole delay of the read/write operation.
+
+    This RAM block has registered outputs.
+
+    The area and delay values of this RAM block were obtained (indirectly) from COFFE simulations.
+    COFFE only support widths and depths that are powers of 2. For M20K (20 Kilobit BRAM), we need
+    the width to be 40 bits and depth to be 512 (for the logically widest mode: 512x40). We can't
+    simulate these dimensions directly in COFFE. So, we simulated and obtained the results for M32K
+    (32 Kilobits BRAM) and (16 Kilobits BRAM). Then we interpolated the results.
+    For delay, a linear interpolation was used, based on the size of the Memory (16K->20K->32K).
+    For area, the value was calculated using two interpolations: (1) port based (change in number of 
+    ports in going from 16K->20K->32K) and (2) number of bits based (change in number of bits in
+    going from 16K->20K->32K). The interpolation that resulted in the larger area was picked.
+    
+
+    Here are the equations used to calculate the delays based on COFFE results:
+    T_setup (inputs) = T_level_shifter + T_register_micro_setup = 32.3ps + 18.91ps = 51.21ps
+    T_clk_to_q (inputs) = T_register_micro_clk_to_q = 60.32ps
+    T_setup (outputs) = T_register_micro_setup = 18.91ps 
+    T_clk_to_q (outputs) = T_register_micro_clk_to_q = 60.32ps
+
+    (Register setup and clk_to_q timings are actually from the FF used in the logic cluster.)
+
+    T_read = T1 + T2 + T3
+    = max (Row decoder, Pre-charge time) + (Wordline driver + Bit line delay) + (Sense amp + Output crossbar)
+
+    * Bit line delay is included in self.RAM.samp.delay time in COFFE. The Sense amp delay is actually
+    self.RAM.samp_part2.delay
+
+    T_write = T1 + T2 + T3
+    = max (Row decoder, Pre-charge time) + (Wordline driver) + (Write driver)
+
+    delay_constant values model the internal limits of a block (the combinatorial delay).
+    delay_constant = max (T_read, T_write) 
+
+    Overall internal delay of the RAM is T_clk_to_q (inputs) + delay_constant + T_setup (outputs)
+    -->
+    <pb_type name="memory">
+      <input name="addr1" num_pins="11"/>
+      <input name="addr2" num_pins="11"/>
+      <input name="data" num_pins="40"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="40"/>
+      <clock name="clk" num_pins="1"/>
+      <!-- Specify single port mode first -->
+      <mode name="mem_512x40_sp">
+        <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="9" port_class="address"/>
+          <input name="data" num_pins="40" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="40" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.data" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.we" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.out" clock="clk"/>
+
+          <T_hold value="7.4e-11" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_512x40_sp.data" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_512x40_sp.we" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_512x40_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.data" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.we" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.out" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_512x40_sp.addr" out_port="mem_512x40_sp.out"/>
+          <delay_constant max="0" in_port="mem_512x40_sp.data" out_port="mem_512x40_sp.out"/>
+          <delay_constant max="0" in_port="mem_512x40_sp.we"   out_port="mem_512x40_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data" output="mem_512x40_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_1024x20_sp">
+        <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="10" port_class="address"/>
+          <input name="data" num_pins="20" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="20" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_1024x20_sp.addr" out_port="mem_1024x20_sp.out"/>
+          <delay_constant max="0" in_port="mem_1024x20_sp.data" out_port="mem_1024x20_sp.out"/>
+          <delay_constant max="0" in_port="mem_1024x20_sp.we"   out_port="mem_1024x20_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_2048x10_sp">
+        <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="11" port_class="address"/>
+          <input name="data" num_pins="10" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="10" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_2048x10_sp.addr" out_port="mem_2048x10_sp.out"/>
+          <delay_constant max="0" in_port="mem_2048x10_sp.data" out_port="mem_2048x10_sp.out"/>
+          <delay_constant max="0" in_port="mem_2048x10_sp.we"   out_port="mem_2048x10_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <!-- Specify true dual port mode next -->
+      <mode name="mem_1024x20_dp">
+        <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="10" port_class="address1"/>
+          <input name="addr2" num_pins="10" port_class="address2"/>
+          <input name="data1" num_pins="20" port_class="data_in1"/>
+          <input name="data2" num_pins="20" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="20" port_class="data_out1"/>
+          <output name="out2" num_pins="20" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_1024x20_dp.addr1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.data1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.we1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.addr2" out_port="mem_1024x20_dp.out2"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.data2" out_port="mem_1024x20_dp.out2"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.we2" out_port="mem_1024x20_dp.out2"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1">
+          </direct>
+          <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2">
+          </direct>
+          <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1">
+          </direct>
+          <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1">
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2">
+          </direct>
+          <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]">
+          </direct>
+          <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_2048x10_dp">
+        <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="11" port_class="address1"/>
+          <input name="addr2" num_pins="11" port_class="address2"/>
+          <input name="data1" num_pins="10" port_class="data_in1"/>
+          <input name="data2" num_pins="10" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="10" port_class="data_out1"/>
+          <output name="out2" num_pins="10" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_2048x10_dp.addr1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.data1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.we1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.addr2" out_port="mem_2048x10_dp.out2"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.data2" out_port="mem_2048x10_dp.out2"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.we2" out_port="mem_2048x10_dp.out2"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1">
+          </direct>
+          <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2">
+          </direct>
+          <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1">
+          </direct>
+          <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1">
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2">
+          </direct>
+          <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]">
+          </direct>
+          <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+    </pb_type>
+    <!-- Define fracturable memory end -->
+
+    <pb_type name="tsv_hole">
+      <input name="I" num_pins="1"/>
+      <output name="O" num_pins="1"/>
+      <interconnect/>
+    </pb_type>
+
+  </complexblocklist>
+
+  <switchblocklist>
+    <!-- Stratix IV uses a uni-directional routing architecture with a Driver Input Mux (DIM) size of 12 (i.e.
+           each wire can be driven by one of 12 block/outputs or wires) for the L4s.
+           
+           In the Stratix IV architecture the long wires (L16 here) are accessible only from the short wires, 
+           and are not connected to the block pins (i.e. connection blocks). Furthermore, they only connect 
+           to switch blocks every 4 LABs (to avoid expensive deep via stacks).
+           We approximate the L16 DIM size as 40:1 (in reality it is a pair of 20:1 (?) muxes with a 2:1 swap mux
+           in front, which has nearly the same connectivity as a full 40:1).
+
+           L4 wires
+           ================
+           At a channel width of 300 there are 260 L4/L4prime wires. At an effective Fc_out of 0.075 
+           and 40 LAB outputs this yeilds:
+
+                40 * 2 = 80 outputs per channel  [2 LABs per-channel]
+
+                80 * 0.075 = 6 outputs drive each L4 wire [output connection block]
+
+           This leaves:
+
+                12 - 6 = 6 inputs to the DIMs from other routing wires [switch block]
+
+           Since L4s connect at every switch block, there are:
+
+                260 L16 wires per channel + direction which can drive wires at a particular switchblock
+                (via switchpoints 0, 1, 2, 3)
+
+           And for each direction (260 wires) only:
+
+               260 / 4 = 65 wires starting/ending per channel + direction at each switch block
+               (i.e. from each direction, north/south/east/west, there are 32 L4s starting, and 32 L4s ending; + 1 wire for the 65th)
+
+           Which we allocate as follows:
+
+                L4
+                =====
+                straight-through connection: 2 (from L4 or L16)
+                clock-wise turn            : 2 (from L4 or L16)
+                counter-clock-wise turn    : 2 (from L4 or L16)
+
+           L16 wires
+           =========
+           At a channel width of 300 there are 40 L16 wires (20 in each direction), which do not connect to the input/output connection blocks.
+           This leaves 40 inputs to the DIM to select from routing wires (long wires use larger DIMs to improve reachability,
+           the area cost is relatively small since they are so rare).
+
+           Since L16s only connect at every 4th switch block there are:
+
+                40 / 4 = 10 L16 wires per channel (5 in each direction) which can drive wires at a particular switchblock
+                (via switchpoints 0, 4, 8, 12)
+
+           And for each direction (20 wires) only:
+
+               40 / 16 = 2.25 => 2 wires starting/ending per channel + direction at each switch block
+               (i.e. from each direction, north/south/east/west, there is one L16 starting, and one L16 ending)
+           
+           We assign the 40 DIM inputs as follows:
+
+                L16
+                =====
+                straight-through connection:  3 (from L16)
+                straight-through connection: 11 (from L4)
+                clock-wise turn            :  3 (from L16)
+                clock-wise turn            : 10 (from L4)
+                counter clock-wise turn    :  3 (from L16)
+                counter clock-wise turn    : 10 (from L4)
+
+           Switch pattern
+           ==============
+           This switch block is based on the Wilton switch block (see Page 103 of Steve Wilton's PhD Thesis 
+           "Architecture and Algorithms for Field-Programmable Gate Arrays with Embedded Memory", 1997):
+
+                left-to-top: W - t
+                top-to-right: t + 1
+                right-to-bottom: 2*W - 2 - t
+                bottom-to-left: t + 1
+                left-to-right: t
+                top-to-bottom: t
+
+           Since Wilton assumed bidirection routing (while we use unidirectional routing),
+           we mirror the clock-wise turns to match the conter-clock-wise specification.
+           -->
+    <switchblock name="wilton_turn_clockwise_core" type="unidir">
+      <switchblock_location type="CORE"/>
+      <switchfuncs>
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="W-t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t+1"/>
+        <!-- right to top -->
+        <func type="br" formula="2*W-2-t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t+1"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="10*to" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers 
+
+               Driving from L16 (few) to L4 (many) preferr driving from end-point of L16, although since there are many they will
+               all be multiply connected.
+               
+               Driving from L4 (many) to L4 (many) shuffle the switchpoints so the L4's are driven from a variety of switchpoints.
+               Since the actual number L4s starting/ending are equal, using 'fixed' from_order would mean only switchpoint 0 -> 0
+               connections. A 'shuffled' order will mix-up the from switchpoints for more diversity.
+               -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0,1,2,3"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+    </switchblock>
+    <switchblock name="wilton_turn_counter_clockwise_core" type="unidir">
+      <switchblock_location type="CORE"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="W-t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t+1"/>
+        <!-- top to right -->
+        <func type="rb" formula="2*W-2-t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t+1"/>
+        <!-- bottom to left -->
+      </switchfuncs>
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="10*to" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers 
+
+               Driving from L16 (few) to L4 (many) preferr driving from end-point of L16, although since there are many they will
+               all be multiply connected.
+               
+               Driving from L4 (many) to L4 (many) shuffle the switchpoints so the L4's are driven from a variety of switchpoints.
+               Since the actual number L4s starting/ending are equal, using 'fixed' from_order would mean only switchpoint 0 -> 0
+               connections. A 'shuffled' order will mix-up the from switchpoints for more diversity.
+
+               Note that a different from_switchpoints ordering is used to ensure a different shuffling occurs compared to 
+               wilton_turn_clockwise_core.
+               -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0,1,2,3"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+    </switchblock>
+    <switchblock name="wilton_straight" type="unidir">
+      <switchblock_location type="EVERYWHERE"/>
+      <switchfuncs>
+        <!-- Straight -->
+        <func type="lr" formula="t"/>
+        <!-- left to right -->
+        <func type="tb" formula="t"/>
+        <!-- top to bottom -->
+        <func type="rl" formula="t"/>
+        <!-- right to left -->
+        <func type="bt" formula="t"/>
+        <!-- bottom to top -->
+      </switchfuncs>
+      <!-- L16 Drivers 
+                Note that we order the switchpoints in order of preference, since VPR currently
+                iterates through the source sets in order, such that we connect first to wires
+                ending at the switchblock (switchpoint 0), and then fallback to switchpoints
+                in decreasing distance from the drive point (if we have more to's than from's
+                it then wraps around).
+
+                Note also that we multiply the number of expected connections by 'to', since while usually
+                there is only one 'to' wire, ocasionally there may be more, and we want to ensure they all
+                get the same number of connections.
+
+                For L16->L16:
+                  We allow any valid switchpoint to be used as the 'from' point.
+                  Allow 'low' switchpoints like '4' may seem counter-intuitive (i.e. why not use a cheaper L4)
+                  this makes it easier to bypass once on the L16 network (e.g. to get around congestion).
+           -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="11*to" from_type="L4" from_switchpoint="0,3,2,1" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 Drivers -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+      <!--<wireconn num_conns="1*to" from_type="L4" from_switchpoint="0" to_type="L4" to_switchpoint="0"/>-->
+      <!--<wireconn num_conns="1*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L4" to_switchpoint="0"/>-->
+    </switchblock>
+    <switchblock name="wilton_straight_corner" type="unidir">
+      <!-- Same as wilton straight, but turning around a corner -->
+      <switchblock_location type="CORNER"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t"/>
+        <!-- top to right -->
+        <func type="rb" formula="t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t"/>
+        <!-- bottom to left -->
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t"/>
+        <!-- right to top -->
+        <func type="br" formula="t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- L16 Drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="11*to" from_type="L4" from_switchpoint="0,3,2,1" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 Drivers -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+      <!--<wireconn num_conns="1*to" from_type="L4" from_switchpoint="0" to_type="L4" to_switchpoint="0"/>-->
+      <!--<wireconn num_conns="1*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L4" to_switchpoint="0"/>-->
+    </switchblock>
+    <switchblock name="wilton_turn_fringe" type="unidir">
+      <!-- Non-corner perimeter SBs -->
+      <switchblock_location type="FRINGE"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="W-t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t+1"/>
+        <!-- top to right -->
+        <func type="rb" formula="2*W-2-t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t+1"/>
+        <!-- bottom to left -->
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="W-t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t+1"/>
+        <!-- right to top -->
+        <func type="br" formula="2*W-2-t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t+1"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- We use 'max' style connections here to ensure there are no dangling wires, otherwise like core turns -->
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*max(from,to)" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="21*max(from,to)" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers -->
+      <wireconn num_conns="1*max(from,to)" from_type="L16" from_switchpoint="0,12,8,4" from_order="fixed" to_type="L4" to_switchpoint="0"/>
+      <wireconn num_conns="1*max(from,to)" from_type="L4" from_switchpoint="0,1,2,3" from_order="shuffled" to_type="L4" to_switchpoint="0"/>
+    </switchblock>
+  </switchblocklist>
+
+  <clocks>
+    <clock buffer_size="auto" C_wire="2.5e-10"/>
+  </clocks>
+
+</architecture>
+
+
diff --git a/vtr_flow/arch/multi_die/aman_3d_limited.xml b/vtr_flow/arch/multi_die/aman_3d_limited.xml
new file mode 100644
index 00000000000..7d67b2c996c
--- /dev/null
+++ b/vtr_flow/arch/multi_die/aman_3d_limited.xml
@@ -0,0 +1,1579 @@
+<architecture>
+  <models>
+    <model name="single_port_ram">
+      <input_ports>
+        <port name="we" clock="clk" combinational_sink_ports="out"/>
+        <!-- control -->
+        <port name="addr" clock="clk" combinational_sink_ports="out"/>
+        <!-- address lines -->
+        <port name="data" clock="clk" combinational_sink_ports="out"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <model name="dual_port_ram">
+      <input_ports>
+        <port name="we1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- write enable -->
+        <port name="we2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- write enable -->
+        <port name="addr1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- address lines -->
+        <port name="addr2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- address lines -->
+        <port name="data1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="data2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out1" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        <port name="out2" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <!-- Used inside DSPs. 
+         Fixed point multiplication.
+         ODIN infers these when * sign appears in RTL. -->
+    <model name="multiply">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_2">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    
+    <model name="mult_add_int_18x19">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="scanin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+        <port name="scanout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_4">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+  </models>
+    <!-- Fixed point MAC inside DSP slices -->
+  <tiles>
+    <tile name="io" area="0">
+      <sub_tile name="io" capacity="8">
+        <equivalent_sites>
+          <site pb_type="io" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+        <pinlocations pattern="custom">
+          <loc side="left" layer_offset="1">io.inpad</loc>
+          <loc side="left">io.outpad io.clock</loc>
+          <loc side="top" layer_offset="1">io.inpad</loc>
+          <loc side="top">io.outpad io.clock</loc>
+          <loc side="right" layer_offset="1">io.inpad</loc>
+          <loc side="right">io.outpad io.clock</loc>
+          <loc side="bottom" layer_offset="1">io.inpad</loc>
+          <loc side="bottom">io.outpad io.clock</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="clb" height="1" width="1" area="27905">
+      <sub_tile name="clb">
+        <equivalent_sites>
+          <site pb_type="clb" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="I1" num_pins="10" equivalent="full"/>
+        <input name="I2" num_pins="10" equivalent="full"/>
+        <input name="I3" num_pins="10" equivalent="full"/>
+        <input name="I4" num_pins="10" equivalent="full"/>
+        <output name="O" num_pins="20" equivalent="instance"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.2" out_type="frac" out_val="0.025"/>
+        <!-- Two sided connectivity CLB architecture--> 
+        <pinlocations pattern="custom">
+          <loc side="right">clb.I1 clb.I3 clb.clk clb.O[9:6]</loc>
+          <loc side="right" layer_offset="1">clb.O[5:0]</loc>
+          <loc side="bottom">clb.I2 clb.I4 clb.clk clb.O[19:16]</loc>
+          <loc side="bottom" layer_offset="1">clb.O[15:10]</loc>    
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="dsp_top" height="1" width="1" area="253779">
+      <sub_tile name="dsp_top">
+        <equivalent_sites>
+          <site pb_type="dsp_top" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="reset" num_pins="1" is_non_clock_global="true"/>
+        <input name="dsp_I1" num_pins="37" />
+        <input name="dsp_I2" num_pins="37" />
+        <input name="chainin" num_pins="64"/>
+        <input name="scanin" num_pins="27"/>
+        <output name="result" num_pins="74"/>
+        <output name="chainout" num_pins="64"/>
+        <output name="scanout" num_pins="27"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <!-- clock pins and chain ports do not connect to local routing -->
+          <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="chainin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="chainout" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="scanin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="scanout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="custom">
+            <loc side="left">dsp_top.dsp_I1 dsp_top.reset</loc>
+            <loc side="right">dsp_top.dsp_I2 dsp_top.clk</loc>
+            <loc side="top">dsp_top.chainin dsp_top.scanin</loc>
+            <loc side="bottom">dsp_top.chainout dsp_top.scanout</loc>
+            <loc side="right" layer_offset="1">dsp_top.result[36:0]</loc>
+            <loc side="left" layer_offset="1">dsp_top.result[73:37]</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="memory" height="1" width="1" area="137668">
+      <sub_tile name="memory">
+        <equivalent_sites>
+          <site pb_type="memory" pin_mapping="direct"/>
+        </equivalent_sites>
+      <input name="addr1" num_pins="11"/>
+      <input name="addr2" num_pins="11"/>
+      <input name="data" num_pins="40"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="40"/>
+      <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <fc_override fc_type="frac" fc_val="0" port_name="clk"/>
+        </fc>  
+        <pinlocations pattern="custom">
+          <loc side="top"> memory.addr1[0] memory.addr1[8] memory.addr2[5] memory.data[2] memory.data[10] memory.data[18] memory.data[26] memory.data[34] memory.clk memory.addr1[1] memory.addr1[9] memory.addr2[6] memory.data[3] memory.data[11] memory.data[19] memory.data[27] memory.data[35]</loc>
+          <loc side="right"> memory.addr1[2] memory.addr1[10] memory.addr2[7] memory.data[4] memory.data[12] memory.data[20] memory.data[28] memory.data[36] memory.addr1[3] memory.addr2[0] memory.addr2[8] memory.data[5] memory.data[13] memory.data[21] memory.data[29] memory.data[37]</loc>
+          <loc side="bottom"> memory.addr1[4] memory.addr2[1] memory.addr2[9] memory.data[6] memory.data[14] memory.data[22] memory.data[30] memory.data[38] memory.addr1[5] memory.addr2[2] memory.addr2[10] memory.data[7] memory.data[15] memory.data[23] memory.data[31] memory.data[39]</loc>
+          <loc side="left" > memory.addr1[6] memory.addr2[3] memory.data[0] memory.data[8] memory.data[16] memory.data[24] memory.data[32] memory.we1 memory.addr1[7] memory.addr2[4] memory.data[1] memory.data[9] memory.data[17] memory.data[25] memory.data[33] memory.we2</loc>
+
+          <loc side="top" layer_offset="1"> memory.out[0] memory.out[8] memory.out[16] memory.out[24] memory.out[32] memory.out[1] memory.out[9] memory.out[17] memory.out[25] memory.out[33]</loc>
+          <loc side="right" layer_offset="1"> memory.out[2] memory.out[10] memory.out[18] memory.out[26] memory.out[34] memory.out[3] memory.out[11] memory.out[19] memory.out[27] memory.out[35]</loc>
+          <loc side="bottom" layer_offset="1"> memory.out[4] memory.out[12] memory.out[20] memory.out[28] memory.out[36] memory.out[5] memory.out[13] memory.out[21] memory.out[29] memory.out[37]</loc>
+          <loc side="left" layer_offset="1"> memory.out[6] memory.out[14] memory.out[22] memory.out[30] memory.out[38] memory.out[7] memory.out[15] memory.out[23] memory.out[31] memory.out[39]</loc>
+
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="tsv_hole" height="2" width="2" area="137668">
+      <sub_tile name="tsv_hole">
+        <equivalent_sites>
+          <site pb_type="tsv_hole"/>
+        </equivalent_sites>
+        <input name="in" num_pins="1"/>
+        <output name="out" num_pins="1"/>
+        <fc in_type="abs" in_val="0" out_type="abs" out_val="0"/>
+      </sub_tile>
+    </tile>
+  </tiles>
+  <!-- ODIN II specific config ends -->
+  <layout>
+    <!-- Physical descriptions begin -->
+    <fixed_layout name="coffe_7nm" width="328" height="288">
+      <layer die="0">
+        <perimeter type="io" priority="101"/>
+      
+        <corners type="EMPTY" priority="102"/>
+
+        <fill type="clb" priority="10"/>
+
+        <col type="memory" startx="11" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="25" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="37" starty="1" repeatx="41" priority="20"/>
+
+        <col type="dsp_top" startx="18" starty="1" repeatx="41" priority="20"/>
+        <col type="dsp_top" startx="31" starty="1" repeatx="41" priority="20"/>
+
+        <!-- PW -->
+        <col type="tsv_hole" startx="8" starty="5" repeatx="13" incry="12" priority="103"/>
+        
+        <!-- GND -->
+        <col type="tsv_hole" startx="14" starty="11" repeatx="13" incry="12" priority="103"/>
+
+      </layer>
+      <layer die="1">
+        <perimeter type="io" priority="101"/>
+      
+        <corners type="EMPTY" priority="102"/>
+
+        <fill type="clb" priority="10"/>
+
+        <col type="memory" startx="11" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="25" starty="1" repeatx="41" priority="20"/>
+        <col type="memory" startx="37" starty="1" repeatx="41" priority="20"/>
+
+        <col type="dsp_top" startx="18" starty="1" repeatx="41" priority="20"/>
+        <col type="dsp_top" startx="31" starty="1" repeatx="41" priority="20"/>
+        
+      </layer>
+    </fixed_layout>
+  </layout>
+  <device>
+    <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/>
+    <area grid_logic_tile_area="0"/>
+    <chan_width_distr>
+      <x distr="uniform" peak="1.000000"/>
+      <y distr="uniform" peak="1.000000"/>
+    </chan_width_distr>
+    <switch_block type="custom"/>
+    <connection_block input_switch_name="ipin_cblock" input_inter_die_switch_name="die_connection"/>
+  </device>
+  <switchlist>
+    <switch type="mux" name="L4_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="9.877e-11" mux_trans_size="2.6482996805637553" buf_size="18.744014602932605"/>
+    <switch type="mux" name="L4_inter_layer_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="17.177e-11" mux_trans_size="2.6482996805637553" buf_size="18.744014602932605"/>
+    <!-- Delay of L16 driver is scaled from L4 by a factor of 1.5x (based on numbers from the Titan Stratix IV architecture file)
+   Area numbers will not be totally accurate because of the same buf_size -->
+    <switch type="mux" name="L16_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="2.016e-10" mux_trans_size="3.1851297470059468" buf_size="39.327334265524485"/>
+    <switch type="mux" name="L16_inter_layer_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="2.746e-10" mux_trans_size="3.1851297470059468" buf_size="39.327334265524485"/>
+    <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="5.636e-11" mux_trans_size="2.008" buf_size="9.624436045683868"/>
+    <switch type="mux" name="die_connection" R="0.0" Cout="0.0" Cin="0.0" Tdel="130e-12" mux_trans_size="1.508" buf_size="11.71"/>
+  </switchlist>
+  <segmentlist>
+    <segment name="L4" freq="280" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0">
+      <mux name="L4_driver"/>
+      <mux_inter_die name="L4_inter_layer_driver"/>
+      <sb type="pattern">1 1 1 1 1</sb>
+      <cb type="pattern">1 1 1 1</cb>
+    </segment>
+    <segment name="L16" freq="40" length="16" type="unidir" Rmetal="0.0" Cmetal="0.0">
+      <mux name="L16_driver"/>
+      <mux_inter_die name="L16_inter_layer_driver"/>
+      <!-- Vias from the top of the metal stack (global layers, where the long wires are 
+           implemented) down to the middle/bottom of the metal stack (semi-global layers, 
+           where the short wires are implemented) are expensive and restrictive.
+           As a result Startix IV only places long wire switch blocks every 4 LABs -->
+      <sb type="pattern">1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1</sb>
+      <!-- For the same reasons, long wires do not connect to block pins in Stratix IV -->
+      <cb type="pattern">0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0</cb>
+    </segment>
+  </segmentlist>
+  <directlist>
+    <!-- Direct connect from one DSP to the DSP directly below it -->
+    <direct name="dsp_out_chain" from_pin="dsp_top.chainout" from_side="bottom" to_pin="dsp_top.chainin" to_side="top" x_offset="0" y_offset="-4" z_offset="0"/>
+    <direct name="dsp_in_chain" from_pin="dsp_top.scanout" from_side="bottom" to_pin="dsp_top.scanin" to_side="top" x_offset="0" y_offset="-4" z_offset="0"/>
+  </directlist>
+
+  <complexblocklist>
+    <!-- Define I/O pads begin -->
+    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+    <pb_type name="io">
+      <input name="outpad" num_pins="1"/>
+      <output name="inpad" num_pins="1"/>
+      <clock name="clock" num_pins="1"/>
+      <!-- IOs can operate as either inputs or outputs.
+       Delays below come from Ian Kuon. They are small, so they should be interpreted as
+       the delays to and from registers in the I/O (and generally I/Os are registered 
+       today and that is when you timing analyze them.
+       -->
+      <mode name="inpad">
+        <pb_type name="inpad" blif_model=".input" num_pb="1">
+          <output name="inpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="outpad">
+        <pb_type name="outpad" blif_model=".output" num_pb="1">
+          <input name="outpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+      <!-- Place I/Os on the sides of the FPGA -->
+    </pb_type>
+    <!-- Define I/O pads ends -->
+    <!-- Define general purpose logic block (CLB) begin -->
+    <pb_type name="clb">
+      <input name="I1" num_pins="10" equivalent="full"/>
+      <input name="I2" num_pins="10" equivalent="full"/>
+      <input name="I3" num_pins="10" equivalent="full"/>
+      <input name="I4" num_pins="10" equivalent="full"/>
+      <output name="O" num_pins="20" equivalent="instance"/>
+      <clock name="clk" num_pins="1"/>  <!-- Basic logic element definition -->
+      <pb_type name="fle" num_pb="10">
+        <input name="in_A" num_pins="1"/>
+        <input name="in_B" num_pins="1"/>
+        <input name="in_C" num_pins="1"/>
+        <input name="in_D" num_pins="1"/>
+        <input name="in_E" num_pins="1"/>
+        <input name="in_F" num_pins="1"/>
+        <output name="out_local" num_pins="2"/>
+        <output name="out_routing" num_pins="2"/>
+        <clock name="clk" num_pins="1"/> 
+        <mode name="n1_lut6">
+          <pb_type name="ble6" num_pb="1">
+            <input name="in_A" num_pins="1"/>
+            <input name="in_B" num_pins="1"/>
+            <input name="in_C" num_pins="1"/>
+            <input name="in_D" num_pins="1"/>
+            <input name="in_E" num_pins="1"/>
+            <input name="in_F" num_pins="1"/>
+            <output name="out_local" num_pins="1"/>
+            <output name="out_routing" num_pins="2"/>
+            <clock name="clk" num_pins="1"/> 
+            <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+              <input name="in" num_pins="6" port_class="lut_in"/>
+              <output name="out" num_pins="1" port_class="lut_out"/>
+              <!-- We define the LUT delays on the LUT pins instead of through the LUT -->
+              <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                 0
+                 0
+                 0
+                 0
+                 0
+                 0
+              </delay_matrix>
+            </pb_type>
+            <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+              <input name="D" num_pins="1" port_class="D"/>
+              <output name="Q" num_pins="1" port_class="Q"/>
+              <clock name="clk" num_pins="1" port_class="clock"/>
+              <T_setup value="1.891e-11" port="ff.D" clock="clk"/>
+              <T_clock_to_Q max="6.032e-11" port="ff.Q" clock="clk"/>
+            </pb_type>
+            <interconnect>
+              <direct name="direct0" input="ble6.in_A" output="lut6.in[0:0]">
+                <delay_constant max="1.1287999999999999e-10" in_port="ble6.in_A" out_port="lut6.in[0:0]" />
+              </direct>
+              <direct name="direct1" input="ble6.in_B" output="lut6.in[1:1]">
+                <delay_constant max="1.1072500000000001e-10" in_port="ble6.in_B" out_port="lut6.in[1:1]" />
+              </direct>
+              <direct name="direct3" input="ble6.in_D" output="lut6.in[3:3]">
+                <delay_constant max="8.1212e-11" in_port="ble6.in_D" out_port="lut6.in[3:3]" />
+              </direct>
+              <direct name="direct4" input="ble6.in_E" output="lut6.in[4:4]">
+                <delay_constant max="7.961e-11" in_port="ble6.in_E" out_port="lut6.in[4:4]" />
+              </direct>
+              <direct name="direct5" input="ble6.in_F" output="lut6.in[5:5]">
+                <delay_constant max="4.9300999999999996e-11" in_port="ble6.in_F" out_port="lut6.in[5:5]" />
+              </direct>
+              <!--Clock -->
+              <direct name="direct6" input="ble6.clk" output="ff.clk"/>
+              <!-- Register feedback mux -->   
+              <mux name="mux1" input="ble6.in_C ff.Q" output="lut6.in[2:2]">
+                <delay_constant max="1.1347e-10" in_port="ble6.in_C" out_port="lut6.in[2:2]" />
+                <delay_constant max="1.1347e-10" in_port="ff.Q" out_port="lut6.in[2:2]" />  
+              </mux>
+              <!-- FF input selection mux -->
+              <mux name="2" input="lut6.out ble6.in_C" output="ff.D">
+                <delay_constant max="1.74588e-11" in_port="lut6.out" out_port="ff.D" />
+                <delay_constant max="1.74588e-11" in_port="ble6.in_C" out_port="ff.D" />
+              </mux>
+              <!-- BLE output (local) -->
+              <mux name="mux3" input="ff.Q lut6.out" output="ble6.out_local">
+                <delay_constant max="1.346e-10" in_port="ff.Q" out_port="ble6.out_local" />
+                <delay_constant max="1.346e-10" in_port="lut6.out" out_port="ble6.out_local" />
+              </mux>
+              <!-- BLE output (routing 1) --> 
+              <mux name="mux4" input="ff.Q lut6.out" output="ble6.out_routing[0:0]">
+                <delay_constant max="3.771e-11" in_port="ff.Q" out_port="ble6.out_routing[0:0]" />
+                <delay_constant max="3.771e-11" in_port="lut6.out" out_port="ble6.out_routing[0:0]" />
+              </mux>
+              <!-- BLE output (routing 2) --> 
+              <mux name="mux5" input="ff.Q lut6.out" output="ble6.out_routing[1:1]">
+                <delay_constant max="3.771e-11" in_port="ff.Q" out_port="ble6.out_routing[1:1]" />
+                <delay_constant max="3.771e-11" in_port="lut6.out" out_port="ble6.out_routing[1:1]" />
+              </mux>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in_A" output="ble6.in_A"/>
+            <direct name="direct2" input="fle.in_B" output="ble6.in_B"/>
+            <direct name="direct3" input="fle.in_C" output="ble6.in_C"/>
+            <direct name="direct4" input="fle.in_D" output="ble6.in_D"/>
+            <direct name="direct5" input="fle.in_E" output="ble6.in_E"/>
+            <direct name="direct6" input="fle.in_F" output="ble6.in_F"/>
+            <direct name="direct7" input="ble6.out_local" output="fle.out_local[0:0]"/>
+            <direct name="direct8" input="ble6.out_routing" output="fle.out_routing"/>
+            <direct name="direct9" input="fle.clk" output="ble6.clk"/>
+          </interconnect>
+        </mode>  
+        <mode name="n2_lut5">
+          <pb_type name="lut5inter" num_pb="1">
+            <input name="in_A" num_pins="1"/>
+            <input name="in_B" num_pins="1"/>
+            <input name="in_C" num_pins="1"/>
+            <input name="in_D" num_pins="1"/>
+            <input name="in_E" num_pins="1"/>
+            <output name="out_local" num_pins="2"/>
+            <output name="out_routing" num_pins="2"/>
+            <clock name="clk" num_pins="1"/> 
+            <pb_type name="ble5" num_pb="2">
+              <input name="in_A" num_pins="1"/>
+              <input name="in_B" num_pins="1"/>
+              <input name="in_C" num_pins="1"/>
+              <input name="in_D" num_pins="1"/>
+              <input name="in_E" num_pins="1"/>
+              <output name="out_local" num_pins="1"/>
+              <output name="out_routing" num_pins="1"/>
+              <clock name="clk" num_pins="1"/> 
+              <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                <input name="in" num_pins="5" port_class="lut_in"/>
+                <output name="out" num_pins="1" port_class="lut_out"/>
+                <!-- We define the LUT delays on the LUT pins instead of through the LUT -->
+                <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                   0
+                   0
+                   0
+                   0
+                   0
+                </delay_matrix>
+              </pb_type>
+              <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                <input name="D" num_pins="1" port_class="D"/>
+                <output name="Q" num_pins="1" port_class="Q"/>
+                <clock name="clk" num_pins="1" port_class="clock"/>
+                <T_setup value="1.891e-11" port="ff.D" clock="clk"/>
+                <T_clock_to_Q max="6.032e-11" port="ff.Q" clock="clk"/>
+              </pb_type>
+              <interconnect>
+                <direct name="direct0" input="ble5.in_A" output="lut5.in[0:0]">
+                  <delay_constant max="1.1287999999999999e-10" in_port="ble5.in_A" out_port="lut5.in[0:0]" />
+                </direct>
+                <direct name="direct1" input="ble5.in_B" output="lut5.in[1:1]">
+                  <delay_constant max="1.1072500000000001e-10" in_port="ble5.in_B" out_port="lut5.in[1:1]" />
+                </direct>
+                <direct name="direct3" input="ble5.in_D" output="lut5.in[3:3]">
+                  <delay_constant max="8.1212e-11" in_port="ble5.in_D" out_port="lut5.in[3:3]" />
+                </direct>
+                <direct name="direct4" input="ble5.in_E" output="lut5.in[4:4]">
+                  <delay_constant max="7.961e-11" in_port="ble5.in_E" out_port="lut5.in[4:4]" />
+                </direct>
+                  <!--Clock -->
+                <direct name="direct5" input="ble5.clk" output="ff.clk"/>
+                <!-- Register feedback mux -->   
+                <mux name="mux1" input="ble5.in_C ff.Q" output="lut5.in[2:2]">
+                  <delay_constant max="1.1347e-10" in_port="ble5.in_C" out_port="lut5.in[2:2]" />
+                  <delay_constant max="1.1347e-10" in_port="ff.Q" out_port="lut5.in[2:2]" />  
+                </mux>
+                <!-- FF input selection mux -->
+                <mux name="2" input="lut5.out ble5.in_C" output="ff.D">
+                  <delay_constant max="1.74588e-11" in_port="lut5.out" out_port="ff.D" />
+                  <delay_constant max="1.74588e-11" in_port="ble5.in_C" out_port="ff.D" />
+                </mux>
+                <!-- BLE output (local) -->
+                <mux name="mux3" input="ff.Q lut5.out" output="ble5.out_local">
+                  <delay_constant max="1.346e-10" in_port="ff.Q" out_port="ble5.out_local" />
+                  <delay_constant max="1.346e-10" in_port="lut5.out" out_port="ble5.out_local" />
+                </mux>
+                <!-- BLE output (routing 1) --> 
+                <mux name="mux4" input="ff.Q lut5.out" output="ble5.out_routing[0:0]">
+                  <delay_constant max="3.771e-11" in_port="ff.Q" out_port="ble5.out_routing[0:0]" />
+                  <delay_constant max="3.771e-11" in_port="lut5.out" out_port="ble5.out_routing[0:0]" />
+                </mux>
+              </interconnect>
+            </pb_type>
+            <interconnect>
+              <direct name="direct1" input="lut5inter.in_A" output="ble5[0:0].in_A"/>
+              <direct name="direct2" input="lut5inter.in_B" output="ble5[0:0].in_B"/>
+              <direct name="direct3" input="lut5inter.in_C" output="ble5[0:0].in_C"/>
+              <direct name="direct4" input="lut5inter.in_D" output="ble5[0:0].in_D"/>
+              <direct name="direct5" input="lut5inter.in_E" output="ble5[0:0].in_E"/>
+              <direct name="direct6" input="lut5inter.in_A" output="ble5[1:1].in_A"/>
+              <direct name="direct7" input="lut5inter.in_B" output="ble5[1:1].in_B"/>
+              <direct name="direct8" input="lut5inter.in_C" output="ble5[1:1].in_C"/>
+              <direct name="direct9" input="lut5inter.in_D" output="ble5[1:1].in_D"/>
+              <direct name="direct10" input="lut5inter.in_E" output="ble5[1:1].in_E"/>
+              <direct name="direct11" input="ble5[1:0].out_local" output="lut5inter.out_local"/>
+              <direct name="direct12" input="ble5[1:0].out_routing" output="lut5inter.out_routing"/>
+              <complete name="complete1" input="lut5inter.clk" output="ble5[1:0].clk"/> 
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in_A" output="lut5inter.in_A"/>
+            <direct name="direct2" input="fle.in_B" output="lut5inter.in_B"/>
+            <direct name="direct3" input="fle.in_C" output="lut5inter.in_C"/>
+            <direct name="direct4" input="fle.in_D" output="lut5inter.in_D"/>
+            <direct name="direct5" input="fle.in_E" output="lut5inter.in_E"/>
+            <direct name="direct7" input="lut5inter.out_local" output="fle.out_local"/>
+            <direct name="direct8" input="lut5inter.out_routing" output="fle.out_routing"/>
+            <direct name="direct9" input="fle.clk" output="lut5inter.clk"/>
+          </interconnect>
+        </mode> 
+        </pb_type>
+        <interconnect>
+        <!-- 50% sparsely populated local routing -->
+        <complete name="lutA" input="clb.I4 clb.I3 fle[1:0].out_local fle[3:2].out_local fle[8:8].out_local" output="fle[9:0].in_A">
+          <delay_constant max="2.842e-11" in_port="clb.I4" out_port="fle.in_A" />
+          <delay_constant max="2.842e-11" in_port="clb.I3" out_port="fle.in_A" />
+          </complete>
+        <complete name="lutB" input="clb.I3 clb.I2 fle[3:2].out_local fle[5:4].out_local fle[9:9].out_local" output="fle[9:0].in_B">
+          <delay_constant max="2.842e-11" in_port="clb.I3" out_port="fle.in_B" />
+          <delay_constant max="2.842e-11" in_port="clb.I2" out_port="fle.in_B" />
+          </complete>
+        <complete name="lutC" input="clb.I2 clb.I1 fle[5:4].out_local fle[7:6].out_local fle[8:8].out_local" output="fle[9:0].in_C">
+          <delay_constant max="2.842e-11" in_port="clb.I2" out_port="fle.in_C" />
+          <delay_constant max="2.842e-11" in_port="clb.I1" out_port="fle.in_C" />
+          </complete>
+        <complete name="lutD" input="clb.I4 clb.I2 fle[1:0].out_local fle[5:4].out_local fle[9:9].out_local" output="fle[9:0].in_D">
+          <delay_constant max="2.842e-11" in_port="clb.I4" out_port="fle.in_D" />
+          <delay_constant max="2.842e-11" in_port="clb.I2" out_port="fle.in_D" />
+          </complete>
+        <complete name="lutE" input="clb.I3 clb.I1 fle[3:2].out_local fle[7:6].out_local fle[8:8].out_local" output="fle[9:0].in_E">
+          <delay_constant max="2.842e-11" in_port="clb.I3" out_port="fle.in_E" />
+          <delay_constant max="2.842e-11" in_port="clb.I1" out_port="fle.in_E" />
+          </complete>
+        <complete name="lutF" input="clb.I4 clb.I1 fle[1:0].out_local fle[7:6].out_local fle[9:9].out_local" output="fle[9:0].in_F">
+          <delay_constant max="2.842e-11" in_port="clb.I4" out_port="fle.in_F" />
+          <delay_constant max="2.842e-11" in_port="clb.I1" out_port="fle.in_F" />
+          </complete>
+          <complete name="clks" input="clb.clk" output="fle[9:0].clk">
+          </complete>
+          <!-- Direct connections to CLB outputs -->
+          <direct name="clbouts1" input="fle[0:0].out_routing" output="clb.O[1:0]"/>
+          <direct name="clbouts2" input="fle[1:1].out_routing" output="clb.O[3:2]"/>
+          <direct name="clbouts3" input="fle[2:2].out_routing" output="clb.O[5:4]"/>
+          <direct name="clbouts4" input="fle[3:3].out_routing" output="clb.O[7:6]"/>
+          <direct name="clbouts5" input="fle[4:4].out_routing" output="clb.O[9:8]"/>
+          <direct name="clbouts6" input="fle[5:5].out_routing" output="clb.O[11:10]"/>
+          <direct name="clbouts7" input="fle[6:6].out_routing" output="clb.O[13:12]"/>
+          <direct name="clbouts8" input="fle[7:7].out_routing" output="clb.O[15:14]"/>
+          <direct name="clbouts9" input="fle[8:8].out_routing" output="clb.O[17:16]"/>
+          <direct name="clbouts10" input="fle[9:9].out_routing" output="clb.O[19:18]"/>
+        </interconnect>
+      </pb_type>
+    <!-- Define general purpose logic block (CLB) ends -->
+
+    <!-- Define DSP slice begin -->
+    <pb_type name="dsp_top">
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <input name="dsp_I1" num_pins="37" />
+      <input name="dsp_I2" num_pins="37" />
+      <input name="chainin" num_pins="64"/>
+      <input name="scanin" num_pins="27"/>
+      <output name="result" num_pins="74"/>
+      <output name="chainout" num_pins="64"/>
+      <output name="scanout" num_pins="27"/>
+      <clock name="clk" num_pins="1"/>
+
+    <pb_type name="dsp" num_pb="1">
+      <input name="reset" num_pins="1"/>
+      <input name="dsp_I1" num_pins="37"/>
+      <input name="dsp_I2" num_pins="37"/>
+      <input name="chainin" num_pins="64"/>
+      <input name="scanin" num_pins="27"/>
+      <output name="result" num_pins="74"/>
+      <output name="chainout" num_pins="64"/>
+      <output name="scanout" num_pins="27"/>
+      <clock name="clk" num_pins="1"/>
+
+      <pb_type name="dsp_pb" num_pb="1">
+        <input name="reset" num_pins="1"/>
+        <input name="datain" num_pins="74"/>
+        <input name="chainin" num_pins="64"/>
+        <input name="scanin" num_pins="27"/>
+        <output name="result" num_pins="74"/>
+        <output name="chainout" num_pins="64"/>
+        <output name="scanout" num_pins="27"/>
+        <clock name="clk" num_pins="1"/>
+
+        <!-- fixed-point multiplier mode (1 27x27 multiplier) result = ax*ay -->
+        <mode name="one_mult_27x27">
+          <pb_type name="one_mult_27x27" num_pb="1">
+            <input name="a" num_pins="27"/>
+            <input name="b" num_pins="27"/>
+            <output name="out" num_pins="54"/>
+            <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1">
+              <input name="a" num_pins="27"/>
+              <input name="b" num_pins="27"/>
+              <output name="out" num_pins="54"/>
+              <delay_constant max="1.667e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/>
+              <delay_constant max="1.667e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a">
+              </direct>
+              <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b">
+              </direct>
+              <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out">
+              </direct>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="datain2a" input="dsp_pb.datain[26:0]" output="one_mult_27x27.a">
+            </direct>
+            <direct name="datain2b" input="dsp_pb.datain[53:27]" output="one_mult_27x27.b">
+            </direct>
+            <direct name="out2dataout" input="one_mult_27x27.out" output="dsp_pb.result[53:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier mode (2 18x19 multipliers) result[some:bits] = ax*ay, result[other:bits] = bx*by -->
+        <mode name="two_mult_18x19">
+          <pb_type name="two_mult_18x19" num_pb="2">
+            <input name="a" num_pins="18"/>
+            <input name="b" num_pins="19"/>
+            <output name="out" num_pins="37"/>
+            <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1">
+              <input name="a" num_pins="18"/>
+              <input name="b" num_pins="19"/>
+              <output name="out" num_pins="37"/>
+              <delay_constant max="1.667e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/>
+              <delay_constant max="1.667e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a">
+                 </direct>
+              <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b">
+                 </direct>
+              <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out">
+                 </direct>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="datain2a1" input="dsp_pb.datain[17:0]" output="two_mult_18x19[0].a">
+            </direct>
+            <direct name="datain2b1" input="dsp_pb.datain[36:18]" output="two_mult_18x19[0].b">
+            </direct>
+            <direct name="datain2a2" input="dsp_pb.datain[54:37]" output="two_mult_18x19[1].a">
+            </direct>
+            <direct name="datain2b2" input="dsp_pb.datain[73:55]" output="two_mult_18x19[1].b">
+            </direct>
+            <direct name="out2result" input="two_mult_18x19.out" output="dsp_pb.result[73:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (bx * by) + (ax * ay) + chainin. chainout = result -->
+        <mode name="sop_2_mode">
+          <pb_type name="sop_2" num_pb="1" blif_model=".subckt int_sop_2">
+            <input name="reset" num_pins="1"/>
+            <input name="ax" num_pins="18"/>
+            <input name="ay" num_pins="19"/>
+            <input name="bx" num_pins="18"/>
+            <input name="by" num_pins="19"/>
+            <input name="chainin" num_pins="37"/>
+            <output name="result" num_pins="37"/>
+            <output name="chainout" num_pins="37"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_2.reset" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ax" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ay" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.bx" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.by" out_port="sop_2.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.chainin" out_port="sop_2.result"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_2.reset" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ax" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.ay" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.bx" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.by" out_port="sop_2.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_2.chainin" out_port="sop_2.chainout"/>
+
+            <T_setup value="1.891e-11" port="sop_2.ax" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.ay" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.bx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.by" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.chainin" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.reset" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_2.result" clock="clk"/>
+
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.ax" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.ay" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.bx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.by" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.chainin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.reset" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_2.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_2.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_2.reset">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[17:0]" output="sop_2.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[36:18]" output="sop_2.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[54:37]" output="sop_2.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[73:55]" output="sop_2.by">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[36:0]" output="sop_2.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_2.result" output="dsp_pb.result[36:0]">
+            </direct>
+            <direct name="chainout" input="sop_2.chainout" output="dsp_pb.chainout[36:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (ax * ay) + bx + chainin. chainout = result. with scanin-scanout support -->
+        <mode name="mult_add_mode_18_19_36">
+          <pb_type name="mult_add" num_pb="1" blif_model=".subckt mult_add_int_18x19">
+            <input name="reset" num_pins="1"/>
+            <input name="ax" num_pins="18"/>
+            <input name="ay" num_pins="19"/>
+            <input name="bx" num_pins="36"/>
+            <input name="chainin" num_pins="64"/>
+            <input name="scanin" num_pins="19"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <output name="scanout" num_pins="19"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="1.667e-9" in_port="mult_add.reset" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ax" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ay" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.bx" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.chainin" out_port="mult_add.result"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.scanin" out_port="mult_add.result"/>
+
+            <delay_constant max="1.667e-9" in_port="mult_add.reset" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ax" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ay" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.bx" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.chainin" out_port="mult_add.chainout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.scanin" out_port="mult_add.chainout"/>
+
+            <delay_constant max="1.667e-9" in_port="mult_add.reset" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ax" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.ay" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.bx" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.chainin" out_port="mult_add.scanout"/>
+            <delay_constant max="1.667e-9" in_port="mult_add.scanin" out_port="mult_add.scanout"/>
+
+            <T_setup value="1.891e-11" port="mult_add.ax" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.ay" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.bx" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.chainin" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.scanin" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.reset" clock="clk"/>
+            <T_setup value="1.891e-11" port="mult_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.ax" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.ay" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.bx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.scanin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.reset" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="mult_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="mult_add.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="mult_add.reset">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[17:0]" output="mult_add.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[36:18]" output="mult_add.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[72:37]" output="mult_add.bx">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="mult_add.chainin">
+            </direct>
+            <direct name="scanin"   input="dsp_pb.scanin[18:0]" output="mult_add.scanin">
+            </direct>
+            <direct name="dataout2result" input="mult_add.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="mult_add.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+            <direct name="scanout" input="mult_add.scanout" output="dsp_pb.scanout[18:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point sum-of-4 mode result = (dx * dy) + (cx * cy) + (bx * by) + (ax * ay) + chainin. chainout = result -->
+        <mode name="sop_4_mode">
+          <pb_type name="sop_4" num_pb="1" blif_model=".subckt int_sop_4">
+            <input name="reset" num_pins="1"/>
+            <input name="ax" num_pins="9"/>
+            <input name="ay" num_pins="9"/>
+            <input name="bx" num_pins="9"/>
+            <input name="by" num_pins="9"/>
+            <input name="cx" num_pins="9"/>
+            <input name="cy" num_pins="9"/>
+            <input name="dx" num_pins="9"/>
+            <input name="dy" num_pins="9"/>
+            <input name="chainin" num_pins="64"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_4.reset" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ax" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ay" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.bx" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.by" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cx" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cy" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dx" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dy" out_port="sop_4.result"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.chainin" out_port="sop_4.result"/>
+
+            <delay_constant max="1.667e-9" in_port="sop_4.reset" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ax" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.ay" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.bx" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.by" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cx" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.cy" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dx" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.dy" out_port="sop_4.chainout"/>
+            <delay_constant max="1.667e-9" in_port="sop_4.chainin" out_port="sop_4.chainout"/>
+
+            <T_setup value="1.891e-11" port="sop_4.ax" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.ay" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.bx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.by" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.cx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.cy" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.dx" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.dy" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.chainin" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.reset" clock="clk"/>
+            <T_setup value="1.891e-11" port="sop_4.result" clock="clk"/>
+
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.ax" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.ay" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.bx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.by" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.cx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.cy" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.dx" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.dy" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.chainin" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.reset" clock="clk"/>
+            <T_clock_to_Q max="6.032e-11" min="6.032e-11" port="sop_4.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_4.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_4.reset">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[8:0]" output="sop_4.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[17:9]" output="sop_4.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[26:18]" output="sop_4.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[35:27]" output="sop_4.by">
+            </direct>
+            <direct name="datain2cx" input="dsp_pb.datain[44:36]" output="sop_4.cx">
+            </direct>
+            <direct name="datain2cy" input="dsp_pb.datain[53:45]" output="sop_4.cy">
+            </direct>
+            <direct name="datain2dx" input="dsp_pb.datain[62:54]" output="sop_4.dx">
+            </direct>
+            <direct name="datain2dy" input="dsp_pb.datain[71:63]" output="sop_4.dy">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="sop_4.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_4.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="sop_4.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+          </interconnect>
+        </mode>
+      </pb_type>
+
+      <interconnect>
+        <direct name="datain1" input="dsp.dsp_I1" output ="dsp_pb.datain[36:0]"/>
+        <direct name="datain2" input="dsp.dsp_I2" output ="dsp_pb.datain[73:37]"/>
+        <direct name="reset" input="dsp.reset" output="dsp_pb.reset"></direct>
+        <direct name="chainin" input="dsp.chainin"    output="dsp_pb.chainin"></direct>
+        <direct name="chainout" input="dsp_pb.chainout" output="dsp.chainout"></direct>
+        <direct name="scanin" input="dsp.scanin"    output="dsp_pb.scanin"></direct>
+        <direct name="scanout" input="dsp_pb.scanout" output="dsp.scanout"></direct>
+        <direct name="result" input="dsp_pb.result" output="dsp.result"></direct>
+        <direct name="clk" input="dsp.clk" output="dsp_pb.clk"></direct>
+      </interconnect>  
+    </pb_type>
+
+      
+    <interconnect>
+      <!--50% sparse crossbar means 50% of the lines can reach an actual input of the dsp 
+      We do this by splitting inputs into two buckets and having two full crossbars-->
+      <!--
+     <complete name="first_half" input="dsp_top.dsp_I1" output="dsp.dsp_I1">
+          <delay_constant max="333e-12" in_port="dsp_top.dsp_I1" out_port="dsp.dsp_I1"/>
+     </complete>
+
+      <complete name="second_half" input="dsp_top.dsp_I2" output="dsp.dsp_I2">
+          <delay_constant max="333e-12" in_port="dsp_top.dsp_I2" out_port="dsp.dsp_I2"/>
+      </complete>
+      -->
+      <direct name="datain1" input="dsp_top.dsp_I1" output ="dsp.dsp_I1"/>
+      <direct name="datain2" input="dsp_top.dsp_I2" output ="dsp.dsp_I2"/>
+
+      <direct name="reset" input="dsp_top.reset" output="dsp.reset"></direct>
+      <direct name="chainin" input="dsp_top.chainin" output="dsp.chainin">
+          <delay_constant max="1179e-12" in_port="dsp_top.chainin" out_port="dsp.chainin"/>
+      </direct>
+      <direct name="chainout" input="dsp.chainout" output="dsp_top.chainout">
+          <delay_constant max="1179e-12" in_port="dsp.chainout" out_port="dsp_top.chainout"/>
+      </direct>
+      <direct name="scanin" input="dsp_top.scanin" output="dsp.scanin">
+          <delay_constant max="1179e-12" in_port="dsp_top.scanin" out_port="dsp.scanin"/>
+      </direct>
+      <direct name="scanout" input="dsp.scanout" output="dsp_top.scanout">
+          <delay_constant max="1179e-12" in_port="dsp.scanout" out_port="dsp_top.scanout"/>
+      </direct>
+      <direct name="result" input="dsp.result" output="dsp_top.result"></direct>
+      <direct name="clk" input="dsp_top.clk" output="dsp.clk"></direct>
+    </interconnect>
+  </pb_type>
+    <!-- Define DSP slice end -->
+
+
+    <!-- Define fracturable memory begin -->
+    <!-- 
+    RAM blocks always have registered inputs. The input FFs appear before the address decoder & wordline driver,
+    and after the local input crossbar & level shifter.
+    RAM blocks optionally have registered outputs. The output FFs (if present) appear after the output crossbar.
+    If BRAM doesn't have registered outputs, then T_clk_to_q is the whole delay of the read/write operation.
+    If BRAM does have registered output, then T_clk_to_q is just the FF clk_to_q and then delay_constant
+    can be used to specify the whole delay of the read/write operation.
+
+    This RAM block has registered outputs.
+
+    The area and delay values of this RAM block were obtained (indirectly) from COFFE simulations.
+    COFFE only support widths and depths that are powers of 2. For M20K (20 Kilobit BRAM), we need
+    the width to be 40 bits and depth to be 512 (for the logically widest mode: 512x40). We can't
+    simulate these dimensions directly in COFFE. So, we simulated and obtained the results for M32K
+    (32 Kilobits BRAM) and (16 Kilobits BRAM). Then we interpolated the results.
+    For delay, a linear interpolation was used, based on the size of the Memory (16K->20K->32K).
+    For area, the value was calculated using two interpolations: (1) port based (change in number of 
+    ports in going from 16K->20K->32K) and (2) number of bits based (change in number of bits in
+    going from 16K->20K->32K). The interpolation that resulted in the larger area was picked.
+    
+
+    Here are the equations used to calculate the delays based on COFFE results:
+    T_setup (inputs) = T_level_shifter + T_register_micro_setup = 32.3ps + 18.91ps = 51.21ps
+    T_clk_to_q (inputs) = T_register_micro_clk_to_q = 60.32ps
+    T_setup (outputs) = T_register_micro_setup = 18.91ps 
+    T_clk_to_q (outputs) = T_register_micro_clk_to_q = 60.32ps
+
+    (Register setup and clk_to_q timings are actually from the FF used in the logic cluster.)
+
+    T_read = T1 + T2 + T3
+    = max (Row decoder, Pre-charge time) + (Wordline driver + Bit line delay) + (Sense amp + Output crossbar)
+
+    * Bit line delay is included in self.RAM.samp.delay time in COFFE. The Sense amp delay is actually
+    self.RAM.samp_part2.delay
+
+    T_write = T1 + T2 + T3
+    = max (Row decoder, Pre-charge time) + (Wordline driver) + (Write driver)
+
+    delay_constant values model the internal limits of a block (the combinatorial delay).
+    delay_constant = max (T_read, T_write) 
+
+    Overall internal delay of the RAM is T_clk_to_q (inputs) + delay_constant + T_setup (outputs)
+    -->
+    <pb_type name="memory">
+      <input name="addr1" num_pins="11"/>
+      <input name="addr2" num_pins="11"/>
+      <input name="data" num_pins="40"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="40"/>
+      <clock name="clk" num_pins="1"/>
+      <!-- Specify single port mode first -->
+      <mode name="mem_512x40_sp">
+        <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="9" port_class="address"/>
+          <input name="data" num_pins="40" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="40" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.data" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.we" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_512x40_sp.out" clock="clk"/>
+
+          <T_hold value="7.4e-11" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_512x40_sp.data" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_512x40_sp.we" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_512x40_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.data" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.we" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_512x40_sp.out" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_512x40_sp.addr" out_port="mem_512x40_sp.out"/>
+          <delay_constant max="0" in_port="mem_512x40_sp.data" out_port="mem_512x40_sp.out"/>
+          <delay_constant max="0" in_port="mem_512x40_sp.we"   out_port="mem_512x40_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data" output="mem_512x40_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_1024x20_sp">
+        <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="10" port_class="address"/>
+          <input name="data" num_pins="20" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="20" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_1024x20_sp.addr" out_port="mem_1024x20_sp.out"/>
+          <delay_constant max="0" in_port="mem_1024x20_sp.data" out_port="mem_1024x20_sp.out"/>
+          <delay_constant max="0" in_port="mem_1024x20_sp.we"   out_port="mem_1024x20_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_2048x10_sp">
+        <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="11" port_class="address"/>
+          <input name="data" num_pins="10" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="10" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_setup value="-3.5e-11" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_hold value="7.4e-11" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_clock_to_Q max="3.7e-10" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_2048x10_sp.addr" out_port="mem_2048x10_sp.out"/>
+          <delay_constant max="0" in_port="mem_2048x10_sp.data" out_port="mem_2048x10_sp.out"/>
+          <delay_constant max="0" in_port="mem_2048x10_sp.we"   out_port="mem_2048x10_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <!-- Specify true dual port mode next -->
+      <mode name="mem_1024x20_dp">
+        <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="10" port_class="address1"/>
+          <input name="addr2" num_pins="10" port_class="address2"/>
+          <input name="data1" num_pins="20" port_class="data_in1"/>
+          <input name="data2" num_pins="20" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="20" port_class="data_out1"/>
+          <output name="out2" num_pins="20" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_1024x20_dp.addr1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.data1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.we1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.addr2" out_port="mem_1024x20_dp.out2"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.data2" out_port="mem_1024x20_dp.out2"/>
+          <delay_constant max="0" in_port="mem_1024x20_dp.we2" out_port="mem_1024x20_dp.out2"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1">
+          </direct>
+          <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2">
+          </direct>
+          <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1">
+          </direct>
+          <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1">
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2">
+          </direct>
+          <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]">
+          </direct>
+          <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_2048x10_dp">
+        <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="11" port_class="address1"/>
+          <input name="addr2" num_pins="11" port_class="address2"/>
+          <input name="data1" num_pins="10" port_class="data_in1"/>
+          <input name="data2" num_pins="10" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="10" port_class="data_out1"/>
+          <output name="out2" num_pins="10" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_setup value="-2.6e-11" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_hold value="1.13e-10" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="3.57e-10" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <delay_constant max="0" in_port="mem_2048x10_dp.addr1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.data1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.we1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.addr2" out_port="mem_2048x10_dp.out2"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.data2" out_port="mem_2048x10_dp.out2"/>
+          <delay_constant max="0" in_port="mem_2048x10_dp.we2" out_port="mem_2048x10_dp.out2"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1">
+          </direct>
+          <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2">
+          </direct>
+          <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1">
+          </direct>
+          <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1">
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2">
+          </direct>
+          <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]">
+          </direct>
+          <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+    </pb_type>
+    <!-- Define fracturable memory end -->
+
+    <pb_type name="tsv_hole">
+      <input name="I" num_pins="1"/>
+      <output name="O" num_pins="1"/>
+      <interconnect/>
+    </pb_type>
+
+  </complexblocklist>
+
+  <switchblocklist>
+    <!-- Stratix IV uses a uni-directional routing architecture with a Driver Input Mux (DIM) size of 12 (i.e.
+           each wire can be driven by one of 12 block/outputs or wires) for the L4s.
+           
+           In the Stratix IV architecture the long wires (L16 here) are accessible only from the short wires, 
+           and are not connected to the block pins (i.e. connection blocks). Furthermore, they only connect 
+           to switch blocks every 4 LABs (to avoid expensive deep via stacks).
+           We approximate the L16 DIM size as 40:1 (in reality it is a pair of 20:1 (?) muxes with a 2:1 swap mux
+           in front, which has nearly the same connectivity as a full 40:1).
+
+           L4 wires
+           ================
+           At a channel width of 300 there are 260 L4/L4prime wires. At an effective Fc_out of 0.075 
+           and 40 LAB outputs this yeilds:
+
+                40 * 2 = 80 outputs per channel  [2 LABs per-channel]
+
+                80 * 0.075 = 6 outputs drive each L4 wire [output connection block]
+
+           This leaves:
+
+                12 - 6 = 6 inputs to the DIMs from other routing wires [switch block]
+
+           Since L4s connect at every switch block, there are:
+
+                260 L16 wires per channel + direction which can drive wires at a particular switchblock
+                (via switchpoints 0, 1, 2, 3)
+
+           And for each direction (260 wires) only:
+
+               260 / 4 = 65 wires starting/ending per channel + direction at each switch block
+               (i.e. from each direction, north/south/east/west, there are 32 L4s starting, and 32 L4s ending; + 1 wire for the 65th)
+
+           Which we allocate as follows:
+
+                L4
+                =====
+                straight-through connection: 2 (from L4 or L16)
+                clock-wise turn            : 2 (from L4 or L16)
+                counter-clock-wise turn    : 2 (from L4 or L16)
+
+           L16 wires
+           =========
+           At a channel width of 300 there are 40 L16 wires (20 in each direction), which do not connect to the input/output connection blocks.
+           This leaves 40 inputs to the DIM to select from routing wires (long wires use larger DIMs to improve reachability,
+           the area cost is relatively small since they are so rare).
+
+           Since L16s only connect at every 4th switch block there are:
+
+                40 / 4 = 10 L16 wires per channel (5 in each direction) which can drive wires at a particular switchblock
+                (via switchpoints 0, 4, 8, 12)
+
+           And for each direction (20 wires) only:
+
+               40 / 16 = 2.25 => 2 wires starting/ending per channel + direction at each switch block
+               (i.e. from each direction, north/south/east/west, there is one L16 starting, and one L16 ending)
+           
+           We assign the 40 DIM inputs as follows:
+
+                L16
+                =====
+                straight-through connection:  3 (from L16)
+                straight-through connection: 11 (from L4)
+                clock-wise turn            :  3 (from L16)
+                clock-wise turn            : 10 (from L4)
+                counter clock-wise turn    :  3 (from L16)
+                counter clock-wise turn    : 10 (from L4)
+
+           Switch pattern
+           ==============
+           This switch block is based on the Wilton switch block (see Page 103 of Steve Wilton's PhD Thesis 
+           "Architecture and Algorithms for Field-Programmable Gate Arrays with Embedded Memory", 1997):
+
+                left-to-top: W - t
+                top-to-right: t + 1
+                right-to-bottom: 2*W - 2 - t
+                bottom-to-left: t + 1
+                left-to-right: t
+                top-to-bottom: t
+
+           Since Wilton assumed bidirection routing (while we use unidirectional routing),
+           we mirror the clock-wise turns to match the conter-clock-wise specification.
+           -->
+    <switchblock name="wilton_turn_clockwise_core" type="unidir">
+      <switchblock_location type="CORE"/>
+      <switchfuncs>
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="W-t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t+1"/>
+        <!-- right to top -->
+        <func type="br" formula="2*W-2-t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t+1"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="10*to" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers 
+
+               Driving from L16 (few) to L4 (many) preferr driving from end-point of L16, although since there are many they will
+               all be multiply connected.
+               
+               Driving from L4 (many) to L4 (many) shuffle the switchpoints so the L4's are driven from a variety of switchpoints.
+               Since the actual number L4s starting/ending are equal, using 'fixed' from_order would mean only switchpoint 0 -> 0
+               connections. A 'shuffled' order will mix-up the from switchpoints for more diversity.
+               -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0,1,2,3"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+    </switchblock>
+    <switchblock name="wilton_turn_counter_clockwise_core" type="unidir">
+      <switchblock_location type="CORE"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="W-t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t+1"/>
+        <!-- top to right -->
+        <func type="rb" formula="2*W-2-t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t+1"/>
+        <!-- bottom to left -->
+      </switchfuncs>
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="10*to" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers 
+
+               Driving from L16 (few) to L4 (many) preferr driving from end-point of L16, although since there are many they will
+               all be multiply connected.
+               
+               Driving from L4 (many) to L4 (many) shuffle the switchpoints so the L4's are driven from a variety of switchpoints.
+               Since the actual number L4s starting/ending are equal, using 'fixed' from_order would mean only switchpoint 0 -> 0
+               connections. A 'shuffled' order will mix-up the from switchpoints for more diversity.
+
+               Note that a different from_switchpoints ordering is used to ensure a different shuffling occurs compared to 
+               wilton_turn_clockwise_core.
+               -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0,1,2,3"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+    </switchblock>
+    <switchblock name="wilton_straight" type="unidir">
+      <switchblock_location type="EVERYWHERE"/>
+      <switchfuncs>
+        <!-- Straight -->
+        <func type="lr" formula="t"/>
+        <!-- left to right -->
+        <func type="tb" formula="t"/>
+        <!-- top to bottom -->
+        <func type="rl" formula="t"/>
+        <!-- right to left -->
+        <func type="bt" formula="t"/>
+        <!-- bottom to top -->
+      </switchfuncs>
+      <!-- L16 Drivers 
+                Note that we order the switchpoints in order of preference, since VPR currently
+                iterates through the source sets in order, such that we connect first to wires
+                ending at the switchblock (switchpoint 0), and then fallback to switchpoints
+                in decreasing distance from the drive point (if we have more to's than from's
+                it then wraps around).
+
+                Note also that we multiply the number of expected connections by 'to', since while usually
+                there is only one 'to' wire, ocasionally there may be more, and we want to ensure they all
+                get the same number of connections.
+
+                For L16->L16:
+                  We allow any valid switchpoint to be used as the 'from' point.
+                  Allow 'low' switchpoints like '4' may seem counter-intuitive (i.e. why not use a cheaper L4)
+                  this makes it easier to bypass once on the L16 network (e.g. to get around congestion).
+           -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="11*to" from_type="L4" from_switchpoint="0,3,2,1" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 Drivers -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+      <!--<wireconn num_conns="1*to" from_type="L4" from_switchpoint="0" to_type="L4" to_switchpoint="0"/>-->
+      <!--<wireconn num_conns="1*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L4" to_switchpoint="0"/>-->
+    </switchblock>
+    <switchblock name="wilton_straight_corner" type="unidir">
+      <!-- Same as wilton straight, but turning around a corner -->
+      <switchblock_location type="CORNER"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t"/>
+        <!-- top to right -->
+        <func type="rb" formula="t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t"/>
+        <!-- bottom to left -->
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t"/>
+        <!-- right to top -->
+        <func type="br" formula="t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- L16 Drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="11*to" from_type="L4" from_switchpoint="0,3,2,1" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 Drivers -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+      <!--<wireconn num_conns="1*to" from_type="L4" from_switchpoint="0" to_type="L4" to_switchpoint="0"/>-->
+      <!--<wireconn num_conns="1*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L4" to_switchpoint="0"/>-->
+    </switchblock>
+    <switchblock name="wilton_turn_fringe" type="unidir">
+      <!-- Non-corner perimeter SBs -->
+      <switchblock_location type="FRINGE"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="W-t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t+1"/>
+        <!-- top to right -->
+        <func type="rb" formula="2*W-2-t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t+1"/>
+        <!-- bottom to left -->
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="W-t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t+1"/>
+        <!-- right to top -->
+        <func type="br" formula="2*W-2-t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t+1"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- We use 'max' style connections here to ensure there are no dangling wires, otherwise like core turns -->
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*max(from,to)" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="21*max(from,to)" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers -->
+      <wireconn num_conns="1*max(from,to)" from_type="L16" from_switchpoint="0,12,8,4" from_order="fixed" to_type="L4" to_switchpoint="0"/>
+      <wireconn num_conns="1*max(from,to)" from_type="L4" from_switchpoint="0,1,2,3" from_order="shuffled" to_type="L4" to_switchpoint="0"/>
+    </switchblock>
+  </switchblocklist>
+
+  <clocks>
+    <clock buffer_size="auto" C_wire="2.5e-10"/>
+  </clocks>
+
+</architecture>
+
+
diff --git a/vtr_flow/parse/parse_config/common/vpr.place.txt b/vtr_flow/parse/parse_config/common/vpr.place.txt
index 8713e8fe51f..64da113842b 100644
--- a/vtr_flow/parse/parse_config/common/vpr.place.txt
+++ b/vtr_flow/parse/parse_config/common/vpr.place.txt
@@ -1,6 +1,13 @@
 #VPR Place Metrics
 placed_wirelength_est;vpr.out;BB estimate of min-dist \(placement\) wire length: (\d+)
 
+#VPR Number of heap operations
+total_swap;vpr.out;Swaps called\s*:\s*(\d+)
+accepted_swap;vpr.out;\s*Swaps accepted\s*:\s*(\d+).*
+rejected_swap;vpr.out;\s*Swaps rejected\s*:\s*(\d+).*
+aborted_swap;vpr.out;\s*Swaps aborted\s*:\s*(\d+).*
+
+
 #VPR Run-time Metrics
 place_mem;vpr.out;.*Placement took.*\(max_rss (.*), .*\)
 place_time;vpr.out;\s*Placement took (.*) seconds