diff --git a/vpr/src/place/centroid_move_generator.cpp b/vpr/src/place/centroid_move_generator.cpp
index b10d93a661f..a1b79b92f7a 100644
--- a/vpr/src/place/centroid_move_generator.cpp
+++ b/vpr/src/place/centroid_move_generator.cpp
@@ -76,9 +76,9 @@ e_create_move CentroidMoveGenerator::propose_move(t_pl_blocks_to_be_moved& block
     /* Calculate the centroid location*/
     calculate_centroid_loc(b_from, false, centroid, nullptr, noc_attraction_enabled_, noc_attraction_w_);
 
-    // Centroid location is not necessarily a valid location, and the downstream location expect a valid
-    // layer for "to" location. So if the layer is not valid, we set it to the same layer as from loc.
-    to.layer = (centroid.layer < 0) ? from.layer : centroid.layer;
+    // Centroid location is not necessarily a valid location, and the downstream location expects a valid
+    // layer for the centroid location. So if the layer is not valid, we set it to the same layer as from loc.
+    centroid.layer = (centroid.layer < 0) ? from.layer : centroid.layer;
     /* Find a location near the weighted centroid_loc */
     if (!find_to_loc_centroid(cluster_from_type, from, centroid, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
diff --git a/vpr/src/place/median_move_generator.cpp b/vpr/src/place/median_move_generator.cpp
index 7e31e237bb7..20ef207221d 100644
--- a/vpr/src/place/median_move_generator.cpp
+++ b/vpr/src/place/median_move_generator.cpp
@@ -5,7 +5,14 @@
 #include "placer_globals.h"
 #include "move_utils.h"
 
-static bool get_bb_incrementally(ClusterNetId net_id, t_bb& bb_coord_new, int xold, int yold, int xnew, int ynew);
+static bool get_bb_incrementally(ClusterNetId net_id,
+                                 t_bb& bb_coord_new,
+                                 int xold,
+                                 int yold,
+                                 int layer_old,
+                                 int xnew,
+                                 int ynew,
+                                 int layer_new);
 
 static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_coord_new, ClusterBlockId block_id, bool& skip_net);
 
@@ -29,7 +36,7 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     auto& place_move_ctx = g_placer_ctx.mutable_move();
 
     const int num_layers = device_ctx.grid.get_num_layers();
-    bool is_multi_layer = (num_layers > 1);
+
 
     t_pl_loc from = place_ctx.block_locs[b_from].loc;
     int from_layer = from.layer;
@@ -43,12 +50,13 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     t_bb coords(OPEN, OPEN, OPEN, OPEN, OPEN, OPEN);
     t_bb limit_coords;
     ClusterBlockId bnum;
-    int pnum, xnew, xold, ynew, yold;
+    int pnum, xnew, xold, ynew, yold, layer_new, layer_old;
 
     //clear the vectors that saves X & Y coords
     //reused to save allocation time
     place_move_ctx.X_coord.clear();
     place_move_ctx.Y_coord.clear();
+    place_move_ctx.layer_coord.clear();
     std::vector<int> layer_blk_cnt(num_layers, 0);
 
     //true if the net is a feedback from the block to itself
@@ -84,8 +92,11 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
             VTR_ASSERT(pnum >= 0);
             xold = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
             yold = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
+            layer_old = place_ctx.block_locs[bnum].loc.layer;
+
             xold = std::max(std::min(xold, (int)device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
             yold = std::max(std::min(yold, (int)device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+            layer_old = std::max(std::min(layer_old, (int)device_ctx.grid.get_num_layers() - 1), 0);
 
             //To calulate the bb incrementally while excluding the moving block
             //assume that the moving block is moved to a non-critical coord of the bb
@@ -101,7 +112,23 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
                 ynew = net_bb_coords.ymin;
             }
 
-            if (!get_bb_incrementally(net_id, coords, xold, yold, xnew, ynew)) {
+            if (net_bb_coords.layer_min == layer_old) {
+                layer_new = net_bb_coords.layer_max;
+            } else {
+                layer_new = net_bb_coords.layer_min;
+            }
+            
+            // If the mvoing block is on the border of the bounding box, we cannot get
+            // the bounding box incrementatlly. In that case, bounding box should be calculated 
+            // from scratch.
+            if (!get_bb_incrementally(net_id,
+                                      coords,
+                                      xold,
+                                      yold,
+                                      layer_old,
+                                      xnew,
+                                      ynew,
+                                      layer_new)) {
                 get_bb_from_scratch_excluding_block(net_id, coords, b_from, skip_net);
                 if (skip_net)
                     continue;
@@ -112,27 +139,19 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
         place_move_ctx.X_coord.push_back(coords.xmax);
         place_move_ctx.Y_coord.push_back(coords.ymin);
         place_move_ctx.Y_coord.push_back(coords.ymax);
-        if (is_multi_layer) {
-            for (int layer_num = 0; layer_num < num_layers; layer_num++) {
-                layer_blk_cnt[layer_num] += place_move_ctx.num_sink_pin_layer[size_t(net_id)][layer_num];
-            }
-            // If the pin under consideration is of type sink, it shouldn't be added to layer_blk_cnt since the block
-            // is moving
-            if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK) {
-                VTR_ASSERT_SAFE(layer_blk_cnt[from_layer] > 0);
-                layer_blk_cnt[from_layer]--;
-            }
-        }
+        place_move_ctx.layer_coord.push_back(coords.layer_min);
+        place_move_ctx.layer_coord.push_back(coords.layer_max);
     }
 
-    if ((place_move_ctx.X_coord.empty()) || (place_move_ctx.Y_coord.empty())) {
-        VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tMove aborted - X_coord and y_coord are empty\n");
+    if ((place_move_ctx.X_coord.empty()) || (place_move_ctx.Y_coord.empty()) || (place_move_ctx.layer_coord.empty())) {
+        VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tMove aborted - X_coord or y_coord or layer_coord are empty\n");
         return e_create_move::ABORT;
     }
 
     //calculate the median region
     std::stable_sort(place_move_ctx.X_coord.begin(), place_move_ctx.X_coord.end());
     std::stable_sort(place_move_ctx.Y_coord.begin(), place_move_ctx.Y_coord.end());
+    std::stable_sort(place_move_ctx.layer_coord.begin(), place_move_ctx.layer_coord.end());
 
     limit_coords.xmin = place_move_ctx.X_coord[floor((place_move_ctx.X_coord.size() - 1) / 2)];
     limit_coords.xmax = place_move_ctx.X_coord[floor((place_move_ctx.X_coord.size() - 1) / 2) + 1];
@@ -140,6 +159,9 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     limit_coords.ymin = place_move_ctx.Y_coord[floor((place_move_ctx.Y_coord.size() - 1) / 2)];
     limit_coords.ymax = place_move_ctx.Y_coord[floor((place_move_ctx.Y_coord.size() - 1) / 2) + 1];
 
+    limit_coords.layer_min = place_move_ctx.layer_coord[floor((place_move_ctx.layer_coord.size() - 1) / 2)];
+    limit_coords.layer_max = place_move_ctx.layer_coord[floor((place_move_ctx.layer_coord.size() - 1) / 2) + 1];
+
     //arrange the different range limiters
     t_range_limiters range_limiters{rlim,
                                     place_move_ctx.first_rlim,
@@ -149,17 +171,8 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_
     t_pl_loc median_point;
     median_point.x = (limit_coords.xmin + limit_coords.xmax) / 2;
     median_point.y = (limit_coords.ymin + limit_coords.ymax) / 2;
+    median_point.layer = (limit_coords.layer_min + limit_coords.layer_max) / 2;
 
-    // Before calling find_to_loc_centroid a valid layer should be assigned to "to" location. If there are multiple layers, the layer
-    // with highest number of sinks will be used. Otherwise, the same layer as "from" loc is assigned.
-    if (is_multi_layer) {
-        int layer_num = std::distance(layer_blk_cnt.begin(), std::max_element(layer_blk_cnt.begin(), layer_blk_cnt.end()));
-        median_point.layer = layer_num;
-        to.layer = layer_num;
-    } else {
-        median_point.layer = from.layer;
-        to.layer = from.layer;
-    }
     if (!find_to_loc_centroid(cluster_from_type, from, median_point, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
     }
@@ -194,6 +207,9 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_co
     int ymin = OPEN;
     int ymax = OPEN;
 
+    int layer_min = OPEN;
+    int layer_max = OPEN;
+
     int pnum;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -208,11 +224,14 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_co
         pnum = net_pin_to_tile_pin_index(net_id, 0);
         int src_x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
         int src_y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
+        int src_layer = place_ctx.block_locs[bnum].loc.layer;
 
         xmin = src_x;
         ymin = src_y;
         xmax = src_x;
         ymax = src_y;
+        layer_min = src_layer;
+        layer_max = src_layer;
         first_block = true;
     }
 
@@ -225,12 +244,15 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_co
         const auto& block_loc = place_ctx.block_locs[bnum].loc;
         int x = block_loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
         int y = block_loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
+        int layer = block_loc.layer;
 
         if (!first_block) {
             xmin = x;
             ymin = y;
             xmax = x;
             ymax = y;
+            layer_max = layer;
+            layer_min = layer;
             first_block = true;
             continue;
         }
@@ -245,6 +267,12 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_co
         } else if (y > ymax) {
             ymax = y;
         }
+
+        if (layer < layer_min) {
+            layer_min = layer;
+        } else if (layer > layer_max) {
+            layer_max = layer;
+        }
     }
 
     /* Now I've found the coordinates of the bounding box.  There are no *
@@ -256,8 +284,10 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_co
      * is 0).  See route_common.cpp for a channel diagram.               */
     bb_coord_new.xmin = std::max(std::min<int>(xmin, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     bb_coord_new.ymin = std::max(std::min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.layer_min = std::max(std::min<int>(layer_min, device_ctx.grid.get_num_layers() - 1), 0);
     bb_coord_new.xmax = std::max(std::min<int>(xmax, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     bb_coord_new.ymax = std::max(std::min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.layer_max = std::max(std::min<int>(layer_max, device_ctx.grid.get_num_layers() - 1), 0);
 }
 
 /*
@@ -273,7 +303,14 @@ static void get_bb_from_scratch_excluding_block(ClusterNetId net_id, t_bb& bb_co
  * the pins always lie on the outside of the bounding box.            *
  * The x and y coordinates are the pin's x and y coordinates.         */
 /* IO blocks are considered to be one cell in for simplicity.         */
-static bool get_bb_incrementally(ClusterNetId net_id, t_bb& bb_coord_new, int xold, int yold, int xnew, int ynew) {
+static bool get_bb_incrementally(ClusterNetId net_id,
+                                 t_bb& bb_coord_new,
+                                 int xold,
+                                 int yold,
+                                 int layer_old,
+                                 int xnew,
+                                 int ynew,
+                                 int layer_new) {
     //TODO: account for multiple physical pin instances per logical pin
 
     auto& device_ctx = g_vpr_ctx.device();
@@ -281,12 +318,21 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb& bb_coord_new, int xo
 
     xnew = std::max(std::min<int>(xnew, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     ynew = std::max(std::min<int>(ynew, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    layer_new = std::max(std::min<int>(layer_new, device_ctx.grid.get_num_layers() - 1), 0);
+
     xold = std::max(std::min<int>(xold, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     yold = std::max(std::min<int>(yold, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    layer_old = std::max(std::min<int>(layer_old, device_ctx.grid.get_num_layers() - 1), 0);
 
     t_bb union_bb_edge;
     t_bb union_bb;
     const bool& cube_bb = g_vpr_ctx.placement().cube_bb;
+    /* Calculating per-layer bounding box is more time consuming compared to cube bounding box. To speed up
+    * this move, the bounding box used for this move is of the type cube bounding box even if the per-layer
+    * bounding box is used by placement SA engine. 
+    * If per-layer bounding box is used, we take a union of boundinx boxes on each layer to make a cube bounding box.
+    * For example, the xmax of this cube boundix box is determined by the maximim x coordinate across all blocks on all layers.
+    */
     if (!cube_bb) {
         std::tie(union_bb_edge, union_bb) = union_2d_bb_incr(place_move_ctx.layer_bb_num_on_edges[net_id],
                                                              place_move_ctx.layer_bb_coords[net_id]);
@@ -410,5 +456,47 @@ static bool get_bb_incrementally(ClusterNetId net_id, t_bb& bb_coord_new, int xo
         bb_coord_new.ymin = curr_bb_coord.ymin;
         bb_coord_new.ymax = curr_bb_coord.ymax;
     }
+
+    if (layer_new < layer_old) {
+        if (layer_old == curr_bb_coord.layer_max) {
+            if (curr_bb_edge.layer_max == 1) {
+                return false;
+            } else {
+                bb_coord_new.layer_max = curr_bb_coord.layer_max;
+            }
+        } else {
+            bb_coord_new.layer_max = curr_bb_coord.layer_max;
+        }
+
+        if (layer_new < curr_bb_coord.layer_min) {
+            bb_coord_new.layer_min = layer_new;
+        } else if (layer_new == curr_bb_coord.layer_min) {
+            bb_coord_new.layer_min = layer_new;
+        } else {
+            bb_coord_new.layer_min = curr_bb_coord.layer_min;
+        }
+
+    } else if (layer_new > layer_old) {
+        if (layer_old == curr_bb_coord.layer_min) {
+            if (curr_bb_edge.layer_min == 1) {
+                return false;
+            } else {
+                bb_coord_new.layer_min = curr_bb_coord.layer_min;
+            }
+        } else {
+            bb_coord_new.layer_min = curr_bb_coord.layer_min;
+        }
+
+        if (layer_new > curr_bb_coord.layer_max) {
+            bb_coord_new.layer_max = layer_new;
+        } else if (layer_new == curr_bb_coord.layer_max) {
+            bb_coord_new.layer_max = layer_new;
+        } else {
+            bb_coord_new.layer_max = curr_bb_coord.layer_max;
+        }
+    } else {
+        bb_coord_new.layer_min = curr_bb_coord.layer_min;
+        bb_coord_new.layer_max = curr_bb_coord.layer_max;
+    }
     return true;
 }
diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp
index 431f05014dc..3ec00c26970 100644
--- a/vpr/src/place/move_utils.cpp
+++ b/vpr/src/place/move_utils.cpp
@@ -928,7 +928,7 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type,
                           ClusterBlockId b_from) {
     //Retrieve the compressed block grid for this block type
     const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[blk_type->index];
-    const int to_layer_num = to_loc.layer;
+    const int to_layer_num = centroid.layer;
     VTR_ASSERT(to_layer_num >= 0);
     const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
 
diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h
index 965dc55f53d..24bdfcb7ebc 100644
--- a/vpr/src/place/move_utils.h
+++ b/vpr/src/place/move_utils.h
@@ -70,6 +70,8 @@ struct t_bb_cost {
     t_edge_cost xmax = {0, 0.0};
     t_edge_cost ymin = {0, 0.0};
     t_edge_cost ymax = {0, 0.0};
+    t_edge_cost layer_min = {0, 0.};
+    t_edge_cost layer_max = {0, 0.};
 };
 
 /**
diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp
index 9fc11b22ee5..281fd174820 100644
--- a/vpr/src/place/place.cpp
+++ b/vpr/src/place/place.cpp
@@ -930,6 +930,7 @@ void try_place(const Netlist<>& net_list,
     //allocate helper vectors that are used by many move generators
     place_move_ctx.X_coord.resize(10, 0);
     place_move_ctx.Y_coord.resize(10, 0);
+    place_move_ctx.layer_coord.resize(10, 0);
 
     //allocate move type statistics vectors
     MoveTypeStat move_type_stat;
@@ -2817,8 +2818,8 @@ static void get_bb_from_scratch(ClusterNetId net_id,
                                 t_bb& coords,
                                 t_bb& num_on_edges,
                                 vtr::NdMatrixProxy<int, 1> num_sink_pin_layer) {
-    int pnum, x, y, pin_layer, xmin, xmax, ymin, ymax;
-    int xmin_edge, xmax_edge, ymin_edge, ymax_edge;
+    int pnum, x, y, pin_layer, xmin, xmax, ymin, ymax, layer_min, layer_max;
+    int xmin_edge, xmax_edge, ymin_edge, ymax_edge, layer_min_edge, layer_max_edge;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
     auto& place_ctx = g_vpr_ctx.placement();
@@ -2832,18 +2833,25 @@ static void get_bb_from_scratch(ClusterNetId net_id,
         + physical_tile_type(bnum)->pin_width_offset[pnum];
     y = place_ctx.block_locs[bnum].loc.y
         + physical_tile_type(bnum)->pin_height_offset[pnum];
+    pin_layer = place_ctx.block_locs[bnum].loc.layer;
 
     x = max(min<int>(x, grid.width() - 2), 1);
     y = max(min<int>(y, grid.height() - 2), 1);
+    pin_layer = max(min<int>(pin_layer, grid.get_num_layers() - 1), 0);
 
     xmin = x;
     ymin = y;
+    layer_min = pin_layer;
     xmax = x;
     ymax = y;
+    layer_max = pin_layer;
+
     xmin_edge = 1;
     ymin_edge = 1;
+    layer_min_edge = 1;
     xmax_edge = 1;
     ymax_edge = 1;
+    layer_max_edge = 1;
 
     for (int layer_num = 0; layer_num < grid.get_num_layers(); layer_num++) {
         num_sink_pin_layer[layer_num] = 0;
@@ -2867,6 +2875,7 @@ static void get_bb_from_scratch(ClusterNetId net_id,
 
         x = max(min<int>(x, grid.width() - 2), 1);  //-2 for no perim channels
         y = max(min<int>(y, grid.height() - 2), 1); //-2 for no perim channels
+        pin_layer = max(min<int>(pin_layer, grid.get_num_layers() - 1), 0);
 
         if (x == xmin) {
             xmin_edge++;
@@ -2894,6 +2903,19 @@ static void get_bb_from_scratch(ClusterNetId net_id,
             ymax_edge = 1;
         }
 
+        if (pin_layer == layer_min) {
+            layer_min_edge++;
+        }
+        if (pin_layer == layer_max) {
+            layer_max_edge++;
+        } else if (pin_layer < layer_min) {
+            layer_min = pin_layer;
+            layer_min_edge = 1;
+        } else if (pin_layer > layer_max) {
+            layer_max = pin_layer;
+            layer_max_edge = 1;
+        }
+
         num_sink_pin_layer[pin_layer]++;
     }
 
@@ -2903,11 +2925,18 @@ static void get_bb_from_scratch(ClusterNetId net_id,
     coords.xmax = xmax;
     coords.ymin = ymin;
     coords.ymax = ymax;
+    coords.layer_min = layer_min;
+    coords.layer_max = layer_max;
+    VTR_ASSERT_DEBUG(layer_min >= 0 && layer_min < device_ctx.grid.get_num_layers());
+    VTR_ASSERT_DEBUG(layer_max >= 0 && layer_max < device_ctx.grid.get_num_layers());
+
 
     num_on_edges.xmin = xmin_edge;
     num_on_edges.xmax = xmax_edge;
     num_on_edges.ymin = ymin_edge;
     num_on_edges.ymax = ymax_edge;
+    num_on_edges.layer_min = layer_min_edge;
+    num_on_edges.layer_max = layer_max_edge;
 }
 
 /* This routine finds the bounding box of each net from scratch when the bounding box is of type per-layer (i.e.   *
@@ -3162,7 +3191,7 @@ static void get_non_updateable_bb(ClusterNetId net_id,
                                   vtr::NdMatrixProxy<int, 1> num_sink_pin_layer) {
     //TODO: account for multiple physical pin instances per logical pin
 
-    int xmax, ymax, xmin, ymin, x, y, layer;
+    int xmax, ymax, layer_max, xmin, ymin, layer_min, x, y, layer;
     int pnum;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -3176,11 +3205,14 @@ static void get_non_updateable_bb(ClusterNetId net_id,
         + physical_tile_type(bnum)->pin_width_offset[pnum];
     y = place_ctx.block_locs[bnum].loc.y
         + physical_tile_type(bnum)->pin_height_offset[pnum];
+    layer = place_ctx.block_locs[bnum].loc.layer;
 
     xmin = x;
     ymin = y;
+    layer_min = layer;
     xmax = x;
     ymax = y;
+    layer_max = layer;
 
     for (int layer_num = 0; layer_num < device_ctx.grid.get_num_layers(); layer_num++) {
         num_sink_pin_layer[layer_num] = 0;
@@ -3207,6 +3239,12 @@ static void get_non_updateable_bb(ClusterNetId net_id,
             ymax = y;
         }
 
+        if (layer < layer_min) {
+            layer_min = layer;
+        } else if (layer > layer_max) {
+            layer_max = layer;
+        }
+
         num_sink_pin_layer[layer]++;
     }
 
@@ -3220,8 +3258,10 @@ static void get_non_updateable_bb(ClusterNetId net_id,
 
     bb_coord_new.xmin = max(min<int>(xmin, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     bb_coord_new.ymin = max(min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.layer_min = max(min<int>(layer_min, device_ctx.grid.get_num_layers() - 1), 0);
     bb_coord_new.xmax = max(min<int>(xmax, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     bb_coord_new.ymax = max(min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    bb_coord_new.layer_max = max(min<int>(layer_max, device_ctx.grid.get_num_layers() - 1), 0);
 }
 
 static void get_non_updateable_layer_bb(ClusterNetId net_id,
@@ -3321,8 +3361,10 @@ static void update_bb(ClusterNetId net_id,
 
     pin_new_loc.x = max(min<int>(pin_new_loc.x, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     pin_new_loc.y = max(min<int>(pin_new_loc.y, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    pin_new_loc.layer_num = max(min<int>(pin_new_loc.layer_num, device_ctx.grid.get_num_layers() - 1), 0);
     pin_old_loc.x = max(min<int>(pin_old_loc.x, device_ctx.grid.width() - 2), 1);  //-2 for no perim channels
     pin_old_loc.y = max(min<int>(pin_old_loc.y, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
+    pin_old_loc.layer_num = max(min<int>(pin_old_loc.layer_num, device_ctx.grid.get_num_layers() - 1), 0);
 
     /* Check if the net had been updated before. */
     if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
@@ -3502,6 +3544,75 @@ static void update_bb(ClusterNetId net_id,
                 num_sink_pin_layer_new[pin_new_loc.layer_num] = (curr_num_sink_pin_layer)[pin_new_loc.layer_num] + 1;
             }
         }
+
+        if (pin_new_loc.layer_num < pin_old_loc.layer_num) {
+            if (pin_old_loc.layer_num == curr_bb_coord->layer_max) {
+                if (curr_bb_edge->layer_max == 1) {
+                    get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new, num_sink_pin_layer_new);
+                    bb_updated_before[net_id] = GOT_FROM_SCRATCH;
+                    return;
+                } else {
+                    bb_edge_new.layer_max = curr_bb_edge->layer_max - 1;
+                    bb_coord_new.layer_max = curr_bb_coord->layer_max;
+                }
+            } else {
+                bb_coord_new.layer_max = curr_bb_coord->layer_max;
+                bb_edge_new.layer_max = curr_bb_edge->layer_max;
+            }
+
+
+            if (pin_new_loc.layer_num < curr_bb_coord->layer_min) {
+                bb_coord_new.layer_min = pin_new_loc.layer_num;
+                bb_edge_new.layer_min = 1;
+            } else if (pin_new_loc.layer_num == curr_bb_coord->layer_min) {
+                bb_coord_new.layer_min = pin_new_loc.layer_num;
+                bb_edge_new.layer_min = curr_bb_edge->layer_min + 1;
+            } else {
+                bb_coord_new.layer_min = curr_bb_coord->layer_min;
+                bb_edge_new.layer_min = curr_bb_edge->layer_min;
+            }
+
+        } else if (pin_new_loc.layer_num > pin_old_loc.layer_num) {
+
+
+            if (pin_old_loc.layer_num == curr_bb_coord->layer_min) {
+                if (curr_bb_edge->layer_min == 1) {
+                    get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new, num_sink_pin_layer_new);
+                    bb_updated_before[net_id] = GOT_FROM_SCRATCH;
+                    return;
+                } else {
+                    bb_edge_new.layer_min = curr_bb_edge->layer_min - 1;
+                    bb_coord_new.layer_min = curr_bb_coord->layer_min;
+                }
+            } else {
+                bb_coord_new.layer_min = curr_bb_coord->layer_min;
+                bb_edge_new.layer_min = curr_bb_edge->layer_min;
+            }
+
+            if (pin_new_loc.layer_num > curr_bb_coord->layer_max) {
+                bb_coord_new.layer_max = pin_new_loc.layer_num;
+                bb_edge_new.layer_max = 1;
+            } else if (pin_new_loc.layer_num == curr_bb_coord->layer_max) {
+                bb_coord_new.layer_max = pin_new_loc.layer_num;
+                bb_edge_new.layer_max = curr_bb_edge->layer_max + 1;
+            } else {
+                bb_coord_new.layer_max = curr_bb_coord->layer_max;
+                bb_edge_new.layer_max = curr_bb_edge->layer_max;
+            }
+
+
+        } else {
+            bb_coord_new.layer_min = curr_bb_coord->layer_min;
+            bb_coord_new.layer_max = curr_bb_coord->layer_max;
+            bb_edge_new.layer_min = curr_bb_edge->layer_min;
+            bb_edge_new.layer_max = curr_bb_edge->layer_max;
+        }
+
+    } else {
+        bb_coord_new.layer_min = curr_bb_coord->layer_min;
+        bb_coord_new.layer_max = curr_bb_coord->layer_max;
+        bb_edge_new.layer_min = curr_bb_edge->layer_min;
+        bb_edge_new.layer_max = curr_bb_edge->layer_max;
     }
 
     if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
diff --git a/vpr/src/place/placer_context.h b/vpr/src/place/placer_context.h
index f5e56bbf37f..5a7e4c6860f 100644
--- a/vpr/src/place/placer_context.h
+++ b/vpr/src/place/placer_context.h
@@ -111,8 +111,11 @@ struct PlacerMoveContext : public Context {
 
     // Scratch vectors that are used by different directed moves for temporary calculations (allocated here to save runtime)
     // These vectors will grow up with the net size as it is mostly used to save coords of the net pins or net bb edges
+    // Given that placement moves involve operations on each coordinate independently, we chose to 
+    // utilize a Struct of Arrays (SoA) rather than an Array of Struct (AoS).
     std::vector<int> X_coord;
     std::vector<int> Y_coord;
+    std::vector<int> layer_coord;
 
     // Container to save the highly critical pins (higher than a timing criticality limit setted by commandline option)
     std::vector<std::pair<ClusterNetId, int>> highly_crit_pins;
diff --git a/vpr/src/place/weighted_centroid_move_generator.cpp b/vpr/src/place/weighted_centroid_move_generator.cpp
index d33b6fa2ebe..93dd5c796f8 100644
--- a/vpr/src/place/weighted_centroid_move_generator.cpp
+++ b/vpr/src/place/weighted_centroid_move_generator.cpp
@@ -40,7 +40,7 @@ e_create_move WeightedCentroidMoveGenerator::propose_move(t_pl_blocks_to_be_move
 
     // Centroid location is not necessarily a valid location, and the downstream location expect a valid
     // layer for "to" location. So if the layer is not valid, we set it to the same layer as from loc.
-    to.layer = (centroid.layer < 0) ? from.layer : centroid.layer;
+    centroid.layer = (centroid.layer < 0) ? from.layer : centroid.layer;
     if (!find_to_loc_centroid(cluster_from_type, from, centroid, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
     }
diff --git a/vpr/src/place/weighted_median_move_generator.cpp b/vpr/src/place/weighted_median_move_generator.cpp
index e25c98f08bd..a9e2aaac526 100644
--- a/vpr/src/place/weighted_median_move_generator.cpp
+++ b/vpr/src/place/weighted_median_move_generator.cpp
@@ -28,7 +28,7 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     auto& place_move_ctx = g_placer_ctx.mutable_move();
 
     int num_layers = g_vpr_ctx.device().grid.get_num_layers();
-    bool is_multi_layer = (num_layers > 1);
+
 
     t_pl_loc from = place_ctx.block_locs[b_from].loc;
     auto cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from);
@@ -45,6 +45,7 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     //reused to save allocation time
     place_move_ctx.X_coord.clear();
     place_move_ctx.Y_coord.clear();
+    place_move_ctx.layer_coord.clear();
     std::vector<int> layer_blk_cnt(num_layers, 0);
 
     //true if the net is a feedback from the block to itself (all the net terminals are connected to the same block)
@@ -76,27 +77,19 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
         place_move_ctx.X_coord.insert(place_move_ctx.X_coord.end(), ceil(coords.xmax.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.xmax.edge);
         place_move_ctx.Y_coord.insert(place_move_ctx.Y_coord.end(), ceil(coords.ymin.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.ymin.edge);
         place_move_ctx.Y_coord.insert(place_move_ctx.Y_coord.end(), ceil(coords.ymax.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.ymax.edge);
-        // If multile layers are available, I need to keep track of how many sinks are in each layer.
-        if (is_multi_layer) {
-            for (int layer_num = 0; layer_num < num_layers; layer_num++) {
-                layer_blk_cnt[layer_num] += place_move_ctx.num_sink_pin_layer[size_t(net_id)][layer_num];
-            }
-            // If the pin under consideration if of type sink, it is counted in place_move_ctx.num_sink_pin_layer, and we don't want to consider the moving pins
-            if (cluster_ctx.clb_nlist.pin_type(pin_id) != PinType::DRIVER) {
-                VTR_ASSERT(layer_blk_cnt[from.layer] > 0);
-                layer_blk_cnt[from.layer]--;
-            }
-        }
+        place_move_ctx.layer_coord.insert(place_move_ctx.layer_coord.end(), ceil(coords.layer_min.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.layer_min.edge);
+        place_move_ctx.layer_coord.insert(place_move_ctx.layer_coord.end(), ceil(coords.layer_max.criticality * CRIT_MULT_FOR_W_MEDIAN), coords.layer_max.edge);
     }
 
-    if ((place_move_ctx.X_coord.empty()) || (place_move_ctx.Y_coord.empty())) {
-        VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tMove aborted - X_coord and y_coord are empty\n");
+    if ((place_move_ctx.X_coord.empty()) || (place_move_ctx.Y_coord.empty()) || (place_move_ctx.layer_coord.empty())) {
+        VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\tMove aborted - X_coord or y_coord or layer_coord are empty\n");
         return e_create_move::ABORT;
     }
 
     //calculate the weighted median region
     std::stable_sort(place_move_ctx.X_coord.begin(), place_move_ctx.X_coord.end());
     std::stable_sort(place_move_ctx.Y_coord.begin(), place_move_ctx.Y_coord.end());
+    std::stable_sort(place_move_ctx.layer_coord.begin(), place_move_ctx.layer_coord.end());
 
     if (place_move_ctx.X_coord.size() == 1) {
         limit_coords.xmin = place_move_ctx.X_coord[0];
@@ -114,6 +107,14 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
         limit_coords.ymax = place_move_ctx.Y_coord[floor((place_move_ctx.Y_coord.size() - 1) / 2) + 1];
     }
 
+    if (place_move_ctx.layer_coord.size() == 1) {
+        limit_coords.layer_min = place_move_ctx.layer_coord[0];
+        limit_coords.layer_max = limit_coords.layer_min;
+    } else {
+        limit_coords.layer_min = place_move_ctx.layer_coord[floor((place_move_ctx.layer_coord.size() - 1) / 2)];
+        limit_coords.layer_max = place_move_ctx.layer_coord[floor((place_move_ctx.layer_coord.size() - 1) / 2) + 1];
+    }
+
     t_range_limiters range_limiters{rlim,
                                     place_move_ctx.first_rlim,
                                     placer_opts.place_dm_rlim};
@@ -121,17 +122,8 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
     t_pl_loc w_median_point;
     w_median_point.x = (limit_coords.xmin + limit_coords.xmax) / 2;
     w_median_point.y = (limit_coords.ymin + limit_coords.ymax) / 2;
+    w_median_point.layer = ((limit_coords.layer_min + limit_coords.layer_max) / 2);
 
-    // If multiple layers are available, we would choose the median layer, otherwise the same layer (layer #0) as the from_loc would be chosen
-    //#TODO: Since we are now only considering 2 layers, the layer with maximum number of sinks should be chosen. we need to update it to get the true median
-    if (is_multi_layer) {
-        int layer_num = std::distance(layer_blk_cnt.begin(), std::max_element(layer_blk_cnt.begin(), layer_blk_cnt.end()));
-        w_median_point.layer = layer_num;
-        to.layer = layer_num;
-    } else {
-        w_median_point.layer = from.layer;
-        to.layer = from.layer;
-    }
     if (!find_to_loc_centroid(cluster_from_type, from, w_median_point, range_limiters, to, b_from)) {
         return e_create_move::ABORT;
     }
@@ -162,8 +154,8 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved&
  *      - criticalities: the timing criticalities of all connections
  */
 static void get_bb_cost_for_net_excluding_block(ClusterNetId net_id, ClusterBlockId, ClusterPinId moving_pin_id, const PlacerCriticalities* criticalities, t_bb_cost* coords, bool& skip_net) {
-    int pnum, x, y, xmin, xmax, ymin, ymax;
-    float xmin_cost, xmax_cost, ymin_cost, ymax_cost, cost;
+    int pnum, x, y, layer, xmin, xmax, ymin, ymax, layer_min, layer_max;
+    float xmin_cost, xmax_cost, ymin_cost, ymax_cost, layer_min_cost, layer_max_cost, cost;
 
     skip_net = true;
 
@@ -171,11 +163,16 @@ static void get_bb_cost_for_net_excluding_block(ClusterNetId net_id, ClusterBloc
     xmax = 0;
     ymin = 0;
     ymax = 0;
+    layer_min = 0;
+    layer_max = 0;
+
     cost = 0.0;
     xmin_cost = 0.0;
     xmax_cost = 0.0;
     ymin_cost = 0.0;
     ymax_cost = 0.0;
+    layer_min_cost = 0.;
+    layer_max_cost = 0.;
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
     auto& place_ctx = g_vpr_ctx.placement();
@@ -187,6 +184,7 @@ static void get_bb_cost_for_net_excluding_block(ClusterNetId net_id, ClusterBloc
     int ipin;
     for (auto pin_id : cluster_ctx.clb_nlist.net_pins(net_id)) {
         bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
+        layer = place_ctx.block_locs[bnum].loc.layer;
 
         if (pin_id != moving_pin_id) {
             skip_net = false;
@@ -220,6 +218,10 @@ static void get_bb_cost_for_net_excluding_block(ClusterNetId net_id, ClusterBloc
                 xmax_cost = cost;
                 ymax = y;
                 ymax_cost = cost;
+                layer_min = layer;
+                layer_min_cost = cost;
+                layer_max = layer;
+                layer_max_cost = cost;
                 is_first_block = false;
             } else {
                 if (x < xmin) {
@@ -237,6 +239,20 @@ static void get_bb_cost_for_net_excluding_block(ClusterNetId net_id, ClusterBloc
                     ymax = y;
                     ymax_cost = cost;
                 }
+
+                if (layer < layer_min) {
+                    layer_min = layer;
+                    layer_min_cost = cost;
+                } else if (layer > layer_max) {
+                    layer_max = layer;
+                    layer_max_cost = cost;
+                } else if (layer == layer_min) {
+                    if (cost > layer_min_cost)
+                        layer_min_cost = cost;
+                } else if (layer == layer_max) {
+                    if (cost > layer_max_cost)
+                        layer_max_cost = cost;
+                }
             }
         }
     }
@@ -246,4 +262,6 @@ static void get_bb_cost_for_net_excluding_block(ClusterNetId net_id, ClusterBloc
     coords->xmax = {xmax, xmax_cost};
     coords->ymin = {ymin, ymin_cost};
     coords->ymax = {ymax, ymax_cost};
+    coords->layer_min = {layer_min, layer_min_cost};
+    coords->layer_max = {layer_max, layer_max_cost};
 }
diff --git a/vtr_flow/arch/multi_die/README.md b/vtr_flow/arch/multi_die/README.md
index 23aa7bcff79..d8e511368eb 100644
--- a/vtr_flow/arch/multi_die/README.md
+++ b/vtr_flow/arch/multi_die/README.md
@@ -28,6 +28,7 @@ This directory contains architecture files for 3D FPGAs. The architectures are d
        - The architecture has two dice.
        - Both dice are SIV-like FPGA fabric.
        - All pins can cross die.
+       - This is a completely hypothetical architecture, as the area required to place drivers on every channel segment to drive an IPIN on the other die would be too large. For the purpose of this scenario, we assume an inter-die connection delay of 0.
      - `3d_full_OPIN_inter_die_stratixiv_arch.timing.xml`
        - The architecture has two dice.
        - Both dice are SIV-like FPGA fabric.
diff --git a/vtr_flow/arch/multi_die/stratixiv_3d/3d_full_OPIN_inter_die_stratixiv_arch.timing.xml b/vtr_flow/arch/multi_die/stratixiv_3d/3d_full_OPIN_inter_die_stratixiv_arch.timing.xml
index fdf81e678b4..206a64dd7eb 100644
--- a/vtr_flow/arch/multi_die/stratixiv_3d/3d_full_OPIN_inter_die_stratixiv_arch.timing.xml
+++ b/vtr_flow/arch/multi_die/stratixiv_3d/3d_full_OPIN_inter_die_stratixiv_arch.timing.xml
@@ -5115,9 +5115,11 @@
               while keeping the switch delay a reasonable (see comment in <segmentlist>) portion of the overall delay.
        -->
     <switch type="mux" name="seg4_driver" R="450" Cin="0.60e-15" Cout="4.82e-15" Tdel="59e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
+    <switch type="mux" name="seg4_inter_layer_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="138.82e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
     <!-- KEM: Since the L16 wires are 4x as long as the L4s, it is not unreasonable to have the L16 drivers be at least 3x as
          powerful. -->
     <switch type="mux" name="seg16_driver" R="150" Cin="1.80e-15" Cout="14.5e-15" Tdel="87e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
+    <switch type="mux" name="seg16_inter_layer_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="166.82e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
     <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer-->
     <switch type="mux" name="ipin_cblock" R="2231.5" Cout="0." Cin="1.47e-15" Tdel="0e0" mux_trans_size="1.222260" buf_size="auto"/>
     <switch type="mux" name="die_connection" R="2231.5" Cout="0" Cin="1.47e-15" Tdel="0e0" mux_trans_size="1.222260" buf_size="auto"/>
@@ -5215,14 +5217,14 @@
           -->
     <segment name="L4" freq="260" length="4" type="unidir" Rmetal="201.7" Cmetal="18.0e-15">
       <mux name="seg4_driver"/>
-      <mux_inter_die name="seg4_driver"/>
+      <mux_inter_die name="seg4_inter_layer_driver"/>
       <!-- L4 connect to connection block L4 and L4prime -->
       <sb type="pattern">1 1 1 1 1</sb>
       <cb type="pattern">1 1 1 1</cb>
     </segment>
     <segment name="L16" freq="40" length="16" type="unidir" Rmetal="50.42" Cmetal="20.7e-15">
       <mux name="seg16_driver"/>
-      <mux_inter_die name="seg16_driver"/>
+      <mux_inter_die name="seg16_inter_layer_driver"/>
       <!-- L16 connect to L16 and L4 -->
       <!-- Vias from the top of the metal stack (global layers, where the long wires are 
            implemented) down to the middle/bottom of the metal stack (semi-global layers,