diff --git a/libs/libvtrutil/src/vtr_vec_id_set.h b/libs/libvtrutil/src/vtr_vec_id_set.h
index 9e0a1f0802e..ed6620b1cdd 100644
--- a/libs/libvtrutil/src/vtr_vec_id_set.h
+++ b/libs/libvtrutil/src/vtr_vec_id_set.h
@@ -2,6 +2,7 @@
 #define VTR_SET_H
 
 #include <vector>
+#include <algorithm>
 
 namespace vtr {
 
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index f49d08bf6c9..e3948d6f88a 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -572,6 +572,8 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
 
     PlacerOpts->effort_scaling = Options.place_effort_scaling;
     PlacerOpts->timing_update_type = Options.timing_update_type;
+
+    PlacerOpts->place_quench_metric = Options.place_quench_metric;
 }
 
 static void SetupAnalysisOpts(const t_options& Options, t_analysis_opts& analysis_opts) {
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 64d607f7a24..53760cc9b68 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -1025,6 +1025,41 @@ struct ParseTimingUpdateType {
     }
 };
 
+struct ParsePlaceQuenchMetric {
+    ConvertedValue<e_place_quench_metric> from_str(std::string str) {
+        ConvertedValue<e_place_quench_metric> conv_value;
+        if (str == "auto")
+            conv_value.set_value(e_place_quench_metric::AUTO);
+        else if (str == "timing_cost")
+            conv_value.set_value(e_place_quench_metric::TIMING_COST);
+        else if (str == "setup_slack")
+            conv_value.set_value(e_place_quench_metric::SETUP_SLACK);
+        else {
+            std::stringstream msg;
+            msg << "Invalid conversion from '" << str << "' to e_place_quench_metric (expected one of: " << argparse::join(default_choices(), ", ") << ")";
+            conv_value.set_error(msg.str());
+        }
+        return conv_value;
+    }
+
+    ConvertedValue<std::string> to_str(e_place_quench_metric val) {
+        ConvertedValue<std::string> conv_value;
+        if (val == e_place_quench_metric::AUTO)
+            conv_value.set_value("auto");
+        if (val == e_place_quench_metric::TIMING_COST)
+            conv_value.set_value("timing_cost");
+        else {
+            VTR_ASSERT(val == e_place_quench_metric::SETUP_SLACK);
+            conv_value.set_value("setup_slack");
+        }
+        return conv_value;
+    }
+
+    std::vector<std::string> default_choices() {
+        return {"auto", "timing_cost", "setup_slack"};
+    }
+};
+
 argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) {
     std::string description =
         "Implements the specified circuit onto the target FPGA architecture"
@@ -1814,6 +1849,17 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
         .default_value("")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    place_timing_grp.add_argument<e_place_quench_metric, ParsePlaceQuenchMetric>(args.place_quench_metric, "--place_quench_metric")
+        .help(
+            "Controls which cost function the placer uses during the quench stage:\n"
+            " * auto: VPR decides\n"
+            " * timing_cost: The same cost formulation as the one used during\n"
+            "                the annealing stage (more stable)\n"
+            " * setup_slack: Directly uses setup slacks (in combination with wiring)\n"
+            "                to check if the block moves should be accepted\n")
+        .default_value("auto")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     auto& route_grp = parser.add_argument_group("routing options");
 
     route_grp.add_argument(args.max_router_iterations, "--max_router_iterations")
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index 55d6b46b532..9e25e81f528 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -128,6 +128,7 @@ struct t_options {
     argparse::ArgValue<PlaceDelayModelType> place_delay_model;
     argparse::ArgValue<e_reducer> place_delay_model_reducer;
     argparse::ArgValue<std::string> allowed_tiles_for_delay_model;
+    argparse::ArgValue<e_place_quench_metric> place_quench_metric;
 
     /* Router Options */
     argparse::ArgValue<bool> check_rr_graph;
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 0dffc51af41..97df4413bcf 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -851,7 +851,8 @@ struct t_annealing_sched {
  * doPlacement: true if placement is supposed to be done in the CAD flow, false otherwise */
 enum e_place_algorithm {
     BOUNDING_BOX_PLACE,
-    PATH_TIMING_DRIVEN_PLACE
+    PATH_TIMING_DRIVEN_PLACE,
+    SETUP_SLACK_ANALYSIS_PLACE
 };
 
 enum e_pad_loc_type {
@@ -889,6 +890,12 @@ enum class e_place_delta_delay_algorithm {
     DIJKSTRA_EXPANSION,
 };
 
+enum class e_place_quench_metric {
+    TIMING_COST,
+    SETUP_SLACK,
+    AUTO
+};
+
 struct t_placer_opts {
     enum e_place_algorithm place_algorithm;
     float timing_tradeoff;
@@ -935,6 +942,7 @@ struct t_placer_opts {
     std::string allowed_tiles_for_delay_model;
 
     e_place_delta_delay_algorithm place_delta_delay_matrix_calculation_method;
+    e_place_quench_metric place_quench_metric;
 };
 
 /* All the parameters controlling the router's operation are in this        *
diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp
index 07d29f914fd..321a9a2556c 100644
--- a/vpr/src/place/place.cpp
+++ b/vpr/src/place/place.cpp
@@ -1,3 +1,8 @@
+/**
+ * @file place.cpp
+ * @brief This is a core file that defines the major placer routines used by VPR.
+ */
+
 #include <cstdio>
 #include <cmath>
 #include <memory>
@@ -44,6 +49,9 @@
 #include "tatum/echo_writer.hpp"
 #include "tatum/TimingReporter.hpp"
 
+#include "place_global.h"
+#include "place_timing_update.h"
+
 using std::max;
 using std::min;
 
@@ -58,10 +66,6 @@ using std::min;
  * cost computation. 0.01 means that there is a 1% error tolerance.       */
 #define ERROR_TOL .01
 
-/* The final rlim (range limit) is 1, which is the smallest value that can *
- * still make progress, since an rlim of 0 wouldn't allow any swaps.       */
-#define FINAL_RLIM 1
-
 /* This defines the maximum number of swap attempts before invoking the   *
  * once-in-a-while placement legality check as well as floating point     *
  * variables round-offs check.                                            */
@@ -91,110 +95,130 @@ struct t_placer_statistics {
         sum_of_squares;
     int success_sum;
 };
+constexpr float INVALID_DELAY = std::numeric_limits<float>::quiet_NaN();
+constexpr double INVALID_COST = std::numeric_limits<double>::quiet_NaN();
+
+/*******************************************************************************
+ * Below is a list of definitions of data structures declared as `extern` in   *
+ * place_global.h. These variables were originally local to the current file.  *
+ * However, they were moved so as to facilitate moving some of the routines    *
+ * in the current file into other source files.                                *
+ * TODO: Create a single extern variable that allows access to all these data  *
+ * structures so that these structures don't have to be declared as extern.    *
+ *******************************************************************************/
+
+/**
+ * @brief Cost of a net, and a temporary cost of a net used during move assessment.
+ *
+ * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1]
+ */
+vtr::vector<ClusterNetId, double> net_cost, proposed_net_cost;
 
-struct t_placer_costs {
-    //Although we do nost cost calculations with float's we
-    //use doubles for the accumulated costs to avoid round-off,
-    //particularly on large designs where the magnitude of a single
-    //move's delta cost is small compared to the overall cost.
-    double cost;
-    double bb_cost;
-    double timing_cost;
-};
-
-struct t_placer_prev_inverse_costs {
-    double bb_cost;
-    double timing_cost;
-};
-
-// Used by update_annealing_state()
-struct t_annealing_state {
-    float t;                  // Temperature
-    float rlim;               // Range limit for swaps
-    float inverse_delta_rlim; // used to calculate crit_exponent
-    float alpha;              // Temperature decays by this factor each outer iteration
-    float restart_t;          // Temperature used after restart due to minimum success ratio
-    float crit_exponent;      // Used by timing-driven placement to "sharpen" timing criticality
-    int move_lim_max;         // Maximum move limit
-    int move_lim;             // Current move limit
-};
+/**
+ * @brief A flag array to indicate whether the specific bounding box has
+ *        been updated in this particular swap or not.
+ *
+ * If it has been updated before, the code must use the updated data, instead of
+ * the out-of-date data passed into the subroutine, particularly used in try_swap().
+ *
+ *   NOT_UPDATED_YET  Indicates that the net has not been updated before.
+ *   UPDATED_ONCE     Indicates that the net has been updated once, if it is going to be
+ *                    updated again, the values from the previous update must be used.
+ *   GOT_FROM_SCRATCH Only applicable for nets larger than SMALL_NETS. It indicates that
+ *                    the particular bounding box cannot be updated incrementally before,
+ *                    hence the bounding box is got from scratch, so the bounding box
+ *                    would definitely be right, DO NOT update again.
+ *
+ * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1]
+ */
+vtr::vector<ClusterNetId, char> bb_updated_before;
 
-constexpr float INVALID_DELAY = std::numeric_limits<float>::quiet_NaN();
+/**
+ * @brief Net connection delays.
+ *
+ *   @param connection_delay
+ *              Delays based on the committed block positions.
+ *   @param proposed_connection_delay
+ *              Delays based on the proposed block positions. Only for connections
+ *              affected by the proposed move. Otherwise, INVALID_DELAY.
+ *
+ * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
+ */
+ClbNetPinsMatrix<float> connection_delay, proposed_connection_delay;
 
-constexpr double MAX_INV_TIMING_COST = 1.e9;
-/* Stops inverse timing cost from going to infinity with very lax timing constraints,
- * which avoids multiplying by a gigantic prev_inverse.timing_cost when auto-normalizing.
- * The exact value of this cost has relatively little impact, but should not be
- * large enough to be on the order of timing costs for normal constraints. */
-
-/********************** Variables local to place.c ***************************/
-
-/* Cost of a net, and a temporary cost of a net used during move assessment. */
-static vtr::vector<ClusterNetId, double> net_cost, proposed_net_cost;
-
-/* [0...cluster_ctx.clb_nlist.nets().size()-1]                                               *
- * A flag array to indicate whether the specific bounding box has been updated   *
- * in this particular swap or not. If it has been updated before, the code       *
- * must use the updated data, instead of the out-of-date data passed into the    *
- * subroutine, particularly used in try_swap(). The value NOT_UPDATED_YET        *
- * indicates that the net has not been updated before, UPDATED_ONCE indicated    *
- * that the net has been updated once, if it is going to be updated again, the   *
- * values from the previous update must be used. GOT_FROM_SCRATCH is only        *
- * applicable for nets larger than SMALL_NETS and it indicates that the          *
- * particular bounding box cannot be updated incrementally before, hence the     *
- * bounding box is got from scratch, so the bounding box would definitely be     *
- * right, DO NOT update again.                                                   */
-static vtr::vector<ClusterNetId, char> bb_updated_before;
-
-/*
- * Net connection delays based on the placement.
+/**
+ * @brief Net connection setup slacks based on most recently updated timing graph.
+ *
+ * Updated with commit_setup_slacks() routine.
+ *
  * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
  */
-static ClbNetPinsMatrix<float> connection_delay;          //Delays based on commited block positions
-static ClbNetPinsMatrix<float> proposed_connection_delay; //Delays for proposed block positions (only
-                                                          // for connections effected by move, otherwise
-                                                          // INVALID_DELAY)
+ClbNetPinsMatrix<float> connection_setup_slack;
 
-/*
- * Timing cost of connections (i.e. criticality * delay).
+/**
+ * @brief Net connection timing costs (i.e. criticality * delay).
+ *
+ *   @param connection_timing_cost
+ *              Costs of committed block positions. See PlacerTimingCosts.
+ *   @param proposed_connection_timing_cost
+ *              Costs for proposed block positions. Only for connection
+ *              affected by the proposed move. Otherwise, INVALID_DELAY
+ *              
  * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
  */
-static PlacerTimingCosts connection_timing_cost;                 //Costs of commited block positions
-static ClbNetPinsMatrix<double> proposed_connection_timing_cost; //Costs for proposed block positions
-                                                                 // (only for connectsion effected by
-                                                                 // move, otherwise INVALID_DELAY)
-
-/*
- * Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection).
- * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1]
+PlacerTimingCosts connection_timing_cost;
+ClbNetPinsMatrix<double> proposed_connection_timing_cost;
+
+/**
+ * @brief Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection).
+ *
+ * Like connection_timing_cost, but summed across net pins. Used to allow more
+ * efficient recalculation of timing cost if only a sub-set of nets are changed
+ * while maintaining numeric stability.
+ *
+ * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1]
  */
-static vtr::vector<ClusterNetId, double> net_timing_cost; //Like connection_timing_cost, but summed
-                                                          // accross net pins. Used to allow more
-                                                          // efficient recalculation of timing cost
-                                                          // if only a sub-set of nets are changed
-                                                          // while maintaining numeric stability.
-
-/* [0..cluster_ctx.clb_nlist.nets().size()-1].  Store the bounding box coordinates and the number of    *
- * blocks on each of a net's bounding box (to allow efficient updates),      *
- * respectively.                                                             */
-
-static vtr::vector<ClusterNetId, t_bb> bb_coords, bb_num_on_edges;
-
-/* The arrays below are used to precompute the inverse of the average   *
- * number of tracks per channel between [subhigh] and [sublow].  Access *
- * them as chan?_place_cost_fac[subhigh][sublow].  They are used to     *
- * speed up the computation of the cost function that takes the length  *
- * of the net bounding box in each dimension, divided by the average    *
- * number of tracks in that direction; for other cost functions they    *
- * will never be used.                                                  *
+vtr::vector<ClusterNetId, double> net_timing_cost;
+
+/**
+ * @brief Store the bounding box coordinates and the number of blocks on each
+ *        of a net's bounding box (to allow efficient updates) respectively.
+ *
+ * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1]
  */
-static float** chanx_place_cost_fac; //[0...device_ctx.grid.width()-2]
-static float** chany_place_cost_fac; //[0...device_ctx.grid.height()-2]
+vtr::vector<ClusterNetId, t_bb> bb_coords, bb_num_on_edges;
 
-/* The following arrays are used by the try_swap function for speed.   */
-/* [0...cluster_ctx.clb_nlist.nets().size()-1] */
-static vtr::vector<ClusterNetId, t_bb> ts_bb_coord_new, ts_bb_edge_new;
-static std::vector<ClusterNetId> ts_nets_to_update;
+/**
+ * @brief 2D arrays used to precompute the inverse of the average
+ *        number of tracks per channel between [subhigh] and [sublow].
+ *
+ * Access them as chan?_place_cost_fac[subhigh][sublow].
+ * They are used to speed up the computation of the cost function that
+ * takes the length of the net bounding box in each dimension, divided
+ * by the average number of tracks in that direction.
+ *
+ * For other cost functions they will never be used.
+ *
+ *   @param chanx_place_cost_fac
+ *              1st dimension index range: [0...device_ctx.grid.width()-2]
+ *   @param chany_place_cost_fac
+ *              1st dimension index range: [0...device_ctx.grid.height()-2]
+ *
+ * For more detailed structure allocation process and index ranges, see
+ * alloc_and_load_for_fast_cost_update().
+ */
+float** chanx_place_cost_fac;
+float** chany_place_cost_fac;
+
+/**
+ * @brief The following arrays are used by the try_swap function for speed.
+ *
+ * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1]
+ */
+vtr::vector<ClusterNetId, t_bb> ts_bb_coord_new, ts_bb_edge_new;
+std::vector<ClusterNetId> ts_nets_to_update;
+
+/********** End of definitions of variables in place_global.h **********/
 
 /* These file-scoped variables keep track of the number of swaps       *
  * rejected, accepted or aborted. The total number of swap attempts    *
@@ -216,11 +240,6 @@ static const float cross_count[50] = {/* [0..49] */ 1.0, 1.0, 1.0, 1.0828, 1.153
                                       2.5064, 2.5356, 2.5610, 2.5864, 2.6117, 2.6371, 2.6625, 2.6887, 2.7148,
                                       2.7410, 2.7671, 2.7933};
 
-static float f_update_td_costs_connections_elapsed_sec = 0.;
-static float f_update_td_costs_nets_elapsed_sec = 0.;
-static float f_update_td_costs_sum_nets_elapsed_sec = 0.;
-static float f_update_td_costs_total_elapsed_sec = 0.;
-
 std::unique_ptr<FILE, decltype(&vtr::fclose)> f_move_stats_file(nullptr, vtr::fclose);
 
 #ifdef VTR_ENABLE_DEBUG_LOGGING
@@ -320,16 +339,16 @@ static double comp_bb_cost(e_cost_methods method);
 static void update_move_nets(int num_nets_affected);
 static void reset_move_nets(int num_nets_affected);
 
-static e_move_result try_swap(float t,
+static e_move_result try_swap(const t_annealing_state* state,
+                              t_placer_timing_update_mode* timing_update_mode,
                               t_placer_costs* costs,
-                              t_placer_prev_inverse_costs* prev_inverse_costs,
-                              float rlim,
                               MoveGenerator& move_generator,
-                              TimingInfo* timing_info,
+                              SetupTimingInfo* timing_info,
                               ClusteredPinTimingInvalidator* pin_timing_invalidator,
                               t_pl_blocks_to_be_moved& blocks_affected,
                               const PlaceDelayModel* delay_model,
-                              const PlacerCriticalities* criticalities,
+                              PlacerCriticalities* criticalities,
+                              PlacerSetupSlacks* setup_slacks,
                               float rlim_escape_fraction,
                               enum e_place_algorithm place_algorithm,
                               float timing_tradeoff);
@@ -347,54 +366,39 @@ static int check_placement_consistency();
 static int check_block_placement_consistency();
 static int check_macro_placement_consistency();
 
-static float starting_t(t_placer_costs* costs,
-                        t_placer_prev_inverse_costs* prev_inverse_costs,
+static float starting_t(const t_annealing_state* state,
+                        t_placer_timing_update_mode* timing_update_mode,
+                        t_placer_costs* costs,
                         t_annealing_sched annealing_sched,
-                        int max_moves,
-                        float rlim,
                         const PlaceDelayModel* delay_model,
-                        const PlacerCriticalities* criticalities,
-                        TimingInfo* timing_info,
+                        PlacerCriticalities* criticalities,
+                        PlacerSetupSlacks* setup_slacks,
+                        SetupTimingInfo* timing_info,
                         MoveGenerator& move_generator,
                         ClusteredPinTimingInvalidator* pin_timing_invalidator,
                         t_pl_blocks_to_be_moved& blocks_affected,
                         const t_placer_opts& placer_opts);
 
-static bool update_annealing_state(t_annealing_state* state,
-                                   float success_rat,
-                                   const t_placer_costs& costs,
-                                   const t_placer_opts& placer_opts,
-                                   const t_annealing_sched& annealing_sched);
-
-static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid);
-
 static int count_connections();
 
 static double get_std_dev(int n, double sum_x_squared, double av_x);
 
 static double recompute_bb_cost();
 
-static float comp_td_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin);
-
-static void comp_td_connection_delays(const PlaceDelayModel* delay_model);
-
 static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected);
 
 static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected);
 
-static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected,
+static void invalidate_affected_connection_delays(const std::vector<ClusterPinId>& sink_pins_affected,
                                                   ClusteredPinTimingInvalidator* pin_tedges_invalidator,
                                                   TimingInfo* timing_info);
 
 static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected);
 
-static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost);
+static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected,
+                                    std::vector<ClusterPinId>& sink_pins_affected);
 
-static void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost);
-
-static double comp_td_connection_cost(const PlaceDelayModel* delay_mode, const PlacerCriticalities& place_crit, ClusterNetId net, int ipin);
-static double sum_td_net_cost(ClusterNetId net);
-static double sum_td_costs();
+static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks);
 
 static e_move_result assess_swap(double delta_c, double t);
 
@@ -431,41 +435,34 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr);
 
 static void free_try_swap_arrays();
 
-static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts,
-                                               t_placer_costs* costs,
-                                               t_placer_prev_inverse_costs* prev_inverse_costs,
-                                               int num_connections,
-                                               float crit_exponent,
-                                               int* outer_crit_iter_count,
-                                               const PlaceDelayModel* delay_model,
-                                               PlacerCriticalities* criticalities,
-                                               ClusteredPinTimingInvalidator* pin_timing_invalidator,
-                                               SetupTimingInfo* timing_info);
-
-static void recompute_criticalities(float crit_exponent,
-                                    const PlaceDelayModel* delay_model,
-                                    PlacerCriticalities* criticalities,
-                                    ClusteredPinTimingInvalidator* pin_timing_invalidator,
-                                    SetupTimingInfo* timing_info,
-                                    t_placer_costs* costs);
-
-static void placement_inner_loop(float t,
+static void outer_loop_update_timing_info(const t_placer_opts& placer_opts,
+                                          t_placer_timing_update_mode* timing_update_mode,
+                                          t_placer_costs* costs,
+                                          int num_connections,
+                                          float crit_exponent,
+                                          int* outer_crit_iter_count,
+                                          const PlaceDelayModel* delay_model,
+                                          PlacerCriticalities* criticalities,
+                                          PlacerSetupSlacks* setup_slacks,
+                                          ClusteredPinTimingInvalidator* pin_timing_invalidator,
+                                          SetupTimingInfo* timing_info);
+
+static void placement_inner_loop(const t_annealing_state* state,
                                  int temp_num,
-                                 float rlim,
                                  const t_placer_opts& placer_opts,
-                                 int move_lim,
-                                 float crit_exponent,
                                  int inner_recompute_limit,
                                  t_placer_statistics* stats,
+                                 t_placer_timing_update_mode* timing_update_mode,
                                  t_placer_costs* costs,
-                                 t_placer_prev_inverse_costs* prev_inverse_costs,
                                  int* moves_since_cost_recompute,
                                  ClusteredPinTimingInvalidator* pin_timing_invalidator,
                                  const PlaceDelayModel* delay_model,
                                  PlacerCriticalities* criticalities,
+                                 PlacerSetupSlacks* setup_slacks,
                                  MoveGenerator& move_generator,
                                  t_pl_blocks_to_be_moved& blocks_affected,
-                                 SetupTimingInfo* timing_info);
+                                 SetupTimingInfo* timing_info,
+                                 enum e_place_algorithm place_algorithm);
 
 static void recompute_costs_from_scratch(const t_placer_opts& placer_opts,
                                          const PlaceDelayModel* delay_model,
@@ -495,7 +492,7 @@ static void print_place_status(const size_t num_temps,
                                size_t tot_moves);
 static void print_resources_utilization();
 
-static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent);
+static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts);
 
 /*****************************************************************************/
 void try_place(const t_placer_opts& placer_opts,
@@ -518,12 +515,10 @@ void try_place(const t_placer_opts& placer_opts,
     auto& timing_ctx = g_vpr_ctx.timing();
     auto pre_place_timing_stats = timing_ctx.stats;
 
-    int tot_iter, moves_since_cost_recompute, width_fac, num_connections,
-        outer_crit_iter_count, inner_recompute_limit;
-    float success_rat, first_crit_exponent, first_rlim;
+    int tot_iter, moves_since_cost_recompute, width_fac, num_connections, outer_crit_iter_count;
+    float success_rat, first_crit_exponent;
 
-    t_placer_costs costs;
-    t_placer_prev_inverse_costs prev_inverse_costs;
+    t_placer_costs costs(placer_opts.place_algorithm);
 
     tatum::TimingPathInfo critical_path;
     float sTNS = NAN;
@@ -537,10 +532,12 @@ void try_place(const t_placer_opts& placer_opts,
     std::shared_ptr<PlacementDelayCalculator> placement_delay_calc;
     std::unique_ptr<PlaceDelayModel> place_delay_model;
     std::unique_ptr<MoveGenerator> move_generator;
+    std::unique_ptr<PlacerSetupSlacks> placer_setup_slacks;
     std::unique_ptr<PlacerCriticalities> placer_criticalities;
     std::unique_ptr<ClusteredPinTimingInvalidator> pin_timing_invalidator;
 
     t_pl_blocks_to_be_moved blocks_affected(cluster_ctx.clb_nlist.blocks().size());
+    t_placer_timing_update_mode timing_update_mode;
 
     /* Allocated here because it goes into timing critical code where each memory allocation is expensive */
     IntraLbPbPinLookup pb_gpin_lookup(device_ctx.logical_block_types);
@@ -552,8 +549,8 @@ void try_place(const t_placer_opts& placer_opts,
     num_ts_called = 0;
 
     if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
-        /*do this before the initial placement to avoid messing up the initial placement */
-        place_delay_model = alloc_lookups_and_criticalities(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs);
+        /* Do this before the initial placement to avoid messing up the initial placement */
+        place_delay_model = alloc_lookups_and_delay_model(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs);
 
         if (isEchoFileEnabled(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)) {
             place_delay_model->dump_echo(getEchoFileName(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL));
@@ -604,6 +601,8 @@ void try_place(const t_placer_opts& placer_opts,
 
         timing_info = make_setup_timing_info(placement_delay_calc, placer_opts.timing_update_type);
 
+        placer_setup_slacks = std::make_unique<PlacerSetupSlacks>(cluster_ctx.clb_nlist, netlist_pin_lookup);
+
         placer_criticalities = std::make_unique<PlacerCriticalities>(cluster_ctx.clb_nlist, netlist_pin_lookup);
 
         pin_timing_invalidator = std::make_unique<ClusteredPinTimingInvalidator>(cluster_ctx.clb_nlist,
@@ -611,15 +610,15 @@ void try_place(const t_placer_opts& placer_opts,
                                                                                  atom_ctx.nlist,
                                                                                  atom_ctx.lookup,
                                                                                  *timing_info->timing_graph());
-        //Update timing and costs
-        recompute_criticalities(first_crit_exponent,
-                                place_delay_model.get(),
-                                placer_criticalities.get(),
-                                pin_timing_invalidator.get(),
-                                timing_info.get(),
-                                &costs);
-
-        timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement
+        //First time compute timing and costs, compute from scratch
+        initialize_timing_info(first_crit_exponent,
+                               place_delay_model.get(),
+                               placer_criticalities.get(),
+                               placer_setup_slacks.get(),
+                               pin_timing_invalidator.get(),
+                               timing_info.get(),
+                               &timing_update_mode,
+                               &costs);
 
         critical_path = timing_info->least_slack_critical_path();
 
@@ -635,26 +634,34 @@ void try_place(const t_placer_opts& placer_opts,
 
         outer_crit_iter_count = 1;
 
-        prev_inverse_costs.timing_cost = 1 / costs.timing_cost;
-        prev_inverse_costs.bb_cost = 1 / costs.bb_cost;
-        costs.cost = 1; /*our new cost function uses normalized values of           */
-                        /*bb_cost and timing_cost, the value of cost will be reset  */
-                        /*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */
-    } else {            /*BOUNDING_BOX_PLACE */
-        costs.cost = costs.bb_cost = comp_bb_cost(NORMAL);
-        costs.timing_cost = 0;
+        /**
+         * Initialize the normalization factors. Calling costs.update_norm_factors() here
+         * would fail the golden results of strong_multiclock benchmark
+         */
+        costs.timing_cost_norm = 1 / costs.timing_cost;
+        costs.bb_cost_norm = 1 / costs.bb_cost;
+        costs.cost = 1;
+
+    } else { //placer_opts.place_algorithm == BOUNDING_BOX_PLACE
+
+        //cost is the same as wirelength cost
+        costs.bb_cost = comp_bb_cost(NORMAL);
+        costs.cost = costs.bb_cost;
+
+        //Timing cost and normalization factors are not used
+        costs.timing_cost = INVALID_COST;
+        costs.timing_cost_norm = INVALID_COST;
+        costs.bb_cost_norm = INVALID_COST;
+
         outer_crit_iter_count = 0;
         num_connections = 0;
         first_crit_exponent = 0;
-
-        prev_inverse_costs.timing_cost = 0; /*inverses not used */
-        prev_inverse_costs.bb_cost = 0;
     }
 
     //Sanity check that initial placement is legal
     check_place(costs, place_delay_model.get(), placer_criticalities.get(), placer_opts.place_algorithm);
 
-    //Initial pacement statistics
+    //Initial placement statistics
     VTR_LOG("Initial placement cost: %g bb_cost: %g td_cost: %g\n",
             costs.cost, costs.bb_cost, costs.timing_cost);
     if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
@@ -687,63 +694,47 @@ void try_place(const t_placer_opts& placer_opts,
         print_place(nullptr, nullptr, filename.c_str());
     }
 
-    int move_lim = 1;
-    if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) {
-        //This scales the move limit proportional to num_blocks ^ (4/3)
-        move_lim = (int)(annealing_sched.inner_num * pow(cluster_ctx.clb_nlist.blocks().size(), 1.3333));
-    } else if (placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT) {
-        //This scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3)
-        //
-        //For highly utilized devices (device_size ~ num_blocks) this is the same as
-        //num_blocks ^ (4/3).
-        //
-        //For low utilization devices (device_size >> num_blocks) this performs more
-        //moves (device_size ^ (2/3)) to ensure better optimization. In this case,
-        //more moves than num_blocks ^ (4/3) may be required, since the search space
-        //is larger.
-        float device_size = device_ctx.grid.width() * device_ctx.grid.height();
-        move_lim = (int)(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(cluster_ctx.clb_nlist.blocks().size(), 2. / 3.));
-    } else {
-        VPR_ERROR(VPR_ERROR_PLACE, "Unrecognized placer effort scaling");
-    }
-    VTR_LOG("Moves per temperature: %d\n", move_lim);
-
-    /* Sometimes I want to run the router with a random placement.  Avoid *
-     * using 0 moves to stop division by 0 and 0 length vector problems,  *
-     * by setting move_lim to 1 (which is still too small to do any       *
-     * significant optimization).                                         */
-    if (move_lim <= 0)
-        move_lim = 1;
+    int first_move_lim = get_initial_move_lim(placer_opts, annealing_sched);
 
+    int inner_recompute_limit;
     if (placer_opts.inner_loop_recompute_divider != 0) {
-        inner_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.inner_loop_recompute_divider);
+        inner_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.inner_loop_recompute_divider);
     } else {
         /*don't do an inner recompute */
-        inner_recompute_limit = move_lim + 1;
+        inner_recompute_limit = first_move_lim + 1;
     }
 
     int quench_recompute_limit;
     if (placer_opts.quench_recompute_divider != 0) {
-        quench_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.quench_recompute_divider);
+        quench_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.quench_recompute_divider);
     } else {
         /*don't do an quench recompute */
-        quench_recompute_limit = move_lim + 1;
+        quench_recompute_limit = first_move_lim + 1;
     }
 
-    first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1);
+    /* Get the first range limiter */
+    float first_rlim = float(std::max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1));
 
-    float first_t = starting_t(&costs, &prev_inverse_costs,
-                               annealing_sched, move_lim, first_rlim,
-                               place_delay_model.get(),
-                               placer_criticalities.get(),
-                               timing_info.get(),
-                               *move_generator,
-                               pin_timing_invalidator.get(),
-                               blocks_affected,
-                               placer_opts);
+    /* Set the temperature high so essentially all swaps will be accepted   */
+    /* when trying to determine the starting temp for placement inner loop. */
+    float first_t = HUGE_POSITIVE_FLOAT;
+
+    /* Initialize annealing state variables */
+    t_annealing_state state(annealing_sched, first_t, first_rlim, first_move_lim, first_crit_exponent);
 
-    t_annealing_state state;
-    init_annealing_state(&state, annealing_sched, first_t, first_rlim, move_lim, first_crit_exponent);
+    /* Update the starting temperature for placement annealing to a more appropriate value */
+    state.t = starting_t(&state,
+                         &timing_update_mode,
+                         &costs,
+                         annealing_sched,
+                         place_delay_model.get(),
+                         placer_criticalities.get(),
+                         placer_setup_slacks.get(),
+                         timing_info.get(),
+                         *move_generator,
+                         pin_timing_invalidator.get(),
+                         blocks_affected,
+                         placer_opts);
 
     if (!placer_opts.move_stats_file.empty()) {
         f_move_stats_file = std::unique_ptr<FILE, decltype(&vtr::fclose)>(vtr::fopen(placer_opts.move_stats_file.c_str(), "w"), vtr::fclose);
@@ -761,30 +752,30 @@ void try_place(const t_placer_opts& placer_opts,
     /* Outer loop of the simulated annealing begins */
     do {
         vtr::Timer temperature_timer;
-        if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
-            costs.cost = 1;
-        }
 
-        outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs,
-                                           num_connections,
-                                           state.crit_exponent,
-                                           &outer_crit_iter_count,
-                                           place_delay_model.get(),
-                                           placer_criticalities.get(),
-                                           pin_timing_invalidator.get(),
-                                           timing_info.get());
-
-        placement_inner_loop(state.t, num_temps, state.rlim, placer_opts,
-                             state.move_lim, state.crit_exponent, inner_recompute_limit, &stats,
-                             &costs,
-                             &prev_inverse_costs,
+        outer_loop_update_timing_info(placer_opts, &timing_update_mode,
+                                      &costs,
+                                      num_connections,
+                                      state.crit_exponent,
+                                      &outer_crit_iter_count,
+                                      place_delay_model.get(),
+                                      placer_criticalities.get(),
+                                      placer_setup_slacks.get(),
+                                      pin_timing_invalidator.get(),
+                                      timing_info.get());
+
+        placement_inner_loop(&state, num_temps, placer_opts,
+                             inner_recompute_limit, &stats,
+                             &timing_update_mode, &costs,
                              &moves_since_cost_recompute,
                              pin_timing_invalidator.get(),
                              place_delay_model.get(),
                              placer_criticalities.get(),
+                             placer_setup_slacks.get(),
                              *move_generator,
                              blocks_affected,
-                             timing_info.get());
+                             timing_info.get(),
+                             placer_opts.place_algorithm);
 
         tot_iter += state.move_lim;
 
@@ -818,39 +809,49 @@ void try_place(const t_placer_opts& placer_opts,
     /* Outer loop of the simmulated annealing ends */
 
     auto pre_quench_timing_stats = timing_ctx.stats;
+
+    /* Start quench */
+    state.t = 0;                     //Freeze out: only accept solutions that improve placement
+    state.move_lim = first_move_lim; //Revert the move limit to initial value
+
     { /* Quench */
         vtr::ScopedFinishTimer temperature_timer("Placement Quench");
 
-        outer_loop_recompute_criticalities(placer_opts, &costs,
-                                           &prev_inverse_costs,
-                                           num_connections,
-                                           state.crit_exponent,
-                                           &outer_crit_iter_count,
-                                           place_delay_model.get(),
-                                           placer_criticalities.get(),
-                                           pin_timing_invalidator.get(),
-                                           timing_info.get());
-
-        state.t = 0; /* freeze out */
+        outer_loop_update_timing_info(placer_opts, &timing_update_mode,
+                                      &costs,
+                                      num_connections,
+                                      state.crit_exponent,
+                                      &outer_crit_iter_count,
+                                      place_delay_model.get(),
+                                      placer_criticalities.get(),
+                                      placer_setup_slacks.get(),
+                                      pin_timing_invalidator.get(),
+                                      timing_info.get());
+
+        //Use setup slack analysis if the placer is timing driven
+        //and the quench metric is SETUP_SLACK. Otherwise, use the
+        //same cost formulation as the annealing stage
+        auto quench_algorithm = get_placement_quench_algorithm(placer_opts);
 
         /* Run inner loop again with temperature = 0 so as to accept only swaps
          * which reduce the cost of the placement */
-        placement_inner_loop(state.t, num_temps, state.rlim, placer_opts,
-                             move_lim, state.crit_exponent, quench_recompute_limit, &stats,
-                             &costs,
-                             &prev_inverse_costs,
+        placement_inner_loop(&state, num_temps, placer_opts,
+                             quench_recompute_limit, &stats,
+                             &timing_update_mode, &costs,
                              &moves_since_cost_recompute,
                              pin_timing_invalidator.get(),
                              place_delay_model.get(),
                              placer_criticalities.get(),
+                             placer_setup_slacks.get(),
                              *move_generator,
                              blocks_affected,
-                             timing_info.get());
+                             timing_info.get(),
+                             quench_algorithm);
 
-        tot_iter += move_lim;
+        tot_iter += state.move_lim;
         ++num_temps;
 
-        calc_placer_stats(stats, success_rat, std_dev, costs, move_lim);
+        calc_placer_stats(stats, success_rat, std_dev, costs, state.move_lim);
 
         if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
             critical_path = timing_info->least_slack_critical_path();
@@ -894,12 +895,18 @@ void try_place(const t_placer_opts& placer_opts,
         VTR_ASSERT(timing_info);
 
         //Update timing and costs
-        recompute_criticalities(state.crit_exponent,
-                                place_delay_model.get(),
-                                placer_criticalities.get(),
-                                pin_timing_invalidator.get(),
-                                timing_info.get(),
-                                &costs);
+        timing_update_mode.update_criticalities = true;
+        timing_update_mode.update_setup_slacks = true;
+        update_setup_slacks_and_criticalities(state.crit_exponent,
+                                              place_delay_model.get(),
+                                              placer_criticalities.get(),
+                                              placer_setup_slacks.get(),
+                                              pin_timing_invalidator.get(),
+                                              timing_info.get(),
+                                              &timing_update_mode,
+                                              &costs);
+
+        commit_setup_slacks(placer_setup_slacks.get());
 
         critical_path = timing_info->least_slack_critical_path();
 
@@ -949,22 +956,30 @@ void try_place(const t_placer_opts& placer_opts,
     print_timing_stats("Placement Quench", post_quench_timing_stats, pre_quench_timing_stats);
     print_timing_stats("Placement Total ", timing_ctx.stats, pre_place_timing_stats);
 
-    VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", f_update_td_costs_connections_elapsed_sec, f_update_td_costs_nets_elapsed_sec, f_update_td_costs_sum_nets_elapsed_sec, f_update_td_costs_total_elapsed_sec);
+    auto update_td_costs_runtime_stats = get_update_td_costs_runtime_stats();
+
+    VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n",
+            update_td_costs_runtime_stats.connections_elapsed_sec,
+            update_td_costs_runtime_stats.nets_elapsed_sec,
+            update_td_costs_runtime_stats.sum_nets_elapsed_sec,
+            update_td_costs_runtime_stats.total_elapsed_sec);
 }
 
-/* Function to recompute the criticalities before the inner loop of the annealing */
-static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts,
-                                               t_placer_costs* costs,
-                                               t_placer_prev_inverse_costs* prev_inverse_costs,
-                                               int num_connections,
-                                               float crit_exponent,
-                                               int* outer_crit_iter_count,
-                                               const PlaceDelayModel* delay_model,
-                                               PlacerCriticalities* criticalities,
-                                               ClusteredPinTimingInvalidator* pin_timing_invalidator,
-                                               SetupTimingInfo* timing_info) {
-    if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE)
+/* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */
+static void outer_loop_update_timing_info(const t_placer_opts& placer_opts,
+                                          t_placer_timing_update_mode* timing_update_mode,
+                                          t_placer_costs* costs,
+                                          int num_connections,
+                                          float crit_exponent,
+                                          int* outer_crit_iter_count,
+                                          const PlaceDelayModel* delay_model,
+                                          PlacerCriticalities* criticalities,
+                                          PlacerSetupSlacks* setup_slacks,
+                                          ClusteredPinTimingInvalidator* pin_timing_invalidator,
+                                          SetupTimingInfo* timing_info) {
+    if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) {
         return;
+    }
 
     /*at each temperature change we update these values to be used     */
     /*for normalizing the tradeoff between timing and wirelength (bb)  */
@@ -976,67 +991,45 @@ static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts,
         num_connections = std::max(num_connections, 1); //Avoid division by zero
         VTR_ASSERT(num_connections > 0);
 
-        //Update timing information
-        recompute_criticalities(crit_exponent,
-                                delay_model,
-                                criticalities,
-                                pin_timing_invalidator,
-                                timing_info,
-                                costs);
+        //Update all timing information
+        timing_update_mode->update_criticalities = true;
+        timing_update_mode->update_setup_slacks = true;
+        update_setup_slacks_and_criticalities(crit_exponent,
+                                              delay_model,
+                                              criticalities,
+                                              setup_slacks,
+                                              pin_timing_invalidator,
+                                              timing_info,
+                                              timing_update_mode,
+                                              costs);
+
+        //Always commit the setup slacks when they are updated
+        commit_setup_slacks(setup_slacks);
+
         *outer_crit_iter_count = 0;
     }
     (*outer_crit_iter_count)++;
 
-    /*at each temperature change we update these values to be used     */
-    /*for normalizing the tradeoff between timing and wirelength (bb)  */
-    prev_inverse_costs->bb_cost = 1 / costs->bb_cost;
-    /*Prevent inverse timing cost from going to infinity */
-    prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST);
-}
-
-//Update timing information based on current placement by running STA to get new slacks,
-//and calculate updated criticalities and timing costs
-static void recompute_criticalities(float crit_exponent,
-                                    const PlaceDelayModel* delay_model,
-                                    PlacerCriticalities* criticalities,
-                                    ClusteredPinTimingInvalidator* pin_timing_invalidator,
-                                    SetupTimingInfo* timing_info,
-                                    t_placer_costs* costs) {
-    //Run STA to update slacks and adjusted/relaxed criticalities
-    timing_info->update();
-
-    //Update placer'criticalities (e.g. sharpen with crit_exponent)
-    criticalities->update_criticalities(timing_info, crit_exponent);
-
-    //Update connection, net and total timing costs based on new criticalities
-#ifdef INCR_COMP_TD_COSTS
-    update_td_costs(delay_model, *criticalities, &costs->timing_cost);
-#else
-    comp_td_costs(delay_model, *criticalities, &costs->timing_cost);
-#endif
-
-    //Clear invalidation state
-    pin_timing_invalidator->reset();
+    costs->update_norm_factors(); ///<Update the cost normalization factors
 }
 
 /* Function which contains the inner loop of the simulated annealing */
-static void placement_inner_loop(float t,
+static void placement_inner_loop(const t_annealing_state* state,
                                  int temp_num,
-                                 float rlim,
                                  const t_placer_opts& placer_opts,
-                                 int move_lim,
-                                 float crit_exponent,
                                  int inner_recompute_limit,
                                  t_placer_statistics* stats,
+                                 t_placer_timing_update_mode* timing_update_mode,
                                  t_placer_costs* costs,
-                                 t_placer_prev_inverse_costs* prev_inverse_costs,
                                  int* moves_since_cost_recompute,
                                  ClusteredPinTimingInvalidator* pin_timing_invalidator,
                                  const PlaceDelayModel* delay_model,
                                  PlacerCriticalities* criticalities,
+                                 PlacerSetupSlacks* setup_slacks,
                                  MoveGenerator& move_generator,
                                  t_pl_blocks_to_be_moved& blocks_affected,
-                                 SetupTimingInfo* timing_info) {
+                                 SetupTimingInfo* timing_info,
+                                 enum e_place_algorithm place_algorithm) {
     int inner_crit_iter_count, inner_iter;
 
     int inner_placement_save_count = 0; //How many times have we dumped placement to a file this temperature?
@@ -1050,16 +1043,19 @@ static void placement_inner_loop(float t,
     inner_crit_iter_count = 1;
 
     /* Inner loop begins */
-    for (inner_iter = 0; inner_iter < move_lim; inner_iter++) {
-        e_move_result swap_result = try_swap(t, costs, prev_inverse_costs, rlim,
+    for (inner_iter = 0; inner_iter < state->move_lim; inner_iter++) {
+        e_move_result swap_result = try_swap(state,
+                                             timing_update_mode,
+                                             costs,
                                              move_generator,
                                              timing_info,
                                              pin_timing_invalidator,
                                              blocks_affected,
                                              delay_model,
                                              criticalities,
+                                             setup_slacks,
                                              placer_opts.rlim_escape_fraction,
-                                             placer_opts.place_algorithm,
+                                             place_algorithm,
                                              placer_opts.timing_tradeoff);
 
         if (swap_result == ACCEPTED) {
@@ -1072,7 +1068,7 @@ static void placement_inner_loop(float t,
             num_swap_accepted++;
         } else if (swap_result == ABORTED) {
             num_swap_aborted++;
-        } else { // swap_result == REJECTED
+        } else { //swap_result == REJECTED
             num_swap_rejected++;
         }
 
@@ -1081,21 +1077,28 @@ static void placement_inner_loop(float t,
              * We do this only once in a while, since it is expensive.
              */
             if (inner_crit_iter_count >= inner_recompute_limit
-                && inner_iter != move_lim - 1) { /*on last iteration don't recompute */
+                && inner_iter != state->move_lim - 1) { /*on last iteration don't recompute */
 
                 inner_crit_iter_count = 0;
 #ifdef VERBOSE
                 VTR_LOG("Inner loop recompute criticalities\n");
 #endif
                 /* Using the delays in connection_delay, do a timing analysis to update slacks and
-                 * criticalities and update the timing cost since it will change.
+                 * criticalities and update the timing cost since they will change.
                  */
-                recompute_criticalities(crit_exponent,
-                                        delay_model,
-                                        criticalities,
-                                        pin_timing_invalidator,
-                                        timing_info,
-                                        costs);
+                timing_update_mode->update_criticalities = true;
+                timing_update_mode->update_setup_slacks = true;
+                update_setup_slacks_and_criticalities(state->crit_exponent,
+                                                      delay_model,
+                                                      criticalities,
+                                                      setup_slacks,
+                                                      pin_timing_invalidator,
+                                                      timing_info,
+                                                      timing_update_mode,
+                                                      costs);
+
+                //Always commit the setup slacks when they are updated
+                commit_setup_slacks(setup_slacks);
             }
             inner_crit_iter_count++;
         }
@@ -1109,7 +1112,7 @@ static void placement_inner_loop(float t,
 
         /* Lines below prevent too much round-off error from accumulating
          * in the cost over many iterations (due to incremental updates).
-         * This round-off can lead to  error checks failing because the cost
+         * This round-off can lead to error checks failing because the cost
          * is different from what you get when you recompute from scratch.
          */
         ++(*moves_since_cost_recompute);
@@ -1120,9 +1123,9 @@ static void placement_inner_loop(float t,
 
         if (placer_opts.placement_saves_per_temperature >= 1
             && inner_iter > 0
-            && (inner_iter + 1) % (move_lim / placer_opts.placement_saves_per_temperature) == 0) {
+            && (inner_iter + 1) % (state->move_lim / placer_opts.placement_saves_per_temperature) == 0) {
             std::string filename = vtr::string_fmt("placement_%03d_%03d.place", temp_num + 1, inner_placement_save_count);
-            VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, move_lim, filename.c_str());
+            VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, state->move_lim, filename.c_str());
             print_place(nullptr, nullptr, filename.c_str());
             ++inner_placement_save_count;
         }
@@ -1194,122 +1197,42 @@ static double get_std_dev(int n, double sum_x_squared, double av_x) {
     return (std_dev);
 }
 
-static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) {
-    /* Update the range limited to keep acceptance prob. near 0.44.  Use *
-     * a floating point rlim to allow gradual transitions at low temps.  */
-
-    float upper_lim;
-
-    *rlim = (*rlim) * (1. - 0.44 + success_rat);
-    upper_lim = max(grid.width() - 1, grid.height() - 1);
-    *rlim = min(*rlim, upper_lim);
-    *rlim = max(*rlim, (float)1.);
-}
-
-/* Update the annealing state according to the annealing schedule selected.
- *   USER_SCHED:  A manual fixed schedule with fixed alpha and exit criteria.
- *   AUTO_SCHED:  A more sophisticated schedule where alpha varies based on success ratio.
- *   DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio.
- *                See doc/src/vpr/dusty_sa.rst for more details.
- *
- * Returns true until the schedule is finished. */
-static bool update_annealing_state(t_annealing_state* state,
-                                   float success_rat,
-                                   const t_placer_costs& costs,
-                                   const t_placer_opts& placer_opts,
-                                   const t_annealing_sched& annealing_sched) {
-    /* Return `false` when the exit criterion is met. */
-    if (annealing_sched.type == USER_SCHED) {
-        state->t *= annealing_sched.alpha_t;
-        return state->t >= annealing_sched.exit_t;
-    }
-
-    auto& device_ctx = g_vpr_ctx.device();
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-
-    /* Automatic annealing schedule */
-    float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size();
-
-    if (annealing_sched.type == DUSTY_SCHED) {
-        bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets
-        if (success_rat < annealing_sched.success_min || restart_temp) {
-            if (state->alpha > annealing_sched.alpha_max) return false;
-            state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature.
-            state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay);
-        } else {
-            if (success_rat > annealing_sched.success_target) {
-                state->restart_t = state->t;
-            }
-            state->t *= state->alpha;
-        }
-        state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat))));
-    } else { /* annealing_sched.type == AUTO_SCHED */
-        if (success_rat > 0.96) {
-            state->alpha = 0.5;
-        } else if (success_rat > 0.8) {
-            state->alpha = 0.9;
-        } else if (success_rat > 0.15 || state->rlim > 1.) {
-            state->alpha = 0.95;
-        } else {
-            state->alpha = 0.8;
-        }
-        state->t *= state->alpha;
-
-        // Must be duplicated to retain previous behavior
-        if (state->t < t_exit || std::isnan(t_exit)) return false;
-    }
-
-    // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk.
-    // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves.
-    update_rlim(&state->rlim, success_rat, device_ctx.grid);
-
-    if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
-        state->crit_exponent = (1 - (state->rlim - FINAL_RLIM) * state->inverse_delta_rlim)
-                                   * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first)
-                               + placer_opts.td_place_exp_first;
-    }
-
-    return true;
-}
-
-static float starting_t(t_placer_costs* costs,
-                        t_placer_prev_inverse_costs* prev_inverse_costs,
+static float starting_t(const t_annealing_state* state,
+                        t_placer_timing_update_mode* timing_update_mode,
+                        t_placer_costs* costs,
                         t_annealing_sched annealing_sched,
-                        int max_moves,
-                        float rlim,
                         const PlaceDelayModel* delay_model,
-                        const PlacerCriticalities* criticalities,
-                        TimingInfo* timing_info,
+                        PlacerCriticalities* criticalities,
+                        PlacerSetupSlacks* setup_slacks,
+                        SetupTimingInfo* timing_info,
                         MoveGenerator& move_generator,
                         ClusteredPinTimingInvalidator* pin_timing_invalidator,
                         t_pl_blocks_to_be_moved& blocks_affected,
                         const t_placer_opts& placer_opts) {
     /* Finds the starting temperature (hot condition).              */
-
-    int i, num_accepted, move_lim;
-    double std_dev, av, sum_of_squares; /* Double important to avoid round off */
+    int num_accepted = 0;
+    double std_dev, av = 0, sum_of_squares = 0; /* Double important to avoid round off */
 
     if (annealing_sched.type == USER_SCHED)
         return (annealing_sched.init_t);
 
     auto& cluster_ctx = g_vpr_ctx.clustering();
 
-    move_lim = min(max_moves, (int)cluster_ctx.clb_nlist.blocks().size());
-
-    num_accepted = 0;
-    av = 0.;
-    sum_of_squares = 0.;
-
-    /* Try one move per block.  Set t high so essentially all accepted. */
+    /* Determines the block swap loop count. */
+    int move_lim = std::min(state->move_lim_max, int(cluster_ctx.clb_nlist.blocks().size()));
 
-    for (i = 0; i < move_lim; i++) {
-        e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, costs, prev_inverse_costs, rlim,
+    for (int i = 0; i < move_lim; i++) {
+        //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack
+        e_move_result swap_result = try_swap(state,
+                                             timing_update_mode,
+                                             costs,
                                              move_generator,
                                              timing_info,
                                              pin_timing_invalidator,
                                              blocks_affected,
                                              delay_model,
                                              criticalities,
+                                             setup_slacks,
                                              placer_opts.rlim_escape_fraction,
                                              placer_opts.place_algorithm,
                                              placer_opts.timing_tradeoff);
@@ -1373,43 +1296,56 @@ static void reset_move_nets(int num_nets_affected) {
     }
 }
 
-static e_move_result try_swap(float t,
+/**
+ * @brief Pick some block and moves it to another spot.
+ *
+ * If the new location is empty, directly move the block. If the new location
+ * is occupied, switch the blocks. Due to the different sizes of the blocks,
+ * this block switching may occur for multiple times. It might also cause the
+ * current swap attempt to abort due to inability to find suitable locations
+ * for moved blocks.
+ *
+ * The move generator will record all the switched blocks in the variable
+ * `blocks_affected`. Afterwards, the move will be assessed by the chosen
+ * cost formulation. Currently, there are three ways to assess move cost,
+ * which are stored in the enum type `e_place_algorithm`.
+ *
+ * @return Whether the block swap is accepted, rejected or aborted.
+ */
+static e_move_result try_swap(const t_annealing_state* state,
+                              t_placer_timing_update_mode* timing_update_mode,
                               t_placer_costs* costs,
-                              t_placer_prev_inverse_costs* prev_inverse_costs,
-                              float rlim,
                               MoveGenerator& move_generator,
-                              TimingInfo* timing_info,
+                              SetupTimingInfo* timing_info,
                               ClusteredPinTimingInvalidator* pin_timing_invalidator,
                               t_pl_blocks_to_be_moved& blocks_affected,
                               const PlaceDelayModel* delay_model,
-                              const PlacerCriticalities* criticalities,
+                              PlacerCriticalities* criticalities,
+                              PlacerSetupSlacks* setup_slacks,
                               float rlim_escape_fraction,
                               enum e_place_algorithm place_algorithm,
                               float timing_tradeoff) {
-    /* Picks some block and moves it to another spot.  If this spot is   *
-     * occupied, switch the blocks.  Assess the change in cost function. *
-     * rlim is the range limiter.                                        *
-     * Returns whether the swap is accepted, rejected or aborted.        *
-     * Passes back the new value of the cost functions.                  */
-
     num_ts_called++;
 
     MoveOutcomeStats move_outcome_stats;
 
-    /* I'm using negative values of proposed_net_cost as a flag, so DO NOT   *
-     * use cost functions that can go negative.                          */
+    /* I'm using negative values of proposed_net_cost as a flag, */
+    /* so DO NOT use cost functions that can go negative.        */
 
-    double delta_c = 0; /* Change in cost due to this swap. */
-    double bb_delta_c = 0;
-    double timing_delta_c = 0;
+    double delta_c = 0;        //Change in cost due to this swap.
+    double bb_delta_c = 0;     //Change in the bounding box (wiring) cost.
+    double timing_delta_c = 0; //Change in the timing cost (delay * criticality).
 
-    //Allow some fraction of moves to not be restricted by rlim,
-    //in the hopes of better escaping local minima
+    /*Allow some fraction of moves to not be restricted by rlim, */
+    /*in the hopes of better escaping local minima.              */
+    float rlim;
     if (rlim_escape_fraction > 0. && vtr::frand() < rlim_escape_fraction) {
         rlim = std::numeric_limits<float>::infinity();
+    } else {
+        rlim = state->rlim;
     }
 
-    //Generate a new move (perturbation) used to explore the space of possible placements
+    /* Generate a new move (perturbation) used to explore the space of possible placements */
     e_create_move create_move_outcome = move_generator.propose_move(blocks_affected, rlim);
 
     LOG_MOVE_STATS_PROPOSED(t, blocks_affected);
@@ -1417,7 +1353,7 @@ static e_move_result try_swap(float t,
     e_move_result move_outcome = ABORTED;
 
     if (create_move_outcome == e_create_move::ABORT) {
-        //Proposed move is not legal -- give up on this move
+        /* Proposed move is not legal -- give up on this move */
         clear_move_blocks(blocks_affected);
 
         LOG_MOVE_STATS_OUTCOME(std::numeric_limits<float>::quiet_NaN(),
@@ -1426,98 +1362,191 @@ static e_move_result try_swap(float t,
                                "ABORTED", "illegal move");
 
         move_outcome = ABORTED;
-    } else {
-        VTR_ASSERT(create_move_outcome == e_create_move::VALID);
 
-        /*
-         * To make evaluating the move simpler (e.g. calculating changed bounding box),
-         * we first move the blocks to thier new locations (apply the move to
-         * place_ctx.block_locs) and then computed the change in cost. If the move is
-         * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing
-         * the move). If the move is rejected the blocks are returned to their original
-         * positions (reverting place_ctx.block_locs to its original state).
-         *
-         * Note that the inverse look-up place_ctx.grid_blocks is only updated
-         * after move acceptance is determined, and so should not be used when
-         * evaluating a move.
-         */
+        return move_outcome;
+    }
 
-        //Update the block positions
-        apply_move_blocks(blocks_affected);
+    /* Move is valid. Proceed to analyze cost. */
+    VTR_ASSERT(create_move_outcome == e_create_move::VALID);
 
-        // Find all the nets affected by this swap and update their costs
-        int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm,
-                                                                    delay_model,
-                                                                    criticalities,
-                                                                    blocks_affected,
-                                                                    bb_delta_c,
-                                                                    timing_delta_c);
-        if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
-            /*in this case we redefine delta_c as a combination of timing and bb.  *
-             *additionally, we normalize all values, therefore delta_c is in       *
-             *relation to 1*/
+    /*
+     * To make evaluating the move simpler (e.g. calculating changed bounding box),
+     * we first move the blocks to thier new locations (apply the move to
+     * place_ctx.block_locs) and then computed the change in cost. If the move is
+     * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing
+     * the move). If the move is rejected the blocks are returned to their original
+     * positions (reverting place_ctx.block_locs to its original state).
+     *
+     * Note that the inverse look-up place_ctx.grid_blocks is only updated
+     * after move acceptance is determined, and so should not be used when
+     * evaluating a move.
+     */
 
-            delta_c = (1 - timing_tradeoff) * bb_delta_c * prev_inverse_costs->bb_cost
-                      + timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost;
-        } else {
-            delta_c = bb_delta_c;
+    //Update the block positions
+    apply_move_blocks(blocks_affected);
+
+    //Find all the nets affected by this swap and update the wiring costs.
+    //This cost value doesn't depend on the timing info.
+    //Also find all the pins affected by the swap, and calculates new connection
+    //delays and timing costs and store them in proposed_* data structures.
+    int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm,
+                                                                delay_model,
+                                                                criticalities,
+                                                                blocks_affected,
+                                                                bb_delta_c,
+                                                                timing_delta_c);
+
+    //Find all the sink pins with changed connection delays from the affected blocks.
+    //These sink pins will be passed into the pin_timing_invalidator for timing update.
+    //They will also be added to the pin invalidator when we wish to revert a timing update.
+    std::vector<ClusterPinId> sink_pins_affected;
+    find_affected_sink_pins(blocks_affected, sink_pins_affected);
+
+    if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) {
+        //Invalidates timing of modified connections for incremental timing updates.
+        invalidate_affected_connection_delays(sink_pins_affected,
+                                              pin_timing_invalidator,
+                                              timing_info);
+
+        //Update the connection_timing_cost and connection_delay
+        //values from the temporary values.
+        //This step is necessary for performing timing update.
+        commit_td_cost(blocks_affected);
+
+        //Update timing information. Since we are analyzing setup slacks,
+        //we only update those values and keep the criticalities stale
+        //so as not to interfere with the original timing cost algorithm.
+        //
+        //Note: the timing info must be called after applying block moves
+        //and committing the timing driven delays and costs.
+        //If we wish to revert this timing update due to move rejection,
+        //we need to first revert block moves and restore timing values.
+        timing_update_mode->update_criticalities = false;
+        timing_update_mode->update_setup_slacks = true;
+        update_setup_slacks_and_criticalities(state->crit_exponent,
+                                              delay_model,
+                                              criticalities,
+                                              setup_slacks,
+                                              pin_timing_invalidator,
+                                              timing_info,
+                                              timing_update_mode,
+                                              costs);
+
+        /* Get the setup slack analysis cost */
+        //TODO: calculate a weighted average of the slack cost and wiring cost
+        delta_c = analyze_setup_slack_cost(setup_slacks);
+
+    } else if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
+        /*in this case we redefine delta_c as a combination of timing and bb.  *
+         *additionally, we normalize all values, therefore delta_c is in       *
+         *relation to 1*/
+
+        delta_c = (1 - timing_tradeoff) * bb_delta_c * costs->bb_cost_norm
+                  + timing_tradeoff * timing_delta_c * costs->timing_cost_norm;
+
+    } else { //place_algorithm == BOUNDING_BOX_PLACE (wiring cost)
+        delta_c = bb_delta_c;
+    }
+
+    /* 1 -> move accepted, 0 -> rejected. */
+    move_outcome = assess_swap(delta_c, state->t);
+
+    if (move_outcome == ACCEPTED) {
+        costs->cost += delta_c;
+        costs->bb_cost += bb_delta_c;
+
+        if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) {
+            /* Update the timing driven cost as usual */
+            costs->timing_cost += timing_delta_c;
+
+            //Commit the setup slack information
+            //The timing delay and cost values should be committed already
+            commit_setup_slacks(setup_slacks);
         }
 
-        /* 1 -> move accepted, 0 -> rejected. */
-        move_outcome = assess_swap(delta_c, t);
-
-        if (move_outcome == ACCEPTED) {
-            costs->cost += delta_c;
-            costs->bb_cost += bb_delta_c;
-
-            if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
-                costs->timing_cost += timing_delta_c;
+        if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
+            costs->timing_cost += timing_delta_c;
+
+            //Invalidates timing of modified connections for incremental timing
+            //updates. This routine relies on comparing proposed_connection_delay
+            //and connection_delay. If the setup slack analysis was not performed,
+            //the sink pins are yet to be invalidated.
+            invalidate_affected_connection_delays(sink_pins_affected,
+                                                  pin_timing_invalidator,
+                                                  timing_info);
+
+            //Update the connection_timing_cost and connection_delay
+            //values from the temporary values
+            commit_td_cost(blocks_affected);
+        }
 
-                //Invalidates timing of modified connections for incremental timing updates
-                //Must be called before commit_td_cost since it relies on comparing
-                //proposed_connection_delay and connection_delay
-                invalidate_affected_connection_delays(blocks_affected,
-                                                      pin_timing_invalidator,
-                                                      timing_info);
+        /* Update net cost functions and reset flags. */
+        update_move_nets(num_nets_affected);
 
-                /*update the connection_timing_cost and connection_delay
-                 * values from the temporary values */
-                commit_td_cost(blocks_affected);
-            }
+        /* Update clb data structures since we kept the move. */
+        commit_move_blocks(blocks_affected);
 
-            /* update net cost functions and reset flags. */
-            update_move_nets(num_nets_affected);
+    } else { //move_outcome == REJECTED
 
-            /* Update clb data structures since we kept the move. */
-            commit_move_blocks(blocks_affected);
+        /* Reset the net cost function flags first. */
+        reset_move_nets(num_nets_affected);
 
-        } else { /* Move was rejected.  */
-                 /* Reset the net cost function flags first. */
-            reset_move_nets(num_nets_affected);
+        /* Restore the place_ctx.block_locs data structures to their state before the move. */
+        revert_move_blocks(blocks_affected);
 
-            /* Restore the place_ctx.block_locs data structures to their state before the move. */
-            revert_move_blocks(blocks_affected);
+        if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) {
+            //Revert the timing delays and costs to pre-update values.
+            //These routines must be called after reverting the block moves
+            //if we wish to perform a reversion of the previous timing update.
+            //
+            //TODO: make this process incremental. Currently, all the delays
+            //are recomputed before all the timing costs are recomputed.
+            comp_td_connection_delays(delay_model);
+            comp_td_costs(delay_model, *criticalities, &costs->timing_cost);
+
+            /* Re-invalidate the affected sink pins */
+            invalidate_affected_connection_delays(sink_pins_affected,
+                                                  pin_timing_invalidator,
+                                                  timing_info);
+
+            /* Revert the timing update */
+            update_setup_slacks_and_criticalities(state->crit_exponent,
+                                                  delay_model,
+                                                  criticalities,
+                                                  setup_slacks,
+                                                  pin_timing_invalidator,
+                                                  timing_info,
+                                                  timing_update_mode,
+                                                  costs);
+
+            /* Check the consistency of the setup slack values */
+            VTR_ASSERT_SAFE_MSG(
+                verify_connection_setup_slacks(setup_slacks),
+                "The current setup slacks should be identical to the values before the try swap timing info update.");
+        }
 
-            if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
-                revert_td_cost(blocks_affected);
-            }
+        if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
+            /* Discard the values stored in proposed_* data structures */
+            revert_td_cost(blocks_affected);
         }
+    }
 
-        move_outcome_stats.delta_cost_norm = delta_c;
-        move_outcome_stats.delta_bb_cost_norm = bb_delta_c * prev_inverse_costs->bb_cost;
-        move_outcome_stats.delta_timing_cost_norm = timing_delta_c * prev_inverse_costs->timing_cost;
+    /* Record the costs in the move outcome stats */
+    move_outcome_stats.delta_cost_norm = delta_c;
+    move_outcome_stats.delta_bb_cost_norm = bb_delta_c * costs->bb_cost_norm;
+    move_outcome_stats.delta_timing_cost_norm = timing_delta_c * costs->timing_cost_norm;
 
-        move_outcome_stats.delta_bb_cost_abs = bb_delta_c;
-        move_outcome_stats.delta_timing_cost_abs = timing_delta_c;
+    move_outcome_stats.delta_bb_cost_abs = bb_delta_c;
+    move_outcome_stats.delta_timing_cost_abs = timing_delta_c;
 
-        LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c,
-                               (move_outcome ? "ACCEPTED" : "REJECTED"), "");
-    }
+    LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c,
+                           (move_outcome ? "ACCEPTED" : "REJECTED"), "");
 
     move_outcome_stats.outcome = move_outcome;
 
     move_generator.process_outcome(move_outcome_stats);
 
+    /* Clear the data structure containing block move info */
     clear_move_blocks(blocks_affected);
 
     //VTR_ASSERT(check_macro_placement_consistency() == 0);
@@ -1526,13 +1555,25 @@ static e_move_result try_swap(float t,
     check_place(*costs, delay_model, place_algorithm);
 #endif
 
-    return (move_outcome);
+    return move_outcome;
 }
 
-//Puts all the nets changed by the current swap into nets_to_update,
-//and updates their bounding box.
-//
-//Returns the number of affected nets.
+/**
+ * @brief Find all the nets and pins affected by this swap and update costs.
+ *
+ * Find all the nets affected by this swap and update the bouding box (wiring)
+ * costs. This cost function doesn't depend on the timing info.
+ *
+ * Find all the pins affected by this swap and update the timing cost.
+ * The timing costs are calculated by getting the new connection delays, multiplied
+ * by the connection criticalities returned by the timing analyzer.
+ * These timing costs are stored in the proposed_* data structures.
+ *
+ * The change in the bounding box cost is stored in `bb_delta_c`.
+ * The change in the timing cost is stored in `timing_delta_c`.
+ *
+ * @return The number of affected nets.
+ */
 static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm,
                                                const PlaceDelayModel* delay_model,
                                                const PlacerCriticalities* criticalities,
@@ -1566,7 +1607,7 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm
             //once per net, not once per pin.
             update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin);
 
-            if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
+            if (place_algorithm == PATH_TIMING_DRIVEN_PLACE || place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) {
                 //Determine the change in timing costs if required
                 update_td_delta_costs(delay_model, *criticalities, net_id, blk_pin, blocks_affected, timing_delta_c);
             }
@@ -1586,6 +1627,7 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm
     return num_affected_nets;
 }
 
+///@brief Stores all the nets affected by the block moves (avoid duplicates).
 static void record_affected_net(const ClusterNetId net, int& num_affected_nets) {
     //Record effected nets
     if (proposed_net_cost[net] < 0.) {
@@ -1598,6 +1640,7 @@ static void record_affected_net(const ClusterNetId net, int& num_affected_nets)
     }
 }
 
+///@brief Update the net bounding box.
 static void update_net_bb(const ClusterNetId net,
                           const t_pl_blocks_to_be_moved& blocks_affected,
                           int iblk,
@@ -1629,6 +1672,16 @@ static void update_net_bb(const ClusterNetId net,
     }
 }
 
+/**
+ * @brief Get the proposed timing delay and cost based on the current block moves.
+ *
+ * Only considers the sink pins on the moved blocks, and the sink pins of the nets
+ * driven by the driver pins on the moved blocks.
+ * Add all these pins into blocks_affected.affected_pins so that we don't have to
+ * go through the moved blocks and gather them again in other routines.
+ *
+ * Also calculates the change in the timing cost by the proposed block moves.
+ */
 static void update_td_delta_costs(const PlaceDelayModel* delay_model,
                                   const PlacerCriticalities& criticalities,
                                   const ClusterNetId net,
@@ -1641,7 +1694,7 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model,
         //This pin is a net driver on a moved block.
         //Re-compute all point to point connections for this net.
         for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) {
-            float temp_delay = comp_td_connection_delay(delay_model, net, ipin);
+            float temp_delay = comp_td_single_connection_delay(delay_model, net, ipin);
             proposed_connection_delay[net][ipin] = temp_delay;
 
             proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay;
@@ -1663,7 +1716,7 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model,
         if (!driven_by_moved_block(net, blocks_affected)) {
             int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin);
 
-            float temp_delay = comp_td_connection_delay(delay_model, net, net_pin);
+            float temp_delay = comp_td_single_connection_delay(delay_model, net, net_pin);
             proposed_connection_delay[net][net_pin] = temp_delay;
 
             proposed_connection_timing_cost[net][net_pin] = criticalities.criticality(net, net_pin) * temp_delay;
@@ -1674,16 +1727,104 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model,
     }
 }
 
+/**
+ * @brief Find all the sink pins with changed connection delays from the affected blocks.
+ *
+ * These sink pins will be passed into the pin_timing_invalidator for timing update.
+ * They will also be added to the pin invalidator when we wish to revert a timing update.
+ *
+ * It is possible that some connections may not have changed delay. For instance, if
+ * using a dx/dy delay model, this could occur if a sink moved to a new position with
+ * the same dx/dy from it's driver. To minimize work during the incremental STA update
+ * we do not invalidate such unchanged connections.
+ */
+static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected,
+                                    std::vector<ClusterPinId>& sink_pins_affected) {
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& clb_nlist = cluster_ctx.clb_nlist;
+
+    for (ClusterPinId clb_pin : blocks_affected.affected_pins) {
+        ClusterNetId net = clb_nlist.pin_net(clb_pin);
+        int ipin = clb_nlist.pin_net_index(clb_pin);
+
+        if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) {
+            //Delay has changed. Must invalidate this sink pin.
+            sink_pins_affected.push_back(clb_pin);
+        }
+    }
+}
+
+/**
+ * @brief Check if the setup slack has gotten better or worse due to block swap.
+ *
+ * Get all the modified slack values via the PlacerSetupSlacks class, and compare
+ * then with the original values at these connections. Sort them and compare them
+ * one by one, and return the difference of the first different pair.
+ *
+ * If the new slack value is larger(better), than return a negative value so that
+ * the move will be accepted. If the new slack value is smaller(worse), return a
+ * positive value so that the move will be rejected.
+ *
+ * If no slack values have changed, then return an arbitrary positive number. A
+ * move resulting in no change in the slack values should probably be unnecessary.
+ *
+ * The sorting is need to prevent in the unlikely circumstances that a bad slack
+ * value suddenly got very good due to the block move, while a good slack value
+ * got very bad, perhaps even worse than the original worse slack value.
+ */
+static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& clb_nlist = cluster_ctx.clb_nlist;
+
+    //Find the original/proposed setup slacks of pins with modified values
+    std::vector<float> original_setup_slacks, proposed_setup_slacks;
+
+    auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack();
+    for (ClusterPinId clb_pin : clb_pins_modified) {
+        ClusterNetId net_id = clb_nlist.pin_net(clb_pin);
+        size_t ipin = clb_nlist.pin_net_index(clb_pin);
+
+        original_setup_slacks.push_back(connection_setup_slack[net_id][ipin]);
+        proposed_setup_slacks.push_back(setup_slacks->setup_slack(net_id, ipin));
+    }
+
+    //Sort in ascending order, from worse slack value to best
+    std::sort(original_setup_slacks.begin(), original_setup_slacks.end());
+    std::sort(proposed_setup_slacks.begin(), proposed_setup_slacks.end());
+
+    //Check the first pair of slack values that are different
+    //If found, return their difference
+    for (size_t idiff = 0; idiff < original_setup_slacks.size(); ++idiff) {
+        float slack_diff = original_setup_slacks[idiff] != proposed_setup_slacks[idiff];
+
+        if (slack_diff != 0) {
+            return slack_diff;
+        }
+    }
+
+    //If all slack values are identical(or no modified slack values),
+    //reject this move by returning an arbitrary positive number as cost
+    return 1;
+}
+
+/**
+ * @brief Decide whether to accept a move based on the probability
+ *        calculated from the current annealing temperature.
+ *
+ * Returns: 1 -> move accepted, 0 -> rejected.
+ */
 static e_move_result assess_swap(double delta_c, double t) {
-    /* Returns: 1 -> move accepted, 0 -> rejected. */
+    /* A non-positive cost will always be accepted */
     if (delta_c <= 0) {
         return ACCEPTED;
     }
 
+    /* If temperature is 0 and the cost is positive, guaranteed rejection */
     if (t == 0.) {
         return REJECTED;
     }
 
+    /* Calculated the probability using temp and decide */
     float fnum = vtr::frand();
     float prob_fac = std::exp(-delta_c / t);
     if (prob_fac > fnum) {
@@ -1693,131 +1834,52 @@ static e_move_result assess_swap(double delta_c, double t) {
     return REJECTED;
 }
 
+/**
+ * @brief Recomputes the wiring cost to eliminate round-off that may have accrued.
+ *
+ * This process assumes that all the net costs have been updated.
+ */
 static double recompute_bb_cost() {
-    /* Recomputes the cost to eliminate roundoff that may have accrued.  *
-     * This routine does as little work as possible to compute this new  *
-     * cost.                                                             */
-
-    double cost = 0;
-
     auto& cluster_ctx = g_vpr_ctx.clustering();
 
+    double cost = 0;
     for (auto net_id : cluster_ctx.clb_nlist.nets()) {       /* for each net ... */
         if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */
             /* Bounding boxes don't have to be recomputed; they're correct. */
             cost += net_cost[net_id];
         }
     }
-
-    return (cost);
-}
-
-/*returns the delay of one point to point connection */
-static float comp_td_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-    auto& place_ctx = g_vpr_ctx.placement();
-
-    float delay_source_to_sink = 0.;
-
-    if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
-        //Only estimate delay for signals routed through the inter-block
-        //routing network. TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay."
-
-        ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id);
-        ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin);
-
-        ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin);
-        ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin);
-
-        int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin);
-        int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin);
-
-        int source_x = place_ctx.block_locs[source_block].loc.x;
-        int source_y = place_ctx.block_locs[source_block].loc.y;
-        int sink_x = place_ctx.block_locs[sink_block].loc.x;
-        int sink_y = place_ctx.block_locs[sink_block].loc.y;
-
-        /* Note: This heuristic only considers delta_x and delta_y, a much better heuristic
-         *       would be to to create a more comprehensive lookup table.
-         *
-         *       In particular this aproach does not accurately capture the effect of fast
-         *       carry-chain connections.
-         */
-        delay_source_to_sink = delay_model->delay(source_x,
-                                                  source_y,
-                                                  source_block_ipin,
-                                                  sink_x,
-                                                  sink_y,
-                                                  sink_block_ipin);
-        if (delay_source_to_sink < 0) {
-            VPR_ERROR(VPR_ERROR_PLACE,
-                      "in comp_td_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n"
-                      "in comp_td_connection_delay: Delay is less than 0\n",
-                      block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(),
-                      source_x, source_y,
-                      block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(),
-                      sink_x, sink_y,
-                      delay_source_to_sink);
-        }
-    }
-
-    return (delay_source_to_sink);
-}
-
-//Recompute all point to point delays, updating connection_delay
-static void comp_td_connection_delays(const PlaceDelayModel* delay_model) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-
-    for (auto net_id : cluster_ctx.clb_nlist.nets()) {
-        for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) {
-            connection_delay[net_id][ipin] = comp_td_connection_delay(delay_model, net_id, ipin);
-        }
-    }
+    return cost;
 }
 
-/* Update the connection_timing_cost values from the temporary *
- * values for all connections that have changed.                   */
+/**
+ * @brief Update the connection_timing_cost values from the temporary
+ *        values for all connections that have/haven't changed.
+ *
+ * All the connections have already been gathered by blocks_affected.affected_pins
+ * after running the routine find_affected_nets_and_update_costs().
+ */
 static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) {
     auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& clb_nlist = cluster_ctx.clb_nlist;
 
-    /* Go through all the blocks moved. */
-    for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
-        ClusterBlockId bnum = blocks_affected.moved_blocks[iblk].block_num;
-        for (ClusterPinId pin_id : cluster_ctx.clb_nlist.block_pins(bnum)) {
-            ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(pin_id);
-
-            if (cluster_ctx.clb_nlist.net_is_ignored(net_id))
-                continue;
-
-            if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::DRIVER) {
-                //This net is being driven by a moved block, recompute
-                //all point to point connections on this net.
-                for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) {
-                    connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin];
-                    proposed_connection_delay[net_id][ipin] = INVALID_DELAY;
-                    connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin];
-                    proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY;
-                }
-            } else {
-                //This pin is a net sink on a moved block
-                VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK);
-
-                /* The following "if" prevents the value from being updated twice. */
-                if (!driven_by_moved_block(net_id, blocks_affected)) {
-                    int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin_id);
+    //Go through all the sink pins affected
+    for (ClusterPinId pin_id : blocks_affected.affected_pins) {
+        ClusterNetId net_id = clb_nlist.pin_net(pin_id);
+        int ipin = clb_nlist.pin_net_index(pin_id);
 
-                    connection_delay[net_id][net_pin] = proposed_connection_delay[net_id][net_pin];
-                    proposed_connection_delay[net_id][net_pin] = INVALID_DELAY;
-                    connection_timing_cost[net_id][net_pin] = proposed_connection_timing_cost[net_id][net_pin];
-                    proposed_connection_timing_cost[net_id][net_pin] = INVALID_DELAY;
-                }
-            }
-        } /* Finished going through all the pins in the moved block */
-    }     /* Finished going through all the blocks moved */
+        //Commit the timing delay and cost values
+        connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin];
+        proposed_connection_delay[net_id][ipin] = INVALID_DELAY;
+        connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin];
+        proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY;
+    }
 }
 
-//Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost based on
-//the move proposed in blocks_affected
+/**
+ * @brief Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost
+ *        based on the move proposed in blocks_affected.
+ */
 static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) {
 #ifndef VTR_ASSERT_SAFE_ENABLED
     static_cast<void>(blocks_affected);
@@ -1836,39 +1898,28 @@ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) {
 #endif
 }
 
-//Invalidates the delays of connections effected by the specified move
-//
-//Relies on proposed_connection_delay and connection_delay to detect
-//which connections have actually had their delay changed.
-static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected,
+/**
+ * @brief Invalidates the delays of connections effected by the specified move.
+ *
+ * Relies on find_affected_sink_pins() to find all the connections with different
+ * `proposed_connection_delay` and `connection_delay`.
+ *
+ * Invalidate all the timing graph edges associated with these sink pins via the
+ * ClusteredPinTimingInvalidator class.
+ */
+static void invalidate_affected_connection_delays(const std::vector<ClusterPinId>& sink_pins_affected,
                                                   ClusteredPinTimingInvalidator* pin_tedges_invalidator,
                                                   TimingInfo* timing_info) {
     VTR_ASSERT_SAFE(timing_info);
     VTR_ASSERT_SAFE(pin_tedges_invalidator);
 
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-    auto& clb_nlist = cluster_ctx.clb_nlist;
-
-    //Inalidate timing graph edges affected by the move
-    for (ClusterPinId pin : blocks_affected.affected_pins) {
-        //It is possible that some connections may not have changed delay.(e.g.
-        //For instance, if using a dx/dy delay model, this could occur if a sink
-        //moved to a new position with the same dx/dy from it's driver.
-        //
-        //To minimze work during the incremental STA update we do not invalidate
-        //such unchanged connections.
-
-        ClusterNetId net = clb_nlist.pin_net(pin);
-        int ipin = clb_nlist.pin_net_index(pin);
-
-        if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) {
-            //Delay changed, must invalidate
-            pin_tedges_invalidator->invalidate_connection(pin, timing_info);
-        }
+    //Invalidate timing graph edges affected by the move
+    for (ClusterPinId clb_pin : sink_pins_affected) {
+        pin_tedges_invalidator->invalidate_connection(clb_pin, timing_info);
     }
 }
 
-//Returns true if 'net' is driven by one of the blocks in 'blocks_affected'
+///@brief Returns true if 'net' is driven by one of the blocks in 'blocks_affected'.
 static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected) {
     auto& cluster_ctx = g_vpr_ctx.clustering();
 
@@ -1881,163 +1932,23 @@ static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_b
     return false;
 }
 
-//Incrementally updates timing cost based on the current delays and criticality estimates
-// Unlike comp_td_costs() this only updates connections who's criticality has changed;
-// this is a superset of those connections who's delay has changed.
-//
-// For a from-scratch recalculation see comp_td_cost()
-static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) {
-    /* NB:  We must be careful calculating the total timing cost incrementally,
-     *      due to limitd floating point precision, so that we get a
-     *      bit-identical result matching that calculated by comp_td_costs().
-     *
-     *      In particular, we can not simply calculate the incremental
-     *      delta's caused by changed connection timing costs and adjust
-     *      the timing cost. Due to limited precision, the results of 
-     *      floating point math operations are order dependant and we
-     *      would get a different result.
-     *
-     *      To get around this, we calculate the timing costs hierarchically
-     *      to ensures we calculate the sum with the same order of operations 
-     *      as comp_td_costs().
-     *
-     *      See PlacerTimingCosts object used to represent connection_timing_costs
-     *      for details.
-     */
-    vtr::Timer t;
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-    auto& clb_nlist = cluster_ctx.clb_nlist;
-
-    //Update the modified pin timing costs
-    {
-        vtr::Timer timer;
-        auto clb_pins_modified = place_crit.pins_with_modified_criticality();
-        for (ClusterPinId clb_pin : clb_pins_modified) {
-            if (clb_nlist.pin_type(clb_pin) == PinType::DRIVER) continue;
-
-            ClusterNetId clb_net = clb_nlist.pin_net(clb_pin);
-            VTR_ASSERT_SAFE(clb_net);
-
-            if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue;
-
-            int ipin = clb_nlist.pin_net_index(clb_pin);
-            VTR_ASSERT_SAFE(ipin >= 0 && ipin < int(clb_nlist.net_pins(clb_net).size()));
-
-            double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin);
-
-            //Record new value
-            connection_timing_cost[clb_net][ipin] = new_timing_cost;
-        }
-
-        f_update_td_costs_connections_elapsed_sec += timer.elapsed_sec();
-    }
-
-    //Re-total timing costs of all nets
-    {
-        vtr::Timer timer;
-        *timing_cost = connection_timing_cost.total_cost();
-        f_update_td_costs_sum_nets_elapsed_sec += timer.elapsed_sec();
-    }
-
-#ifdef VTR_ASSERT_DEBUG_ENABLED
-    double check_timing_cost = 0.;
-    comp_td_costs(delay_model, place_crit, &check_timing_cost);
-    VTR_ASSERT_DEBUG_MSG(check_timing_cost == *timing_cost,
-                         "Total timing cost calculated incrementally in update_td_costs() is "
-                         "not consistent with value calculated from scratch in comp_td_costs()");
-#endif
-    f_update_td_costs_total_elapsed_sec += t.elapsed_sec();
-}
-
-//Recomputes timing cost from scratch based on the current delays and criticality estimates
-//
-// For a more efficient incremental update see update_td_costs()
-static void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) {
-    /* Computes the cost (from scratch) from the delays and criticalities    *
-     * of all point to point connections, we define the timing cost of       *
-     * each connection as criticality*delay.                                 */
-
-    /* NB: We calculate the timing cost in a hierarchicl manner (first connectsion,
-     *     then nets, then sum of nets) in order to allow it to be incrementally
-     *     while avoiding round-off effects. See update_td_costs() for details.
-     */
-
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-
-    for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */
-
-        if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue;
-
-        for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) {
-            float conn_timing_cost = comp_td_connection_cost(delay_model, place_crit, net_id, ipin);
-
-            //Record new value
-            connection_timing_cost[net_id][ipin] = conn_timing_cost;
-        }
-
-        //Store net timing cost for more efficient incremental updating
-        net_timing_cost[net_id] = sum_td_net_cost(net_id);
-    }
-
-    /* Make sure timing cost does not go above MIN_TIMING_COST. */
-    *timing_cost = sum_td_costs();
-}
-
-//Calculates the timing cost of the specified connection.
-// Updates the value in connection_timing_cost
-// Assumes only be called from compt_td_cost() or update_td_costs()
-static double comp_td_connection_cost(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, ClusterNetId net, int ipin) {
-    VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins");
-
-    VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_connection_delay(delay_model, net, ipin),
-                        "Connection delays should already be updated");
-
-    double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin];
-
-    VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_delay[net][ipin]),
-                        "Propsoed connection delay should already be invalidated");
-
-    VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_timing_cost[net][ipin]),
-                        "Proposed connection timing cost should already be invalidated");
-
-    return conn_timing_cost;
-}
-
-//Returns the timing cost of the specified 'net' based on the values in connection_timing_cost
-static double sum_td_net_cost(ClusterNetId net) {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-
-    double net_td_cost = 0;
-    for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) {
-        net_td_cost += connection_timing_cost[net][ipin];
-    }
-
-    return net_td_cost;
-}
-
-//Returns the total timing cost accross all nets based on the values in net_timing_cost
-static double sum_td_costs() {
-    auto& cluster_ctx = g_vpr_ctx.clustering();
-
-    double td_cost = 0;
-    for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */
-
-        if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue;
-
-        td_cost += net_timing_cost[net_id];
-    }
-
-    return td_cost;
-}
-
-/* Finds the cost from scratch.  Done only when the placement   *
- * has been radically changed (i.e. after initial placement).   *
- * Otherwise find the cost change incrementally.  If method     *
- * check is NORMAL, we find bounding boxes that are updateable  *
- * for the larger nets.  If method is CHECK, all bounding boxes *
- * are found via the non_updateable_bb routine, to provide a    *
- * cost which can be used to check the correctness of the       *
- * other routine.                                               */
+/**
+ * @brief Find the wiring cost.
+ *
+ * Find the wiring cost from scratch only when the placement has
+ * been radically changed (i.e. after the initial placement).
+ * Otherwise, find the cost change incrementally.
+ *
+ *   @param method
+ *
+ *   NORMAL  If the method check is NORMAL, we find boudning
+ *           boxes that are updateable for the larger nets.
+ *
+ *   CHECK   If the method check is CHECK, all bounding boxes
+ *           are found via the non_updateable_bb routine to
+ *           provide a cost which can be used to check the
+ *           correctness of the other routine.
+ */
 static double comp_bb_cost(e_cost_methods method) {
     double cost = 0;
     double expected_wirelength = 0.0;
@@ -2096,6 +2007,8 @@ static void alloc_and_load_placement_structs(float place_cost_exp,
         connection_delay = make_net_pins_matrix<float>(cluster_ctx.clb_nlist, 0.f);
         proposed_connection_delay = make_net_pins_matrix<float>(cluster_ctx.clb_nlist, 0.f);
 
+        connection_setup_slack = make_net_pins_matrix<float>(cluster_ctx.clb_nlist, std::numeric_limits<float>::infinity());
+
         connection_timing_cost = PlacerTimingCosts(cluster_ctx.clb_nlist);
         proposed_connection_timing_cost = make_net_pins_matrix<double>(cluster_ctx.clb_nlist, 0.);
         net_timing_cost.resize(num_nets, 0.);
@@ -2137,6 +2050,7 @@ static void free_placement_structs(const t_placer_opts& placer_opts) {
     if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
         vtr::release_memory(connection_timing_cost);
         vtr::release_memory(connection_delay);
+        vtr::release_memory(connection_setup_slack);
         vtr::release_memory(proposed_connection_timing_cost);
         vtr::release_memory(proposed_connection_delay);
 
@@ -2981,24 +2895,21 @@ static void print_resources_utilization() {
     VTR_LOG("\n");
 }
 
-static void init_annealing_state(t_annealing_state* state,
-                                 const t_annealing_sched& annealing_sched,
-                                 float t,
-                                 float rlim,
-                                 int move_lim_max,
-                                 float crit_exponent) {
-    state->alpha = annealing_sched.alpha_min;
-    state->t = t;
-    state->restart_t = t;
-    state->rlim = rlim;
-    state->inverse_delta_rlim = 1 / (rlim - FINAL_RLIM);
-    state->move_lim_max = std::max(1, move_lim_max);
-    if (annealing_sched.type == DUSTY_SCHED) {
-        state->move_lim = std::max(1, (int)(state->move_lim_max * annealing_sched.success_target));
+static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts) {
+    e_place_algorithm place_algo = placer_opts.place_algorithm;
+    e_place_quench_metric quench_metric = placer_opts.place_quench_metric;
+
+    if (place_algo == e_place_algorithm::PATH_TIMING_DRIVEN_PLACE) {
+        if (quench_metric == e_place_quench_metric::AUTO || quench_metric == e_place_quench_metric::TIMING_COST) {
+            return PATH_TIMING_DRIVEN_PLACE;
+        } else {
+            VTR_ASSERT(quench_metric == e_place_quench_metric::SETUP_SLACK);
+            return SETUP_SLACK_ANALYSIS_PLACE;
+        }
     } else {
-        state->move_lim = state->move_lim_max;
+        VTR_ASSERT(place_algo == e_place_algorithm::BOUNDING_BOX_PLACE);
+        return BOUNDING_BOX_PLACE;
     }
-    state->crit_exponent = crit_exponent;
 }
 
 bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) {
diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/place_delay_model.cpp
index c30f32b3e7d..e8a58db6704 100644
--- a/vpr/src/place/place_delay_model.cpp
+++ b/vpr/src/place/place_delay_model.cpp
@@ -10,6 +10,8 @@
 #include "vtr_math.h"
 #include "vpr_error.h"
 
+#include "place_global.h"
+
 #ifdef VTR_ENABLE_CAPNPROTO
 #    include "capnp/serialize.h"
 #    include "place_delay_model.capnp.h"
@@ -18,10 +20,7 @@
 #    include "serdes_utils.h"
 #endif /* VTR_ENABLE_CAPNPROTO */
 
-/*
- * DeltaDelayModel
- */
-
+///@brief DeltaDelayModel methods.
 float DeltaDelayModel::delay(int from_x, int from_y, int /*from_pin*/, int to_x, int to_y, int /*to_pin*/) const {
     int delta_x = std::abs(from_x - to_x);
     int delta_y = std::abs(from_y - to_y);
@@ -46,9 +45,11 @@ void DeltaDelayModel::dump_echo(std::string filepath) const {
     vtr::fclose(f);
 }
 
-/*
- * OverrideDelayModel
- */
+const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
+    return base_delay_model_.get();
+}
+
+///@brief OverrideDelayModel methods.
 float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const {
     //First check to if there is an override delay value
     auto& device_ctx = g_vpr_ctx.device();
@@ -136,18 +137,14 @@ float OverrideDelayModel::get_delay_override(int from_type, int from_class, int
     return iter->second;
 }
 
-const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
-    return base_delay_model_.get();
-}
-
 void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model_obj) {
     base_delay_model_ = std::move(base_delay_model_obj);
 }
 
-// When writing capnp targetted serialization, always allow compilation when
-// VTR_ENABLE_CAPNPROTO=OFF.  Generally this means throwing an exception
-// instead.
-//
+/**
+ * When writing capnp targetted serialization, always allow compilation when
+ * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
+ */
 #ifndef VTR_ENABLE_CAPNPROTO
 
 #    define DISABLE_ERROR                              \
@@ -300,3 +297,81 @@ void OverrideDelayModel::write(const std::string& file) const {
 }
 
 #endif
+
+///@brief Initialize the placer delay model.
+std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist,
+                                                               const t_placer_opts& placer_opts,
+                                                               const t_router_opts& router_opts,
+                                                               t_det_routing_arch* det_routing_arch,
+                                                               std::vector<t_segment_inf>& segment_inf,
+                                                               const t_direct_inf* directs,
+                                                               const int num_directs) {
+    return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf,
+                                     chan_width_dist, directs, num_directs);
+}
+
+/**
+ * @brief Returns the delay of one point to point connection.
+ *
+ * Only estimate delay for signals routed through the inter-block routing network.
+ * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay."
+ */
+float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) {
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& place_ctx = g_vpr_ctx.placement();
+
+    float delay_source_to_sink = 0.;
+
+    if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+        ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id);
+        ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin);
+
+        ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin);
+        ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin);
+
+        int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin);
+        int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin);
+
+        int source_x = place_ctx.block_locs[source_block].loc.x;
+        int source_y = place_ctx.block_locs[source_block].loc.y;
+        int sink_x = place_ctx.block_locs[sink_block].loc.x;
+        int sink_y = place_ctx.block_locs[sink_block].loc.y;
+
+        /**
+         * This heuristic only considers delta_x and delta_y, a much better
+         * heuristic would be to to create a more comprehensive lookup table.
+         *
+         * In particular this approach does not accurately capture the effect
+         * of fast carry-chain connections.
+         */
+        delay_source_to_sink = delay_model->delay(source_x,
+                                                  source_y,
+                                                  source_block_ipin,
+                                                  sink_x,
+                                                  sink_y,
+                                                  sink_block_ipin);
+        if (delay_source_to_sink < 0) {
+            VPR_ERROR(VPR_ERROR_PLACE,
+                      "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n"
+                      "in comp_td_single_connection_delay: Delay is less than 0\n",
+                      block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(),
+                      source_x, source_y,
+                      block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(),
+                      sink_x, sink_y,
+                      delay_source_to_sink);
+        }
+    }
+
+    return (delay_source_to_sink);
+}
+
+///@brief Recompute all point to point delays, updating `connection_delay` matrix.
+void comp_td_connection_delays(const PlaceDelayModel* delay_model) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    for (auto net_id : cluster_ctx.clb_nlist.nets()) {
+        for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) {
+            connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, net_id, ipin);
+        }
+    }
+}
diff --git a/vpr/src/place/place_delay_model.h b/vpr/src/place/place_delay_model.h
index db22db238ec..55b0558cb49 100644
--- a/vpr/src/place/place_delay_model.h
+++ b/vpr/src/place/place_delay_model.h
@@ -1,3 +1,9 @@
+/**
+ * @file
+ * @brief This file contains all the class and function declarations related to
+ *        the placer delay model. For implementations, see place_delay_model.cpp.
+ */
+
 #ifndef PLACE_DELAY_MODEL_H
 #define PLACE_DELAY_MODEL_H
 
@@ -20,12 +26,30 @@
 #    define ALWAYS_INLINE inline
 #endif
 
-//Abstract interface to a placement delay model
+///@brief Forward declarations.
+class PlaceDelayModel;
+
+///@brief Initialize the placer delay model.
+std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist,
+                                                               const t_placer_opts& place_opts,
+                                                               const t_router_opts& router_opts,
+                                                               t_det_routing_arch* det_routing_arch,
+                                                               std::vector<t_segment_inf>& segment_inf,
+                                                               const t_direct_inf* directs,
+                                                               const int num_directs);
+
+///@brief Returns the delay of one point to point connection.
+float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin);
+
+///@brief Recompute all point to point delays, updating `connection_delay` matrix.
+void comp_td_connection_delays(const PlaceDelayModel* delay_model);
+
+///@brief Abstract interface to a placement delay model.
 class PlaceDelayModel {
   public:
     virtual ~PlaceDelayModel() = default;
 
-    // Computes place delay model.
+    ///@brief Computes place delay model.
     virtual void compute(
         RouterDelayProfiler& route_profiler,
         const t_placer_opts& placer_opts,
@@ -33,25 +57,32 @@ class PlaceDelayModel {
         int longest_length)
         = 0;
 
-    //Returns the delay estimate between the specified block pins
-    //
-    // Either compute or read methods must be invoked before invoking
-    // delay.
+    /**
+     * @brief Returns the delay estimate between the specified block pins.
+     *
+     * Either compute or read methods must be invoked before invoking delay.
+     */
     virtual float delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const = 0;
 
-    //Dumps the delay model to an echo file
+    ///@brief Dumps the delay model to an echo file.
     virtual void dump_echo(std::string filename) const = 0;
 
-    // Write place delay model to specified file.
-    // May be unimplemented, in which case method should throw an exception.
+    /**
+     * @brief Write place delay model to specified file.
+     *
+     * May be unimplemented, in which case method should throw an exception.
+     */
     virtual void write(const std::string& file) const = 0;
 
-    // Read place delay model from specified file.
-    // May be unimplemented, in which case method should throw an exception.
+    /**
+     * @brief Read place delay model from specified file.
+     *
+     * May be unimplemented, in which case method should throw an exception.
+     */
     virtual void read(const std::string& file) = 0;
 };
 
-//A simple delay model based on the distance (delta) between block locations
+///@brief A simple delay model based on the distance (delta) between block locations.
 class DeltaDelayModel : public PlaceDelayModel {
   public:
     DeltaDelayModel() {}
@@ -109,10 +140,13 @@ class OverrideDelayModel : public PlaceDelayModel {
         short delta_x;
         short delta_y;
 
-        //A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
-        //is required for operator< to be inlined by compiler.
-        //Proper inlining of the function reduces place time by around 5%.
-        //For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
+        /**
+         * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
+         * is required for operator< to be inlined by compiler. Proper inlining of the
+         * function reduces place time by around 5%.
+         *
+         * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
+         */
         friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) {
             const short* left = reinterpret_cast<const short*>(&lhs);
             const short* right = reinterpret_cast<const short*>(&rhs);
@@ -123,8 +157,11 @@ class OverrideDelayModel : public PlaceDelayModel {
 
     vtr::flat_map2<t_override, float> delay_overrides_;
 
-    //operator< treats memory layout of t_override as an array of short
-    //this requires all members of t_override are shorts and there is no padding between members of t_override
+    /**
+     * operator< treats memory layout of t_override as an array of short.
+     * This requires all members of t_override are shorts and there is no
+     * padding between members of t_override.
+     */
     static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)");
     static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts");
     static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts");
diff --git a/vpr/src/place/place_global.h b/vpr/src/place/place_global.h
new file mode 100644
index 00000000000..fd1cc2d9a6b
--- /dev/null
+++ b/vpr/src/place/place_global.h
@@ -0,0 +1,39 @@
+/**
+ * @file
+ * @brief This file contains all the global data structures referenced across
+ *        multiple files in ./vpr/src/place.
+ *
+ * These global data structures were originally local to place.cpp, and they
+ * were referenced by a lot of routines local to place.cpp. However, to shorten
+ * the file size of place.cpp, these routines are moved to other files.
+ *
+ * Instead of elongating the argument list of the moved routines, I moved the
+ * data structures to here so that they can be easily shared across different
+ * files.
+ *
+ * For detailed descriptions on what each data structure stores, please see
+ * place.cpp, where these variables are defined.
+ *
+ * TODO: Create a single extern variable that allows access to all these data
+ * structures so that these structures don't have to be declared as extern.
+ */
+
+#pragma once
+#include <vector>
+#include "vtr_vector.h"
+#include "vpr_net_pins_matrix.h"
+#include "timing_place.h"
+
+extern vtr::vector<ClusterNetId, double> net_cost, proposed_net_cost;
+extern vtr::vector<ClusterNetId, char> bb_updated_before;
+extern ClbNetPinsMatrix<float> connection_delay;
+extern ClbNetPinsMatrix<float> proposed_connection_delay;
+extern ClbNetPinsMatrix<float> connection_setup_slack;
+extern PlacerTimingCosts connection_timing_cost;
+extern ClbNetPinsMatrix<double> proposed_connection_timing_cost;
+extern vtr::vector<ClusterNetId, double> net_timing_cost;
+extern vtr::vector<ClusterNetId, t_bb> bb_coords, bb_num_on_edges;
+extern vtr::vector<ClusterNetId, t_bb> ts_bb_coord_new, ts_bb_edge_new;
+extern float** chanx_place_cost_fac;
+extern float** chany_place_cost_fac;
+extern std::vector<ClusterNetId> ts_nets_to_update;
diff --git a/vpr/src/place/place_timing_update.cpp b/vpr/src/place/place_timing_update.cpp
new file mode 100644
index 00000000000..fa74f97dfb5
--- /dev/null
+++ b/vpr/src/place/place_timing_update.cpp
@@ -0,0 +1,362 @@
+/**
+ * @file place_timing_update.cpp
+ * @brief Defines the routines declared in place_timing_update.h.
+ */
+
+#include "vtr_time.h"
+
+#include "place_timing_update.h"
+#include "place_global.h"
+
+///@brief Use an incremental approach to updating timing costs after re-computing criticalities
+static constexpr bool INCR_COMP_TD_COSTS = true;
+
+///@brief File-scope variable that can be accessed via the routine get_udpate_td_costs_runtime_stats().
+static t_update_td_costs_stats update_td_costs_stats;
+
+///@brief Routines local to place_timing_update.cpp
+static double comp_td_connection_cost(const PlaceDelayModel* delay_model,
+                                      const PlacerCriticalities& place_crit,
+                                      ClusterNetId net,
+                                      int ipin);
+static double sum_td_net_cost(ClusterNetId net);
+static double sum_td_costs();
+
+/**
+ * @brief Initialize the timing information and structures in the placer.
+ *
+ * Perform first time update on the timing graph, and initialize the values within
+ * PlacerCriticalities, PlacerSetupSlacks, and connection_timing_cost.
+ */
+void initialize_timing_info(float crit_exponent,
+                            const PlaceDelayModel* delay_model,
+                            PlacerCriticalities* criticalities,
+                            PlacerSetupSlacks* setup_slacks,
+                            ClusteredPinTimingInvalidator* pin_timing_invalidator,
+                            SetupTimingInfo* timing_info,
+                            t_placer_timing_update_mode* timing_update_mode,
+                            t_placer_costs* costs) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& clb_nlist = cluster_ctx.clb_nlist;
+
+    //Initialize the timing update mode. Update both
+    //setup slacks and criticalities from scratch
+    timing_update_mode->update_criticalities = true;
+    timing_update_mode->update_setup_slacks = true;
+    timing_update_mode->recompute_criticalities = true;
+    timing_update_mode->recompute_setup_slacks = true;
+
+    //As a safety measure, for the first time update,
+    //invalidate all timing edges via the pin invalidator
+    //by passing in all the clb sink pins
+    for (ClusterNetId net_id : clb_nlist.nets()) {
+        for (ClusterPinId pin_id : clb_nlist.net_sinks(net_id)) {
+            pin_timing_invalidator->invalidate_connection(pin_id, timing_info);
+        }
+    }
+
+    //Perform timing info update
+    update_setup_slacks_and_criticalities(crit_exponent,
+                                          delay_model,
+                                          criticalities,
+                                          setup_slacks,
+                                          pin_timing_invalidator,
+                                          timing_info,
+                                          timing_update_mode,
+                                          costs);
+
+    //Compute timing cost from scratch
+    comp_td_costs(delay_model, *criticalities, &costs->timing_cost);
+
+    //Initialize the data structure that stores committed placer setup slacks
+    commit_setup_slacks(setup_slacks);
+
+    //Don't warn again about unconstrained nodes again during placement
+    timing_info->set_warn_unconstrained(false);
+}
+
+/**
+ * @brief Update timing info based on the current block positions.
+ *
+ * Update the values stored in PlacerCriticalities and PlacerSetupSlacks.
+ * This routine tries its best to be incremental when it comes to updating
+ * these values, and branching variables are stored in `timing_update_mode`.
+ * For a detailed description of how these variables work, please refer to
+ * the declaration documentation on t_placer_timing_update_mode.
+ *
+ * If criticalities are updated, the timing costs are updated as well.
+ * Calling this routine to update timing_cost will produce round-off error
+ * in the long run, so this value will be recomputed once in a while, via
+ * other timing driven routines.
+ *
+ * All the pins with changed connection delays have already been added into
+ * the ClusteredPinTimingInvalidator to allow incremental STA update. These
+ * changed connection delays are a direct result of moved blocks in try_swap().
+ *
+ * @param crit_exponent            Used to calculate `sharpened` criticalities.
+ *
+ * @param delay_model              Used to calculate the delay between two locations.
+ *
+ * @param criticalities            Mapping interface between atom pin criticalities
+ *                                 and clb pin criticalities.
+ *
+ * @param setup_slacks             Mapping interface between atom pin raw setup slacks
+ *                                 and clb pin raw setup slacks.
+ *
+ * @param pin_timing_invalidator   Stores all the pins that have their delay value changed
+ *                                 and needs to be updated in the timing graph.
+ *
+ * @param timing_info              Stores the timing graph and other important timing info.
+ *
+ * @param timing_update_mode       Determines what should be updated when this routine is
+ *                                 called, and using incremental techniques is appropriate.
+ *
+ * @param costs                    Stores the updated timing cost for the whole placement.
+ */
+void update_setup_slacks_and_criticalities(float crit_exponent,
+                                           const PlaceDelayModel* delay_model,
+                                           PlacerCriticalities* criticalities,
+                                           PlacerSetupSlacks* setup_slacks,
+                                           ClusteredPinTimingInvalidator* pin_timing_invalidator,
+                                           SetupTimingInfo* timing_info,
+                                           t_placer_timing_update_mode* timing_update_mode,
+                                           t_placer_costs* costs) {
+    //Run STA to update slacks and adjusted/relaxed criticalities
+    timing_info->update();
+
+    if (timing_update_mode->update_setup_slacks) {
+        //Update placer's setup slacks
+        setup_slacks->update_setup_slacks(timing_info, timing_update_mode->recompute_setup_slacks);
+    }
+
+    if (timing_update_mode->update_criticalities) {
+        //Update placer's criticalities (e.g. sharpen with crit_exponent)
+        criticalities->update_criticalities(timing_info, crit_exponent, timing_update_mode->recompute_criticalities);
+
+        //Update connection, net and total timing costs based on new criticalities
+        if (INCR_COMP_TD_COSTS) {
+            update_td_costs(delay_model, *criticalities, &costs->timing_cost);
+        } else {
+            comp_td_costs(delay_model, *criticalities, &costs->timing_cost);
+        }
+    }
+
+    //Setup slacks and criticalities need to be in sync with the timing_info.
+    //if they are to be incrementally updated on the next iteration.
+    //Otherwise, a re-computation for all clb sink pins is required.
+    timing_update_mode->recompute_setup_slacks = !timing_update_mode->update_setup_slacks;
+    timing_update_mode->recompute_criticalities = !timing_update_mode->update_criticalities;
+
+    //Clear invalidation state
+    pin_timing_invalidator->reset();
+}
+
+/**
+ * @brief Incrementally updates timing cost based on the current delays and criticality estimates.
+ *
+ * Unlike comp_td_costs(), this only updates connections who's criticality has changed.
+ * This is a superset of those connections whose connection delay has changed. For a
+ * from-scratch recalculation, refer to comp_td_cost().
+ *
+ * We must be careful calculating the total timing cost incrementally, due to limited
+ * floating point precision, so that we get a bit-identical result matching the one
+ * calculated by comp_td_costs().
+ *
+ * In particular, we can not simply calculate the incremental delta's caused by changed
+ * connection timing costs and adjust the timing cost. Due to limited precision, the results
+ * of floating point math operations are order dependant and we would get a different result.
+ *
+ * To get around this, we calculate the timing costs hierarchically, to ensure that we
+ * calculate the sum with the same order of operations as comp_td_costs().
+ *
+ * See PlacerTimingCosts object used to represent connection_timing_costs for details.
+ */
+void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) {
+    vtr::Timer t;
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    auto& clb_nlist = cluster_ctx.clb_nlist;
+
+    //Update the modified pin timing costs
+    {
+        vtr::Timer timer;
+        auto clb_pins_modified = place_crit.pins_with_modified_criticality();
+        for (ClusterPinId clb_pin : clb_pins_modified) {
+            if (clb_nlist.pin_type(clb_pin) == PinType::DRIVER) continue;
+
+            ClusterNetId clb_net = clb_nlist.pin_net(clb_pin);
+            VTR_ASSERT_SAFE(clb_net);
+
+            if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue;
+
+            int ipin = clb_nlist.pin_net_index(clb_pin);
+            VTR_ASSERT_SAFE(ipin >= 1 && ipin < int(clb_nlist.net_pins(clb_net).size()));
+
+            double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin);
+
+            //Record new value
+            connection_timing_cost[clb_net][ipin] = new_timing_cost;
+        }
+
+        update_td_costs_stats.connections_elapsed_sec += timer.elapsed_sec();
+    }
+
+    //Re-total timing costs of all nets
+    {
+        vtr::Timer timer;
+        *timing_cost = connection_timing_cost.total_cost();
+        update_td_costs_stats.sum_nets_elapsed_sec += timer.elapsed_sec();
+    }
+
+#ifdef VTR_ASSERT_DEBUG_ENABLED
+    double check_timing_cost = 0.;
+    comp_td_costs(delay_model, place_crit, &check_timing_cost);
+    VTR_ASSERT_DEBUG_MSG(check_timing_cost == *timing_cost,
+                         "Total timing cost calculated incrementally in update_td_costs() is "
+                         "not consistent with value calculated from scratch in comp_td_costs()");
+#endif
+    update_td_costs_stats.total_elapsed_sec += t.elapsed_sec();
+}
+
+/**
+ * @brief Recomputes timing cost from scratch based on the current delays and criticality estimates.
+ *
+ * Computes the cost (from scratch) from the delays and criticalities of all point to point
+ * connections, we define the timing cost of each connection as criticality * delay.
+ *
+ * We calculate the timing cost in a hierarchical manner (first connection, then nets, then
+ * sum of nets) in order to allow it to be incremental while avoiding round-off effects.
+ *
+ * For a more efficient incremental update, see update_td_costs().
+ */
+void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) {
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    for (auto net_id : cluster_ctx.clb_nlist.nets()) {
+        if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue;
+
+        for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) {
+            float conn_timing_cost = comp_td_connection_cost(delay_model, place_crit, net_id, ipin);
+
+            /* Record new value */
+            connection_timing_cost[net_id][ipin] = conn_timing_cost;
+        }
+        /* Store net timing cost for more efficient incremental updating */
+        net_timing_cost[net_id] = sum_td_net_cost(net_id);
+    }
+    /* Make sure timing cost does not go above MIN_TIMING_COST. */
+    *timing_cost = sum_td_costs();
+}
+
+/**
+ * @brief Calculates the timing cost of the specified connection.
+ *
+ * This routine assumes that it is only called either compt_td_cost() or
+ * update_td_costs(). Otherwise, various assertions below would fail.
+ */
+static double comp_td_connection_cost(const PlaceDelayModel* delay_model,
+                                      const PlacerCriticalities& place_crit,
+                                      ClusterNetId net,
+                                      int ipin) {
+    VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins");
+
+    VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_single_connection_delay(delay_model, net, ipin),
+                        "Connection delays should already be updated");
+
+    double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin];
+
+    VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_delay[net][ipin]),
+                        "Propsoed connection delay should already be invalidated");
+
+    VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_timing_cost[net][ipin]),
+                        "Proposed connection timing cost should already be invalidated");
+
+    return conn_timing_cost;
+}
+
+///@brief Returns the timing cost of the specified 'net' based on the values in connection_timing_cost.
+static double sum_td_net_cost(ClusterNetId net) {
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    double net_td_cost = 0;
+    for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) {
+        net_td_cost += connection_timing_cost[net][ipin];
+    }
+
+    return net_td_cost;
+}
+
+///@brief Returns the total timing cost accross all nets based on the values in net_timing_cost.
+static double sum_td_costs() {
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    double td_cost = 0;
+    for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */
+
+        if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue;
+
+        td_cost += net_timing_cost[net_id];
+    }
+
+    return td_cost;
+}
+
+/**
+ * @brief Commit all the setup slack values from the PlacerSetupSlacks
+ *        class to a vtr matrix.
+ *
+ * This routine is incremental since it relies on the pins_with_modified_setup_slack()
+ * to detect which pins need to be updated and which pins do not.
+ *
+ * Therefore, it is assumed that this routine is always called immediately after
+ * each time update_setup_slacks_and_criticalities() updates the setup slacks
+ * (i.e. t_placer_timing_update_mode::update_setup_slacks = true). Otherwise,
+ * pins_with_modified_setup_slack() cannot accurately account for all the pins
+ * that have their setup slacks changed, making this routine incorrect.
+ *
+ * Currently, the only exception to the rule above is when setup slack analysis is used
+ * during the placement quench. The new setup slacks might be either accepted or
+ * rejected, so for efficiency reasons, this routine is not called if the slacks are
+ * rejected in the end. For more detailed info, see the try_swap() routine.
+ */
+void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) {
+    const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
+
+    //Incremental: only go through sink pins with modified setup slack
+    auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack();
+    for (ClusterPinId pin_id : clb_pins_modified) {
+        ClusterNetId net_id = clb_nlist.pin_net(pin_id);
+        size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id);
+
+        connection_setup_slack[net_id][pin_index_in_net] = setup_slacks->setup_slack(net_id, pin_index_in_net);
+    }
+}
+
+/**
+ * @brief Verify that the values in the vtr matrix matches the PlacerSetupSlacks class.
+ *
+ * Return true if all values are identical. Otherwise, return false.
+ * Used to check if the timing update has been succesfully revereted if a proposed move
+ * is rejected when applying setup slack analysis during the placement quench.
+ * If successful, the setup slacks in the timing analyzer should be the same as
+ * the setup slacks in connection_setup_slack matrix without running commit_setup_slacks().
+ *
+ * For more detailed info, see the try_swap() routine.
+ */
+bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) {
+    const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
+
+    //Go through every single sink pin to check that the slack values are the same
+    for (ClusterNetId net_id : clb_nlist.nets()) {
+        for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) {
+            if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+///@brief Fetch the file-scope variable update_td_costs_stats in timing_place.cpp.
+t_update_td_costs_stats get_update_td_costs_runtime_stats() {
+    return update_td_costs_stats;
+}
diff --git a/vpr/src/place/place_timing_update.h b/vpr/src/place/place_timing_update.h
new file mode 100644
index 00000000000..fa5a47e8727
--- /dev/null
+++ b/vpr/src/place/place_timing_update.h
@@ -0,0 +1,94 @@
+/**
+ * @file place_timing_update.h
+ * @brief Stores timing update routines declarations used by the VPR placer.
+ */
+#pragma once
+#include "timing_place.h"
+#include "place_util.h"
+
+///<Forward declarations.
+struct t_placer_timing_update_mode;
+struct t_update_td_costs_stats;
+
+///@brief Initialize the timing information and structures in the placer.
+void initialize_timing_info(float crit_exponent,
+                            const PlaceDelayModel* delay_model,
+                            PlacerCriticalities* criticalities,
+                            PlacerSetupSlacks* setup_slacks,
+                            ClusteredPinTimingInvalidator* pin_timing_invalidator,
+                            SetupTimingInfo* timing_info,
+                            t_placer_timing_update_mode* timing_update_mode,
+                            t_placer_costs* costs);
+
+///@brief Update the timing information and structures in the placer.
+void update_setup_slacks_and_criticalities(float crit_exponent,
+                                           const PlaceDelayModel* delay_model,
+                                           PlacerCriticalities* criticalities,
+                                           PlacerSetupSlacks* setup_slacks,
+                                           ClusteredPinTimingInvalidator* pin_timing_invalidator,
+                                           SetupTimingInfo* timing_info,
+                                           t_placer_timing_update_mode* timing_update_mode,
+                                           t_placer_costs* costs);
+
+///@brief Incrementally updates timing cost based on the current delays and criticality estimates.
+void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost);
+
+///@brief Recomputes timing cost from scratch based on the current delays and criticality estimates.
+void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost);
+
+///@brief Commit all the setup slack values from the PlacerSetupSlacks class to a vtr matrix.
+void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks);
+
+///@brief Verify that the values in the vtr matrix matches the PlacerSetupSlacks class.
+bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks);
+
+///@brief Fetch the file-scope variable update_td_costs_stats in timing_place.cpp.
+t_update_td_costs_stats get_update_td_costs_runtime_stats();
+
+/**
+ * @brief Structure that determines the scope and the method of the timing update.
+ *
+ * This structure determines whether criticalities/setup slacks
+ * values in PlacerCriticalities and PlacerSetupSlacks should be
+ * updated in this iteration of timing update.
+ *
+ *   @param update_criticalities Update the values in PlacerCriticalities
+ *   @param update_setup_slacks Update the values in PlacerSetupSlacks
+ *
+ * It also enables detecting whether incremental updates can be performed.
+ * Each time incremental STA updates the timing graph, the timing analyzer
+ * can be queried for a set of ATOM pins that have modified slack/criticality
+ * values. If we keep feeding these modified pin ids into the two classes
+ * above, then the updates will be incremental and correct. We can treat this
+ * as PlacerCriticalities and PlacerSetupSlacks being `in sync` with the
+ * timing graph.
+ *
+ * However, for example, if the timing graph is updated twice before we update
+ * PlacerCriticalities, then we are only able to query the most recent set
+ * of ATOM pins with modified values rather than all two sets of pins. Under
+ * this circumstance, PlacerCriticalities is `out of sync` with the timing
+ * graph, and we must calculate criticality for every sink pin in the netlist.
+ * The case is the same for PlacerSetupSlacks.
+ *
+ * Fortunately, we know what does and what doesn't get updated in each iteration
+ * of timing update, so we can also tell if we should perform re-computation
+ * on the next iteration The following two variables are used and updated in
+ * the routine `update_setup_slacks_and_criticalities()`.
+ *
+ *   @param recompute_criticalities Recompute criticalities for every sink pin
+ *   @param recompute_setup_slacks Recompute setup slacks for every sink pin
+ */
+struct t_placer_timing_update_mode {
+    bool update_criticalities;
+    bool update_setup_slacks;
+    bool recompute_criticalities;
+    bool recompute_setup_slacks;
+};
+
+///@brief Structure that stores CPU runtime stats for the update_td_costs() routine.
+struct t_update_td_costs_stats {
+    float connections_elapsed_sec = 0.;
+    float nets_elapsed_sec = 0.;
+    float sum_nets_elapsed_sec = 0.;
+    float total_elapsed_sec = 0.;
+};
diff --git a/vpr/src/place/place_util.cpp b/vpr/src/place/place_util.cpp
index cd010c3a853..bb2738580b2 100644
--- a/vpr/src/place/place_util.cpp
+++ b/vpr/src/place/place_util.cpp
@@ -1,18 +1,29 @@
+/**
+ * @file place_util.cpp
+ * @brief Definitions of structure routines declared in place_util.h.
+ */
+
 #include "place_util.h"
 #include "globals.h"
 
+///<File-scope routines.
 static vtr::Matrix<t_grid_blocks> init_grid_blocks();
+static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid);
 
+///@brief Initialize the placement context.
 void init_placement_context() {
     auto& place_ctx = g_vpr_ctx.mutable_placement();
     auto& cluster_ctx = g_vpr_ctx.clustering();
 
+    /* Intialize the lookup of CLB block positions */
     place_ctx.block_locs.clear();
     place_ctx.block_locs.resize(cluster_ctx.clb_nlist.blocks().size());
 
+    /* Initialize the reverse lookup of CLB block positions */
     place_ctx.grid_blocks = init_grid_blocks();
 }
 
+///@brief Initialize `grid_blocks`, the inverse structure of `block_locs`.
 static vtr::Matrix<t_grid_blocks> init_grid_blocks() {
     auto& device_ctx = g_vpr_ctx.device();
 
@@ -29,3 +40,174 @@ static vtr::Matrix<t_grid_blocks> init_grid_blocks() {
 
     return grid_blocks;
 }
+
+///@brief Constructor: stores current placer algorithm.
+t_placer_costs::t_placer_costs(enum e_place_algorithm algo)
+    : place_algorithm(algo) {
+    if (place_algorithm != PATH_TIMING_DRIVEN_PLACE) {
+        VTR_ASSERT_MSG(
+            place_algorithm == BOUNDING_BOX_PLACE,
+            "Must pass a valid placer algorithm into the placer cost structure.");
+    }
+}
+
+/**
+ * @brief Mutator: updates the norm factors in the outer loop iteration.
+ *
+ * At each temperature change we update these values to be used
+ * for normalizing the trade-off between timing and wirelength (bb)
+ */
+void t_placer_costs::update_norm_factors() {
+    if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
+        bb_cost_norm = 1 / bb_cost;
+        //Prevent the norm factor from going to infinity
+        timing_cost_norm = std::min(1 / timing_cost, MAX_INV_TIMING_COST);
+        cost = 1;       //The value of cost will be reset to 1 if timing driven
+    } else {            //place_algorithm == BOUNDING_BOX_PLACE
+        cost = bb_cost; //The cost value should be identical to the wirelength cost
+    }
+}
+
+///@brief Constructor: Initialize all annealing state variables.
+t_annealing_state::t_annealing_state(const t_annealing_sched& annealing_sched,
+                                     float first_t,
+                                     float first_rlim,
+                                     int first_move_lim,
+                                     float first_crit_exponent) {
+    alpha = annealing_sched.alpha_min;
+    t = first_t;
+    restart_t = first_t;
+    rlim = first_rlim;
+    inverse_delta_rlim = 1 / (first_rlim - FINAL_RLIM);
+    move_lim_max = first_move_lim;
+    crit_exponent = first_crit_exponent;
+
+    //Determine the current move_lim based on the schedule type
+    if (annealing_sched.type == DUSTY_SCHED) {
+        move_lim = std::max(1, (int)(move_lim_max * annealing_sched.success_target));
+    } else {
+        move_lim = move_lim_max;
+    }
+}
+
+/**
+ * @brief Get the initial limit for inner loop block move attempt limit.
+ *
+ * There are two ways to scale the move limit.
+ * e_place_effort_scaling::CIRCUIT
+ *      scales the move limit proportional to num_blocks ^ (4/3)
+ * e_place_effort_scaling::DEVICE_CIRCUIT
+ *      scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3)
+ *
+ * The second method is almost identical to the first one when the device
+ * is highly utilized (device_size ~ num_blocks). For low utilization devices
+ * (device_size >> num_blocks), the search space is larger, so the second method
+ * performs more moves to ensure better optimization.
+ */
+
+int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched) {
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    auto device_size = device_ctx.grid.width() * device_ctx.grid.height();
+    auto num_blocks = cluster_ctx.clb_nlist.blocks().size();
+
+    int move_lim;
+    if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) {
+        move_lim = int(annealing_sched.inner_num * pow(num_blocks, 4. / 3.));
+    } else {
+        VTR_ASSERT_MSG(
+            placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT,
+            "Unrecognized placer effort scaling");
+
+        move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.));
+    }
+
+    /* Avoid having a non-positive move_lim */
+    move_lim = std::max(move_lim, 1);
+
+    VTR_LOG("Moves per temperature: %d\n", move_lim);
+
+    return move_lim;
+}
+
+/**
+ * @brief Update the annealing state according to the annealing schedule selected.
+ *
+ *   USER_SCHED:  A manual fixed schedule with fixed alpha and exit criteria.
+ *   AUTO_SCHED:  A more sophisticated schedule where alpha varies based on success ratio.
+ *   DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio.
+ *                See doc/src/vpr/dusty_sa.rst for more details.
+ *
+ * Returns true until the schedule is finished.
+ */
+bool update_annealing_state(t_annealing_state* state,
+                            float success_rat,
+                            const t_placer_costs& costs,
+                            const t_placer_opts& placer_opts,
+                            const t_annealing_sched& annealing_sched) {
+    /* Return `false` when the exit criterion is met. */
+    if (annealing_sched.type == USER_SCHED) {
+        state->t *= annealing_sched.alpha_t;
+        return state->t >= annealing_sched.exit_t;
+    }
+
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    /* Automatic annealing schedule */
+    float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size();
+
+    if (annealing_sched.type == DUSTY_SCHED) {
+        bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets
+        if (success_rat < annealing_sched.success_min || restart_temp) {
+            if (state->alpha > annealing_sched.alpha_max) return false;
+            state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature.
+            state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay);
+        } else {
+            if (success_rat > annealing_sched.success_target) {
+                state->restart_t = state->t;
+            }
+            state->t *= state->alpha;
+        }
+        state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat))));
+    } else { /* annealing_sched.type == AUTO_SCHED */
+        if (success_rat > 0.96) {
+            state->alpha = 0.5;
+        } else if (success_rat > 0.8) {
+            state->alpha = 0.9;
+        } else if (success_rat > 0.15 || state->rlim > 1.) {
+            state->alpha = 0.95;
+        } else {
+            state->alpha = 0.8;
+        }
+        state->t *= state->alpha;
+
+        // Must be duplicated to retain previous behavior
+        if (state->t < t_exit || std::isnan(t_exit)) return false;
+    }
+
+    // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk.
+    // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves.
+    update_rlim(&state->rlim, success_rat, device_ctx.grid);
+
+    if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
+        state->crit_exponent = (1 - (state->rlim - state->final_rlim()) * state->inverse_delta_rlim)
+                                   * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first)
+                               + placer_opts.td_place_exp_first;
+    }
+
+    return true;
+}
+
+/**
+ * @brief Update the range limited to keep acceptance prob. near 0.44.
+ *
+ * Use a floating point rlim to allow gradual transitions at low temps.
+ */
+static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) {
+    float upper_lim = std::max(grid.width() - 1, grid.height() - 1);
+
+    *rlim *= (1. - 0.44 + success_rat);
+    *rlim = std::max(std::min(*rlim, upper_lim), 1.f);
+}
diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h
index f35ec854ac9..399684ae03a 100644
--- a/vpr/src/place/place_util.h
+++ b/vpr/src/place/place_util.h
@@ -1,8 +1,118 @@
-#ifndef PLACE_UTIL_H
-#define PLACE_UTIL_H
-#include <string>
+/**
+ * @file place_util.h
+ * @brief Utility structures representing various states of the
+ *        placement. Also contains declarations of related routines.
+ */
 
-//Initialize the placement context
+#pragma once
+#include "vpr_types.h"
+
+///@brief Forward declarations.
+class t_placer_costs;
+class t_annealing_state;
+
+///@brief Initialize the placement context.
 void init_placement_context();
 
-#endif
+///@brief Get the initial limit for inner loop block move attempt limit.
+int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched);
+
+///@brief Update the annealing state according to the annealing schedule selected.
+bool update_annealing_state(t_annealing_state* state,
+                            float success_rat,
+                            const t_placer_costs& costs,
+                            const t_placer_opts& placer_opts,
+                            const t_annealing_sched& annealing_sched);
+
+/**
+ * @brief Data structure that stores different cost values in the placer.
+ *
+ * Although we do cost calculations with float values, we use doubles
+ * for the accumulated costs to avoid round-off, particularly on large
+ * designs where the magnitude of a single move's delta cost is small
+ * compared to the overall cost.
+ *
+ * The cost normalization factors are updated upon every temperature change
+ * in the outer_loop_update_timing_info routine. They are the multiplicative
+ * inverses of their respective cost values when the routine is called. They
+ * serve to normalize the trade-off between timing and wirelength (bb).
+ *
+ *   @param cost The weighted average of the wiring cost and the timing cost.
+ *   @param bb_cost The bounding box cost, aka the wiring cost.
+ *   @param timing_cost The timing cost, which is connection delay * criticality.
+ *
+ *   @param bb_cost_norm The normalization factor for the wiring cost.
+ *   @param timing_cost_norm The normalization factor for the timing cost, which
+ *              is upper-bounded by the value of MAX_INV_TIMING_COST.
+ *
+ *   @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity
+ *              with very lax timing constraints, which avoids multiplying by a
+ *              gigantic timing_cost_norm when auto-normalizing. The exact value
+ *              of this cost has relatively little impact, but should not be large
+ *              enough to be on the order of timing costs for normal constraints.
+ *
+ *   @param place_algorithm Determines how the member values are updated upon
+ *              each temperature change during the placer annealing process.
+ */
+class t_placer_costs {
+  public:
+    double cost;
+    double bb_cost;
+    double timing_cost;
+    double bb_cost_norm;
+    double timing_cost_norm;
+
+  private:
+    double MAX_INV_TIMING_COST = 1.e9;
+    enum e_place_algorithm place_algorithm;
+
+  public: //Constructor
+    t_placer_costs(enum e_place_algorithm algo);
+
+  public: //Mutator
+    void update_norm_factors();
+};
+
+/**
+ * @brief Stores variables that are used by the annealing process.
+ *
+ * This structure is updated by update_annealing_state() on each outer
+ * loop iteration. It stores various important variables that need to
+ * be accessed during the placement inner loop.
+ *
+ *   @param t Temperature for simulated annealing.
+ *   @param rlim Range limit for block swaps.
+ *   @param inverse_delta_rlim Used to update crit_exponent.
+ *   @param alpha Temperature decays factor (multiplied each outer loop iteration).
+ *   @param restart_t Temperature used after restart due to minimum success ratio.
+ *   @param crit_exponent Used by timing-driven placement to "sharpen" the timing criticality.
+ *   @param move_lim_max Maximum block move limit.
+ *   @param move_lim Current block move limit.
+ *
+ *   @param FINAL_RLIM The final rlim (range limit) is 1, which is the smallest value that
+ *              can still make progress, since an rlim of 0 wouldn't allow any swaps.
+ */
+class t_annealing_state {
+  public:
+    float t;
+    float rlim;
+    float inverse_delta_rlim;
+    float alpha;
+    float restart_t;
+    float crit_exponent;
+    int move_lim_max;
+    int move_lim;
+
+  private:
+    float FINAL_RLIM = 1.;
+
+  public: //Constructor
+    t_annealing_state(const t_annealing_sched& annealing_sched,
+                      float first_t,
+                      float first_rlim,
+                      int first_move_lim,
+                      float first_crit_exponent);
+
+  public: //Accessor
+    float final_rlim() const { return FINAL_RLIM; }
+};
diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp
index e62eab6c894..ae8e1b1e27c 100644
--- a/vpr/src/place/timing_place.cpp
+++ b/vpr/src/place/timing_place.cpp
@@ -1,3 +1,7 @@
+/**
+ * @file timing_place.cpp
+ * @brief Stores the method definitions of classes defined in timing_place.h.
+ */
 #include <cstdio>
 #include <cmath>
 
@@ -14,71 +18,42 @@
 
 #include "timing_info.h"
 
-//Use an incremental approach to updaing criticalities?
-constexpr bool INCR_UPDATE_CRITICALITIES = true;
+///@brief Use an incremental approach to updating criticalities and setup slacks?
+static constexpr bool INCR_UPDATE_CRITICALITIES = true, INCR_UPDATE_SETUP_SLACKS = true;
 
-/**************************************/
-
-/* Allocates space for the timing_place_crit_ data structure *
- * I chunk the data to save space on large problems.           */
+///@brief Allocates space for the timing_place_crit_ data structure.
 PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup)
     : clb_nlist_(clb_nlist)
     , pin_lookup_(netlist_pin_lookup)
     , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits<float>::quiet_NaN())) {
 }
 
-/**************************************/
-void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent) {
+/**
+ * @brief Updated the criticalities in the timing_place_crit_ data structure.
+ *
+ * If the criticalities are not updated immediately after each time we call
+ * timing_info->update(), then timing_info->pins_with_modified_setup_criticality()
+ * cannot accurately account for all the pins that need to be updated. In this case,
+ * we pass in recompute=true to update all criticalities from scratch.
+ *
+ * If the criticality exponent has changed, we also need to update from scratch.
+ */
+void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent, bool recompute) {
+    /* Determine what pins need updating */
+    if (!recompute && crit_exponent == last_crit_exponent_ && INCR_UPDATE_CRITICALITIES) {
+        incr_update_criticalities(timing_info);
+    } else {
+        recompute_criticalities();
+
+        /* Record new criticality exponent */
+        last_crit_exponent_ = crit_exponent;
+    }
+
     /* Performs a 1-to-1 mapping from criticality to timing_place_crit_.
      * For every pin on every net (or, equivalently, for every tedge ending
      * in that pin), timing_place_crit_ = criticality^(criticality exponent) */
 
-    //Determine what pins need updating
-    if (INCR_UPDATE_CRITICALITIES) {
-        cluster_pins_with_modified_criticality_.clear();
-        if (crit_exponent != last_crit_exponent_) {
-            //Criticality exponent changed, must re-calculate criticalities for *all* sink pins
-            for (ClusterNetId net_id : clb_nlist_.nets()) {
-                for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
-                    cluster_pins_with_modified_criticality_.insert(pin_id);
-                }
-            }
-
-            //Record new criticality exponent
-            last_crit_exponent_ = crit_exponent;
-        } else {
-            //Criticality exponent unchanged
-            //
-            //Collect the cluster pins which need to be updated based on the latest timing
-            //analysis
-            //
-            //Note we use the set of pins reported by the *timing_info* as having modified
-            //criticality, rather than those marked as modified by the timing analyzer.
-            //Since timing_info uses shifted/relaxed criticality (which depends on max
-            //required time and worst case slacks), additional nodes may be modified
-            //when updating the atom pin criticalities.
-
-            for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) {
-                ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin);
-
-                //Some atom pins correspond to connections which are completely
-                //contained within a cluster, and hence have no corresponding
-                //clustered pin.
-                if (!clb_pin) continue;
-
-                cluster_pins_with_modified_criticality_.insert(clb_pin);
-            }
-        }
-    } else {
-        //Non-incremental: all pins and nets need updating
-        for (ClusterNetId net_id : clb_nlist_.nets()) {
-            for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
-                cluster_pins_with_modified_criticality_.insert(pin_id);
-            }
-        }
-    }
-
-    //Update the effected pins
+    /* Update the effected pins */
     for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) {
         ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin);
         int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin);
@@ -92,21 +67,144 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf
     }
 }
 
+/**
+ * @brief Collect the cluster pins which need to be updated based on the latest timing
+ *        analysis so that incremental updates to criticalities can be performed.
+ *
+ * Note we use the set of pins reported by the *timing_info* as having modified
+ * criticality, rather than those marked as modified by the timing analyzer.
+ *
+ * Since timing_info uses shifted/relaxed criticality (which depends on max required
+ * time and worst case slacks), additional nodes may be modified when updating the
+ * atom pin criticalities.
+ */
+
+void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timing_info) {
+    cluster_pins_with_modified_criticality_.clear();
+
+    for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) {
+        ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin);
+
+        //Some atom pins correspond to connections which are completely
+        //contained within a cluster, and hence have no corresponding
+        //clustered pin.
+        if (!clb_pin) continue;
+
+        cluster_pins_with_modified_criticality_.insert(clb_pin);
+    }
+}
+
+/**
+ * @brief Collect all the sink pins in the netlist and prepare them update.
+ *
+ * For the incremental version, see PlacerCriticalities::incr_update_criticalities().
+ */
+void PlacerCriticalities::recompute_criticalities() {
+    cluster_pins_with_modified_criticality_.clear();
+
+    /* Non-incremental: all sink pins need updating */
+    for (ClusterNetId net_id : clb_nlist_.nets()) {
+        for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
+            cluster_pins_with_modified_criticality_.insert(pin_id);
+        }
+    }
+}
+
+///@brief Override the criticality of a particular connection.
 void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float val) {
     timing_place_crit_[net_id][ipin] = val;
 }
 
+/**
+ * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which
+ *        were modified by the last call to PlacerCriticalities::update_criticalities().
+ */
 PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const {
     return vtr::make_range(cluster_pins_with_modified_criticality_);
 }
 
-std::unique_ptr<PlaceDelayModel> alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist,
-                                                                 const t_placer_opts& placer_opts,
-                                                                 const t_router_opts& router_opts,
-                                                                 t_det_routing_arch* det_routing_arch,
-                                                                 std::vector<t_segment_inf>& segment_inf,
-                                                                 const t_direct_inf* directs,
-                                                                 const int num_directs) {
-    return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf,
-                                     chan_width_dist, directs, num_directs);
+/**************************************/
+
+///@brief Allocates space for the timing_place_setup_slacks_ data structure.
+PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup)
+    : clb_nlist_(clb_nlist)
+    , pin_lookup_(netlist_pin_lookup)
+    , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits<float>::quiet_NaN())) {
+}
+
+/**
+ * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure.
+ *
+ * If the setup slacks are not updated immediately after each time we call
+ * timing_info->update(), then timing_info->pins_with_modified_setup_slack()
+ * cannot accurately account for all the pins that need to be updated.
+ * In this case, we pass in recompute=true to update all setup slacks from scratch.
+ */
+void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute) {
+    if (!recompute && INCR_UPDATE_SETUP_SLACKS) {
+        incr_update_setup_slacks(timing_info);
+    } else {
+        recompute_setup_slacks();
+    }
+
+    /* Update the effected pins */
+    for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) {
+        ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin);
+        int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin);
+
+        float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info, pin_lookup_, clb_pin);
+
+        timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack;
+    }
+}
+
+/**
+ * @brief Collect the cluster pins which need to be updated based on the latest timing
+ *        analysis so that incremental updates to setup slacks can be performed.
+ *
+ * Note we use the set of pins reported by the *timing_info* as having modified
+ * setup slacks, rather than those marked as modified by the timing analyzer.
+ */
+void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_info) {
+    cluster_pins_with_modified_setup_slack_.clear();
+
+    for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_slack()) {
+        ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin);
+
+        //Some atom pins correspond to connections which are completely
+        //contained within a cluster, and hence have no corresponding
+        //clustered pin.
+        if (!clb_pin) continue;
+
+        cluster_pins_with_modified_setup_slack_.insert(clb_pin);
+    }
+}
+
+/**
+ * @brief Collect all the sink pins in the netlist and prepare them update.
+ *
+ * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks().
+ */
+void PlacerSetupSlacks::recompute_setup_slacks() {
+    cluster_pins_with_modified_setup_slack_.clear();
+
+    /* Non-incremental: all sink pins need updating */
+    for (ClusterNetId net_id : clb_nlist_.nets()) {
+        for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) {
+            cluster_pins_with_modified_setup_slack_.insert(pin_id);
+        }
+    }
+}
+
+///@brief Override the setup slack of a particular connection.
+void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float val) {
+    timing_place_setup_slacks_[net_id][ipin] = val;
+}
+
+/**
+ * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
+ *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
+ */
+PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const {
+    return vtr::make_range(cluster_pins_with_modified_setup_slack_);
 }
diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h
index c3d8a41c3a1..50042b50ea4 100644
--- a/vpr/src/place/timing_place.h
+++ b/vpr/src/place/timing_place.h
@@ -1,3 +1,32 @@
+/**
+ * @file timing_place.h
+ * @brief Interface used by the VPR placer to query information
+ *        from the Tatum timing analyzer.
+ *
+ *   @class PlacerSetupSlacks
+ *              Queries connection **RAW** setup slacks, which can
+ *              range from negative to positive values. Also maps
+ *              atom pin setup slacks to clb pin setup slacks.
+ *   @class PlacerCriticalities
+ *              Query connection criticalities, which are calculuated
+ *              based on the raw setup slacks and ranges from 0 to 1.
+ *              Also maps atom pin crit. to clb pin crit.
+ *   @class PlacerTimingCosts
+ *              Hierarchical structure used by update_td_costs() to
+ *              maintain the order of addition operation of float values
+ *              (to avoid round-offs) while doing incremental updates.
+ *
+ * Calculating criticalities:
+ *      All the raw setup slack values across a single clock domain are gathered, shifted,
+ *      and rated from best to worst. The best shifted slack value (the most positive one)
+ *      will have a criticality of 0, while the worse shifted slack value (always 0)
+ *      will have a criticality of 1. Criticalities are used to calculated timing costs
+ *      for each connection (delay * criticality).
+ *
+ *      For a more detailed description on how criticalities are calculated, see
+ *      calc_relaxed_criticality() in `timing_util.cpp`.
+ */
+
 #ifndef TIMING_PLACE
 #define TIMING_PLACE
 
@@ -7,39 +36,42 @@
 #include "place_delay_model.h"
 #include "vpr_net_pins_matrix.h"
 
-std::unique_ptr<PlaceDelayModel> alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist,
-                                                                 const t_placer_opts& place_opts,
-                                                                 const t_router_opts& router_opts,
-                                                                 t_det_routing_arch* det_routing_arch,
-                                                                 std::vector<t_segment_inf>& segment_inf,
-                                                                 const t_direct_inf* directs,
-                                                                 const int num_directs);
-/* Usage
+/**
+ * @brief PlacerCriticalities returns the clustered netlist connection criticalities
+ *        used by the placer ('sharpened' by a criticality exponent).
+ *
+ * Usage
  * =====
- * PlacerCriticalities returns the clustered netlist connection criticalities used by 
- * the placer ('sharpened' by a criticality exponent). This also serves to map atom 
- * netlist level criticalites (i.e. on AtomPinIds) to the clustered netlist (i.e. 
- * ClusterPinIds) used during placement.
+ * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds)
+ * to the clustered netlist (i.e. ClusterPinIds) used during placement.
  *
- * Criticalities are calculated by calling update_criticalities(), which will 
- * update criticalities based on the atom netlist connection criticalities provided by
- * the passed in SetupTimingInfo. This is done incrementally, based on the modified
- * connections/AtomPinIds returned by SetupTimingInfo.
+ * Criticalities are calculated by calling update_setup_slacks_and_criticalities() and
+ * setting t_placer_timing_update_mode::update_criticalities to true. It will update
+ * criticalities based on the atom netlist connection criticalities provided by the
+ * passed in SetupTimingInfo.
  *
- * The criticalities of individual connections can then be queried by calling the 
- * criticality() member function.
+ * This process can be done incrementally, based on the modified connections/AtomPinIds
+ * returned by SetupTimingInfo. But sometimes a recomputation is required. For detailed
+ * information please see the description of `t_placer_timing_update_mode` structure.
  *
- * It also supports iterating via pins_with_modified_criticalities() through the 
- * clustered netlist pins/connections which have had their criticality modified by 
- * the last call to update_criticalities(), which is useful for incrementally 
+ * It also supports iterating via pins_with_modified_criticalities() through the
+ * clustered netlist pins/connections which have had their criticality modified by
+ * the last call to update_criticalities(), which is useful for incrementally
  * re-calculating timing costs.
  *
+ * The criticalities of individual connections can then be queried by calling the
+ * criticality() member function.
+ *
  * Implementation
  * ==============
- * To support incremental re-calculation the class saves the last criticality exponent
- * passed to update_criticalites(). If the next update uses the same exponent criticalities
- * can be incrementally updated. Otherwise they must be re-calculated from scratch, since
- * a change in exponent changes *all* criticalities.
+ * To support incremental re-calculation, the class saves the last criticality exponent
+ * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same
+ * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated
+ * from scratch, since a change in exponent changes *all* criticalities.
+ *
+ * If the timing graph is updated while t_placer_timing_update_mode::update_criticalities is
+ * set to false, a re-calculation of *all* criticalities is required as well (since we don't
+ * know exactly which pins have changed after multiple timing updates have been performed).
  */
 class PlacerCriticalities {
   public: //Types
@@ -55,40 +87,134 @@ class PlacerCriticalities {
     PlacerCriticalities& operator=(const PlacerCriticalities& clb_nlist) = delete;
 
   public: //Accessors
-    //Returns the criticality of the specified connection
+    ///@brief Returns the criticality of the specified connection.
     float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; }
 
-    //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified
-    //by the last call to update_criticalities()
+    /**
+     * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which
+     *        were modified by the last call to PlacerCriticalities::update_criticalities().
+     */
     pin_range pins_with_modified_criticality() const;
 
   public: //Modifiers
-    //Incrementally updates criticalities based on the atom netlist criticalitites provied by
-    //timing_info and the provided criticality_exponent.
-    void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent);
+    /**
+     * @brief Updates criticalities based on the atom netlist criticalitites
+     *        provided by timing_info and the provided criticality_exponent.
+     */
+    void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent, bool recompute);
 
-    //Override the criticality of a particular connection
+    ///@brief Override the criticality of a particular connection.
     void set_criticality(ClusterNetId net, int ipin, float val);
 
   private: //Data
+    ///@brief The clb netlist in the placement context.
     const ClusteredNetlist& clb_nlist_;
-    const ClusteredPinAtomPinsLookup& pin_lookup_;
 
-    ClbNetPinsMatrix<float> timing_place_crit_; /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */
+    ///@brief The lookup table that maps atom pins to clb pins.
+    const ClusteredPinAtomPinsLookup& pin_lookup_;
 
-    //The criticality exponent when update_criticalites() was last called (used to detect if incremental update can be used)
+    /**
+     * @brief The matrix that stores criticality value for each connection.
+     *
+     * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
+     */
+    ClbNetPinsMatrix<float> timing_place_crit_;
+
+    /**
+     * The criticality exponent when update_criticalites() was last called
+     * (used to detect if incremental update can be used).
+     */
     float last_crit_exponent_ = std::numeric_limits<float>::quiet_NaN();
 
-    //Set of pins with criticaltites modified by last call to update_criticalities()
+    ///@brief Set of pins with criticaltites modified by last call to update_criticalities().
     vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_criticality_;
+
+    ///@brief Updates criticalities: incremental V.S. from scratch
+    void incr_update_criticalities(const SetupTimingInfo* timing_info);
+    void recompute_criticalities();
 };
 
-/* Usage
+/**
+ * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection.
+ *
+ * Usage
  * =====
- * PlacerTimingCosts mimics a 2D array of connection timing costs running from:
- *      [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
+ * This also serves to map atom netlist level setup slacks (i.e. on AtomPinIds) to the
+ * clustered netlist (i.e. ClusterPinIds) used during placement.
+ *
+ * Setup slacks are calculated by calling update_setup_slacks_and_criticalities(),
+ * with t_placer_timing_update_mode::update_setup_slacks to true. It will update setup
+ * slacks based on the atom netlist connection setup slacks provided by the passed in
+ * SetupTimingInfo.
+ *
+ * This process can be done incrementally, based on the modified connections/AtomPinIds
+ * returned by SetupTimingInfo. But sometimes a recomputation is required. For detailed
+ * information please see the description of `t_placer_timing_update_mode` structure.
+ *
+ * It also supports iterating via pins_with_modified_setup_slack() through the clustered
+ * netlist pins/connections which have had their setup slacks modified by the last call
+ * to update_setup_slacks().
+ *
+ * The RAW setup slacks of individual connections can then be queried by calling the
+ * setup_slack() member function.
+ *
+ * Note: RAW setup slacks are unlike criticalities. Their values are not confined between
+ * 0 and 1. Their values can be either positive or negative.
+ */
+class PlacerSetupSlacks {
+  public: //Types
+    typedef vtr::vec_id_set<ClusterPinId>::iterator pin_iterator;
+    typedef vtr::vec_id_set<ClusterNetId>::iterator net_iterator;
+
+    typedef vtr::Range<pin_iterator> pin_range;
+    typedef vtr::Range<net_iterator> net_range;
+
+  public: //Lifetime
+    PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup);
+    PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete;
+    PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete;
+
+  public: //Accessors
+    ///@brief Returns the setup slack of the specified connection.
+    float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; }
+
+    /**
+     * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds)
+     *        which were modified by the last call to PlacerSetupSlacks::update_setup_slacks().
+     */
+    pin_range pins_with_modified_setup_slack() const;
+
+  public: //Modifiers
+    ///@brief Updates setup slacks based on the atom netlist setup slacks provided by timing_info.
+    void update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute);
+
+    ///@brief Override the setup slack of a particular connection.
+    void set_setup_slack(ClusterNetId net, int ipin, float val);
+
+  private: //Data
+    const ClusteredNetlist& clb_nlist_;
+    const ClusteredPinAtomPinsLookup& pin_lookup_;
+
+    /**
+     * @brief The matrix that stores raw setup slack values for each connection.
+     *
+     * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]
+     */
+    ClbNetPinsMatrix<float> timing_place_setup_slacks_;
+
+    ///@brief Set of pins with raw setup slacks modified by last call to update_criticalities()
+    vtr::vec_id_set<ClusterPinId> cluster_pins_with_modified_setup_slack_;
+
+    ///@brief Updates setup slacks: incremental V.S. from scratch.
+    void incr_update_setup_slacks(const SetupTimingInfo* timing_info);
+    void recompute_setup_slacks();
+};
+
+/**
+ * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from:
+ *        [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1].
  *
- * So it can be used similar to:
+ * It can be used similar to:
  *
  *      PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct
  *
@@ -99,53 +225,53 @@ class PlacerCriticalities {
  *
  *      //Potentially other modifications...
  *
- *      //Calculate the updated timing cost, of all connections, incrementally based 
- *      //on modifications
+ *      //Calculate the updated timing cost, of all connections,
+ *      //incrementally based on modifications
  *      float total_timing_cost = connection_timing_costs.total_cost();
- *      
+ *
  * However behind the scenes PlacerTimingCosts tracks when connection costs are modified,
  * and efficiently re-calculates the total timing cost incrementally based on the connections
  * which have had their cost modified.
  *
- * Implementaion
- * =============
- * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part 
+ * Implementation
+ * ==============
+ * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part
  * of connection_costs_.  To mimic 2d-array like access PlacerTimingCosts also uses two proxy
  * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy
  * respectively).
  *
  * The first part of connection_costs_ stores intermediate sums of the connection costs for
- * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary 
+ * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary
  * tree, where leaves correspond to individual connection costs and intermediate nodes the
- * partial sums of the connection costs. (The binary tree is stored implicitly in the 
- * connection_costs_  vector, using Eytzinger's/BFS layout.) By summing the entire binary 
+ * partial sums of the connection costs. (The binary tree is stored implicitly in the
+ * connection_costs_  vector, using Eytzinger's/BFS layout.) By summing the entire binary
  * tree we calculate the total timing cost over all connections.
  *
  * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset
  * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up
- * to the root) which have ancestors (leaves) with modified connection costs. When the 
+ * to the root) which have ancestors (leaves) with modified connection costs. When the
  * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost.
- * Only invalidated nodes are traversed, with valid nodes just returning their previously 
+ * Only invalidated nodes are traversed, with valid nodes just returning their previously
  * calculated (and unchanged) value.
  *
- * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can 
+ * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can
  * be done in O(k log K) time.
  *
- * It is important to note that due to limited floating point precision, floating point 
+ * It is important to note that due to limited floating point precision, floating point
  * arithmetic has an order dependence (due to round-off). Using a binary tree to total
  * the timing connection costs allows us to incrementally update the total timign cost while
- * maintianing the *same order of operations* as if it was re-computed from scratch. This 
+ * maintianing the *same order of operations* as if it was re-computed from scratch. This
  * ensures we *always* get consistent results regardless of what/when connections are changed.
  *
  * Proxy Classes
- * -------------
+ * =============
  * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of
  * internal storage of that net's connection costs.
  *
- * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular 
- * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy 
- * supports assignment, allowing clients to modify the connection cost. It also detects if the 
- * assigned value differs from the previous value and if so, calls PlacerTimingCosts's 
+ * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular
+ * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy
+ * supports assignment, allowing clients to modify the connection cost. It also detects if the
+ * assigned value differs from the previous value and if so, calls PlacerTimingCosts's
  * invalidate() method on that connection cost.
  *
  * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN)
@@ -193,7 +319,9 @@ class PlacerTimingCosts {
         size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1);
 
         VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections");
-        VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)");
+        VTR_ASSERT_MSG(
+            num_connections == 0 || num_level_before_leaves < num_connections,
+            "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)");
 
         //We don't need to store all possible leaves if we have fewer connections
         //(i.e. bottom-right of tree is empty)
@@ -213,16 +341,19 @@ class PlacerTimingCosts {
         }
     }
 
-    //Proxy class representing a connection cost
-    // Supports modification of connection cost while detecting changes and
-    // reporting them up to PlacerTimingCosts
+    /**
+     * @brief Proxy class representing a connection cost.
+     *
+     * Supports modification of connection cost while detecting
+     * changes and reporting them up to PlacerTimingCosts.
+     */
     class ConnectionProxy {
       public:
         ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost)
             : timing_costs_(timing_costs)
             , connection_cost_(connection_cost) {}
 
-        //Allow clients to modify the connection cost via assignment
+        ///@brief Allow clients to modify the connection cost via assignment.
         ConnectionProxy& operator=(double new_cost) {
             if (new_cost != connection_cost_) {
                 //If connection cost changed, update it, and mark it
@@ -233,9 +364,11 @@ class PlacerTimingCosts {
             return *this;
         }
 
-        //Support getting the current connection cost as a double
-        // Useful for client code operating on the cost values (e.g.
-        // difference between costs)
+        /**
+         * @brief Support getting the current connection cost as a double.
+         *
+         * Useful for client code operating on the cost values (e.g. difference between costs).
+         */
         operator double() {
             return connection_cost_;
         }
@@ -245,15 +378,18 @@ class PlacerTimingCosts {
         double& connection_cost_;
     };
 
-    //Proxy class representing the connection costs of a net
-    // Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection
+    /**
+     * @brief Proxy class representing the connection costs of a net.
+     *
+     * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection.
+     */
     class NetProxy {
       public:
         NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs)
             : timing_costs_(timing_costs)
             , net_sink_costs_(net_sink_costs) {}
 
-        //Indexes into the specific net pin/connection
+        ///@brief Indexes into the specific net pin/connection.
         ConnectionProxy operator[](size_t ipin) {
             return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]);
         }
@@ -263,7 +399,7 @@ class PlacerTimingCosts {
         double* net_sink_costs_;
     };
 
-    //Indexes into the specific net
+    ///@brief Indexes into the specific net.
     NetProxy operator[](ClusterNetId net_id) {
         VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0);
 
@@ -282,8 +418,10 @@ class PlacerTimingCosts {
         std::swap(num_levels_, other.num_levels_);
     }
 
-    //Calculates the total cost of all connections efficiently
-    //in the face of modified connection costs
+    /**
+     * @brief Calculates the total cost of all connections efficiently
+     *        in the face of modified connection costs.
+     */
     double total_cost() {
         float cost = total_cost_recurr(0); //Root
 
@@ -294,7 +432,7 @@ class PlacerTimingCosts {
     }
 
   private:
-    //Recursively calculate and update the timing cost rooted at inode
+    ///@brief Recursively calculate and update the timing cost rooted at inode.
     double total_cost_recurr(size_t inode) {
         //Prune out-of-tree
         if (inode > connection_costs_.size() - 1) {
@@ -329,12 +467,18 @@ class PlacerTimingCosts {
         return node_cost;
     }
 
-    friend ConnectionProxy; //So it can call invalidate()
+    ///@brief Friend-ed so it can call invalidate().
+    friend ConnectionProxy;
 
     void invalidate(double* invalidated_cost) {
         //Check pointer within range of internal storage
-        VTR_ASSERT_SAFE_MSG(invalidated_cost >= &connection_costs_[0], "Connection cost pointer should be after start of internal storage");
-        VTR_ASSERT_SAFE_MSG(invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], "Connection cost pointer should be before end of internal storage");
+        VTR_ASSERT_SAFE_MSG(
+            invalidated_cost >= &connection_costs_[0],
+            "Connection cost pointer should be after start of internal storage");
+
+        VTR_ASSERT_SAFE_MSG(
+            invalidated_cost <= &connection_costs_[connection_costs_.size() - 1],
+            "Connection cost pointer should be before end of internal storage");
 
         size_t icost = invalidated_cost - &connection_costs_[0];
 
@@ -343,7 +487,7 @@ class PlacerTimingCosts {
         //Invalidate parent intermediate costs up to root or first
         //already-invalidated parent
         size_t iparent = parent(icost);
-        ;
+
         while (!std::isnan(connection_costs_[iparent])) {
             //Invalidate
             connection_costs_[iparent] = std::numeric_limits<double>::quiet_NaN();
@@ -371,33 +515,41 @@ class PlacerTimingCosts {
         return (i - 1) / 2;
     }
 
-    //Returns the number of nodes in ilevel'th level
-    //If ilevel is negative, return 0, since the root shouldn't be counted
-    //as a leaf node candidate
+    /**
+     * @brief Returns the number of nodes in ilevel'th level.
+     *
+     * If ilevel is negative, return 0, since the root shouldn't
+     * be counted as a leaf node candidate.
+     */
     size_t num_nodes_in_level(int ilevel) const {
         return ilevel < 0 ? 0 : (2 << (ilevel));
     }
 
-    //Returns the total number of nodes in levels [0..ilevel] (inclusive)
+    ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive).
     size_t num_nodes_up_to_level(int ilevel) const {
         return (2 << (ilevel + 1)) - 1;
     }
 
   private:
-    //Vector storing the implicit binary tree of connection costs
-    // The actual connections are stored at the end of the vector
-    // (last level of the binary tree). The earlier portions of
-    // the tree are the intermediate nodes.
-    //
-    // The methods left_child()/right_child()/parent() can be used
-    // to traverse the tree by indicies into this vector
+    /**
+     * @brief Vector storing the implicit binary tree of connection costs.
+     *
+     * The actual connections are stored at the end of the vector
+     * (last level of the binary tree). The earlier portions of
+     * the tree are the intermediate nodes.
+     *
+     * The methods left_child()/right_child()/parent() can be used
+     * to traverse the tree by indicies into this vector.
+     */
     std::vector<double> connection_costs_;
 
-    //Vector storing the indicies of the first connection for
-    //each net in the netlist, used for indexing by net.
+    /**
+     * @brief Vector storing the indicies of the first connection
+     *        for each net in the netlist, used for indexing by net.
+     */
     vtr::vector<ClusterNetId, int> net_start_indicies_;
 
-    //Number of levels in the binary tree
+    ///@brief Number of levels in the binary tree.
     size_t num_levels_ = 0;
 };
 
diff --git a/vpr/src/timing/timing_util.cpp b/vpr/src/timing/timing_util.cpp
index 6dd2c06d249..5bff2ac8324 100644
--- a/vpr/src/timing/timing_util.cpp
+++ b/vpr/src/timing/timing_util.cpp
@@ -579,6 +579,23 @@ float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, cons
     return clb_pin_crit;
 }
 
+//Return the setup slack of a net's pin in the CLB netlist
+float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin) {
+    //There may be multiple atom netlist pins connected to this CLB pin
+    float clb_pin_setup_slack = std::numeric_limits<float>::quiet_NaN();
+
+    for (const auto atom_pin : pin_lookup.connected_atom_pins(clb_pin)) {
+        //Take the worst of the atom pin slacks as the CLB pin slack
+        if (std::isnan(clb_pin_setup_slack)) {
+            clb_pin_setup_slack = timing_info.setup_pin_slack(atom_pin);
+        } else {
+            clb_pin_setup_slack = std::min(clb_pin_setup_slack, timing_info.setup_pin_slack(atom_pin));
+        }
+    }
+
+    return clb_pin_setup_slack;
+}
+
 //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum
 //required time and worst slack for all domain pairs represent by the slack tags
 //
diff --git a/vpr/src/timing/timing_util.h b/vpr/src/timing/timing_util.h
index 87f6b86787b..682771e9763 100644
--- a/vpr/src/timing/timing_util.h
+++ b/vpr/src/timing/timing_util.h
@@ -183,6 +183,9 @@ class ClusteredPinTimingInvalidator {
 //Return the criticality of a net's pin in the CLB netlist
 float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin);
 
+//Return the setup slack of a net's pin in the CLB netlist
+float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin);
+
 //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum
 //required time and worst slack for all domain pairs represent by the slack tags
 //