From f4ea4a15c6701b6f4b4ace9f0ee3ff83d0c36d75 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 23 Jul 2020 18:24:46 -0400 Subject: [PATCH 01/21] Added interface for mapping between CLB pins and setup slacks. Refactored PlacerCriticalities, and created PlacerSetupSlacks, so that they can choose between doing incremental V.S. from scratch updates. --- vpr/src/place/place.cpp | 8 +- vpr/src/place/timing_place.cpp | 174 ++++++++++++++++++++++++--------- vpr/src/place/timing_place.h | 68 ++++++++++++- vpr/src/timing/timing_util.cpp | 17 ++++ 4 files changed, 215 insertions(+), 52 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index ef6f0ba8c74..b9ceb07e04f 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -164,6 +164,10 @@ static vtr::vector net_timing_cost; //Like connection_timi static vtr::vector bb_coords, bb_num_on_edges; +/* Determines if slacks/criticalities need to be recomputed from scratch */ +static bool do_recompute_criticalities = false; +static bool do_recompute_slacks = false; + /* The arrays below are used to precompute the inverse of the average * * number of tracks per channel between [subhigh] and [sublow]. Access * * them as chan?_place_cost_fac[subhigh][sublow]. They are used to * @@ -1103,7 +1107,7 @@ static void placement_inner_loop(float t, /* Lines below prevent too much round-off error from accumulating * in the cost over many iterations (due to incremental updates). - * This round-off can lead to error checks failing because the cost + * This round-off can lead to error checks failing because the cost * is different from what you get when you recompute from scratch. */ ++(*moves_since_cost_recompute); @@ -1894,7 +1898,7 @@ static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCrit if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue; int ipin = clb_nlist.pin_net_index(clb_pin); - VTR_ASSERT_SAFE(ipin >= 0 && ipin < int(clb_nlist.net_pins(clb_net).size())); + VTR_ASSERT_SAFE(ipin >= 1 && ipin < int(clb_nlist.net_pins(clb_net).size())); double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin); diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp index e62eab6c894..3043c0e1089 100644 --- a/vpr/src/place/timing_place.cpp +++ b/vpr/src/place/timing_place.cpp @@ -14,8 +14,9 @@ #include "timing_info.h" -//Use an incremental approach to updaing criticalities? +//Use an incremental approach to updating criticalities and setup slacks? constexpr bool INCR_UPDATE_CRITICALITIES = true; +constexpr bool INCR_UPDATE_SETUP_SLACKS = true; /**************************************/ @@ -27,58 +28,29 @@ PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, cons , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { } -/**************************************/ -void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent) { - /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. - * For every pin on every net (or, equivalently, for every tedge ending - * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ +void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent, bool recompute) { + //If the criticalities are not updated immediately after each time we call + //timing_info->update(), then timing_info->pins_with_modified_setup_criticality() + //cannot accurately account for all the pins that need to be updated. + //In this case, we pass in recompute=true to update all criticalities from scratch. + // + //If the criticality exponent has changed, we also need to update from scratch. //Determine what pins need updating - if (INCR_UPDATE_CRITICALITIES) { - cluster_pins_with_modified_criticality_.clear(); - if (crit_exponent != last_crit_exponent_) { - //Criticality exponent changed, must re-calculate criticalities for *all* sink pins - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } - - //Record new criticality exponent - last_crit_exponent_ = crit_exponent; - } else { - //Criticality exponent unchanged - // - //Collect the cluster pins which need to be updated based on the latest timing - //analysis - // - //Note we use the set of pins reported by the *timing_info* as having modified - //criticality, rather than those marked as modified by the timing analyzer. - //Since timing_info uses shifted/relaxed criticality (which depends on max - //required time and worst case slacks), additional nodes may be modified - //when updating the atom pin criticalities. - - for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { - ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); - - //Some atom pins correspond to connections which are completely - //contained within a cluster, and hence have no corresponding - //clustered pin. - if (!clb_pin) continue; - - cluster_pins_with_modified_criticality_.insert(clb_pin); - } - } + if (!recompute && crit_exponent == last_crit_exponent_ && INCR_UPDATE_CRITICALITIES) { + incr_update_criticalities(timing_info); } else { - //Non-incremental: all pins and nets need updating - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } + recompute_criticalities(timing_info); + + //Record new criticality exponent + last_crit_exponent_ = crit_exponent; } - //Update the effected pins + /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. + * For every pin on every net (or, equivalently, for every tedge ending + * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ + + // Update the effected pins for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) { ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); @@ -92,6 +64,41 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf } } +void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_criticality_.clear(); + + //Collect the cluster pins which need to be updated based on the latest timing + //analysis + // + //Note we use the set of pins reported by the *timing_info* as having modified + //criticality, rather than those marked as modified by the timing analyzer. + //Since timing_info uses shifted/relaxed criticality (which depends on max + //required time and worst case slacks), additional nodes may be modified + //when updating the atom pin criticalities. + + for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_criticality_.insert(clb_pin); + } +} + +void PlacerCriticalities::recompute_criticalities(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_criticality_.clear(); + + //Non-incremental: all sink pins need updating + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_criticality_.insert(pin_id); + } + } +} + void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float val) { timing_place_crit_[net_id][ipin] = val; } @@ -100,6 +107,77 @@ PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticali return vtr::make_range(cluster_pins_with_modified_criticality_); } +/**************************************/ + +/* Allocates space for the timing_place_setup_slacks_ data structure */ +PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) + : clb_nlist_(clb_nlist) + , pin_lookup_(netlist_pin_lookup) + , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { +} + +void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute) { + //If the setup slacks are not updated immediately after each time we call + //timing_info->update(), then timing_info->pins_with_modified_setup_slack() + //cannot accurately account for all the pins that need to be updated. + //In this case, we pass in recompute=true to update all setup slacks from scratch. + if (!recompute && INCR_UPDATE_SETUP_SLACKS) { + incr_update_setup_slacks(timing_info); + } else { + recompute_setup_slacks(timing_info); + } + + //Update the effected pins + for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) { + ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); + int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); + + float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info, pin_lookup_, clb_pin); + + timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack; + } +} + +void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_setup_slack_.clear(); + + //Collect the cluster pins which need to be updated based on the latest timing analysis + // + //Note we use the set of pins reported by the *timing_info* as having modified + //setup slacks, rather than those marked as modified by the timing analyzer. + for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_slack()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_setup_slack_.insert(clb_pin); + } +} + +void PlacerSetupSlacks::recompute_setup_slacks(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_setup_slack_.clear(); + + //Non-incremental: all sink pins need updating + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_setup_slack_.insert(pin_id); + } + } +} + +void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float val) { + timing_place_setup_slacks_[net_id][ipin] = val; +} + +PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { + return vtr::make_range(cluster_pins_with_modified_setup_slack_); +} + +/**************************************/ + std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist, const t_placer_opts& placer_opts, const t_router_opts& router_opts, diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index c3d8a41c3a1..fff1c6ab5f1 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -63,9 +63,9 @@ class PlacerCriticalities { pin_range pins_with_modified_criticality() const; public: //Modifiers - //Incrementally updates criticalities based on the atom netlist criticalitites provied by + //Updates criticalities based on the atom netlist criticalitites provided by //timing_info and the provided criticality_exponent. - void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent); + void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent, bool recompute); //Override the criticality of a particular connection void set_criticality(ClusterNetId net, int ipin, float val); @@ -81,6 +81,70 @@ class PlacerCriticalities { //Set of pins with criticaltites modified by last call to update_criticalities() vtr::vec_id_set cluster_pins_with_modified_criticality_; + + //Updates criticalities: incremental V.S. from scratch + void incr_update_criticalities(const SetupTimingInfo* timing_info); + void recompute_criticalities(const SetupTimingInfo* timing_info); +}; + +/* Usage + * ===== + * PlacerSetupSlacks returns the clustered netlist connection setup slack used by + * the placer. This also serves to map atom netlist level slack (i.e. on AtomPinIds) + * to the clustered netlist (i.e. ClusterPinIds) used during placement. + * + * Setup slacks are calculated by calling update_setup_slacks(), which will + * update setup slacks based on the atom netlist connection setup slacks provided by + * the passed in SetupTimingInfo. This is done incrementally, based on the modified + * connections/AtomPinIds returned by SetupTimingInfo. + * + * The setup slacks of individual connections can then be queried by calling the + * setup_slack() member function. + * + * It also supports iterating via pins_with_modified_setup_slack() through the + * clustered netlist pins/connections which have had their setup slacks modified by + * the last call to update_setup_slacks(). + */ +class PlacerSetupSlacks { + public: //Types + typedef vtr::vec_id_set::iterator pin_iterator; + typedef vtr::vec_id_set::iterator net_iterator; + + typedef vtr::Range pin_range; + typedef vtr::Range net_range; + + public: //Lifetime + PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup); + PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete; + PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete; + + public: //Accessors + //Returns the setup slack of the specified connection + float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slack_[net][ipin]; } + + //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified + //by the last call to update_setup_slacks() + pin_range pins_with_modified_setup_slack() const; + + public: //Modifiers + //Updates setup slacks based on the atom netlist setup slacks provided by timing_info + void update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute); + + //Override the setup slack of a particular connection + void set_setup_slack(ClusterNetId net, int ipin, float val); + + private: //Data + const ClusteredNetlist& clb_nlist_; + const ClusteredPinAtomPinsLookup& pin_lookup_; + + ClbNetPinsMatrix timing_place_setup_slacks_; /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ + + //Set of pins with criticaltites modified by last call to update_criticalities() + vtr::vec_id_set cluster_pins_with_modified_setup_slack_; + + //Updates setup slacks: incremental V.S. from scratch + void incr_update_setup_slacks(const SetupTimingInfo* timing_info); + void recompute_setup_slacks(const SetupTimingInfo* timing_info); }; /* Usage diff --git a/vpr/src/timing/timing_util.cpp b/vpr/src/timing/timing_util.cpp index 6dd2c06d249..6ad86f4ca43 100644 --- a/vpr/src/timing/timing_util.cpp +++ b/vpr/src/timing/timing_util.cpp @@ -579,6 +579,23 @@ float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, cons return clb_pin_crit; } +//Return the slack of a net's pin in the CLB netlist +float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin) { + //There may be multiple atom netlist pins connected to this CLB pin + float clb_pin_setup_slack = std::numeric_limits::quiet_NaN(); + + for (const auto atom_pin : pin_lookup.connected_atom_pins(clb_pin)) { + //Take the worst of the atom pin slacks as the CLB pin slack + if (std::isnan(clb_pin_setup_slack)) { + clb_pin_setup_slack = timing_info.setup_pin_slack(atom_pin); + } else { + clb_pin_setup_slack = std::min(clb_pin_setup_slack, timing_info.setup_pin_slack(atom_pin)); + } + } + + return clb_pin_setup_slack; +} + //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum //required time and worst slack for all domain pairs represent by the slack tags // From c024603536cc9fad343065975bb802561051b4bc Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 23 Jul 2020 19:12:25 -0400 Subject: [PATCH 02/21] Refactored criticalities update in place.cpp and added setup slacks update. Added checks to see if the updates need to be done from scratch or can be done incrementally --- vpr/src/place/place.cpp | 246 ++++++++++++++++++++++++++++------------ 1 file changed, 172 insertions(+), 74 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index b9ceb07e04f..d52d4d4d6e7 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -165,8 +165,8 @@ static vtr::vector net_timing_cost; //Like connection_timi static vtr::vector bb_coords, bb_num_on_edges; /* Determines if slacks/criticalities need to be recomputed from scratch */ -static bool do_recompute_criticalities = false; -static bool do_recompute_slacks = false; +static bool do_recompute_criticalities = true; +static bool do_recompute_setup_slacks = true; /* The arrays below are used to precompute the inverse of the average * * number of tracks per channel between [subhigh] and [sublow]. Access * @@ -417,16 +417,16 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr); static void free_try_swap_arrays(); -static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info); +static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, + t_placer_costs* costs, + t_placer_prev_inverse_costs* prev_inverse_costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info); static void recompute_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, @@ -435,6 +435,25 @@ static void recompute_criticalities(float crit_exponent, SetupTimingInfo* timing_info, t_placer_costs* costs); +static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info); + +static void update_criticalities(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs); + +static void update_setup_slacks_and_criticalities(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs); + static void placement_inner_loop(float t, int temp_num, float rlim, @@ -590,6 +609,8 @@ void try_place(const t_placer_opts& placer_opts, timing_info = make_setup_timing_info(placement_delay_calc, placer_opts.timing_update_type); + placer_setup_slacks = std::make_unique(cluster_ctx.clb_nlist, netlist_pin_lookup); + placer_criticalities = std::make_unique(cluster_ctx.clb_nlist, netlist_pin_lookup); pin_timing_invalidator = std::make_unique(cluster_ctx.clb_nlist, @@ -598,12 +619,12 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.lookup, *timing_info->timing_graph()); //Update timing and costs - recompute_criticalities(crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); + update_criticalities(crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &costs); timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement @@ -754,14 +775,14 @@ void try_place(const t_placer_opts& placer_opts, costs.cost = 1; } - outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs, - num_connections, - crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); + outer_loop_update_criticalities(placer_opts, &costs, &prev_inverse_costs, + num_connections, + crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + pin_timing_invalidator.get(), + timing_info.get()); placement_inner_loop(t, num_temps, rlim, placer_opts, move_lim, crit_exponent, inner_recompute_limit, &stats, @@ -818,15 +839,15 @@ void try_place(const t_placer_opts& placer_opts, { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); - outer_loop_recompute_criticalities(placer_opts, &costs, - &prev_inverse_costs, - num_connections, - crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); + outer_loop_update_criticalities(placer_opts, &costs, + &prev_inverse_costs, + num_connections, + crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + pin_timing_invalidator.get(), + timing_info.get()); t = 0; /* freeze out */ @@ -892,12 +913,12 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs - recompute_criticalities(crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); + update_criticalities(crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &costs); critical_path = timing_info->least_slack_critical_path(); @@ -950,17 +971,17 @@ void try_place(const t_placer_opts& placer_opts, VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", f_update_td_costs_connections_elapsed_sec, f_update_td_costs_nets_elapsed_sec, f_update_td_costs_sum_nets_elapsed_sec, f_update_td_costs_total_elapsed_sec); } -/* Function to recompute the criticalities before the inner loop of the annealing */ -static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info) { +/* Function to update the criticalities before the inner loop of the annealing */ +static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, + t_placer_costs* costs, + t_placer_prev_inverse_costs* prev_inverse_costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info) { if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) return; @@ -975,12 +996,13 @@ static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, VTR_ASSERT(num_connections > 0); //Update timing information - recompute_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + update_criticalities(crit_exponent, + delay_model, + criticalities, + pin_timing_invalidator, + timing_info, + &costs); + *outer_crit_iter_count = 0; } (*outer_crit_iter_count)++; @@ -992,19 +1014,45 @@ static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); } -//Update timing information based on current placement by running STA to get new slacks, -//and calculate updated criticalities and timing costs -static void recompute_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs) { +//Update timing information based on current placement by running STA +//and record the new setup slack information +static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info) { + + //Run STA to update slacks and adjusted/relaxed criticalities + timing_info->update(); + + //Update placer's setup slacks + setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); + + //Setup slacks are now in sync with the timing_info + //Can perform incremental updates next time + do_recompute_setup_slacks = false; + + //Criticalities are now out of sync with the timing_info + //Must do from scratch recompute next time + do_recompute_criticalities = true; + + //Clear invalidation state + pin_timing_invalidator->reset(); +} + +//Update timing information based on current placement by running STA +//and calculate the updated criticalities and timing costs +//(based on the new setup slacks) +static void update_criticalities(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs) { + //Run STA to update slacks and adjusted/relaxed criticalities timing_info->update(); - //Update placer'criticalities (e.g. sharpen with crit_exponent) - criticalities->update_criticalities(timing_info, crit_exponent); + //Update placer's criticalities (e.g. sharpen with crit_exponent) + criticalities->update_criticalities(timing_info, crit_exponent, do_recompute_criticalities); //Update connection, net and total timing costs based on new criticalities #ifdef INCR_COMP_TD_COSTS @@ -1013,10 +1061,59 @@ static void recompute_criticalities(float crit_exponent, comp_td_costs(delay_model, *criticalities, &costs->timing_cost); #endif + //Criticalities are now in sync with the timing_info + //Can perform incremental updates next time + do_recompute_criticalities = false; + + //Setup slacks are now out of sync with the timing_info + //Must do from scratch recompute next time + do_recompute_setup_slacks = true; + //Clear invalidation state pin_timing_invalidator->reset(); } +//Update timing information based on current placement by running STA. +//Record the new slack information as well as calculate the updated +//criticalities and timing costs (based on the new setup slacks) +// +//If both setup slacks and criticalities need to be updated, +//this routine should be called, instead of individual update routine. +//This is to prevent unnecessary from scratch updates +static void update_setup_slacks_and_criticalities(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs) { + + //Run STA to update slacks and adjusted/relaxed criticalities + timing_info->update(); + + //Update placer's setup slacks + setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); + + //Update placer's criticalities (e.g. sharpen with crit_exponent) + criticalities->update_criticalities(timing_info, crit_exponent, do_recompute_criticalities); + + //Update connection, net and total timing costs based on new criticalities +#ifdef INCR_COMP_TD_COSTS + update_td_costs(delay_model, *criticalities, &costs->timing_cost); +#else + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); +#endif + + //Both Setup slacks and Criticalities are now in sync with the timing_info + //They can be both incrementally updated next time + do_recompute_setup_slacks = false; + do_recompute_criticalities = false; + + //Clear invalidation state + pin_timing_invalidator->reset(); +} + + /* Function which contains the inner loop of the simulated annealing */ static void placement_inner_loop(float t, int temp_num, @@ -1088,12 +1185,13 @@ static void placement_inner_loop(float t, /* Using the delays in connection_delay, do a timing analysis to update slacks and * criticalities and update the timing cost since it will change. */ - recompute_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + //Update timing information + update_criticalities(crit_exponent, + delay_model, + criticalities, + pin_timing_invalidator, + timing_info, + costs); } inner_crit_iter_count++; } From cb6e9a6bf90f3cf2619d939a184593ca2b488f47 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 23 Jul 2020 19:41:56 -0400 Subject: [PATCH 03/21] Fixe up format and compilation errors --- vpr/src/place/place.cpp | 22 ++++++++-------------- vpr/src/place/timing_place.cpp | 12 ++++++------ vpr/src/place/timing_place.h | 10 +++++----- vpr/src/timing/timing_util.cpp | 2 +- vpr/src/timing/timing_util.h | 3 +++ 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index d52d4d4d6e7..81a32b4acee 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -164,6 +164,10 @@ static vtr::vector net_timing_cost; //Like connection_timi static vtr::vector bb_coords, bb_num_on_edges; +/* Determines if slacks/criticalities need to be updated */ +static bool do_update_criticalities = true; +static bool do_update_setup_slacks = true; + /* Determines if slacks/criticalities need to be recomputed from scratch */ static bool do_recompute_criticalities = true; static bool do_recompute_setup_slacks = true; @@ -428,13 +432,6 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info); -static void recompute_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs); - static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info); @@ -542,6 +539,7 @@ void try_place(const t_placer_opts& placer_opts, std::shared_ptr placement_delay_calc; std::unique_ptr place_delay_model; std::unique_ptr move_generator; + std::unique_ptr placer_setup_slacks; std::unique_ptr placer_criticalities; std::unique_ptr pin_timing_invalidator; @@ -1001,7 +999,7 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, criticalities, pin_timing_invalidator, timing_info, - &costs); + costs); *outer_crit_iter_count = 0; } @@ -1019,7 +1017,6 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info) { - //Run STA to update slacks and adjusted/relaxed criticalities timing_info->update(); @@ -1039,7 +1036,7 @@ static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, } //Update timing information based on current placement by running STA -//and calculate the updated criticalities and timing costs +//and calculate the updated criticalities and timing costs //(based on the new setup slacks) static void update_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, @@ -1047,7 +1044,6 @@ static void update_criticalities(float crit_exponent, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info, t_placer_costs* costs) { - //Run STA to update slacks and adjusted/relaxed criticalities timing_info->update(); @@ -1074,7 +1070,7 @@ static void update_criticalities(float crit_exponent, } //Update timing information based on current placement by running STA. -//Record the new slack information as well as calculate the updated +//Record the new slack information as well as calculate the updated //criticalities and timing costs (based on the new setup slacks) // //If both setup slacks and criticalities need to be updated, @@ -1087,7 +1083,6 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info, t_placer_costs* costs) { - //Run STA to update slacks and adjusted/relaxed criticalities timing_info->update(); @@ -1113,7 +1108,6 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, pin_timing_invalidator->reset(); } - /* Function which contains the inner loop of the simulated annealing */ static void placement_inner_loop(float t, int temp_num, diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp index 3043c0e1089..f7d940dfd5f 100644 --- a/vpr/src/place/timing_place.cpp +++ b/vpr/src/place/timing_place.cpp @@ -29,7 +29,7 @@ PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, cons } void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent, bool recompute) { - //If the criticalities are not updated immediately after each time we call + //If the criticalities are not updated immediately after each time we call //timing_info->update(), then timing_info->pins_with_modified_setup_criticality() //cannot accurately account for all the pins that need to be updated. //In this case, we pass in recompute=true to update all criticalities from scratch. @@ -40,7 +40,7 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf if (!recompute && crit_exponent == last_crit_exponent_ && INCR_UPDATE_CRITICALITIES) { incr_update_criticalities(timing_info); } else { - recompute_criticalities(timing_info); + recompute_criticalities(); //Record new criticality exponent last_crit_exponent_ = crit_exponent; @@ -88,7 +88,7 @@ void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timin } } -void PlacerCriticalities::recompute_criticalities(const SetupTimingInfo* timing_info) { +void PlacerCriticalities::recompute_criticalities() { cluster_pins_with_modified_criticality_.clear(); //Non-incremental: all sink pins need updating @@ -117,14 +117,14 @@ PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const Cl } void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute) { - //If the setup slacks are not updated immediately after each time we call + //If the setup slacks are not updated immediately after each time we call //timing_info->update(), then timing_info->pins_with_modified_setup_slack() //cannot accurately account for all the pins that need to be updated. //In this case, we pass in recompute=true to update all setup slacks from scratch. if (!recompute && INCR_UPDATE_SETUP_SLACKS) { incr_update_setup_slacks(timing_info); } else { - recompute_setup_slacks(timing_info); + recompute_setup_slacks(); } //Update the effected pins @@ -157,7 +157,7 @@ void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_i } } -void PlacerSetupSlacks::recompute_setup_slacks(const SetupTimingInfo* timing_info) { +void PlacerSetupSlacks::recompute_setup_slacks() { cluster_pins_with_modified_setup_slack_.clear(); //Non-incremental: all sink pins need updating diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index fff1c6ab5f1..d37983730f5 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -82,9 +82,9 @@ class PlacerCriticalities { //Set of pins with criticaltites modified by last call to update_criticalities() vtr::vec_id_set cluster_pins_with_modified_criticality_; - //Updates criticalities: incremental V.S. from scratch + //Updates criticalities: incremental V.S. from scratch void incr_update_criticalities(const SetupTimingInfo* timing_info); - void recompute_criticalities(const SetupTimingInfo* timing_info); + void recompute_criticalities(); }; /* Usage @@ -120,7 +120,7 @@ class PlacerSetupSlacks { public: //Accessors //Returns the setup slack of the specified connection - float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slack_[net][ipin]; } + float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; } //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified //by the last call to update_setup_slacks() @@ -142,9 +142,9 @@ class PlacerSetupSlacks { //Set of pins with criticaltites modified by last call to update_criticalities() vtr::vec_id_set cluster_pins_with_modified_setup_slack_; - //Updates setup slacks: incremental V.S. from scratch + //Updates setup slacks: incremental V.S. from scratch void incr_update_setup_slacks(const SetupTimingInfo* timing_info); - void recompute_setup_slacks(const SetupTimingInfo* timing_info); + void recompute_setup_slacks(); }; /* Usage diff --git a/vpr/src/timing/timing_util.cpp b/vpr/src/timing/timing_util.cpp index 6ad86f4ca43..5bff2ac8324 100644 --- a/vpr/src/timing/timing_util.cpp +++ b/vpr/src/timing/timing_util.cpp @@ -579,7 +579,7 @@ float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, cons return clb_pin_crit; } -//Return the slack of a net's pin in the CLB netlist +//Return the setup slack of a net's pin in the CLB netlist float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin) { //There may be multiple atom netlist pins connected to this CLB pin float clb_pin_setup_slack = std::numeric_limits::quiet_NaN(); diff --git a/vpr/src/timing/timing_util.h b/vpr/src/timing/timing_util.h index 87f6b86787b..682771e9763 100644 --- a/vpr/src/timing/timing_util.h +++ b/vpr/src/timing/timing_util.h @@ -183,6 +183,9 @@ class ClusteredPinTimingInvalidator { //Return the criticality of a net's pin in the CLB netlist float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin); +//Return the setup slack of a net's pin in the CLB netlist +float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin); + //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum //required time and worst slack for all domain pairs represent by the slack tags // From 63db2e165d8bf822c7c3b94d26a71597870bab1b Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 31 Jul 2020 00:41:10 -0400 Subject: [PATCH 04/21] Merged 3 update routines into 1 single routine --- vpr/src/place/place.cpp | 166 ++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 107 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 81a32b4acee..2dfff1fd311 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -429,20 +429,10 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, int* outer_crit_iter_count, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info); -static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info); - -static void update_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs); - static void update_setup_slacks_and_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, @@ -465,6 +455,7 @@ static void placement_inner_loop(float t, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, SetupTimingInfo* timing_info); @@ -617,12 +608,15 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.lookup, *timing_info->timing_graph()); //Update timing and costs - update_criticalities(crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); + do_update_criticalities = true; + do_update_setup_slacks = false; + update_setup_slacks_and_criticalities(crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &costs); timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement @@ -779,6 +773,7 @@ void try_place(const t_placer_opts& placer_opts, &outer_crit_iter_count, place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), pin_timing_invalidator.get(), timing_info.get()); @@ -790,6 +785,7 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), *move_generator, blocks_affected, timing_info.get()); @@ -844,6 +840,7 @@ void try_place(const t_placer_opts& placer_opts, &outer_crit_iter_count, place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), pin_timing_invalidator.get(), timing_info.get()); @@ -859,6 +856,7 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), *move_generator, blocks_affected, timing_info.get()); @@ -911,12 +909,15 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs - update_criticalities(crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); + do_update_criticalities = true; + do_update_setup_slacks = false; + update_setup_slacks_and_criticalities(crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &costs); critical_path = timing_info->least_slack_critical_path(); @@ -978,6 +979,7 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, int* outer_crit_iter_count, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info) { if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) @@ -993,13 +995,16 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, num_connections = std::max(num_connections, 1); //Avoid division by zero VTR_ASSERT(num_connections > 0); - //Update timing information - update_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + //Update timing information and criticalities + do_update_criticalities = true; + do_update_setup_slacks = false; + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); *outer_crit_iter_count = 0; } @@ -1012,70 +1017,9 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); } -//Update timing information based on current placement by running STA -//and record the new setup slack information -static void update_setup_slacks(PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info) { - //Run STA to update slacks and adjusted/relaxed criticalities - timing_info->update(); - - //Update placer's setup slacks - setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); - - //Setup slacks are now in sync with the timing_info - //Can perform incremental updates next time - do_recompute_setup_slacks = false; - - //Criticalities are now out of sync with the timing_info - //Must do from scratch recompute next time - do_recompute_criticalities = true; - - //Clear invalidation state - pin_timing_invalidator->reset(); -} - -//Update timing information based on current placement by running STA -//and calculate the updated criticalities and timing costs -//(based on the new setup slacks) -static void update_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs) { - //Run STA to update slacks and adjusted/relaxed criticalities - timing_info->update(); - - //Update placer's criticalities (e.g. sharpen with crit_exponent) - criticalities->update_criticalities(timing_info, crit_exponent, do_recompute_criticalities); - - //Update connection, net and total timing costs based on new criticalities -#ifdef INCR_COMP_TD_COSTS - update_td_costs(delay_model, *criticalities, &costs->timing_cost); -#else - comp_td_costs(delay_model, *criticalities, &costs->timing_cost); -#endif - - //Criticalities are now in sync with the timing_info - //Can perform incremental updates next time - do_recompute_criticalities = false; - - //Setup slacks are now out of sync with the timing_info - //Must do from scratch recompute next time - do_recompute_setup_slacks = true; - - //Clear invalidation state - pin_timing_invalidator->reset(); -} - //Update timing information based on current placement by running STA. //Record the new slack information as well as calculate the updated //criticalities and timing costs (based on the new setup slacks) -// -//If both setup slacks and criticalities need to be updated, -//this routine should be called, instead of individual update routine. -//This is to prevent unnecessary from scratch updates static void update_setup_slacks_and_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, @@ -1087,22 +1031,26 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, timing_info->update(); //Update placer's setup slacks - setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); + if (do_update_setup_slacks) { + setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); + } - //Update placer's criticalities (e.g. sharpen with crit_exponent) - criticalities->update_criticalities(timing_info, crit_exponent, do_recompute_criticalities); + if (do_update_criticalities) { + //Update placer's criticalities (e.g. sharpen with crit_exponent) + criticalities->update_criticalities(timing_info, crit_exponent, do_recompute_criticalities); - //Update connection, net and total timing costs based on new criticalities + //Update connection, net and total timing costs based on new criticalities #ifdef INCR_COMP_TD_COSTS - update_td_costs(delay_model, *criticalities, &costs->timing_cost); + update_td_costs(delay_model, *criticalities, &costs->timing_cost); #else - comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); #endif + } - //Both Setup slacks and Criticalities are now in sync with the timing_info - //They can be both incrementally updated next time - do_recompute_setup_slacks = false; - do_recompute_criticalities = false; + //Setup slacks and Criticalities need to be in sync with the timing_info + //Otherwise, they cannot be incrementally updated on the next iteration + do_recompute_setup_slacks = !do_update_setup_slacks; + do_recompute_criticalities = !do_update_criticalities; //Clear invalidation state pin_timing_invalidator->reset(); @@ -1123,6 +1071,7 @@ static void placement_inner_loop(float t, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, SetupTimingInfo* timing_info) { @@ -1180,12 +1129,15 @@ static void placement_inner_loop(float t, * criticalities and update the timing cost since it will change. */ //Update timing information - update_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + do_update_criticalities = true; + do_update_setup_slacks = false; + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); } inner_crit_iter_count++; } From e8f73c63494374356ada911f7a947670ea1cb16a Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 31 Jul 2020 00:50:38 -0400 Subject: [PATCH 05/21] Resolve more merge conflicts --- vpr/src/place/place.cpp | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 55eff6bf35d..a3ddf6dfd3f 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -782,7 +782,6 @@ void try_place(const t_placer_opts& placer_opts, costs.cost = 1; } -<<<<<<< HEAD outer_loop_update_criticalities(placer_opts, &costs, &prev_inverse_costs, num_connections, crit_exponent, @@ -792,16 +791,6 @@ void try_place(const t_placer_opts& placer_opts, placer_setup_slacks.get(), pin_timing_invalidator.get(), timing_info.get()); -======= - outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); ->>>>>>> sync placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, @@ -850,8 +839,7 @@ void try_place(const t_placer_opts& placer_opts, auto pre_quench_timing_stats = timing_ctx.stats; { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); - -<<<<<<< HEAD + outer_loop_update_criticalities(placer_opts, &costs, &prev_inverse_costs, num_connections, @@ -862,17 +850,6 @@ void try_place(const t_placer_opts& placer_opts, placer_setup_slacks.get(), pin_timing_invalidator.get(), timing_info.get()); -======= - outer_loop_recompute_criticalities(placer_opts, &costs, - &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); ->>>>>>> sync state.t = 0; /* freeze out */ @@ -938,7 +915,6 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs -<<<<<<< HEAD do_update_criticalities = true; do_update_setup_slacks = false; update_setup_slacks_and_criticalities(crit_exponent, @@ -948,14 +924,6 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), timing_info.get(), &costs); -======= - recompute_criticalities(state.crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); ->>>>>>> sync critical_path = timing_info->least_slack_critical_path(); From d80de58cb846a39ea5f8217ccd0af143e834f2cd Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 31 Jul 2020 01:54:22 -0400 Subject: [PATCH 06/21] Changed crit_exponent to first_crit_exponent/state.crit_exponent --- vpr/src/place/place.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index a3ddf6dfd3f..22debfa44a2 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -628,7 +628,7 @@ void try_place(const t_placer_opts& placer_opts, //Update timing and costs do_update_criticalities = true; do_update_setup_slacks = false; - update_setup_slacks_and_criticalities(crit_exponent, + update_setup_slacks_and_criticalities(first_crit_exponent, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), @@ -784,7 +784,7 @@ void try_place(const t_placer_opts& placer_opts, outer_loop_update_criticalities(placer_opts, &costs, &prev_inverse_costs, num_connections, - crit_exponent, + state.crit_exponent, &outer_crit_iter_count, place_delay_model.get(), placer_criticalities.get(), @@ -843,7 +843,7 @@ void try_place(const t_placer_opts& placer_opts, outer_loop_update_criticalities(placer_opts, &costs, &prev_inverse_costs, num_connections, - crit_exponent, + state.crit_exponent, &outer_crit_iter_count, place_delay_model.get(), placer_criticalities.get(), @@ -917,7 +917,7 @@ void try_place(const t_placer_opts& placer_opts, //Update timing and costs do_update_criticalities = true; do_update_setup_slacks = false; - update_setup_slacks_and_criticalities(crit_exponent, + update_setup_slacks_and_criticalities(state.crit_exponent, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), From 831df449f4931e0cd7a1202b4105fe505421d156 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 6 Aug 2020 02:18:19 -0400 Subject: [PATCH 07/21] Created a setup slack matrix that copies data from the PlacerSetupSlacks. The matrix update is incremental according to the pins with modified setup slacks returned from PlacerSetupSlacks. Outer loop routine now updates both setup slacks and criticalities, while the inner loop routine passes in variables that determine the strategies/cost functions used to evaluate the effectiveness of try_swap moves. --- vpr/src/place/place.cpp | 155 ++++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 53 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 22debfa44a2..c63599705a3 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -150,18 +150,20 @@ static vtr::vector bb_updated_before; * Net connection delays based on the placement. * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static ClbNetPinsMatrix connection_delay; //Delays based on commited block positions +static ClbNetPinsMatrix connection_delay; //Delays based on committed block positions static ClbNetPinsMatrix proposed_connection_delay; //Delays for proposed block positions (only // for connections effected by move, otherwise // INVALID_DELAY) +static ClbNetPinsMatrix connection_setup_slack; //Setup slacks based on most recently updated timing graph + /* * Timing cost of connections (i.e. criticality * delay). * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static PlacerTimingCosts connection_timing_cost; //Costs of commited block positions +static PlacerTimingCosts connection_timing_cost; //Costs of committed block positions static ClbNetPinsMatrix proposed_connection_timing_cost; //Costs for proposed block positions - // (only for connectsion effected by + // (only for connection effected by // move, otherwise INVALID_DELAY) /* @@ -386,6 +388,8 @@ static float comp_td_connection_delay(const PlaceDelayModel* delay_model, Cluste static void comp_td_connection_delays(const PlaceDelayModel* delay_model); +static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks); + static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); @@ -439,17 +443,17 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr); static void free_try_swap_arrays(); -static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info); +static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_costs* costs, + t_placer_prev_inverse_costs* prev_inverse_costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info); static void update_setup_slacks_and_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, @@ -474,6 +478,8 @@ static void placement_inner_loop(float t, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, + bool inner_loop_update_crit, + bool inner_loop_update_setup_slack, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, SetupTimingInfo* timing_info); @@ -627,7 +633,7 @@ void try_place(const t_placer_opts& placer_opts, *timing_info->timing_graph()); //Update timing and costs do_update_criticalities = true; - do_update_setup_slacks = false; + do_update_setup_slacks = true; update_setup_slacks_and_criticalities(first_crit_exponent, place_delay_model.get(), placer_criticalities.get(), @@ -636,6 +642,9 @@ void try_place(const t_placer_opts& placer_opts, timing_info.get(), &costs); + //Initialize the setup slacks matrix + record_setup_slacks(placer_setup_slacks.get()); + timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement critical_path = timing_info->least_slack_critical_path(); @@ -782,15 +791,17 @@ void try_place(const t_placer_opts& placer_opts, costs.cost = 1; } - outer_loop_update_criticalities(placer_opts, &costs, &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - placer_setup_slacks.get(), - pin_timing_invalidator.get(), - timing_info.get()); + outer_loop_update_timing_info(placer_opts, &costs, &prev_inverse_costs, + num_connections, + state.crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get()); + + bool anneal_update_crit = true, anneal_update_setup_slack = false; placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, @@ -801,6 +812,8 @@ void try_place(const t_placer_opts& placer_opts, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), + anneal_update_crit, + anneal_update_setup_slack, *move_generator, blocks_affected, timing_info.get()); @@ -839,20 +852,23 @@ void try_place(const t_placer_opts& placer_opts, auto pre_quench_timing_stats = timing_ctx.stats; { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); - - outer_loop_update_criticalities(placer_opts, &costs, - &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - placer_setup_slacks.get(), - pin_timing_invalidator.get(), - timing_info.get()); + + outer_loop_update_timing_info(placer_opts, &costs, + &prev_inverse_costs, + num_connections, + state.crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get()); state.t = 0; /* freeze out */ + //Analyze setup slacks for quench + bool quench_update_crit = true, quench_update_setup_slack = true; + /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, @@ -864,6 +880,8 @@ void try_place(const t_placer_opts& placer_opts, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), + quench_update_crit, + quench_update_setup_slack, *move_generator, blocks_affected, timing_info.get()); @@ -976,18 +994,18 @@ void try_place(const t_placer_opts& placer_opts, VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", f_update_td_costs_connections_elapsed_sec, f_update_td_costs_nets_elapsed_sec, f_update_td_costs_sum_nets_elapsed_sec, f_update_td_costs_total_elapsed_sec); } -/* Function to update the criticalities before the inner loop of the annealing */ -static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info) { +/* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */ +static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_costs* costs, + t_placer_prev_inverse_costs* prev_inverse_costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info) { if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) return; @@ -1003,7 +1021,7 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, //Update timing information and criticalities do_update_criticalities = true; - do_update_setup_slacks = false; + do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, @@ -1011,6 +1029,8 @@ static void outer_loop_update_criticalities(const t_placer_opts& placer_opts, pin_timing_invalidator, timing_info, costs); + //Always record the setup slacks + record_setup_slacks(setup_slacks); *outer_crit_iter_count = 0; } @@ -1036,8 +1056,8 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, //Run STA to update slacks and adjusted/relaxed criticalities timing_info->update(); - //Update placer's setup slacks if (do_update_setup_slacks) { + //Update placer's setup slacks setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); } @@ -1053,7 +1073,7 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, #endif } - //Setup slacks and Criticalities need to be in sync with the timing_info + //Setup slacks and criticalities need to be in sync with the timing_info //Otherwise, they cannot be incrementally updated on the next iteration do_recompute_setup_slacks = !do_update_setup_slacks; do_recompute_criticalities = !do_update_criticalities; @@ -1078,6 +1098,8 @@ static void placement_inner_loop(float t, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, + bool inner_loop_update_crit, + bool inner_loop_update_setup_slack, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, SetupTimingInfo* timing_info) { @@ -1135,8 +1157,8 @@ static void placement_inner_loop(float t, * criticalities and update the timing cost since it will change. */ //Update timing information - do_update_criticalities = true; - do_update_setup_slacks = false; + do_update_criticalities = inner_loop_update_crit; + do_update_setup_slacks = inner_loop_update_setup_slack; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, @@ -1144,6 +1166,17 @@ static void placement_inner_loop(float t, pin_timing_invalidator, timing_info, costs); + + //Currently, if we update the setup slacks within the inner loop + //We aim to evaluate moves based upon the cost functions + //related to these setup slacks + bool do_setup_slack_analysis = inner_loop_update_setup_slack; + if (do_setup_slack_analysis) { + //Currently, we accept these new setup slacks right away + //TODO: Consider situations where we reject the series of moves + //that lead to the current slack values. + record_setup_slacks(setup_slacks); + } } inner_crit_iter_count++; } @@ -1814,7 +1847,7 @@ static float comp_td_connection_delay(const PlaceDelayModel* delay_model, Cluste //Recompute all point to point delays, updating connection_delay static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { - auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); for (auto net_id : cluster_ctx.clb_nlist.nets()) { for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { @@ -1823,6 +1856,19 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { } } +//Copy all the current setup slacks from the PlacerSetupSlacks class +static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Only go through pins with modified setup slack + for (ClusterPinId pin_id : setup_slacks->pins_with_modified_setup_slack()) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + + connection_setup_slack[net_id][pin_index_in_net] = setup_slacks->setup_slack(net_id, pin_index_in_net); + } +} + /* Update the connection_timing_cost values from the temporary * * values for all connections that have changed. */ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { @@ -2144,6 +2190,8 @@ static void alloc_and_load_placement_structs(float place_cost_exp, connection_delay = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.f); proposed_connection_delay = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.f); + connection_setup_slack = make_net_pins_matrix(cluster_ctx.clb_nlist, std::numeric_limits::infinity()); + connection_timing_cost = PlacerTimingCosts(cluster_ctx.clb_nlist); proposed_connection_timing_cost = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.); net_timing_cost.resize(num_nets, 0.); @@ -2185,6 +2233,7 @@ static void free_placement_structs(const t_placer_opts& placer_opts) { if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { vtr::release_memory(connection_timing_cost); vtr::release_memory(connection_delay); + vtr::release_memory(connection_setup_slack); vtr::release_memory(proposed_connection_timing_cost); vtr::release_memory(proposed_connection_delay); From d329911beed081646bc13ea7acb0f5a23fdac330 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 6 Aug 2020 03:45:27 -0400 Subject: [PATCH 08/21] Provided more complete explanation for the record_setup_slacks routine. --- vpr/src/place/place.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index c63599705a3..5f83493b082 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -1029,7 +1029,7 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, pin_timing_invalidator, timing_info, costs); - //Always record the setup slacks + //Always record the setup slacks when they are updated record_setup_slacks(setup_slacks); *outer_crit_iter_count = 0; @@ -1167,14 +1167,15 @@ static void placement_inner_loop(float t, timing_info, costs); - //Currently, if we update the setup slacks within the inner loop - //We aim to evaluate moves based upon the cost functions + //Currently, if we update the setup slacks within the inner loop, + //we do so to evaluate moves based upon the cost functions //related to these setup slacks - bool do_setup_slack_analysis = inner_loop_update_setup_slack; - if (do_setup_slack_analysis) { - //Currently, we accept these new setup slacks right away - //TODO: Consider situations where we reject the series of moves - //that lead to the current slack values. + // + //If we do not update the setup slacks, we do not alter the values + //in the setup slacks matrix. Otherwise, the incremental update + //method of the routine record_setup_slacks will become dysfunctional. + if (inner_loop_update_setup_slack) { + //TODO: add slack cost evaluation functions record_setup_slacks(setup_slacks); } } @@ -1857,6 +1858,8 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { } //Copy all the current setup slacks from the PlacerSetupSlacks class +//This routine will always be incremental and correct, as it is called +//if and only if the PlacerSetupSlacks class is updated with new slack values static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; From 225870e19d9fe2ef12462fa0a8927353693d008c Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 6 Aug 2020 06:01:29 -0400 Subject: [PATCH 09/21] Added placement snapshot functions that facilitates the reversion of a series of successful moves done by try_swap. Right now the data structures representing the state variables are directly being copied, however the process can possibly be optimized with incremental techniques. The snapshot routines are called in the placement's inner loop, and should be used together with VPR options quench_recompute_divider and less optimally inner_recompute_divider. The latter would be too time consuming in practice. --- vpr/src/place/place.cpp | 136 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 133 insertions(+), 3 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 5f83493b082..d8076ca083e 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -390,6 +390,8 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model); static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks); +static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks); + static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); @@ -514,6 +516,20 @@ static void print_resources_utilization(); static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); +//Placement snapshot data structures. To be optimized. +static ClbNetPinsMatrix connection_delay_snapshot; +static PlacerTimingCosts connection_timing_cost_snapshot; +static vtr::vector bb_coords_snapshot, bb_num_on_edges_snapshot; +static vtr::vector net_cost_snapshot; +static vtr::vector bb_updated_before_snapshot; +static vtr::vector_map block_locs_snapshot; +static vtr::Matrix grid_blocks_snapshot; + +static void take_placement_snapshot(); +static void revert_placement_snapshot(ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info); + +static e_move_result do_setup_slack_cost_analysis(const PlacerSetupSlacks* setup_slacks); + /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, t_annealing_sched annealing_sched, @@ -849,6 +865,11 @@ void try_place(const t_placer_opts& placer_opts, } while (update_annealing_state(&state, success_rat, costs, placer_opts, annealing_sched)); /* Outer loop of the simmulated annealing ends */ + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + //Take a snapshot of the current placer before doing placement quench + take_placement_snapshot(); + } + auto pre_quench_timing_stats = timing_ctx.stats; { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); @@ -1175,8 +1196,39 @@ static void placement_inner_loop(float t, //in the setup slacks matrix. Otherwise, the incremental update //method of the routine record_setup_slacks will become dysfunctional. if (inner_loop_update_setup_slack) { - //TODO: add slack cost evaluation functions - record_setup_slacks(setup_slacks); + e_move_result slack_result = do_setup_slack_cost_analysis(setup_slacks); + + if (slack_result == ACCEPTED) { + //If accepted, update the setup slack matrix + //and take a snapshot of the current placement + record_setup_slacks(setup_slacks); + take_placement_snapshot(); + } else { + VTR_ASSERT(slack_result == REJECTED); + + //If rejected, undo all the moves since the last timing info update + //i.e., revert to the last placement snapshot + // + //Invalidate all the timing edges and do a new timing_info->update() + // + //Leave the setup slack matrix unchanged + revert_placement_snapshot(pin_timing_invalidator, timing_info); + + //Update timing information + do_update_criticalities = true; + do_update_setup_slacks = true; + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); + + VTR_ASSERT_MSG( + verify_connection_setup_slacks(setup_slacks), + "The setup slacks should not change after reverting to the last placement snapshot and updating the timing info."); + } } } inner_crit_iter_count++; @@ -1212,6 +1264,13 @@ static void placement_inner_loop(float t, /* Inner loop ends */ } +//Evaluate if the new slack values are acceptable using weighted average cost functions +static e_move_result do_setup_slack_cost_analysis(const PlacerSetupSlacks* setup_slacks) { + //TODO: implement the cost functions + int num = rand() % 2; + return num ? ACCEPTED : REJECTED; +} + static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, @@ -1863,7 +1922,7 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - //Only go through pins with modified setup slack + //Only go through sink pins with modified setup slack for (ClusterPinId pin_id : setup_slacks->pins_with_modified_setup_slack()) { ClusterNetId net_id = clb_nlist.pin_net(pin_id); size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); @@ -1872,6 +1931,21 @@ static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks) { } } +static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Go through every single sink pin + for (ClusterNetId net_id : clb_nlist.nets()) { + for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { + if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) { + return false; + } + } + } + + return true; +} + /* Update the connection_timing_cost values from the temporary * * values for all connections that have changed. */ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { @@ -3104,3 +3178,59 @@ static void init_annealing_state(t_annealing_state* state, bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); } + +//Recording down all the info about the placer's current state +static void take_placement_snapshot() { + const auto& place_ctx = g_vpr_ctx.placement(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + const auto& clb_nlist = cluster_ctx.clb_nlist; + + connection_delay_snapshot = connection_delay; + //Go through every single sink pin to check if delay has been updated + for (ClusterNetId net_id : clb_nlist.nets()) { + for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { + VTR_ASSERT_MSG(connection_delay[net_id][ipin] == connection_delay_snapshot[net_id][ipin], + "Direct assignment of the delay has failed"); + } + } + + connection_timing_cost_snapshot = connection_timing_cost; + bb_coords_snapshot = bb_coords; + bb_num_on_edges_snapshot = bb_num_on_edges; + net_cost_snapshot = net_cost; + bb_updated_before_snapshot = bb_updated_before; + + block_locs_snapshot = place_ctx.block_locs; + grid_blocks_snapshot = place_ctx.grid_blocks; +} + +//Revert back to the recorded placer state, which is the state +//of the placer when the last timing info update took place +static void revert_placement_snapshot(ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info) { + auto& place_ctx = g_vpr_ctx.mutable_placement(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Go through every single sink pin to check if delay has changed + for (ClusterNetId net_id : clb_nlist.nets()) { + for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { + if (connection_delay[net_id][ipin] != connection_delay_snapshot[net_id][ipin]) { + //Delay changed, must invalidate + ClusterPinId pin_id = clb_nlist.net_pin(net_id, ipin); + pin_tedges_invalidator->invalidate_connection(pin_id, timing_info); + connection_delay[net_id][ipin] = connection_delay_snapshot[net_id][ipin]; + } + } + } + + connection_timing_cost = connection_timing_cost_snapshot; + bb_coords = bb_coords_snapshot; + bb_num_on_edges = bb_num_on_edges_snapshot; + net_cost = net_cost_snapshot; + bb_updated_before = bb_updated_before_snapshot; + + place_ctx.block_locs = block_locs_snapshot; + place_ctx.grid_blocks = grid_blocks_snapshot; +} \ No newline at end of file From 0e01ed7e68b2c0589771a918d2e54fbc0a28e9a5 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 6 Aug 2020 07:33:09 -0400 Subject: [PATCH 10/21] Implemented do_setup_slack_cost_analysis: softmax of negative slacks --- vpr/src/place/place.cpp | 63 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index d8076ca083e..1972c79cc80 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -1266,9 +1266,66 @@ static void placement_inner_loop(float t, //Evaluate if the new slack values are acceptable using weighted average cost functions static e_move_result do_setup_slack_cost_analysis(const PlacerSetupSlacks* setup_slacks) { - //TODO: implement the cost functions - int num = rand() % 2; - return num ? ACCEPTED : REJECTED; + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Aggregating the total negative slack. Skip pins with positive slacks + float total_negative_slack = 0.f; + std::vector pins_with_negative_slack; + size_t num_pins_with_negative_slack; + + for (ClusterPinId pin_id : setup_slacks->pins_with_modified_setup_slack()) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + + if (connection_setup_slack[net_id][pin_index_in_net] < 0) { + pins_with_negative_slack.push_back(pin_id); + ++num_pins_with_negative_slack; + total_negative_slack += connection_setup_slack[net_id][pin_index_in_net]; + } + } + + //Variables for storing weights and values + float weight, frac_changed; + float total_cost = 0.f; + + std::ofstream osa("a.out", std::ofstream::app); + std::ofstream osb("b.out", std::ofstream::app); + + for (ClusterPinId pin_id : pins_with_negative_slack) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + + //The slack values in PlacerSetupSlacks have not been updated to connection_setup_slack + //These values are in the proposed state: they might be accepted or rejected + float proposed_setup_slack = setup_slacks->setup_slack(net_id, pin_index_in_net); + float original_setup_slack = connection_setup_slack[net_id][pin_index_in_net]; + + //The worse the slack of a pin, the more weight it is given + //Currently, first normalize, then apply the Softmax function, + //which takes the exponential of the opposite value of the + //normalized slack value and then normalize again. More negative + //slacks should take on a much larger weight. + weight = std::exp(original_setup_slack / total_negative_slack); + osa << weight << ' '; + + //The fraction by which the slack value has changed. + //Positive->good, negative->bad. + frac_changed = (proposed_setup_slack - original_setup_slack) / original_setup_slack; + osb << frac_changed << ' '; + + //Using minus due to the definition of cost: lower cost is better + total_cost -= frac_changed * weight; + } + osa << '\n'; + osb << '\n'; + osa.close(); + osb.close(); + + //Currently, as long as the total cost is negative, the moves + //by the try_swap routine are accepted. + return total_cost < 0 ? ACCEPTED : REJECTED; } static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, From 2e212dcbe5be359f389f28784a30f295d8301915 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 11 Aug 2020 06:11:26 -0400 Subject: [PATCH 11/21] Added single move reversion for setup slack analysis(rather than taking placement snapshots). Currently experiencing consistency failures. Also updated slack analysis cost function: comparing the worse slack change across all modified clb pins --- vpr/src/base/vpr_types.h | 3 +- vpr/src/place/place.cpp | 403 ++++++++++++++++++--------------------- 2 files changed, 185 insertions(+), 221 deletions(-) diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 1e40ccfaef2..34f08d250f6 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -851,7 +851,8 @@ struct t_annealing_sched { * doPlacement: true if placement is supposed to be done in the CAD flow, false otherwise */ enum e_place_algorithm { BOUNDING_BOX_PLACE, - PATH_TIMING_DRIVEN_PLACE + PATH_TIMING_DRIVEN_PLACE, + SETUP_SLACK_ANALYSIS_PLACE }; enum e_place_effort_scaling { diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 1972c79cc80..258c23875aa 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -331,15 +331,17 @@ static void update_move_nets(int num_nets_affected); static void reset_move_nets(int num_nets_affected); static e_move_result try_swap(float t, + float crit_exponent, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, MoveGenerator& move_generator, - TimingInfo* timing_info, + SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, float rlim_escape_fraction, enum e_place_algorithm place_algorithm, float timing_tradeoff); @@ -357,14 +359,16 @@ static int check_placement_consistency(); static int check_block_placement_consistency(); static int check_macro_placement_consistency(); -static float starting_t(t_placer_costs* costs, +static float starting_t(float crit_exponent, + t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, int max_moves, float rlim, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, - TimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + SetupTimingInfo* timing_info, MoveGenerator& move_generator, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, @@ -388,7 +392,7 @@ static float comp_td_connection_delay(const PlaceDelayModel* delay_model, Cluste static void comp_td_connection_delays(const PlaceDelayModel* delay_model); -static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks); +static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks); static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks); @@ -396,7 +400,7 @@ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); -static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected, +static void invalidate_affected_connection_delays(const std::vector& sink_pins_affected, ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info); @@ -410,6 +414,9 @@ static double comp_td_connection_cost(const PlaceDelayModel* delay_mode, const P static double sum_td_net_cost(ClusterNetId net); static double sum_td_costs(); +static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected, + std::vector& sink_pins_affected); + static e_move_result assess_swap(double delta_c, double t); static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new); @@ -484,7 +491,8 @@ static void placement_inner_loop(float t, bool inner_loop_update_setup_slack, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, - SetupTimingInfo* timing_info); + SetupTimingInfo* timing_info, + enum e_place_algorithm place_algorithm); static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, @@ -516,19 +524,7 @@ static void print_resources_utilization(); static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); -//Placement snapshot data structures. To be optimized. -static ClbNetPinsMatrix connection_delay_snapshot; -static PlacerTimingCosts connection_timing_cost_snapshot; -static vtr::vector bb_coords_snapshot, bb_num_on_edges_snapshot; -static vtr::vector net_cost_snapshot; -static vtr::vector bb_updated_before_snapshot; -static vtr::vector_map block_locs_snapshot; -static vtr::Matrix grid_blocks_snapshot; - -static void take_placement_snapshot(); -static void revert_placement_snapshot(ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info); - -static e_move_result do_setup_slack_cost_analysis(const PlacerSetupSlacks* setup_slacks); +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks); /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, @@ -647,9 +643,11 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.nlist, atom_ctx.lookup, *timing_info->timing_graph()); - //Update timing and costs + //First time compute timing and costs, compute from scratch do_update_criticalities = true; do_update_setup_slacks = true; + do_recompute_criticalities = true; + do_recompute_setup_slacks = true; update_setup_slacks_and_criticalities(first_crit_exponent, place_delay_model.get(), placer_criticalities.get(), @@ -659,7 +657,7 @@ void try_place(const t_placer_opts& placer_opts, &costs); //Initialize the setup slacks matrix - record_setup_slacks(placer_setup_slacks.get()); + commit_setup_slacks(placer_setup_slacks.get()); timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement @@ -774,10 +772,11 @@ void try_place(const t_placer_opts& placer_opts, first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); - float first_t = starting_t(&costs, &prev_inverse_costs, + float first_t = starting_t(first_crit_exponent, &costs, &prev_inverse_costs, annealing_sched, move_lim, first_rlim, place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), timing_info.get(), *move_generator, pin_timing_invalidator.get(), @@ -832,7 +831,8 @@ void try_place(const t_placer_opts& placer_opts, anneal_update_setup_slack, *move_generator, blocks_affected, - timing_info.get()); + timing_info.get(), + placer_opts.place_algorithm); tot_iter += state.move_lim; @@ -865,11 +865,6 @@ void try_place(const t_placer_opts& placer_opts, } while (update_annealing_state(&state, success_rat, costs, placer_opts, annealing_sched)); /* Outer loop of the simmulated annealing ends */ - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - //Take a snapshot of the current placer before doing placement quench - take_placement_snapshot(); - } - auto pre_quench_timing_stats = timing_ctx.stats; { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); @@ -905,7 +900,8 @@ void try_place(const t_placer_opts& placer_opts, quench_update_setup_slack, *move_generator, blocks_affected, - timing_info.get()); + timing_info.get(), + e_place_algorithm::SETUP_SLACK_ANALYSIS_PLACE); tot_iter += move_lim; ++num_temps; @@ -1050,8 +1046,9 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, pin_timing_invalidator, timing_info, costs); - //Always record the setup slacks when they are updated - record_setup_slacks(setup_slacks); + + //Always commit the setup slacks when they are updated + commit_setup_slacks(setup_slacks); *outer_crit_iter_count = 0; } @@ -1123,7 +1120,8 @@ static void placement_inner_loop(float t, bool inner_loop_update_setup_slack, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, - SetupTimingInfo* timing_info) { + SetupTimingInfo* timing_info, + enum e_place_algorithm place_algorithm) { int inner_crit_iter_count, inner_iter; int inner_placement_save_count = 0; //How many times have we dumped placement to a file this temperature? @@ -1138,15 +1136,16 @@ static void placement_inner_loop(float t, /* Inner loop begins */ for (inner_iter = 0; inner_iter < move_lim; inner_iter++) { - e_move_result swap_result = try_swap(t, costs, prev_inverse_costs, rlim, + e_move_result swap_result = try_swap(t, crit_exponent, costs, prev_inverse_costs, rlim, move_generator, timing_info, pin_timing_invalidator, blocks_affected, delay_model, criticalities, + setup_slacks, placer_opts.rlim_escape_fraction, - placer_opts.place_algorithm, + place_algorithm, placer_opts.timing_tradeoff); if (swap_result == ACCEPTED) { @@ -1188,47 +1187,9 @@ static void placement_inner_loop(float t, timing_info, costs); - //Currently, if we update the setup slacks within the inner loop, - //we do so to evaluate moves based upon the cost functions - //related to these setup slacks - // - //If we do not update the setup slacks, we do not alter the values - //in the setup slacks matrix. Otherwise, the incremental update - //method of the routine record_setup_slacks will become dysfunctional. - if (inner_loop_update_setup_slack) { - e_move_result slack_result = do_setup_slack_cost_analysis(setup_slacks); - - if (slack_result == ACCEPTED) { - //If accepted, update the setup slack matrix - //and take a snapshot of the current placement - record_setup_slacks(setup_slacks); - take_placement_snapshot(); - } else { - VTR_ASSERT(slack_result == REJECTED); - - //If rejected, undo all the moves since the last timing info update - //i.e., revert to the last placement snapshot - // - //Invalidate all the timing edges and do a new timing_info->update() - // - //Leave the setup slack matrix unchanged - revert_placement_snapshot(pin_timing_invalidator, timing_info); - - //Update timing information - do_update_criticalities = true; - do_update_setup_slacks = true; - update_setup_slacks_and_criticalities(crit_exponent, - delay_model, - criticalities, - setup_slacks, - pin_timing_invalidator, - timing_info, - costs); - - VTR_ASSERT_MSG( - verify_connection_setup_slacks(setup_slacks), - "The setup slacks should not change after reverting to the last placement snapshot and updating the timing info."); - } + //Commit the setup slacks if they are updated + if (do_update_setup_slacks) { + commit_setup_slacks(setup_slacks); } } inner_crit_iter_count++; @@ -1264,68 +1225,33 @@ static void placement_inner_loop(float t, /* Inner loop ends */ } -//Evaluate if the new slack values are acceptable using weighted average cost functions -static e_move_result do_setup_slack_cost_analysis(const PlacerSetupSlacks* setup_slacks) { +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { const auto& cluster_ctx = g_vpr_ctx.clustering(); - const auto& clb_nlist = cluster_ctx.clb_nlist; - //Aggregating the total negative slack. Skip pins with positive slacks - float total_negative_slack = 0.f; - std::vector pins_with_negative_slack; - size_t num_pins_with_negative_slack; + //Find the original/proposed setup slacks of pins with modified values + std::vector original_setup_slacks, proposed_setup_slacks; - for (ClusterPinId pin_id : setup_slacks->pins_with_modified_setup_slack()) { - ClusterNetId net_id = clb_nlist.pin_net(pin_id); - size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + for (ClusterPinId clb_pin : setup_slacks->pins_with_modified_setup_slack()) { + ClusterNetId net_id = clb_nlist.pin_net(clb_pin); + size_t ipin = clb_nlist.pin_net_index(clb_pin); - if (connection_setup_slack[net_id][pin_index_in_net] < 0) { - pins_with_negative_slack.push_back(pin_id); - ++num_pins_with_negative_slack; - total_negative_slack += connection_setup_slack[net_id][pin_index_in_net]; - } + original_setup_slacks.push_back(connection_setup_slack[net_id][ipin]); + proposed_setup_slacks.push_back(setup_slacks->setup_slack(net_id, ipin)); } - //Variables for storing weights and values - float weight, frac_changed; - float total_cost = 0.f; - - std::ofstream osa("a.out", std::ofstream::app); - std::ofstream osb("b.out", std::ofstream::app); + //If there are no pins with modified slack values, accept this move + //by returning an arbitrary negative number + if (original_setup_slacks.empty()) { + return -1; + } - for (ClusterPinId pin_id : pins_with_negative_slack) { - ClusterNetId net_id = clb_nlist.pin_net(pin_id); - size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + //Sort in ascending order, from worse slack value to best + std::sort(original_setup_slacks.begin(), original_setup_slacks.end()); + std::sort(proposed_setup_slacks.begin(), proposed_setup_slacks.end()); - //The slack values in PlacerSetupSlacks have not been updated to connection_setup_slack - //These values are in the proposed state: they might be accepted or rejected - float proposed_setup_slack = setup_slacks->setup_slack(net_id, pin_index_in_net); - float original_setup_slack = connection_setup_slack[net_id][pin_index_in_net]; - - //The worse the slack of a pin, the more weight it is given - //Currently, first normalize, then apply the Softmax function, - //which takes the exponential of the opposite value of the - //normalized slack value and then normalize again. More negative - //slacks should take on a much larger weight. - weight = std::exp(original_setup_slack / total_negative_slack); - osa << weight << ' '; - - //The fraction by which the slack value has changed. - //Positive->good, negative->bad. - frac_changed = (proposed_setup_slack - original_setup_slack) / original_setup_slack; - osb << frac_changed << ' '; - - //Using minus due to the definition of cost: lower cost is better - total_cost -= frac_changed * weight; - } - osa << '\n'; - osb << '\n'; - osa.close(); - osb.close(); - - //Currently, as long as the total cost is negative, the moves - //by the try_swap routine are accepted. - return total_cost < 0 ? ACCEPTED : REJECTED; + //Compare if the worse slack value has gotten worse or better + return original_setup_slacks.front() - proposed_setup_slacks.front(); } static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, @@ -1470,14 +1396,16 @@ static bool update_annealing_state(t_annealing_state* state, return true; } -static float starting_t(t_placer_costs* costs, +static float starting_t(float crit_exponent, + t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, int max_moves, float rlim, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, - TimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + SetupTimingInfo* timing_info, MoveGenerator& move_generator, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, @@ -1501,13 +1429,19 @@ static float starting_t(t_placer_costs* costs, /* Try one move per block. Set t high so essentially all accepted. */ for (i = 0; i < move_lim; i++) { - e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, costs, prev_inverse_costs, rlim, + //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack + e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, + crit_exponent, + costs, + prev_inverse_costs, + rlim, move_generator, timing_info, pin_timing_invalidator, blocks_affected, delay_model, criticalities, + setup_slacks, placer_opts.rlim_escape_fraction, placer_opts.place_algorithm, placer_opts.timing_tradeoff); @@ -1572,15 +1506,17 @@ static void reset_move_nets(int num_nets_affected) { } static e_move_result try_swap(float t, + float crit_exponent, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, MoveGenerator& move_generator, - TimingInfo* timing_info, + SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, float rlim_escape_fraction, enum e_place_algorithm place_algorithm, float timing_tradeoff) { @@ -1643,14 +1579,44 @@ static e_move_result try_swap(float t, //Update the block positions apply_move_blocks(blocks_affected); - // Find all the nets affected by this swap and update their costs + //Find all the nets affected by this swap and update their costs + //This routine calculates new connection delays and timing costs + //and store them in proposed_* data structures int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, delay_model, criticalities, blocks_affected, bb_delta_c, timing_delta_c); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + + //Find all the sink pins with changed connection delays from the affected blocks + //These sink pins will be passed into the pin_timing_invalidator for sta update + std::vector sink_pins_affected; + find_affected_sink_pins(blocks_affected, sink_pins_affected); + + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Invalidates timing of modified connections for incremental timing updates + //This routine relies on comparing proposed_connection_delay and connection_delay + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); + + //Update timing information. Only update setup slacks. + //Keep the connection criticalities and timing costs stale + //so as not to mess up the original timing driven algorithm + do_update_criticalities = false; + do_update_setup_slacks = true; + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); + + delta_c = analyze_setup_slack_cost(setup_slacks); + + } else if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { /*in this case we redefine delta_c as a combination of timing and bb. * *additionally, we normalize all values, therefore delta_c is in * *relation to 1*/ @@ -1668,18 +1634,32 @@ static e_move_result try_swap(float t, costs->cost += delta_c; costs->bb_cost += bb_delta_c; + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + costs->timing_cost += timing_delta_c; + + //Commit the setup slack information + commit_setup_slacks(setup_slacks); + + //Update the connection_timing_cost and connection_delay + //values from the temporary values. + //The connections have already been invalidated and updated + //during the previous analysis stage. + commit_td_cost(blocks_affected); + } + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { costs->timing_cost += timing_delta_c; //Invalidates timing of modified connections for incremental timing updates - //Must be called before commit_td_cost since it relies on comparing - //proposed_connection_delay and connection_delay - invalidate_affected_connection_delays(blocks_affected, + //This routine relies on comparing proposed_connection_delay and connection_delay + //If the setup slack analysis was not performed, the + //sink pins are yet to be invalidated. + invalidate_affected_connection_delays(sink_pins_affected, pin_timing_invalidator, timing_info); - /*update the connection_timing_cost and connection_delay - * values from the temporary values */ + //update the connection_timing_cost and connection_delay + //values from the temporary values commit_td_cost(blocks_affected); } @@ -1696,6 +1676,30 @@ static e_move_result try_swap(float t, /* Restore the place_ctx.block_locs data structures to their state before the move. */ revert_move_blocks(blocks_affected); + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Re-invalidate the affected sink pins + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); + + /* Blocks are restored. Now Restore the timing information to pre-analysis state */ + do_update_criticalities = false; + do_update_setup_slacks = true; + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); + /* + * VTR_ASSERT_MSG( + * verify_connection_setup_slacks(setup_slacks), + * "The setup slacks should not change after reverting to state before the timing info update."); + */ + revert_td_cost(blocks_affected); + } + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { revert_td_cost(blocks_affected); } @@ -1872,6 +1876,29 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, } } +static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected, + std::vector& sink_pins_affected) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; + + for (ClusterPinId clb_pin : blocks_affected.affected_pins) { + //It is possible that some connections may not have changed delay.(e.g. + //For instance, if using a dx/dy delay model, this could occur if a sink + //moved to a new position with the same dx/dy from it's driver. + // + //To minimize work during the incremental STA update we do not invalidate + //such unchanged connections. + + ClusterNetId net = clb_nlist.pin_net(clb_pin); + int ipin = clb_nlist.pin_net_index(clb_pin); + + if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) { + //Delay has changed. Must invalidate this sink pin. + sink_pins_affected.push_back(clb_pin); + } + } +} + static e_move_result assess_swap(double delta_c, double t) { /* Returns: 1 -> move accepted, 0 -> rejected. */ if (delta_c <= 0) { @@ -1973,13 +2000,15 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { } } -//Copy all the current setup slacks from the PlacerSetupSlacks class -//This routine will always be incremental and correct, as it is called -//if and only if the PlacerSetupSlacks class is updated with new slack values -static void record_setup_slacks(const PlacerSetupSlacks* setup_slacks) { +//Commit all the setup slack values from the PlacerSetupSlacks class. +//This routine will be incremental and correct if and only if +//it is called immediately after each time +//update_setup_slacks_and_criticalities updates the setup slacks +//i.e. do_update_setup_slacks = true +static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - //Only go through sink pins with modified setup slack + //Incremental: only go through sink pins with modified setup slack for (ClusterPinId pin_id : setup_slacks->pins_with_modified_setup_slack()) { ClusterNetId net_id = clb_nlist.pin_net(pin_id); size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); @@ -2068,31 +2097,21 @@ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { // //Relies on proposed_connection_delay and connection_delay to detect //which connections have actually had their delay changed. -static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected, +static void invalidate_affected_connection_delays(const std::vector& sink_pins_affected, ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info) { VTR_ASSERT_SAFE(timing_info); VTR_ASSERT_SAFE(pin_tedges_invalidator); - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& clb_nlist = cluster_ctx.clb_nlist; - - //Inalidate timing graph edges affected by the move - for (ClusterPinId pin : blocks_affected.affected_pins) { - //It is possible that some connections may not have changed delay.(e.g. + //Invalidate timing graph edges affected by the move + for (ClusterPinId clb_pin : sink_pins_affected) { + //It is possible that some connections may not have changed delay. //For instance, if using a dx/dy delay model, this could occur if a sink //moved to a new position with the same dx/dy from it's driver. // - //To minimze work during the incremental STA update we do not invalidate + //To minimize work during the incremental STA update we do not invalidate //such unchanged connections. - - ClusterNetId net = clb_nlist.pin_net(pin); - int ipin = clb_nlist.pin_net_index(pin); - - if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) { - //Delay changed, must invalidate - pin_tedges_invalidator->invalidate_connection(pin, timing_info); - } + pin_tedges_invalidator->invalidate_connection(clb_pin, timing_info); } } @@ -3235,59 +3254,3 @@ static void init_annealing_state(t_annealing_state* state, bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); } - -//Recording down all the info about the placer's current state -static void take_placement_snapshot() { - const auto& place_ctx = g_vpr_ctx.placement(); - const auto& cluster_ctx = g_vpr_ctx.clustering(); - - const auto& clb_nlist = cluster_ctx.clb_nlist; - - connection_delay_snapshot = connection_delay; - //Go through every single sink pin to check if delay has been updated - for (ClusterNetId net_id : clb_nlist.nets()) { - for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { - VTR_ASSERT_MSG(connection_delay[net_id][ipin] == connection_delay_snapshot[net_id][ipin], - "Direct assignment of the delay has failed"); - } - } - - connection_timing_cost_snapshot = connection_timing_cost; - bb_coords_snapshot = bb_coords; - bb_num_on_edges_snapshot = bb_num_on_edges; - net_cost_snapshot = net_cost; - bb_updated_before_snapshot = bb_updated_before; - - block_locs_snapshot = place_ctx.block_locs; - grid_blocks_snapshot = place_ctx.grid_blocks; -} - -//Revert back to the recorded placer state, which is the state -//of the placer when the last timing info update took place -static void revert_placement_snapshot(ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info) { - auto& place_ctx = g_vpr_ctx.mutable_placement(); - const auto& cluster_ctx = g_vpr_ctx.clustering(); - - const auto& clb_nlist = cluster_ctx.clb_nlist; - - //Go through every single sink pin to check if delay has changed - for (ClusterNetId net_id : clb_nlist.nets()) { - for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { - if (connection_delay[net_id][ipin] != connection_delay_snapshot[net_id][ipin]) { - //Delay changed, must invalidate - ClusterPinId pin_id = clb_nlist.net_pin(net_id, ipin); - pin_tedges_invalidator->invalidate_connection(pin_id, timing_info); - connection_delay[net_id][ipin] = connection_delay_snapshot[net_id][ipin]; - } - } - } - - connection_timing_cost = connection_timing_cost_snapshot; - bb_coords = bb_coords_snapshot; - bb_num_on_edges = bb_num_on_edges_snapshot; - net_cost = net_cost_snapshot; - bb_updated_before = bb_updated_before_snapshot; - - place_ctx.block_locs = block_locs_snapshot; - place_ctx.grid_blocks = grid_blocks_snapshot; -} \ No newline at end of file From 96e65ba3fc6ccffd6191da0736fb256fd36d8fb3 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 14 Aug 2020 05:33:03 -0400 Subject: [PATCH 12/21] Corrected the timing update and reversion of setup slack analysis during the placement quench stage. Made commit_td_cost method incremental by only going through sink pins affected by the moved blocks. --- vpr/src/place/place.cpp | 261 +++++++++++++++++++++------------------- 1 file changed, 135 insertions(+), 126 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 258c23875aa..1d1f17832f5 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -119,6 +119,14 @@ struct t_annealing_state { int move_lim; // Current move limit }; +/* Determines if slacks/criticalities need to be updated */ +static bool do_update_criticalities = true; +static bool do_update_setup_slacks = true; + +/* Determines if slacks/criticalities need to be recomputed from scratch */ +bool do_recompute_criticalities = true; +bool do_recompute_setup_slacks = true; + constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); constexpr double MAX_INV_TIMING_COST = 1.e9; @@ -182,14 +190,6 @@ static vtr::vector net_timing_cost; //Like connection_timi static vtr::vector bb_coords, bb_num_on_edges; -/* Determines if slacks/criticalities need to be updated */ -static bool do_update_criticalities = true; -static bool do_update_setup_slacks = true; - -/* Determines if slacks/criticalities need to be recomputed from scratch */ -static bool do_recompute_criticalities = true; -static bool do_recompute_setup_slacks = true; - /* The arrays below are used to precompute the inverse of the average * * number of tracks per channel between [subhigh] and [sublow]. Access * * them as chan?_place_cost_fac[subhigh][sublow]. They are used to * @@ -417,6 +417,8 @@ static double sum_td_costs(); static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected, std::vector& sink_pins_affected); +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks); + static e_move_result assess_swap(double delta_c, double t); static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new); @@ -487,8 +489,6 @@ static void placement_inner_loop(float t, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, - bool inner_loop_update_crit, - bool inner_loop_update_setup_slack, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, SetupTimingInfo* timing_info, @@ -524,8 +524,6 @@ static void print_resources_utilization(); static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); -static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks); - /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, t_annealing_sched annealing_sched, @@ -648,6 +646,16 @@ void try_place(const t_placer_opts& placer_opts, do_update_setup_slacks = true; do_recompute_criticalities = true; do_recompute_setup_slacks = true; + + //As a safety measure, for the first time update, + //invalidate all timing edges via the pin invalidator. + auto& clb_nlist = cluster_ctx.clb_nlist; + for (ClusterNetId net : clb_nlist.nets()) { + for (ClusterPinId pin : clb_nlist.net_sinks(net)) { + pin_timing_invalidator.get()->invalidate_connection(pin, timing_info.get()); + } + } + update_setup_slacks_and_criticalities(first_crit_exponent, place_delay_model.get(), placer_criticalities.get(), @@ -816,8 +824,6 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), timing_info.get()); - bool anneal_update_crit = true, anneal_update_setup_slack = false; - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, &costs, @@ -827,8 +833,6 @@ void try_place(const t_placer_opts& placer_opts, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), - anneal_update_crit, - anneal_update_setup_slack, *move_generator, blocks_affected, timing_info.get(), @@ -882,8 +886,14 @@ void try_place(const t_placer_opts& placer_opts, state.t = 0; /* freeze out */ - //Analyze setup slacks for quench - bool quench_update_crit = true, quench_update_setup_slack = true; + //Use setup slack analysis if the placer is timing driven + //TODO: make this a command line option to turn on slack analysis + enum e_place_algorithm quench_algorithm; + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + quench_algorithm = SETUP_SLACK_ANALYSIS_PLACE; + } else { + quench_algorithm = BOUNDING_BOX_PLACE; + } /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ @@ -896,12 +906,10 @@ void try_place(const t_placer_opts& placer_opts, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), - quench_update_crit, - quench_update_setup_slack, *move_generator, blocks_affected, timing_info.get(), - e_place_algorithm::SETUP_SLACK_ANALYSIS_PLACE); + quench_algorithm); tot_iter += move_lim; ++num_temps; @@ -1116,8 +1124,6 @@ static void placement_inner_loop(float t, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, - bool inner_loop_update_crit, - bool inner_loop_update_setup_slack, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, SetupTimingInfo* timing_info, @@ -1136,7 +1142,11 @@ static void placement_inner_loop(float t, /* Inner loop begins */ for (inner_iter = 0; inner_iter < move_lim; inner_iter++) { - e_move_result swap_result = try_swap(t, crit_exponent, costs, prev_inverse_costs, rlim, + e_move_result swap_result = try_swap(t, + crit_exponent, + costs, + prev_inverse_costs, + rlim, move_generator, timing_info, pin_timing_invalidator, @@ -1162,7 +1172,7 @@ static void placement_inner_loop(float t, num_swap_rejected++; } - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE || place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { /* Do we want to re-timing analyze the circuit to get updated slack and criticality values? * We do this only once in a while, since it is expensive. */ @@ -1177,8 +1187,8 @@ static void placement_inner_loop(float t, * criticalities and update the timing cost since it will change. */ //Update timing information - do_update_criticalities = inner_loop_update_crit; - do_update_setup_slacks = inner_loop_update_setup_slack; + do_update_criticalities = true; + do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, @@ -1186,11 +1196,8 @@ static void placement_inner_loop(float t, pin_timing_invalidator, timing_info, costs); - - //Commit the setup slacks if they are updated - if (do_update_setup_slacks) { - commit_setup_slacks(setup_slacks); - } + //Commit the setup slacks + commit_setup_slacks(setup_slacks); } inner_crit_iter_count++; } @@ -1225,35 +1232,6 @@ static void placement_inner_loop(float t, /* Inner loop ends */ } -static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { - const auto& cluster_ctx = g_vpr_ctx.clustering(); - const auto& clb_nlist = cluster_ctx.clb_nlist; - - //Find the original/proposed setup slacks of pins with modified values - std::vector original_setup_slacks, proposed_setup_slacks; - - for (ClusterPinId clb_pin : setup_slacks->pins_with_modified_setup_slack()) { - ClusterNetId net_id = clb_nlist.pin_net(clb_pin); - size_t ipin = clb_nlist.pin_net_index(clb_pin); - - original_setup_slacks.push_back(connection_setup_slack[net_id][ipin]); - proposed_setup_slacks.push_back(setup_slacks->setup_slack(net_id, ipin)); - } - - //If there are no pins with modified slack values, accept this move - //by returning an arbitrary negative number - if (original_setup_slacks.empty()) { - return -1; - } - - //Sort in ascending order, from worse slack value to best - std::sort(original_setup_slacks.begin(), original_setup_slacks.end()); - std::sort(proposed_setup_slacks.begin(), proposed_setup_slacks.end()); - - //Compare if the worse slack value has gotten worse or better - return original_setup_slacks.front() - proposed_setup_slacks.front(); -} - static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, @@ -1426,11 +1404,12 @@ static float starting_t(float crit_exponent, av = 0.; sum_of_squares = 0.; - /* Try one move per block. Set t high so essentially all accepted. */ + /* Try one move per block. Set the temperature high so essentially all accepted. */ + float t = HUGE_POSITIVE_FLOAT; for (i = 0; i < move_lim; i++) { //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack - e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, + e_move_result swap_result = try_swap(t, crit_exponent, costs, prev_inverse_costs, @@ -1582,6 +1561,8 @@ static e_move_result try_swap(float t, //Find all the nets affected by this swap and update their costs //This routine calculates new connection delays and timing costs //and store them in proposed_* data structures + //This routine also calculates the wiring cost, which doesn't + //depend on the timing driven data int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, delay_model, criticalities, @@ -1589,8 +1570,9 @@ static e_move_result try_swap(float t, bb_delta_c, timing_delta_c); - //Find all the sink pins with changed connection delays from the affected blocks - //These sink pins will be passed into the pin_timing_invalidator for sta update + //Find all the sink pins with changed connection delays from the affected blocks. + //These sink pins will be passed into the pin_timing_invalidator for timing update. + //They will also be added to the pin invalidator when we wish to revert a timing update. std::vector sink_pins_affected; find_affected_sink_pins(blocks_affected, sink_pins_affected); @@ -1601,9 +1583,18 @@ static e_move_result try_swap(float t, pin_timing_invalidator, timing_info); - //Update timing information. Only update setup slacks. - //Keep the connection criticalities and timing costs stale - //so as not to mess up the original timing driven algorithm + //Update the connection_timing_cost and connection_delay + //values from the temporary values. + commit_td_cost(blocks_affected); + + //Update timing information. Since we are analyzing setup slacks, + //we only update those values and keep the criticalities stale + //so as not to interfere with the original timing driven algorithm. + // + //Note: the timing info must be called after applying block moves + //and committing the timing driven delays and costs. + //If we wish to revert this timing update due to move rejection, + //we need to revert block moves and restore the timing values. do_update_criticalities = false; do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, @@ -1614,6 +1605,8 @@ static e_move_result try_swap(float t, timing_info, costs); + /* Get the setup slack analysis cost */ + //TODO: calculate a weighted average of the slack cost and wiring cost delta_c = analyze_setup_slack_cost(setup_slacks); } else if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { @@ -1623,7 +1616,8 @@ static e_move_result try_swap(float t, delta_c = (1 - timing_tradeoff) * bb_delta_c * prev_inverse_costs->bb_cost + timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost; - } else { + + } else { //place_algorithm == BOUNDING_BOX_PLACE (wiring cost) delta_c = bb_delta_c; } @@ -1635,16 +1629,12 @@ static e_move_result try_swap(float t, costs->bb_cost += bb_delta_c; if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + /* Update the timing driven cost as usual */ costs->timing_cost += timing_delta_c; //Commit the setup slack information + //The timing delay and cost values should be committed already commit_setup_slacks(setup_slacks); - - //Update the connection_timing_cost and connection_delay - //values from the temporary values. - //The connections have already been invalidated and updated - //during the previous analysis stage. - commit_td_cost(blocks_affected); } if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { @@ -1669,20 +1659,27 @@ static e_move_result try_swap(float t, /* Update clb data structures since we kept the move. */ commit_move_blocks(blocks_affected); - } else { /* Move was rejected. */ - /* Reset the net cost function flags first. */ + } else { //move_outcome == REJECTED + + /* Reset the net cost function flags first. */ reset_move_nets(num_nets_affected); /* Restore the place_ctx.block_locs data structures to their state before the move. */ revert_move_blocks(blocks_affected); if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Revert the timing delays and costs to pre-update values + //These routines must be called after reverting the block moves + //TODO: make this process incremental + comp_td_connection_delays(delay_model); + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + //Re-invalidate the affected sink pins invalidate_affected_connection_delays(sink_pins_affected, pin_timing_invalidator, timing_info); - /* Blocks are restored. Now Restore the timing information to pre-analysis state */ + /* Revert the timing update */ do_update_criticalities = false; do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, @@ -1692,15 +1689,14 @@ static e_move_result try_swap(float t, pin_timing_invalidator, timing_info, costs); - /* - * VTR_ASSERT_MSG( - * verify_connection_setup_slacks(setup_slacks), - * "The setup slacks should not change after reverting to state before the timing info update."); - */ - revert_td_cost(blocks_affected); + + VTR_ASSERT_SAFE_MSG( + verify_connection_setup_slacks(setup_slacks), + "The current setup slacks should be identical to the values before the try swap timing info update."); } if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + /* Unstage the values stored in proposed_* data structures */ revert_td_cost(blocks_affected); } } @@ -1728,7 +1724,7 @@ static e_move_result try_swap(float t, check_place(*costs, delay_model, place_algorithm); #endif - return (move_outcome); + return move_outcome; } //Puts all the nets changed by the current swap into nets_to_update, @@ -1768,7 +1764,7 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm //once per net, not once per pin. update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE || place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { //Determine the change in timing costs if required update_td_delta_costs(delay_model, *criticalities, net_id, blk_pin, blocks_affected, timing_delta_c); } @@ -1899,6 +1895,41 @@ static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affect } } +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Find the original/proposed setup slacks of pins with modified values + std::vector original_setup_slacks, proposed_setup_slacks; + + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId clb_pin : clb_pins_modified) { + ClusterNetId net_id = clb_nlist.pin_net(clb_pin); + size_t ipin = clb_nlist.pin_net_index(clb_pin); + + original_setup_slacks.push_back(connection_setup_slack[net_id][ipin]); + proposed_setup_slacks.push_back(setup_slacks->setup_slack(net_id, ipin)); + } + + //Sort in ascending order, from worse slack value to best + std::sort(original_setup_slacks.begin(), original_setup_slacks.end()); + std::sort(proposed_setup_slacks.begin(), proposed_setup_slacks.end()); + + //Check the first pair of slack values that are different + //If found, return their difference + for (size_t idiff = 0; idiff < original_setup_slacks.size(); ++idiff) { + float slack_diff = original_setup_slacks[idiff] != proposed_setup_slacks[idiff]; + + if (slack_diff != 0) { + return slack_diff; + } + } + + //If all slack values are identical(or no modified slack values), + //reject this move by returning an arbitrary positive number as cost + return 1; +} + static e_move_result assess_swap(double delta_c, double t) { /* Returns: 1 -> move accepted, 0 -> rejected. */ if (delta_c <= 0) { @@ -2001,15 +2032,15 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { } //Commit all the setup slack values from the PlacerSetupSlacks class. -//This routine will be incremental and correct if and only if -//it is called immediately after each time -//update_setup_slacks_and_criticalities updates the setup slacks -//i.e. do_update_setup_slacks = true +//This incremental routine will be correct if and only if it is called +//immediately after each time update_setup_slacks_and_criticalities +//updates the setup slacks (i.e. do_update_setup_slacks = true) static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; //Incremental: only go through sink pins with modified setup slack - for (ClusterPinId pin_id : setup_slacks->pins_with_modified_setup_slack()) { + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId pin_id : clb_pins_modified) { ClusterNetId net_id = clb_nlist.pin_net(pin_id); size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); @@ -2020,7 +2051,7 @@ static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - //Go through every single sink pin + //Go through every single sink pin to check that the slack values are the same for (ClusterNetId net_id : clb_nlist.nets()) { for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) { @@ -2033,44 +2064,22 @@ static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks } /* Update the connection_timing_cost values from the temporary * - * values for all connections that have changed. */ + * values for all connections that have changed. */ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; - /* Go through all the blocks moved. */ - for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) { - ClusterBlockId bnum = blocks_affected.moved_blocks[iblk].block_num; - for (ClusterPinId pin_id : cluster_ctx.clb_nlist.block_pins(bnum)) { - ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(pin_id); - - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) - continue; - - if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::DRIVER) { - //This net is being driven by a moved block, recompute - //all point to point connections on this net. - for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { - connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin]; - proposed_connection_delay[net_id][ipin] = INVALID_DELAY; - connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin]; - proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY; - } - } else { - //This pin is a net sink on a moved block - VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK); - - /* The following "if" prevents the value from being updated twice. */ - if (!driven_by_moved_block(net_id, blocks_affected)) { - int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin_id); + //Go through all the sink pins affected + for (ClusterPinId pin_id : blocks_affected.affected_pins) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + int ipin = clb_nlist.pin_net_index(pin_id); - connection_delay[net_id][net_pin] = proposed_connection_delay[net_id][net_pin]; - proposed_connection_delay[net_id][net_pin] = INVALID_DELAY; - connection_timing_cost[net_id][net_pin] = proposed_connection_timing_cost[net_id][net_pin]; - proposed_connection_timing_cost[net_id][net_pin] = INVALID_DELAY; - } - } - } /* Finished going through all the pins in the moved block */ - } /* Finished going through all the blocks moved */ + //Commit the timing delay and cost values + connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin]; + proposed_connection_delay[net_id][ipin] = INVALID_DELAY; + connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin]; + proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY; + } } //Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost based on From 112bde57b797764dc9694bfbcb64ec92dcfde9e5 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 14 Aug 2020 07:07:47 -0400 Subject: [PATCH 13/21] Moved four boolean global variables controlling the timing update into a new local structure called t_placer_timing_update_mode to tidy up the code. --- vpr/src/place/place.cpp | 187 ++++++++++++++++++++++++++-------------- 1 file changed, 122 insertions(+), 65 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 1d1f17832f5..b6045c3139c 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -119,13 +119,15 @@ struct t_annealing_state { int move_lim; // Current move limit }; -/* Determines if slacks/criticalities need to be updated */ -static bool do_update_criticalities = true; -static bool do_update_setup_slacks = true; - -/* Determines if slacks/criticalities need to be recomputed from scratch */ -bool do_recompute_criticalities = true; -bool do_recompute_setup_slacks = true; +struct t_placer_timing_update_mode { + /* Determines if slacks/criticalities need to be updated */ + bool do_update_criticalities; + bool do_update_setup_slacks; + + /* Determines if slacks/criticalities need to be recomputed from scratch */ + bool do_recompute_criticalities; + bool do_recompute_setup_slacks; +}; constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); @@ -332,6 +334,7 @@ static void reset_move_nets(int num_nets_affected); static e_move_result try_swap(float t, float crit_exponent, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, @@ -360,6 +363,7 @@ static int check_block_placement_consistency(); static int check_macro_placement_consistency(); static float starting_t(float crit_exponent, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, @@ -455,6 +459,7 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr); static void free_try_swap_arrays(); static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, int num_connections, @@ -466,12 +471,22 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info); +static void initialize_timing_info(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs); + static void update_setup_slacks_and_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs); static void placement_inner_loop(float t, @@ -482,6 +497,7 @@ static void placement_inner_loop(float t, float crit_exponent, int inner_recompute_limit, t_placer_statistics* stats, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, int* moves_since_cost_recompute, @@ -569,6 +585,7 @@ void try_place(const t_placer_opts& placer_opts, std::unique_ptr pin_timing_invalidator; t_pl_blocks_to_be_moved blocks_affected(cluster_ctx.clb_nlist.blocks().size()); + t_placer_timing_update_mode timing_update_mode; /* Allocated here because it goes into timing critical code where each memory allocation is expensive */ IntraLbPbPinLookup pb_gpin_lookup(device_ctx.logical_block_types); @@ -642,32 +659,14 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.lookup, *timing_info->timing_graph()); //First time compute timing and costs, compute from scratch - do_update_criticalities = true; - do_update_setup_slacks = true; - do_recompute_criticalities = true; - do_recompute_setup_slacks = true; - - //As a safety measure, for the first time update, - //invalidate all timing edges via the pin invalidator. - auto& clb_nlist = cluster_ctx.clb_nlist; - for (ClusterNetId net : clb_nlist.nets()) { - for (ClusterPinId pin : clb_nlist.net_sinks(net)) { - pin_timing_invalidator.get()->invalidate_connection(pin, timing_info.get()); - } - } - - update_setup_slacks_and_criticalities(first_crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - placer_setup_slacks.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); - - //Initialize the setup slacks matrix - commit_setup_slacks(placer_setup_slacks.get()); - - timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement + initialize_timing_info(first_crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &timing_update_mode, + &costs); critical_path = timing_info->least_slack_critical_path(); @@ -780,7 +779,8 @@ void try_place(const t_placer_opts& placer_opts, first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); - float first_t = starting_t(first_crit_exponent, &costs, &prev_inverse_costs, + float first_t = starting_t(first_crit_exponent, &timing_update_mode, + &costs, &prev_inverse_costs, annealing_sched, move_lim, first_rlim, place_delay_model.get(), placer_criticalities.get(), @@ -814,7 +814,8 @@ void try_place(const t_placer_opts& placer_opts, costs.cost = 1; } - outer_loop_update_timing_info(placer_opts, &costs, &prev_inverse_costs, + outer_loop_update_timing_info(placer_opts, &timing_update_mode, + &costs, &prev_inverse_costs, num_connections, state.crit_exponent, &outer_crit_iter_count, @@ -826,8 +827,7 @@ void try_place(const t_placer_opts& placer_opts, placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, - &costs, - &prev_inverse_costs, + &timing_update_mode, &costs, &prev_inverse_costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), @@ -873,8 +873,8 @@ void try_place(const t_placer_opts& placer_opts, { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); - outer_loop_update_timing_info(placer_opts, &costs, - &prev_inverse_costs, + outer_loop_update_timing_info(placer_opts, &timing_update_mode, + &costs, &prev_inverse_costs, num_connections, state.crit_exponent, &outer_crit_iter_count, @@ -899,8 +899,7 @@ void try_place(const t_placer_opts& placer_opts, * which reduce the cost of the placement */ placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, move_lim, state.crit_exponent, quench_recompute_limit, &stats, - &costs, - &prev_inverse_costs, + &timing_update_mode, &costs, &prev_inverse_costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), @@ -958,16 +957,19 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs - do_update_criticalities = true; - do_update_setup_slacks = false; + timing_update_mode.do_update_criticalities = true; + timing_update_mode.do_update_setup_slacks = true; update_setup_slacks_and_criticalities(state.crit_exponent, place_delay_model.get(), placer_criticalities.get(), placer_setup_slacks.get(), pin_timing_invalidator.get(), timing_info.get(), + &timing_update_mode, &costs); + commit_setup_slacks(placer_setup_slacks.get()); + critical_path = timing_info->least_slack_critical_path(); if (isEchoFileEnabled(E_ECHO_FINAL_PLACEMENT_TIMING_GRAPH)) { @@ -1021,6 +1023,7 @@ void try_place(const t_placer_opts& placer_opts, /* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, int num_connections, @@ -1031,8 +1034,9 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info) { - if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) + if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) { return; + } /*at each temperature change we update these values to be used */ /*for normalizing the tradeoff between timing and wirelength (bb) */ @@ -1044,15 +1048,16 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, num_connections = std::max(num_connections, 1); //Avoid division by zero VTR_ASSERT(num_connections > 0); - //Update timing information and criticalities - do_update_criticalities = true; - do_update_setup_slacks = true; + //Update all timing information + timing_update_mode->do_update_criticalities = true; + timing_update_mode->do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, setup_slacks, pin_timing_invalidator, timing_info, + timing_update_mode, costs); //Always commit the setup slacks when they are updated @@ -1069,6 +1074,50 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); } +static void initialize_timing_info(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Initialize the timing update mode. Update both + //setup slacks and criticalities from scratch + timing_update_mode->do_update_criticalities = true; + timing_update_mode->do_update_setup_slacks = true; + timing_update_mode->do_recompute_criticalities = true; + timing_update_mode->do_recompute_setup_slacks = true; + + //As a safety measure, for the first time update, + //invalidate all timing edges via the pin invalidator + //by passing in all the clb sink pins + for (ClusterNetId net_id : clb_nlist.nets()) { + for (ClusterPinId pin_id : clb_nlist.net_sinks(net_id)) { + pin_timing_invalidator->invalidate_connection(pin_id, timing_info); + } + } + + //Perform timing info update + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + //Initialize the data structure that stores committed placer setup slacks + commit_setup_slacks(setup_slacks); + + //Don't warn again about unconstrained nodes again during placement + timing_info->set_warn_unconstrained(false); +} + //Update timing information based on current placement by running STA. //Record the new slack information as well as calculate the updated //criticalities and timing costs (based on the new setup slacks) @@ -1078,18 +1127,19 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, PlacerSetupSlacks* setup_slacks, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs) { //Run STA to update slacks and adjusted/relaxed criticalities timing_info->update(); - if (do_update_setup_slacks) { + if (timing_update_mode->do_update_setup_slacks) { //Update placer's setup slacks - setup_slacks->update_setup_slacks(timing_info, do_recompute_setup_slacks); + setup_slacks->update_setup_slacks(timing_info, timing_update_mode->do_recompute_setup_slacks); } - if (do_update_criticalities) { + if (timing_update_mode->do_update_criticalities) { //Update placer's criticalities (e.g. sharpen with crit_exponent) - criticalities->update_criticalities(timing_info, crit_exponent, do_recompute_criticalities); + criticalities->update_criticalities(timing_info, crit_exponent, timing_update_mode->do_recompute_criticalities); //Update connection, net and total timing costs based on new criticalities #ifdef INCR_COMP_TD_COSTS @@ -1099,10 +1149,11 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, #endif } - //Setup slacks and criticalities need to be in sync with the timing_info - //Otherwise, they cannot be incrementally updated on the next iteration - do_recompute_setup_slacks = !do_update_setup_slacks; - do_recompute_criticalities = !do_update_criticalities; + //Setup slacks and criticalities need to be in sync with the timing_info. + //if they are to be incrementally updated on the next iteration. + //Otherwise, a re-computation for all clb sink pins is required. + timing_update_mode->do_recompute_setup_slacks = !timing_update_mode->do_update_setup_slacks; + timing_update_mode->do_recompute_criticalities = !timing_update_mode->do_update_criticalities; //Clear invalidation state pin_timing_invalidator->reset(); @@ -1117,6 +1168,7 @@ static void placement_inner_loop(float t, float crit_exponent, int inner_recompute_limit, t_placer_statistics* stats, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, int* moves_since_cost_recompute, @@ -1144,6 +1196,7 @@ static void placement_inner_loop(float t, for (inner_iter = 0; inner_iter < move_lim; inner_iter++) { e_move_result swap_result = try_swap(t, crit_exponent, + timing_update_mode, costs, prev_inverse_costs, rlim, @@ -1172,7 +1225,7 @@ static void placement_inner_loop(float t, num_swap_rejected++; } - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE || place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { /* Do we want to re-timing analyze the circuit to get updated slack and criticality values? * We do this only once in a while, since it is expensive. */ @@ -1184,19 +1237,20 @@ static void placement_inner_loop(float t, VTR_LOG("Inner loop recompute criticalities\n"); #endif /* Using the delays in connection_delay, do a timing analysis to update slacks and - * criticalities and update the timing cost since it will change. + * criticalities and update the timing cost since they will change. */ - //Update timing information - do_update_criticalities = true; - do_update_setup_slacks = true; + timing_update_mode->do_update_criticalities = true; + timing_update_mode->do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, setup_slacks, pin_timing_invalidator, timing_info, + timing_update_mode, costs); - //Commit the setup slacks + + //Always commit the setup slacks when they are updated commit_setup_slacks(setup_slacks); } inner_crit_iter_count++; @@ -1375,6 +1429,7 @@ static bool update_annealing_state(t_annealing_state* state, } static float starting_t(float crit_exponent, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, @@ -1411,6 +1466,7 @@ static float starting_t(float crit_exponent, //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack e_move_result swap_result = try_swap(t, crit_exponent, + timing_update_mode, costs, prev_inverse_costs, rlim, @@ -1486,6 +1542,7 @@ static void reset_move_nets(int num_nets_affected) { static e_move_result try_swap(float t, float crit_exponent, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, @@ -1595,14 +1652,15 @@ static e_move_result try_swap(float t, //and committing the timing driven delays and costs. //If we wish to revert this timing update due to move rejection, //we need to revert block moves and restore the timing values. - do_update_criticalities = false; - do_update_setup_slacks = true; + timing_update_mode->do_update_criticalities = false; + timing_update_mode->do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, setup_slacks, pin_timing_invalidator, timing_info, + timing_update_mode, costs); /* Get the setup slack analysis cost */ @@ -1680,14 +1738,13 @@ static e_move_result try_swap(float t, timing_info); /* Revert the timing update */ - do_update_criticalities = false; - do_update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, setup_slacks, pin_timing_invalidator, timing_info, + timing_update_mode, costs); VTR_ASSERT_SAFE_MSG( From 29b55a317c7299cafc612304c6356608efcfbe08 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 20 Aug 2020 17:22:01 -0400 Subject: [PATCH 14/21] Added vpr option --place_quench_metric to turn on/off setup slack analysis during placement quench. Possible options are: auto, timing_cost, setup_slack. --- vpr/src/base/SetupVPR.cpp | 2 ++ vpr/src/base/read_options.cpp | 46 +++++++++++++++++++++++++++++++++++ vpr/src/base/read_options.h | 1 + vpr/src/base/vpr_types.h | 7 ++++++ vpr/src/place/place.cpp | 29 ++++++++++++++++------ 5 files changed, 78 insertions(+), 7 deletions(-) diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index c10609e6857..546fdc2f029 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -570,6 +570,8 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts) PlacerOpts->effort_scaling = Options.place_effort_scaling; PlacerOpts->timing_update_type = Options.timing_update_type; + + PlacerOpts->place_quench_metric = Options.place_quench_metric; } static void SetupAnalysisOpts(const t_options& Options, t_analysis_opts& analysis_opts) { diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index b40e867f672..76bd56d0126 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -959,6 +959,41 @@ struct ParseTimingUpdateType { } }; +struct ParsePlaceQuenchMetric { + ConvertedValue from_str(std::string str) { + ConvertedValue conv_value; + if (str == "auto") + conv_value.set_value(e_place_quench_metric::AUTO); + else if (str == "timing_cost") + conv_value.set_value(e_place_quench_metric::TIMING_COST); + else if (str == "setup_slack") + conv_value.set_value(e_place_quench_metric::SETUP_SLACK); + else { + std::stringstream msg; + msg << "Invalid conversion from '" << str << "' to e_place_quench_metric (expected one of: " << argparse::join(default_choices(), ", ") << ")"; + conv_value.set_error(msg.str()); + } + return conv_value; + } + + ConvertedValue to_str(e_place_quench_metric val) { + ConvertedValue conv_value; + if (val == e_place_quench_metric::AUTO) + conv_value.set_value("auto"); + if (val == e_place_quench_metric::TIMING_COST) + conv_value.set_value("timing_cost"); + else { + VTR_ASSERT(val == e_place_quench_metric::SETUP_SLACK); + conv_value.set_value("setup_slack"); + } + return conv_value; + } + + std::vector default_choices() { + return {"auto", "timing_cost", "setup_slack"}; + } +}; + argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) { std::string description = "Implements the specified circuit onto the target FPGA architecture" @@ -1747,6 +1782,17 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .default_value("") .show_in(argparse::ShowIn::HELP_ONLY); + place_timing_grp.add_argument(args.place_quench_metric, "--place_quench_metric") + .help( + "Controls which cost function the placer uses during the quench stage:\n" + " * auto: VPR decides\n" + " * timing_cost: The same cost formulation as the one used during\n" + " the annealing stage (more stable)\n" + " * setup_slack: Directly uses setup slacks (in combination with wiring)\n" + " to check if the block moves should be accepted\n") + .default_value("auto") + .show_in(argparse::ShowIn::HELP_ONLY); + auto& route_grp = parser.add_argument_group("routing options"); route_grp.add_argument(args.max_router_iterations, "--max_router_iterations") diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h index e3e1307823e..5964904072a 100644 --- a/vpr/src/base/read_options.h +++ b/vpr/src/base/read_options.h @@ -130,6 +130,7 @@ struct t_options { argparse::ArgValue place_delay_model; argparse::ArgValue place_delay_model_reducer; argparse::ArgValue allowed_tiles_for_delay_model; + argparse::ArgValue place_quench_metric; /* Router Options */ argparse::ArgValue check_rr_graph; diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 34f08d250f6..9019dacba91 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -885,6 +885,12 @@ enum class e_place_delta_delay_algorithm { DIJKSTRA_EXPANSION, }; +enum class e_place_quench_metric { + TIMING_COST, + SETUP_SLACK, + AUTO +}; + struct t_placer_opts { enum e_place_algorithm place_algorithm; float timing_tradeoff; @@ -933,6 +939,7 @@ struct t_placer_opts { std::string allowed_tiles_for_delay_model; e_place_delta_delay_algorithm place_delta_delay_matrix_calculation_method; + e_place_quench_metric place_quench_metric; }; /* All the parameters controlling the router's operation are in this * diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index b6045c3139c..5df9a124473 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -540,6 +540,8 @@ static void print_resources_utilization(); static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); +static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts); + /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, t_annealing_sched annealing_sched, @@ -887,13 +889,9 @@ void try_place(const t_placer_opts& placer_opts, state.t = 0; /* freeze out */ //Use setup slack analysis if the placer is timing driven - //TODO: make this a command line option to turn on slack analysis - enum e_place_algorithm quench_algorithm; - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - quench_algorithm = SETUP_SLACK_ANALYSIS_PLACE; - } else { - quench_algorithm = BOUNDING_BOX_PLACE; - } + //and the quench metric is SETUP_SLACK. Otherwise, use the + //same cost formulation as the annealing stage + auto quench_algorithm = get_placement_quench_algorithm(placer_opts); /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ @@ -3317,6 +3315,23 @@ static void init_annealing_state(t_annealing_state* state, state->crit_exponent = crit_exponent; } +static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts) { + e_place_algorithm place_algo = placer_opts.place_algorithm; + e_place_quench_metric quench_metric = placer_opts.place_quench_metric; + + if (place_algo == e_place_algorithm::PATH_TIMING_DRIVEN_PLACE) { + if (quench_metric == e_place_quench_metric::AUTO || quench_metric == e_place_quench_metric::TIMING_COST) { + return PATH_TIMING_DRIVEN_PLACE; + } else { + VTR_ASSERT(quench_metric == e_place_quench_metric::SETUP_SLACK); + return SETUP_SLACK_ANALYSIS_PLACE; + } + } else { + VTR_ASSERT(place_algo == e_place_algorithm::BOUNDING_BOX_PLACE); + return BOUNDING_BOX_PLACE; + } +} + bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); } From da55abfc45eeb761c2661ece14d8335e59b47dc1 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 21 Aug 2020 18:58:05 -0400 Subject: [PATCH 15/21] Merged t_placer_costs and t_placer_prev_inverse_costs and added corresponding documentation. --- vpr/src/place/place.cpp | 153 +++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 57 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 5df9a124473..bc97f29cba5 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -92,19 +92,75 @@ struct t_placer_statistics { int success_sum; }; -struct t_placer_costs { - //Although we do nost cost calculations with float's we - //use doubles for the accumulated costs to avoid round-off, - //particularly on large designs where the magnitude of a single - //move's delta cost is small compared to the overall cost. +/** + * @brief Data structure that stores different cost values in the placer. + * + * Although we do cost calculations with float values, we use doubles + * for the accumulated costs to avoid round-off, particularly on large + * designs where the magnitude of a single move's delta cost is small + * compared to the overall cost. + * + * The cost normalization factors are updated upon every temperature change + * in the outer_loop_update_timing_info routine. They are the multiplicative + * inverses of their respective cost values when the routine is called. They + * serve to normalize the trade-off between timing and wirelength (bb). + * + * @param cost The weighted average of the wiring cost and the timing cost. + * @param bb_cost The bounding box cost, aka the wiring cost. + * @param timing_cost The timing cost, which is connection delay * criticality. + * + * @param bb_cost_norm The normalization factor for the wiring cost. + * @param timing_cost_norm The normalization factor for the timing cost, which + * is upper-bounded by the value of MAX_INV_TIMING_COST. + * + * @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity + * with very lax timing constraints, which avoids multiplying by a + * gigantic timing_cost_norm when auto-normalizing. The exact value + * of this cost has relatively little impact, but should not be large + * enough to be on the order of timing costs for normal constraints. + * + * @param place_algorithm Determines how the member values are updated upon + * each temperature change during the placer annealing process. + */ +class t_placer_costs { + public: double cost; double bb_cost; double timing_cost; -}; + double bb_cost_norm; + double timing_cost_norm; + + private: + static constexpr double MAX_INV_TIMING_COST = 1.e9; + enum e_place_algorithm place_algorithm; + + public: + ///@brief Constructor that takes in the current placer algorithm. + t_placer_costs(enum e_place_algorithm algo) + : place_algorithm(algo) { + if (place_algorithm != PATH_TIMING_DRIVEN_PLACE) { + VTR_ASSERT_MSG( + place_algorithm == BOUNDING_BOX_PLACE, + "Must pass a valid placer algorithm into the placer cost structure."); + } + } -struct t_placer_prev_inverse_costs { - double bb_cost; - double timing_cost; + /** + * @brief Mutator: updates the norm factors in the outer loop. + * + * At each temperature change we update these values to be used + * for normalizing the trade-off between timing and wirelength (bb) + */ + void update_norm_factors() { + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + bb_cost_norm = 1 / bb_cost; + ///::quiet_NaN(); - -constexpr double MAX_INV_TIMING_COST = 1.e9; -/* Stops inverse timing cost from going to infinity with very lax timing constraints, - * which avoids multiplying by a gigantic prev_inverse.timing_cost when auto-normalizing. - * The exact value of this cost has relatively little impact, but should not be - * large enough to be on the order of timing costs for normal constraints. */ +constexpr double INVALID_COST = std::numeric_limits::quiet_NaN(); /********************** Variables local to place.c ***************************/ @@ -336,7 +387,6 @@ static e_move_result try_swap(float t, float crit_exponent, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, MoveGenerator& move_generator, SetupTimingInfo* timing_info, @@ -365,7 +415,6 @@ static int check_macro_placement_consistency(); static float starting_t(float crit_exponent, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, int max_moves, float rlim, @@ -461,7 +510,6 @@ static void free_try_swap_arrays(); static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int num_connections, float crit_exponent, int* outer_crit_iter_count, @@ -499,7 +547,6 @@ static void placement_inner_loop(float t, t_placer_statistics* stats, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int* moves_since_cost_recompute, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, @@ -567,8 +614,7 @@ void try_place(const t_placer_opts& placer_opts, outer_crit_iter_count, inner_recompute_limit; float success_rat, first_crit_exponent, first_rlim; - t_placer_costs costs; - t_placer_prev_inverse_costs prev_inverse_costs; + t_placer_costs costs(placer_opts.place_algorithm); tatum::TimingPathInfo critical_path; float sTNS = NAN; @@ -684,26 +730,32 @@ void try_place(const t_placer_opts& placer_opts, outer_crit_iter_count = 1; - prev_inverse_costs.timing_cost = 1 / costs.timing_cost; - prev_inverse_costs.bb_cost = 1 / costs.bb_cost; - costs.cost = 1; /*our new cost function uses normalized values of */ - /*bb_cost and timing_cost, the value of cost will be reset */ - /*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */ - } else { /*BOUNDING_BOX_PLACE */ - costs.cost = costs.bb_cost = comp_bb_cost(NORMAL); - costs.timing_cost = 0; + /** + * Initialize the normalization factors. Calling costs.update_norm_factors() here + * would fail the golden results of strong_multiclock benchmark + */ + costs.timing_cost_norm = 1 / costs.timing_cost; + costs.bb_cost_norm = 1 / costs.bb_cost; + costs.cost = 1; + + } else { //placer_opts.place_algorithm == BOUNDING_BOX_PLACE + costs.bb_cost = comp_bb_cost(NORMAL); + costs.cost = costs.bb_cost; ///bb_cost = 1 / costs->bb_cost; - /*Prevent inverse timing cost from going to infinity */ - prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); + costs->update_norm_factors(); ///bb_cost - + timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost; + delta_c = (1 - timing_tradeoff) * bb_delta_c * costs->bb_cost_norm + + timing_tradeoff * timing_delta_c * costs->timing_cost_norm; } else { //place_algorithm == BOUNDING_BOX_PLACE (wiring cost) delta_c = bb_delta_c; @@ -1757,8 +1796,8 @@ static e_move_result try_swap(float t, } move_outcome_stats.delta_cost_norm = delta_c; - move_outcome_stats.delta_bb_cost_norm = bb_delta_c * prev_inverse_costs->bb_cost; - move_outcome_stats.delta_timing_cost_norm = timing_delta_c * prev_inverse_costs->timing_cost; + move_outcome_stats.delta_bb_cost_norm = bb_delta_c * costs->bb_cost_norm; + move_outcome_stats.delta_timing_cost_norm = timing_delta_c * costs->timing_cost_norm; move_outcome_stats.delta_bb_cost_abs = bb_delta_c; move_outcome_stats.delta_timing_cost_abs = timing_delta_c; From 92c416a5c76ec21f78e33733638862656e1cee1b Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 21 Aug 2020 21:32:23 -0400 Subject: [PATCH 16/21] Reduced down the argument list for starting_t, placement_inner_loop, and try_swap by passing variables using t_annealing_state. Also moved first move_lim determination process to a separate routine --- vpr/src/place/place.cpp | 246 ++++++++++++++++++++++------------------ 1 file changed, 133 insertions(+), 113 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index bc97f29cba5..349fdbd66ac 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -154,25 +154,40 @@ class t_placer_costs { void update_norm_factors() { if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { bb_cost_norm = 1 / bb_cost; - ///> num_blocks) this performs more - //moves (device_size ^ (2/3)) to ensure better optimization. In this case, - //more moves than num_blocks ^ (4/3) may be required, since the search space - //is larger. - float device_size = device_ctx.grid.width() * device_ctx.grid.height(); - move_lim = (int)(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(cluster_ctx.clb_nlist.blocks().size(), 2. / 3.)); - } else { - VPR_ERROR(VPR_ERROR_PLACE, "Unrecognized placer effort scaling"); - } - VTR_LOG("Moves per temperature: %d\n", move_lim); - - /* Sometimes I want to run the router with a random placement. Avoid * - * using 0 moves to stop division by 0 and 0 length vector problems, * - * by setting move_lim to 1 (which is still too small to do any * - * significant optimization). */ - if (move_lim <= 0) - move_lim = 1; + int first_move_lim = get_initial_move_lim(placer_opts, annealing_sched); + int inner_recompute_limit; if (placer_opts.inner_loop_recompute_divider != 0) { - inner_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.inner_loop_recompute_divider); + inner_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.inner_loop_recompute_divider); } else { /*don't do an inner recompute */ - inner_recompute_limit = move_lim + 1; + inner_recompute_limit = first_move_lim + 1; } int quench_recompute_limit; if (placer_opts.quench_recompute_divider != 0) { - quench_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.quench_recompute_divider); + quench_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.quench_recompute_divider); } else { /*don't do an quench recompute */ - quench_recompute_limit = move_lim + 1; + quench_recompute_limit = first_move_lim + 1; } - first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); + /* Get the first range limiter */ + float first_rlim = float(std::max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1)); - float first_t = starting_t(first_crit_exponent, &timing_update_mode, - &costs, - annealing_sched, move_lim, first_rlim, - place_delay_model.get(), - placer_criticalities.get(), - placer_setup_slacks.get(), - timing_info.get(), - *move_generator, - pin_timing_invalidator.get(), - blocks_affected, - placer_opts); + /* Set the temperature high so essentially all swaps will be accepted */ + /* when trying to determine the starting temp for placement inner loop. */ + float first_t = HUGE_POSITIVE_FLOAT; t_annealing_state state; - init_annealing_state(&state, annealing_sched, first_t, first_rlim, move_lim, first_crit_exponent); + init_annealing_state(&state, annealing_sched, first_t, first_rlim, first_move_lim, first_crit_exponent); + + /* Update the starting temperature for placement annealing to a more appropriate value */ + state.t = starting_t(&state, + &timing_update_mode, + &costs, + annealing_sched, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + timing_info.get(), + *move_generator, + pin_timing_invalidator.get(), + blocks_affected, + placer_opts); if (!placer_opts.move_stats_file.empty()) { f_move_stats_file = std::unique_ptr(vtr::fopen(placer_opts.move_stats_file.c_str(), "w"), vtr::fclose); @@ -876,8 +867,8 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), timing_info.get()); - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, - state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, + placement_inner_loop(&state, num_temps, placer_opts, + inner_recompute_limit, &stats, &timing_update_mode, &costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), @@ -921,6 +912,11 @@ void try_place(const t_placer_opts& placer_opts, /* Outer loop of the simmulated annealing ends */ auto pre_quench_timing_stats = timing_ctx.stats; + + /* Start quench */ + state.t = 0; //Freeze out: only accept solutions that improve placement + state.move_lim = first_move_lim; //Revert the move limit to initial value + { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); @@ -935,8 +931,6 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), timing_info.get()); - state.t = 0; /* freeze out */ - //Use setup slack analysis if the placer is timing driven //and the quench metric is SETUP_SLACK. Otherwise, use the //same cost formulation as the annealing stage @@ -944,8 +938,8 @@ void try_place(const t_placer_opts& placer_opts, /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, - move_lim, state.crit_exponent, quench_recompute_limit, &stats, + placement_inner_loop(&state, num_temps, placer_opts, + quench_recompute_limit, &stats, &timing_update_mode, &costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), @@ -957,10 +951,10 @@ void try_place(const t_placer_opts& placer_opts, timing_info.get(), quench_algorithm); - tot_iter += move_lim; + tot_iter += state.move_lim; ++num_temps; - calc_placer_stats(stats, success_rat, std_dev, costs, move_lim); + calc_placer_stats(stats, success_rat, std_dev, costs, state.move_lim); if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { critical_path = timing_info->least_slack_critical_path(); @@ -1202,12 +1196,9 @@ static void update_setup_slacks_and_criticalities(float crit_exponent, } /* Function which contains the inner loop of the simulated annealing */ -static void placement_inner_loop(float t, +static void placement_inner_loop(const t_annealing_state* state, int temp_num, - float rlim, const t_placer_opts& placer_opts, - int move_lim, - float crit_exponent, int inner_recompute_limit, t_placer_statistics* stats, t_placer_timing_update_mode* timing_update_mode, @@ -1234,12 +1225,10 @@ static void placement_inner_loop(float t, inner_crit_iter_count = 1; /* Inner loop begins */ - for (inner_iter = 0; inner_iter < move_lim; inner_iter++) { - e_move_result swap_result = try_swap(t, - crit_exponent, + for (inner_iter = 0; inner_iter < state->move_lim; inner_iter++) { + e_move_result swap_result = try_swap(state, timing_update_mode, costs, - rlim, move_generator, timing_info, pin_timing_invalidator, @@ -1270,7 +1259,7 @@ static void placement_inner_loop(float t, * We do this only once in a while, since it is expensive. */ if (inner_crit_iter_count >= inner_recompute_limit - && inner_iter != move_lim - 1) { /*on last iteration don't recompute */ + && inner_iter != state->move_lim - 1) { /*on last iteration don't recompute */ inner_crit_iter_count = 0; #ifdef VERBOSE @@ -1281,7 +1270,7 @@ static void placement_inner_loop(float t, */ timing_update_mode->do_update_criticalities = true; timing_update_mode->do_update_setup_slacks = true; - update_setup_slacks_and_criticalities(crit_exponent, + update_setup_slacks_and_criticalities(state->crit_exponent, delay_model, criticalities, setup_slacks, @@ -1316,9 +1305,9 @@ static void placement_inner_loop(float t, if (placer_opts.placement_saves_per_temperature >= 1 && inner_iter > 0 - && (inner_iter + 1) % (move_lim / placer_opts.placement_saves_per_temperature) == 0) { + && (inner_iter + 1) % (state->move_lim / placer_opts.placement_saves_per_temperature) == 0) { std::string filename = vtr::string_fmt("placement_%03d_%03d.place", temp_num + 1, inner_placement_save_count); - VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, move_lim, filename.c_str()); + VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, state->move_lim, filename.c_str()); print_place(nullptr, nullptr, filename.c_str()); ++inner_placement_save_count; } @@ -1468,12 +1457,10 @@ static bool update_annealing_state(t_annealing_state* state, return true; } -static float starting_t(float crit_exponent, +static float starting_t(const t_annealing_state* state, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, t_annealing_sched annealing_sched, - int max_moves, - float rlim, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, @@ -1483,31 +1470,22 @@ static float starting_t(float crit_exponent, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts) { /* Finds the starting temperature (hot condition). */ - - int i, num_accepted, move_lim; - double std_dev, av, sum_of_squares; /* Double important to avoid round off */ + int num_accepted = 0; + double std_dev, av = 0, sum_of_squares = 0; /* Double important to avoid round off */ if (annealing_sched.type == USER_SCHED) return (annealing_sched.init_t); auto& cluster_ctx = g_vpr_ctx.clustering(); - move_lim = min(max_moves, (int)cluster_ctx.clb_nlist.blocks().size()); - - num_accepted = 0; - av = 0.; - sum_of_squares = 0.; + /* Determines the block swap loop count. */ + int move_lim = std::min(state->move_lim_max, int(cluster_ctx.clb_nlist.blocks().size())); - /* Try one move per block. Set the temperature high so essentially all accepted. */ - float t = HUGE_POSITIVE_FLOAT; - - for (i = 0; i < move_lim; i++) { + for (int i = 0; i < move_lim; i++) { //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack - e_move_result swap_result = try_swap(t, - crit_exponent, + e_move_result swap_result = try_swap(state, timing_update_mode, costs, - rlim, move_generator, timing_info, pin_timing_invalidator, @@ -1578,11 +1556,9 @@ static void reset_move_nets(int num_nets_affected) { } } -static e_move_result try_swap(float t, - float crit_exponent, +static e_move_result try_swap(const t_annealing_state* state, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - float rlim, MoveGenerator& move_generator, SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, @@ -1612,8 +1588,11 @@ static e_move_result try_swap(float t, //Allow some fraction of moves to not be restricted by rlim, //in the hopes of better escaping local minima + float rlim; if (rlim_escape_fraction > 0. && vtr::frand() < rlim_escape_fraction) { rlim = std::numeric_limits::infinity(); + } else { + rlim = state->rlim; } //Generate a new move (perturbation) used to explore the space of possible placements @@ -1691,7 +1670,7 @@ static e_move_result try_swap(float t, //we need to revert block moves and restore the timing values. timing_update_mode->do_update_criticalities = false; timing_update_mode->do_update_setup_slacks = true; - update_setup_slacks_and_criticalities(crit_exponent, + update_setup_slacks_and_criticalities(state->crit_exponent, delay_model, criticalities, setup_slacks, @@ -1717,7 +1696,7 @@ static e_move_result try_swap(float t, } /* 1 -> move accepted, 0 -> rejected. */ - move_outcome = assess_swap(delta_c, t); + move_outcome = assess_swap(delta_c, state->t); if (move_outcome == ACCEPTED) { costs->cost += delta_c; @@ -1775,7 +1754,7 @@ static e_move_result try_swap(float t, timing_info); /* Revert the timing update */ - update_setup_slacks_and_criticalities(crit_exponent, + update_setup_slacks_and_criticalities(state->crit_exponent, delay_model, criticalities, setup_slacks, @@ -3374,3 +3353,44 @@ static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& pla bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); } + +/** + * @brief Get the initial limit for inner loop block move attempt limit. + * + * There are two ways to scale the move limit. + * e_place_effort_scaling::CIRCUIT + * scales the move limit proportional to num_blocks ^ (4/3) + * e_place_effort_scaling::DEVICE_CIRCUIT + * scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) + * + * The second method is almost identical to the first one when the device + * is highly utilized (device_size ~ num_blocks). For low utilization devices + * (device_size >> num_blocks), the search space is larger, so the second method + * performs more moves to ensure better optimization. + */ + +static int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched) { + const auto& device_ctx = g_vpr_ctx.device(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + auto device_size = device_ctx.grid.width() * device_ctx.grid.height(); + auto num_blocks = cluster_ctx.clb_nlist.blocks().size(); + + int move_lim; + if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { + move_lim = int(annealing_sched.inner_num * pow(num_blocks, 4. / 3.)); + } else { + VTR_ASSERT_MSG( + placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT, + "Unrecognized placer effort scaling"); + + move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.)); + } + + /* Avoid having a non-positive move_lim */ + move_lim = std::max(move_lim, 1); + + VTR_LOG("Moves per temperature: %d\n", move_lim); + + return move_lim; +} From 870eca625355508909e6803ab4c77196c0961300 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 21 Aug 2020 22:20:54 -0400 Subject: [PATCH 17/21] Moved t_placer_costs and t_annealing_state and related routines to placer_util.* files. --- vpr/src/place/place.cpp | 175 +---------------------------------- vpr/src/place/place_util.cpp | 90 ++++++++++++++++++ vpr/src/place/place_util.h | 103 ++++++++++++++++++++- 3 files changed, 192 insertions(+), 176 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 349fdbd66ac..325ba4d3a4c 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -58,10 +58,6 @@ using std::min; * cost computation. 0.01 means that there is a 1% error tolerance. */ #define ERROR_TOL .01 -/* The final rlim (range limit) is 1, which is the smallest value that can * - * still make progress, since an rlim of 0 wouldn't allow any swaps. */ -#define FINAL_RLIM 1 - /* This defines the maximum number of swap attempts before invoking the * * once-in-a-while placement legality check as well as floating point * * variables round-offs check. */ @@ -92,104 +88,6 @@ struct t_placer_statistics { int success_sum; }; -/** - * @brief Data structure that stores different cost values in the placer. - * - * Although we do cost calculations with float values, we use doubles - * for the accumulated costs to avoid round-off, particularly on large - * designs where the magnitude of a single move's delta cost is small - * compared to the overall cost. - * - * The cost normalization factors are updated upon every temperature change - * in the outer_loop_update_timing_info routine. They are the multiplicative - * inverses of their respective cost values when the routine is called. They - * serve to normalize the trade-off between timing and wirelength (bb). - * - * @param cost The weighted average of the wiring cost and the timing cost. - * @param bb_cost The bounding box cost, aka the wiring cost. - * @param timing_cost The timing cost, which is connection delay * criticality. - * - * @param bb_cost_norm The normalization factor for the wiring cost. - * @param timing_cost_norm The normalization factor for the timing cost, which - * is upper-bounded by the value of MAX_INV_TIMING_COST. - * - * @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity - * with very lax timing constraints, which avoids multiplying by a - * gigantic timing_cost_norm when auto-normalizing. The exact value - * of this cost has relatively little impact, but should not be large - * enough to be on the order of timing costs for normal constraints. - * - * @param place_algorithm Determines how the member values are updated upon - * each temperature change during the placer annealing process. - */ -class t_placer_costs { - public: - double cost; - double bb_cost; - double timing_cost; - double bb_cost_norm; - double timing_cost_norm; - - private: - static constexpr double MAX_INV_TIMING_COST = 1.e9; - enum e_place_algorithm place_algorithm; - - public: - ///@brief Constructor that takes in the current placer algorithm. - t_placer_costs(enum e_place_algorithm algo) - : place_algorithm(algo) { - if (place_algorithm != PATH_TIMING_DRIVEN_PLACE) { - VTR_ASSERT_MSG( - place_algorithm == BOUNDING_BOX_PLACE, - "Must pass a valid placer algorithm into the placer cost structure."); - } - } - - /** - * @brief Mutator: updates the norm factors in the outer loop. - * - * At each temperature change we update these values to be used - * for normalizing the trade-off between timing and wirelength (bb) - */ - void update_norm_factors() { - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - bb_cost_norm = 1 / bb_cost; - //Prevent the norm factor from going to infinity - timing_cost_norm = std::min(1 / timing_cost, MAX_INV_TIMING_COST); - cost = 1; //The value of cost will be reset to 1 if timing driven - } else { //place_algorithm == BOUNDING_BOX_PLACE - cost = bb_cost; //The cost value should be identical to the wirelength cost - } - } -}; - -/** - * @brief Stores variables that are used by the annealing process. - * - * This structure is updated by update_annealing_state() on each outer - * loop iteration. It stores various important variables that need to - * be accessed during the placement inner loop. - * - * @param t Temperature for simulated annealing. - * @param rlim Range limit for block swaps. - * @param inverse_delta_rlim Used to update crit_exponent. - * @param alpha Temperature decays factor (multiplied each outer loop iteration). - * @param restart_t Temperature used after restart due to minimum success ratio. - * @param crit_exponent Used by timing-driven placement to "sharpen" the timing criticality. - * @param move_lim_max Maximum block move limit. - * @param move_lim Current block move limit. - */ -struct t_annealing_state { - float t; - float rlim; - float inverse_delta_rlim; - float alpha; - float restart_t; - float crit_exponent; - int move_lim_max; - int move_lim; -}; - struct t_placer_timing_update_mode { /* Determines if slacks/criticalities need to be updated */ bool do_update_criticalities; @@ -593,12 +491,8 @@ static void print_place_status(const size_t num_temps, size_t tot_moves); static void print_resources_utilization(); -static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); - static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts); -static int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched); - /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, t_annealing_sched annealing_sched, @@ -822,8 +716,8 @@ void try_place(const t_placer_opts& placer_opts, /* when trying to determine the starting temp for placement inner loop. */ float first_t = HUGE_POSITIVE_FLOAT; - t_annealing_state state; - init_annealing_state(&state, annealing_sched, first_t, first_rlim, first_move_lim, first_crit_exponent); + /* Initialize annealing state variables */ + t_annealing_state state(annealing_sched, first_t, first_rlim, first_move_lim, first_crit_exponent); /* Update the starting temperature for placement annealing to a more appropriate value */ state.t = starting_t(&state, @@ -1449,7 +1343,7 @@ static bool update_annealing_state(t_annealing_state* state, update_rlim(&state->rlim, success_rat, device_ctx.grid); if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - state->crit_exponent = (1 - (state->rlim - FINAL_RLIM) * state->inverse_delta_rlim) + state->crit_exponent = (1 - (state->rlim - state->final_rlim()) * state->inverse_delta_rlim) * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) + placer_opts.td_place_exp_first; } @@ -3313,26 +3207,6 @@ static void print_resources_utilization() { VTR_LOG("\n"); } -static void init_annealing_state(t_annealing_state* state, - const t_annealing_sched& annealing_sched, - float t, - float rlim, - int move_lim_max, - float crit_exponent) { - state->alpha = annealing_sched.alpha_min; - state->t = t; - state->restart_t = t; - state->rlim = rlim; - state->inverse_delta_rlim = 1 / (rlim - FINAL_RLIM); - state->move_lim_max = std::max(1, move_lim_max); - if (annealing_sched.type == DUSTY_SCHED) { - state->move_lim = std::max(1, (int)(state->move_lim_max * annealing_sched.success_target)); - } else { - state->move_lim = state->move_lim_max; - } - state->crit_exponent = crit_exponent; -} - static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts) { e_place_algorithm place_algo = placer_opts.place_algorithm; e_place_quench_metric quench_metric = placer_opts.place_quench_metric; @@ -3352,45 +3226,4 @@ static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& pla bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); -} - -/** - * @brief Get the initial limit for inner loop block move attempt limit. - * - * There are two ways to scale the move limit. - * e_place_effort_scaling::CIRCUIT - * scales the move limit proportional to num_blocks ^ (4/3) - * e_place_effort_scaling::DEVICE_CIRCUIT - * scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) - * - * The second method is almost identical to the first one when the device - * is highly utilized (device_size ~ num_blocks). For low utilization devices - * (device_size >> num_blocks), the search space is larger, so the second method - * performs more moves to ensure better optimization. - */ - -static int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched) { - const auto& device_ctx = g_vpr_ctx.device(); - const auto& cluster_ctx = g_vpr_ctx.clustering(); - - auto device_size = device_ctx.grid.width() * device_ctx.grid.height(); - auto num_blocks = cluster_ctx.clb_nlist.blocks().size(); - - int move_lim; - if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { - move_lim = int(annealing_sched.inner_num * pow(num_blocks, 4. / 3.)); - } else { - VTR_ASSERT_MSG( - placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT, - "Unrecognized placer effort scaling"); - - move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.)); - } - - /* Avoid having a non-positive move_lim */ - move_lim = std::max(move_lim, 1); - - VTR_LOG("Moves per temperature: %d\n", move_lim); - - return move_lim; -} +} \ No newline at end of file diff --git a/vpr/src/place/place_util.cpp b/vpr/src/place/place_util.cpp index cd010c3a853..307321607dc 100644 --- a/vpr/src/place/place_util.cpp +++ b/vpr/src/place/place_util.cpp @@ -29,3 +29,93 @@ static vtr::Matrix init_grid_blocks() { return grid_blocks; } + +///@brief Constructor: stores current placer algorithm. +t_placer_costs::t_placer_costs(enum e_place_algorithm algo) + : place_algorithm(algo) { + if (place_algorithm != PATH_TIMING_DRIVEN_PLACE) { + VTR_ASSERT_MSG( + place_algorithm == BOUNDING_BOX_PLACE, + "Must pass a valid placer algorithm into the placer cost structure."); + } +} + +/** + * @brief Mutator: updates the norm factors in the outer loop iteration. + * + * At each temperature change we update these values to be used + * for normalizing the trade-off between timing and wirelength (bb) + */ +void t_placer_costs::update_norm_factors() { + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + bb_cost_norm = 1 / bb_cost; + //Prevent the norm factor from going to infinity + timing_cost_norm = std::min(1 / timing_cost, MAX_INV_TIMING_COST); + cost = 1; //The value of cost will be reset to 1 if timing driven + } else { //place_algorithm == BOUNDING_BOX_PLACE + cost = bb_cost; //The cost value should be identical to the wirelength cost + } +} + +///@brief Constructor: Initialize all annealing state variables. +t_annealing_state::t_annealing_state(const t_annealing_sched& annealing_sched, + float first_t, + float first_rlim, + int first_move_lim, + float first_crit_exponent) { + alpha = annealing_sched.alpha_min; + t = first_t; + restart_t = first_t; + rlim = first_rlim; + inverse_delta_rlim = 1 / (first_rlim - FINAL_RLIM); + move_lim_max = first_move_lim; + crit_exponent = first_crit_exponent; + + //Determine the current move_lim based on the schedule type + if (annealing_sched.type == DUSTY_SCHED) { + move_lim = std::max(1, (int)(move_lim_max * annealing_sched.success_target)); + } else { + move_lim = move_lim_max; + } +} + +/** + * @brief Get the initial limit for inner loop block move attempt limit. + * + * There are two ways to scale the move limit. + * e_place_effort_scaling::CIRCUIT + * scales the move limit proportional to num_blocks ^ (4/3) + * e_place_effort_scaling::DEVICE_CIRCUIT + * scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) + * + * The second method is almost identical to the first one when the device + * is highly utilized (device_size ~ num_blocks). For low utilization devices + * (device_size >> num_blocks), the search space is larger, so the second method + * performs more moves to ensure better optimization. + */ + +int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched) { + const auto& device_ctx = g_vpr_ctx.device(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + auto device_size = device_ctx.grid.width() * device_ctx.grid.height(); + auto num_blocks = cluster_ctx.clb_nlist.blocks().size(); + + int move_lim; + if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { + move_lim = int(annealing_sched.inner_num * pow(num_blocks, 4. / 3.)); + } else { + VTR_ASSERT_MSG( + placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT, + "Unrecognized placer effort scaling"); + + move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.)); + } + + /* Avoid having a non-positive move_lim */ + move_lim = std::max(move_lim, 1); + + VTR_LOG("Moves per temperature: %d\n", move_lim); + + return move_lim; +} diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h index f35ec854ac9..151890b668d 100644 --- a/vpr/src/place/place_util.h +++ b/vpr/src/place/place_util.h @@ -1,8 +1,101 @@ -#ifndef PLACE_UTIL_H -#define PLACE_UTIL_H -#include +#pragma once +#include "vpr_types.h" -//Initialize the placement context +///@brief Initialize the placement context void init_placement_context(); -#endif +///@brief Get the initial limit for inner loop block move attempt limit. +int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched); + +/** + * @brief Data structure that stores different cost values in the placer. + * + * Although we do cost calculations with float values, we use doubles + * for the accumulated costs to avoid round-off, particularly on large + * designs where the magnitude of a single move's delta cost is small + * compared to the overall cost. + * + * The cost normalization factors are updated upon every temperature change + * in the outer_loop_update_timing_info routine. They are the multiplicative + * inverses of their respective cost values when the routine is called. They + * serve to normalize the trade-off between timing and wirelength (bb). + * + * @param cost The weighted average of the wiring cost and the timing cost. + * @param bb_cost The bounding box cost, aka the wiring cost. + * @param timing_cost The timing cost, which is connection delay * criticality. + * + * @param bb_cost_norm The normalization factor for the wiring cost. + * @param timing_cost_norm The normalization factor for the timing cost, which + * is upper-bounded by the value of MAX_INV_TIMING_COST. + * + * @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity + * with very lax timing constraints, which avoids multiplying by a + * gigantic timing_cost_norm when auto-normalizing. The exact value + * of this cost has relatively little impact, but should not be large + * enough to be on the order of timing costs for normal constraints. + * + * @param place_algorithm Determines how the member values are updated upon + * each temperature change during the placer annealing process. + */ +class t_placer_costs { + public: + double cost; + double bb_cost; + double timing_cost; + double bb_cost_norm; + double timing_cost_norm; + + private: + static constexpr double MAX_INV_TIMING_COST = 1.e9; + enum e_place_algorithm place_algorithm; + + public: //Constructor + t_placer_costs(enum e_place_algorithm algo); + + public: //Mutator + void update_norm_factors(); +}; + +/** + * @brief Stores variables that are used by the annealing process. + * + * This structure is updated by update_annealing_state() on each outer + * loop iteration. It stores various important variables that need to + * be accessed during the placement inner loop. + * + * @param t Temperature for simulated annealing. + * @param rlim Range limit for block swaps. + * @param inverse_delta_rlim Used to update crit_exponent. + * @param alpha Temperature decays factor (multiplied each outer loop iteration). + * @param restart_t Temperature used after restart due to minimum success ratio. + * @param crit_exponent Used by timing-driven placement to "sharpen" the timing criticality. + * @param move_lim_max Maximum block move limit. + * @param move_lim Current block move limit. + * + * @param FINAL_RLIM The final rlim (range limit) is 1, which is the smallest value that + * can still make progress, since an rlim of 0 wouldn't allow any swaps. + */ +class t_annealing_state { + public: + float t; + float rlim; + float inverse_delta_rlim; + float alpha; + float restart_t; + float crit_exponent; + int move_lim_max; + int move_lim; + + private: + static constexpr float FINAL_RLIM = 1.; + + public: //Constructor + t_annealing_state(const t_annealing_sched& annealing_sched, + float first_t, + float first_rlim, + int first_move_lim, + float first_crit_exponent); + + public: //Accessor + float final_rlim() const { return FINAL_RLIM; } +}; From a2685c7c9ab4c852ddfde25aaff2ac2e0749e06a Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Sun, 23 Aug 2020 19:49:15 -0400 Subject: [PATCH 18/21] Changed major place.cpp data structures from file scope to global scope. Moved delay routines to place_delay_model.*. Moved annealing update routines to place_util.*. Enhanced documentations. --- vpr/src/place/place.cpp | 303 +++++++++------------------- vpr/src/place/place_delay_model.cpp | 105 ++++++++-- vpr/src/place/place_delay_model.h | 73 +++++-- vpr/src/place/place_global.h | 34 ++++ vpr/src/place/place_util.cpp | 82 ++++++++ vpr/src/place/place_util.h | 11 + vpr/src/place/timing_place.cpp | 13 -- vpr/src/place/timing_place.h | 7 - 8 files changed, 367 insertions(+), 261 deletions(-) create mode 100644 vpr/src/place/place_global.h diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 325ba4d3a4c..a6943d5e2e6 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -44,6 +44,8 @@ #include "tatum/echo_writer.hpp" #include "tatum/TimingReporter.hpp" +#include "place_global.h" + using std::max; using std::min; @@ -101,60 +103,103 @@ struct t_placer_timing_update_mode { constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); constexpr double INVALID_COST = std::numeric_limits::quiet_NaN(); -/********************** Variables local to place.c ***************************/ - -/* Cost of a net, and a temporary cost of a net used during move assessment. */ -static vtr::vector net_cost, proposed_net_cost; - -/* [0...cluster_ctx.clb_nlist.nets().size()-1] * - * A flag array to indicate whether the specific bounding box has been updated * - * in this particular swap or not. If it has been updated before, the code * - * must use the updated data, instead of the out-of-date data passed into the * - * subroutine, particularly used in try_swap(). The value NOT_UPDATED_YET * - * indicates that the net has not been updated before, UPDATED_ONCE indicated * - * that the net has been updated once, if it is going to be updated again, the * - * values from the previous update must be used. GOT_FROM_SCRATCH is only * - * applicable for nets larger than SMALL_NETS and it indicates that the * - * particular bounding box cannot be updated incrementally before, hence the * - * bounding box is got from scratch, so the bounding box would definitely be * - * right, DO NOT update again. */ -static vtr::vector bb_updated_before; - -/* - * Net connection delays based on the placement. +/******************************************************************************* + * Below is a list of definitions of data structures declared as `extern` in * + * place_global.h. These variables were originally local to the current file. * + * However, they were moved so as to facilitate moving some of the routines * + * in the current file into other source files. * + *******************************************************************************/ + +/** + * @brief Cost of a net, and a temporary cost of a net used during move assessment. + * + * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector net_cost, proposed_net_cost; + +/** + * @brief A flag array to indicate whether the specific bounding box has + * been updated in this particular swap or not. + * + * If it has been updated before, the code must use the updated data, instead of + * the out-of-date data passed into the subroutine, particularly used in try_swap(). + * + * NOT_UPDATED_YET Indicates that the net has not been updated before. + * UPDATED_ONCE Indicates that the net has been updated once, if it is going to be + * updated again, the values from the previous update must be used. + * GOT_FROM_SCRATCH Only applicable for nets larger than SMALL_NETS. It indicates that + * the particular bounding box cannot be updated incrementally before, + * hence the bounding box is got from scratch, so the bounding box + * would definitely be right, DO NOT update again. + * + * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector bb_updated_before; + +/** + * @brief Net connection delays. + * + * @param connection_delay + * Delays based on the committed block positions. + * @param proposed_connection_delay + * Delays based on the proposed block positions. Only for connections + * affected by the proposed move. Otherwise, INVALID_DELAY. + * * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static ClbNetPinsMatrix connection_delay; //Delays based on committed block positions -static ClbNetPinsMatrix proposed_connection_delay; //Delays for proposed block positions (only - // for connections effected by move, otherwise - // INVALID_DELAY) +ClbNetPinsMatrix connection_delay, proposed_connection_delay; -static ClbNetPinsMatrix connection_setup_slack; //Setup slacks based on most recently updated timing graph +/** + * @brief Net connection setup slacks based on most recently updated timing graph. + * + * Updated with commit_setup_slacks() routine. + * + * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ +ClbNetPinsMatrix connection_setup_slack; -/* - * Timing cost of connections (i.e. criticality * delay). +/** + * @brief Net connection timing costs (i.e. criticality * delay). + * + * @param connection_timing_cost + * Costs of committed block positions. See PlacerTimingCosts. + * @param proposed_connection_timing_cost + * Costs for proposed block positions. Only for connection + * affected by the proposed move. Otherwise, INVALID_DELAY + * * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static PlacerTimingCosts connection_timing_cost; //Costs of committed block positions -static ClbNetPinsMatrix proposed_connection_timing_cost; //Costs for proposed block positions - // (only for connection effected by - // move, otherwise INVALID_DELAY) - -/* - * Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection). - * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1] +PlacerTimingCosts connection_timing_cost; +ClbNetPinsMatrix proposed_connection_timing_cost; + +/** + * @brief Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection). + * + * Like connection_timing_cost, but summed across net pins. Used to allow more + * efficient recalculation of timing cost if only a sub-set of nets are changed + * while maintaining numeric stability. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector net_timing_cost; + +/** + * @brief Store the bounding box coordinates and the number of blocks on each + * of a net's bounding box (to allow efficient updates) respectively. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1] */ -static vtr::vector net_timing_cost; //Like connection_timing_cost, but summed - // accross net pins. Used to allow more - // efficient recalculation of timing cost - // if only a sub-set of nets are changed - // while maintaining numeric stability. +vtr::vector bb_coords, bb_num_on_edges; -/* [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates and the number of * - * blocks on each of a net's bounding box (to allow efficient updates), * - * respectively. */ +/** + * @brief The following arrays are used by the try_swap function for speed. + * + * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector ts_bb_coord_new, ts_bb_edge_new; +std::vector ts_nets_to_update; -static vtr::vector bb_coords, bb_num_on_edges; +/********** End of definitions of variables in place_global.h **********/ /* The arrays below are used to precompute the inverse of the average * * number of tracks per channel between [subhigh] and [sublow]. Access * @@ -167,11 +212,6 @@ static vtr::vector bb_coords, bb_num_on_edges; static float** chanx_place_cost_fac; //[0...device_ctx.grid.width()-2] static float** chany_place_cost_fac; //[0...device_ctx.grid.height()-2] -/* The following arrays are used by the try_swap function for speed. */ -/* [0...cluster_ctx.clb_nlist.nets().size()-1] */ -static vtr::vector ts_bb_coord_new, ts_bb_edge_new; -static std::vector ts_nets_to_update; - /* These file-scoped variables keep track of the number of swaps * * rejected, accepted or aborted. The total number of swap attempts * * is the sum of the three number. */ @@ -336,24 +376,12 @@ static float starting_t(const t_annealing_state* state, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts); -static bool update_annealing_state(t_annealing_state* state, - float success_rat, - const t_placer_costs& costs, - const t_placer_opts& placer_opts, - const t_annealing_sched& annealing_sched); - -static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); - static int count_connections(); static double get_std_dev(int n, double sum_x_squared, double av_x); static double recompute_bb_cost(); -static float comp_td_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin); - -static void comp_td_connection_delays(const PlaceDelayModel* delay_model); - static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks); static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks); @@ -548,8 +576,8 @@ void try_place(const t_placer_opts& placer_opts, num_ts_called = 0; if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - /*do this before the initial placement to avoid messing up the initial placement */ - place_delay_model = alloc_lookups_and_criticalities(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs); + /* Do this before the initial placement to avoid messing up the initial placement */ + place_delay_model = alloc_lookups_and_delay_model(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs); if (isEchoFileEnabled(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)) { place_delay_model->dump_echo(getEchoFileName(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)); @@ -1273,84 +1301,6 @@ static double get_std_dev(int n, double sum_x_squared, double av_x) { return (std_dev); } -static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) { - /* Update the range limited to keep acceptance prob. near 0.44. Use * - * a floating point rlim to allow gradual transitions at low temps. */ - - float upper_lim; - - *rlim = (*rlim) * (1. - 0.44 + success_rat); - upper_lim = max(grid.width() - 1, grid.height() - 1); - *rlim = min(*rlim, upper_lim); - *rlim = max(*rlim, (float)1.); -} - -/* Update the annealing state according to the annealing schedule selected. - * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. - * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. - * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. - * See doc/src/vpr/dusty_sa.rst for more details. - * - * Returns true until the schedule is finished. */ -static bool update_annealing_state(t_annealing_state* state, - float success_rat, - const t_placer_costs& costs, - const t_placer_opts& placer_opts, - const t_annealing_sched& annealing_sched) { - /* Return `false` when the exit criterion is met. */ - if (annealing_sched.type == USER_SCHED) { - state->t *= annealing_sched.alpha_t; - return state->t >= annealing_sched.exit_t; - } - - auto& device_ctx = g_vpr_ctx.device(); - auto& cluster_ctx = g_vpr_ctx.clustering(); - - /* Automatic annealing schedule */ - float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); - - if (annealing_sched.type == DUSTY_SCHED) { - bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets - if (success_rat < annealing_sched.success_min || restart_temp) { - if (state->alpha > annealing_sched.alpha_max) return false; - state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature. - state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay); - } else { - if (success_rat > annealing_sched.success_target) { - state->restart_t = state->t; - } - state->t *= state->alpha; - } - state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat)))); - } else { /* annealing_sched.type == AUTO_SCHED */ - if (success_rat > 0.96) { - state->alpha = 0.5; - } else if (success_rat > 0.8) { - state->alpha = 0.9; - } else if (success_rat > 0.15 || state->rlim > 1.) { - state->alpha = 0.95; - } else { - state->alpha = 0.8; - } - state->t *= state->alpha; - - // Must be duplicated to retain previous behavior - if (state->t < t_exit || std::isnan(t_exit)) return false; - } - - // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk. - // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. - update_rlim(&state->rlim, success_rat, device_ctx.grid); - - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - state->crit_exponent = (1 - (state->rlim - state->final_rlim()) * state->inverse_delta_rlim) - * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) - + placer_opts.td_place_exp_first; - } - - return true; -} - static float starting_t(const t_annealing_state* state, t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, @@ -1806,7 +1756,7 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, //This pin is a net driver on a moved block. //Re-compute all point to point connections for this net. for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { - float temp_delay = comp_td_connection_delay(delay_model, net, ipin); + float temp_delay = comp_td_single_connection_delay(delay_model, net, ipin); proposed_connection_delay[net][ipin] = temp_delay; proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay; @@ -1828,7 +1778,7 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, if (!driven_by_moved_block(net, blocks_affected)) { int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin); - float temp_delay = comp_td_connection_delay(delay_model, net, net_pin); + float temp_delay = comp_td_single_connection_delay(delay_model, net, net_pin); proposed_connection_delay[net][net_pin] = temp_delay; proposed_connection_timing_cost[net][net_pin] = criticalities.criticality(net, net_pin) * temp_delay; @@ -1935,69 +1885,6 @@ static double recompute_bb_cost() { return (cost); } -/*returns the delay of one point to point connection */ -static float comp_td_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& place_ctx = g_vpr_ctx.placement(); - - float delay_source_to_sink = 0.; - - if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { - //Only estimate delay for signals routed through the inter-block - //routing network. TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay." - - ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id); - ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin); - - ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin); - ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin); - - int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin); - int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin); - - int source_x = place_ctx.block_locs[source_block].loc.x; - int source_y = place_ctx.block_locs[source_block].loc.y; - int sink_x = place_ctx.block_locs[sink_block].loc.x; - int sink_y = place_ctx.block_locs[sink_block].loc.y; - - /* Note: This heuristic only considers delta_x and delta_y, a much better heuristic - * would be to to create a more comprehensive lookup table. - * - * In particular this aproach does not accurately capture the effect of fast - * carry-chain connections. - */ - delay_source_to_sink = delay_model->delay(source_x, - source_y, - source_block_ipin, - sink_x, - sink_y, - sink_block_ipin); - if (delay_source_to_sink < 0) { - VPR_ERROR(VPR_ERROR_PLACE, - "in comp_td_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n" - "in comp_td_connection_delay: Delay is less than 0\n", - block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(), - source_x, source_y, - block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(), - sink_x, sink_y, - delay_source_to_sink); - } - } - - return (delay_source_to_sink); -} - -//Recompute all point to point delays, updating connection_delay -static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { - const auto& cluster_ctx = g_vpr_ctx.clustering(); - - for (auto net_id : cluster_ctx.clb_nlist.nets()) { - for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { - connection_delay[net_id][ipin] = comp_td_connection_delay(delay_model, net_id, ipin); - } - } -} - //Commit all the setup slack values from the PlacerSetupSlacks class. //This incremental routine will be correct if and only if it is called //immediately after each time update_setup_slacks_and_criticalities @@ -2212,7 +2099,7 @@ static void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCritic static double comp_td_connection_cost(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, ClusterNetId net, int ipin) { VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins"); - VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_connection_delay(delay_model, net, ipin), + VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_single_connection_delay(delay_model, net, ipin), "Connection delays should already be updated"); double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin]; diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/place_delay_model.cpp index c30f32b3e7d..31486293231 100644 --- a/vpr/src/place/place_delay_model.cpp +++ b/vpr/src/place/place_delay_model.cpp @@ -10,6 +10,8 @@ #include "vtr_math.h" #include "vpr_error.h" +#include "place_global.h" + #ifdef VTR_ENABLE_CAPNPROTO # include "capnp/serialize.h" # include "place_delay_model.capnp.h" @@ -18,10 +20,7 @@ # include "serdes_utils.h" #endif /* VTR_ENABLE_CAPNPROTO */ -/* - * DeltaDelayModel - */ - +///@brief DeltaDelayModel methods. float DeltaDelayModel::delay(int from_x, int from_y, int /*from_pin*/, int to_x, int to_y, int /*to_pin*/) const { int delta_x = std::abs(from_x - to_x); int delta_y = std::abs(from_y - to_y); @@ -46,9 +45,11 @@ void DeltaDelayModel::dump_echo(std::string filepath) const { vtr::fclose(f); } -/* - * OverrideDelayModel - */ +const DeltaDelayModel* OverrideDelayModel::base_delay_model() const { + return base_delay_model_.get(); +} + +///@brief OverrideDelayModel methods. float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const { //First check to if there is an override delay value auto& device_ctx = g_vpr_ctx.device(); @@ -136,18 +137,14 @@ float OverrideDelayModel::get_delay_override(int from_type, int from_class, int return iter->second; } -const DeltaDelayModel* OverrideDelayModel::base_delay_model() const { - return base_delay_model_.get(); -} - void OverrideDelayModel::set_base_delay_model(std::unique_ptr base_delay_model_obj) { base_delay_model_ = std::move(base_delay_model_obj); } -// When writing capnp targetted serialization, always allow compilation when -// VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception -// instead. -// +/** + * When writing capnp targetted serialization, always allow compilation when + * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead. + */ #ifndef VTR_ENABLE_CAPNPROTO # define DISABLE_ERROR \ @@ -300,3 +297,81 @@ void OverrideDelayModel::write(const std::string& file) const { } #endif + +///@brief Initialize the placer delay model. +std::unique_ptr alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + t_det_routing_arch* det_routing_arch, + std::vector& segment_inf, + const t_direct_inf* directs, + const int num_directs) { + return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf, + chan_width_dist, directs, num_directs); +} + +/** + * @brief Returns the delay of one point to point connection. + * + * Only estimate delay for signals routed through the inter-block routing network. + * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay." + */ +float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& place_ctx = g_vpr_ctx.placement(); + + float delay_source_to_sink = 0.; + + if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { + ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id); + ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin); + + ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin); + ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin); + + int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin); + int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin); + + int source_x = place_ctx.block_locs[source_block].loc.x; + int source_y = place_ctx.block_locs[source_block].loc.y; + int sink_x = place_ctx.block_locs[sink_block].loc.x; + int sink_y = place_ctx.block_locs[sink_block].loc.y; + + /** + * This heuristic only considers delta_x and delta_y, a much better + * heuristic would be to to create a more comprehensive lookup table. + * + * In particular this approach does not accurately capture the effect + * of fast carry-chain connections. + */ + delay_source_to_sink = delay_model->delay(source_x, + source_y, + source_block_ipin, + sink_x, + sink_y, + sink_block_ipin); + if (delay_source_to_sink < 0) { + VPR_ERROR(VPR_ERROR_PLACE, + "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n" + "in comp_td_single_connection_delay: Delay is less than 0\n", + block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(), + source_x, source_y, + block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(), + sink_x, sink_y, + delay_source_to_sink); + } + } + + return (delay_source_to_sink); +} + +///@brief Recompute all point to point delays, updating `connection_delay` matrix. +void comp_td_connection_delays(const PlaceDelayModel* delay_model) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + for (auto net_id : cluster_ctx.clb_nlist.nets()) { + for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { + connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, net_id, ipin); + } + } +} \ No newline at end of file diff --git a/vpr/src/place/place_delay_model.h b/vpr/src/place/place_delay_model.h index db22db238ec..55b0558cb49 100644 --- a/vpr/src/place/place_delay_model.h +++ b/vpr/src/place/place_delay_model.h @@ -1,3 +1,9 @@ +/** + * @file + * @brief This file contains all the class and function declarations related to + * the placer delay model. For implementations, see place_delay_model.cpp. + */ + #ifndef PLACE_DELAY_MODEL_H #define PLACE_DELAY_MODEL_H @@ -20,12 +26,30 @@ # define ALWAYS_INLINE inline #endif -//Abstract interface to a placement delay model +///@brief Forward declarations. +class PlaceDelayModel; + +///@brief Initialize the placer delay model. +std::unique_ptr alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist, + const t_placer_opts& place_opts, + const t_router_opts& router_opts, + t_det_routing_arch* det_routing_arch, + std::vector& segment_inf, + const t_direct_inf* directs, + const int num_directs); + +///@brief Returns the delay of one point to point connection. +float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin); + +///@brief Recompute all point to point delays, updating `connection_delay` matrix. +void comp_td_connection_delays(const PlaceDelayModel* delay_model); + +///@brief Abstract interface to a placement delay model. class PlaceDelayModel { public: virtual ~PlaceDelayModel() = default; - // Computes place delay model. + ///@brief Computes place delay model. virtual void compute( RouterDelayProfiler& route_profiler, const t_placer_opts& placer_opts, @@ -33,25 +57,32 @@ class PlaceDelayModel { int longest_length) = 0; - //Returns the delay estimate between the specified block pins - // - // Either compute or read methods must be invoked before invoking - // delay. + /** + * @brief Returns the delay estimate between the specified block pins. + * + * Either compute or read methods must be invoked before invoking delay. + */ virtual float delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const = 0; - //Dumps the delay model to an echo file + ///@brief Dumps the delay model to an echo file. virtual void dump_echo(std::string filename) const = 0; - // Write place delay model to specified file. - // May be unimplemented, in which case method should throw an exception. + /** + * @brief Write place delay model to specified file. + * + * May be unimplemented, in which case method should throw an exception. + */ virtual void write(const std::string& file) const = 0; - // Read place delay model from specified file. - // May be unimplemented, in which case method should throw an exception. + /** + * @brief Read place delay model from specified file. + * + * May be unimplemented, in which case method should throw an exception. + */ virtual void read(const std::string& file) = 0; }; -//A simple delay model based on the distance (delta) between block locations +///@brief A simple delay model based on the distance (delta) between block locations. class DeltaDelayModel : public PlaceDelayModel { public: DeltaDelayModel() {} @@ -109,10 +140,13 @@ class OverrideDelayModel : public PlaceDelayModel { short delta_x; short delta_y; - //A combination of ALWAYS_INLINE attribute and std::lexicographical_compare - //is required for operator< to be inlined by compiler. - //Proper inlining of the function reduces place time by around 5%. - //For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225 + /** + * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare + * is required for operator< to be inlined by compiler. Proper inlining of the + * function reduces place time by around 5%. + * + * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225 + */ friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) { const short* left = reinterpret_cast(&lhs); const short* right = reinterpret_cast(&rhs); @@ -123,8 +157,11 @@ class OverrideDelayModel : public PlaceDelayModel { vtr::flat_map2 delay_overrides_; - //operator< treats memory layout of t_override as an array of short - //this requires all members of t_override are shorts and there is no padding between members of t_override + /** + * operator< treats memory layout of t_override as an array of short. + * This requires all members of t_override are shorts and there is no + * padding between members of t_override. + */ static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)"); static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts"); static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts"); diff --git a/vpr/src/place/place_global.h b/vpr/src/place/place_global.h new file mode 100644 index 00000000000..b9615f2a328 --- /dev/null +++ b/vpr/src/place/place_global.h @@ -0,0 +1,34 @@ +/** + * @file + * @brief This file contains all the global data structures referenced across + * multiple files in ./vpr/src/place. + * + * These global data structures were originally local to place.cpp, and they + * were referenced by a lot of routines local to place.cpp. However, to shorten + * the file size of place.cpp, these routines are moved to other files. + * + * Instead of elongating the argument list of the moved routines, I moved the + * data structures to here so that they can be easily shared across different + * files. + * + * For detailed descriptions on what each data structure stores, please see + * place.cpp, where these variables are defined. + */ + +#pragma once +#include +#include "vtr_vector.h" +#include "vpr_net_pins_matrix.h" +#include "timing_place.h" + +extern vtr::vector net_cost, proposed_net_cost; +extern vtr::vector bb_updated_before; +extern ClbNetPinsMatrix connection_delay; +extern ClbNetPinsMatrix proposed_connection_delay; +extern ClbNetPinsMatrix connection_setup_slack; +extern PlacerTimingCosts connection_timing_cost; +extern ClbNetPinsMatrix proposed_connection_timing_cost; +extern vtr::vector net_timing_cost; +extern vtr::vector bb_coords, bb_num_on_edges; +extern vtr::vector ts_bb_coord_new, ts_bb_edge_new; +extern std::vector ts_nets_to_update; \ No newline at end of file diff --git a/vpr/src/place/place_util.cpp b/vpr/src/place/place_util.cpp index 307321607dc..b06a5b8c9ef 100644 --- a/vpr/src/place/place_util.cpp +++ b/vpr/src/place/place_util.cpp @@ -2,6 +2,7 @@ #include "globals.h" static vtr::Matrix init_grid_blocks(); +static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); void init_placement_context() { auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -119,3 +120,84 @@ int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sch return move_lim; } + +/** + * @brief Update the annealing state according to the annealing schedule selected. + * + * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. + * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. + * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. + * See doc/src/vpr/dusty_sa.rst for more details. + * + * Returns true until the schedule is finished. + */ +bool update_annealing_state(t_annealing_state* state, + float success_rat, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched) { + /* Return `false` when the exit criterion is met. */ + if (annealing_sched.type == USER_SCHED) { + state->t *= annealing_sched.alpha_t; + return state->t >= annealing_sched.exit_t; + } + + auto& device_ctx = g_vpr_ctx.device(); + auto& cluster_ctx = g_vpr_ctx.clustering(); + + /* Automatic annealing schedule */ + float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); + + if (annealing_sched.type == DUSTY_SCHED) { + bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets + if (success_rat < annealing_sched.success_min || restart_temp) { + if (state->alpha > annealing_sched.alpha_max) return false; + state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature. + state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay); + } else { + if (success_rat > annealing_sched.success_target) { + state->restart_t = state->t; + } + state->t *= state->alpha; + } + state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat)))); + } else { /* annealing_sched.type == AUTO_SCHED */ + if (success_rat > 0.96) { + state->alpha = 0.5; + } else if (success_rat > 0.8) { + state->alpha = 0.9; + } else if (success_rat > 0.15 || state->rlim > 1.) { + state->alpha = 0.95; + } else { + state->alpha = 0.8; + } + state->t *= state->alpha; + + // Must be duplicated to retain previous behavior + if (state->t < t_exit || std::isnan(t_exit)) return false; + } + + // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk. + // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. + update_rlim(&state->rlim, success_rat, device_ctx.grid); + + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + state->crit_exponent = (1 - (state->rlim - state->final_rlim()) * state->inverse_delta_rlim) + * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) + + placer_opts.td_place_exp_first; + } + + return true; +} + +/** + * @brief Update the range limited to keep acceptance prob. near 0.44. + * + * Use a floating point rlim to allow gradual transitions at low temps. + */ +static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) { + float upper_lim = std::max(grid.width() - 1, grid.height() - 1); + + *rlim *= (1. - 0.44 + success_rat); + *rlim = std::max(std::min(*rlim, upper_lim), 1.f); +} \ No newline at end of file diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h index 151890b668d..46887954c3e 100644 --- a/vpr/src/place/place_util.h +++ b/vpr/src/place/place_util.h @@ -1,12 +1,23 @@ #pragma once #include "vpr_types.h" +///@brief Forward declarations. +class t_placer_costs; +class t_annealing_state; + ///@brief Initialize the placement context void init_placement_context(); ///@brief Get the initial limit for inner loop block move attempt limit. int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched); +///@brief Update the annealing state according to the annealing schedule selected. +bool update_annealing_state(t_annealing_state* state, + float success_rat, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched); + /** * @brief Data structure that stores different cost values in the placer. * diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp index f7d940dfd5f..4593777ce15 100644 --- a/vpr/src/place/timing_place.cpp +++ b/vpr/src/place/timing_place.cpp @@ -175,16 +175,3 @@ void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float val PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { return vtr::make_range(cluster_pins_with_modified_setup_slack_); } - -/**************************************/ - -std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - const t_direct_inf* directs, - const int num_directs) { - return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf, - chan_width_dist, directs, num_directs); -} diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index d37983730f5..b88e72af2c2 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -7,13 +7,6 @@ #include "place_delay_model.h" #include "vpr_net_pins_matrix.h" -std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist, - const t_placer_opts& place_opts, - const t_router_opts& router_opts, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - const t_direct_inf* directs, - const int num_directs); /* Usage * ===== * PlacerCriticalities returns the clustered netlist connection criticalities used by From cc4488e3511284453b386ae19f431eeca9a67760 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 24 Aug 2020 03:05:33 -0400 Subject: [PATCH 19/21] Moved timing update routines from place.cpp to place_timing_update.*. Enhanced documentation. --- libs/libvtrutil/src/vtr_vec_id_set.h | 1 + vpr/src/place/place.cpp | 342 ++------------------------ vpr/src/place/place_timing_update.cpp | 326 ++++++++++++++++++++++++ vpr/src/place/place_timing_update.h | 94 +++++++ 4 files changed, 442 insertions(+), 321 deletions(-) create mode 100644 vpr/src/place/place_timing_update.cpp create mode 100644 vpr/src/place/place_timing_update.h diff --git a/libs/libvtrutil/src/vtr_vec_id_set.h b/libs/libvtrutil/src/vtr_vec_id_set.h index 9e0a1f0802e..ed6620b1cdd 100644 --- a/libs/libvtrutil/src/vtr_vec_id_set.h +++ b/libs/libvtrutil/src/vtr_vec_id_set.h @@ -2,6 +2,7 @@ #define VTR_SET_H #include +#include namespace vtr { diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index a6943d5e2e6..82de9ede639 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -1,3 +1,8 @@ +/** + * @file place.cpp + * @brief This is a core file that defines the major placer routines used by VPR. + */ + #include #include #include @@ -45,6 +50,7 @@ #include "tatum/TimingReporter.hpp" #include "place_global.h" +#include "place_timing_update.h" using std::max; using std::min; @@ -89,17 +95,6 @@ struct t_placer_statistics { sum_of_squares; int success_sum; }; - -struct t_placer_timing_update_mode { - /* Determines if slacks/criticalities need to be updated */ - bool do_update_criticalities; - bool do_update_setup_slacks; - - /* Determines if slacks/criticalities need to be recomputed from scratch */ - bool do_recompute_criticalities; - bool do_recompute_setup_slacks; -}; - constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); constexpr double INVALID_COST = std::numeric_limits::quiet_NaN(); @@ -232,11 +227,6 @@ static const float cross_count[50] = {/* [0..49] */ 1.0, 1.0, 1.0, 1.0828, 1.153 2.5064, 2.5356, 2.5610, 2.5864, 2.6117, 2.6371, 2.6625, 2.6887, 2.7148, 2.7410, 2.7671, 2.7933}; -static float f_update_td_costs_connections_elapsed_sec = 0.; -static float f_update_td_costs_nets_elapsed_sec = 0.; -static float f_update_td_costs_sum_nets_elapsed_sec = 0.; -static float f_update_td_costs_total_elapsed_sec = 0.; - std::unique_ptr f_move_stats_file(nullptr, vtr::fclose); #ifdef VTR_ENABLE_DEBUG_LOGGING @@ -382,10 +372,6 @@ static double get_std_dev(int n, double sum_x_squared, double av_x); static double recompute_bb_cost(); -static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks); - -static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks); - static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); @@ -396,14 +382,6 @@ static void invalidate_affected_connection_delays(const std::vector& sink_pins_affected); @@ -456,24 +434,6 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info); -static void initialize_timing_info(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_timing_update_mode* timing_update_mode, - t_placer_costs* costs); - -static void update_setup_slacks_and_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_timing_update_mode* timing_update_mode, - t_placer_costs* costs); - static void placement_inner_loop(const t_annealing_state* state, int temp_num, const t_placer_opts& placer_opts, @@ -920,8 +880,8 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs - timing_update_mode.do_update_criticalities = true; - timing_update_mode.do_update_setup_slacks = true; + timing_update_mode.update_criticalities = true; + timing_update_mode.update_setup_slacks = true; update_setup_slacks_and_criticalities(state.crit_exponent, place_delay_model.get(), placer_criticalities.get(), @@ -981,7 +941,13 @@ void try_place(const t_placer_opts& placer_opts, print_timing_stats("Placement Quench", post_quench_timing_stats, pre_quench_timing_stats); print_timing_stats("Placement Total ", timing_ctx.stats, pre_place_timing_stats); - VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", f_update_td_costs_connections_elapsed_sec, f_update_td_costs_nets_elapsed_sec, f_update_td_costs_sum_nets_elapsed_sec, f_update_td_costs_total_elapsed_sec); + auto update_td_costs_runtime_stats = get_update_td_costs_runtime_stats(); + + VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", + update_td_costs_runtime_stats.connections_elapsed_sec, + update_td_costs_runtime_stats.nets_elapsed_sec, + update_td_costs_runtime_stats.sum_nets_elapsed_sec, + update_td_costs_runtime_stats.total_elapsed_sec); } /* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */ @@ -1011,8 +977,8 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, VTR_ASSERT(num_connections > 0); //Update all timing information - timing_update_mode->do_update_criticalities = true; - timing_update_mode->do_update_setup_slacks = true; + timing_update_mode->update_criticalities = true; + timing_update_mode->update_setup_slacks = true; update_setup_slacks_and_criticalities(crit_exponent, delay_model, criticalities, @@ -1032,91 +998,6 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, costs->update_norm_factors(); ///do_update_criticalities = true; - timing_update_mode->do_update_setup_slacks = true; - timing_update_mode->do_recompute_criticalities = true; - timing_update_mode->do_recompute_setup_slacks = true; - - //As a safety measure, for the first time update, - //invalidate all timing edges via the pin invalidator - //by passing in all the clb sink pins - for (ClusterNetId net_id : clb_nlist.nets()) { - for (ClusterPinId pin_id : clb_nlist.net_sinks(net_id)) { - pin_timing_invalidator->invalidate_connection(pin_id, timing_info); - } - } - - //Perform timing info update - update_setup_slacks_and_criticalities(crit_exponent, - delay_model, - criticalities, - setup_slacks, - pin_timing_invalidator, - timing_info, - timing_update_mode, - costs); - - //Initialize the data structure that stores committed placer setup slacks - commit_setup_slacks(setup_slacks); - - //Don't warn again about unconstrained nodes again during placement - timing_info->set_warn_unconstrained(false); -} - -//Update timing information based on current placement by running STA. -//Record the new slack information as well as calculate the updated -//criticalities and timing costs (based on the new setup slacks) -static void update_setup_slacks_and_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - PlacerSetupSlacks* setup_slacks, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_timing_update_mode* timing_update_mode, - t_placer_costs* costs) { - //Run STA to update slacks and adjusted/relaxed criticalities - timing_info->update(); - - if (timing_update_mode->do_update_setup_slacks) { - //Update placer's setup slacks - setup_slacks->update_setup_slacks(timing_info, timing_update_mode->do_recompute_setup_slacks); - } - - if (timing_update_mode->do_update_criticalities) { - //Update placer's criticalities (e.g. sharpen with crit_exponent) - criticalities->update_criticalities(timing_info, crit_exponent, timing_update_mode->do_recompute_criticalities); - - //Update connection, net and total timing costs based on new criticalities -#ifdef INCR_COMP_TD_COSTS - update_td_costs(delay_model, *criticalities, &costs->timing_cost); -#else - comp_td_costs(delay_model, *criticalities, &costs->timing_cost); -#endif - } - - //Setup slacks and criticalities need to be in sync with the timing_info. - //if they are to be incrementally updated on the next iteration. - //Otherwise, a re-computation for all clb sink pins is required. - timing_update_mode->do_recompute_setup_slacks = !timing_update_mode->do_update_setup_slacks; - timing_update_mode->do_recompute_criticalities = !timing_update_mode->do_update_criticalities; - - //Clear invalidation state - pin_timing_invalidator->reset(); -} - /* Function which contains the inner loop of the simulated annealing */ static void placement_inner_loop(const t_annealing_state* state, int temp_num, @@ -1190,8 +1071,8 @@ static void placement_inner_loop(const t_annealing_state* state, /* Using the delays in connection_delay, do a timing analysis to update slacks and * criticalities and update the timing cost since they will change. */ - timing_update_mode->do_update_criticalities = true; - timing_update_mode->do_update_setup_slacks = true; + timing_update_mode->update_criticalities = true; + timing_update_mode->update_setup_slacks = true; update_setup_slacks_and_criticalities(state->crit_exponent, delay_model, criticalities, @@ -1512,8 +1393,8 @@ static e_move_result try_swap(const t_annealing_state* state, //and committing the timing driven delays and costs. //If we wish to revert this timing update due to move rejection, //we need to revert block moves and restore the timing values. - timing_update_mode->do_update_criticalities = false; - timing_update_mode->do_update_setup_slacks = true; + timing_update_mode->update_criticalities = false; + timing_update_mode->update_setup_slacks = true; update_setup_slacks_and_criticalities(state->crit_exponent, delay_model, criticalities, @@ -1885,38 +1766,6 @@ static double recompute_bb_cost() { return (cost); } -//Commit all the setup slack values from the PlacerSetupSlacks class. -//This incremental routine will be correct if and only if it is called -//immediately after each time update_setup_slacks_and_criticalities -//updates the setup slacks (i.e. do_update_setup_slacks = true) -static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { - const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - //Incremental: only go through sink pins with modified setup slack - auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); - for (ClusterPinId pin_id : clb_pins_modified) { - ClusterNetId net_id = clb_nlist.pin_net(pin_id); - size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); - - connection_setup_slack[net_id][pin_index_in_net] = setup_slacks->setup_slack(net_id, pin_index_in_net); - } -} - -static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { - const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - //Go through every single sink pin to check that the slack values are the same - for (ClusterNetId net_id : clb_nlist.nets()) { - for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { - if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) { - return false; - } - } - } - - return true; -} - /* Update the connection_timing_cost values from the temporary * * values for all connections that have changed. */ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { @@ -1991,155 +1840,6 @@ static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_b return false; } -//Incrementally updates timing cost based on the current delays and criticality estimates -// Unlike comp_td_costs() this only updates connections who's criticality has changed; -// this is a superset of those connections who's delay has changed. -// -// For a from-scratch recalculation see comp_td_cost() -static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { - /* NB: We must be careful calculating the total timing cost incrementally, - * due to limitd floating point precision, so that we get a - * bit-identical result matching that calculated by comp_td_costs(). - * - * In particular, we can not simply calculate the incremental - * delta's caused by changed connection timing costs and adjust - * the timing cost. Due to limited precision, the results of - * floating point math operations are order dependant and we - * would get a different result. - * - * To get around this, we calculate the timing costs hierarchically - * to ensures we calculate the sum with the same order of operations - * as comp_td_costs(). - * - * See PlacerTimingCosts object used to represent connection_timing_costs - * for details. - */ - vtr::Timer t; - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& clb_nlist = cluster_ctx.clb_nlist; - - //Update the modified pin timing costs - { - vtr::Timer timer; - auto clb_pins_modified = place_crit.pins_with_modified_criticality(); - for (ClusterPinId clb_pin : clb_pins_modified) { - if (clb_nlist.pin_type(clb_pin) == PinType::DRIVER) continue; - - ClusterNetId clb_net = clb_nlist.pin_net(clb_pin); - VTR_ASSERT_SAFE(clb_net); - - if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue; - - int ipin = clb_nlist.pin_net_index(clb_pin); - VTR_ASSERT_SAFE(ipin >= 1 && ipin < int(clb_nlist.net_pins(clb_net).size())); - - double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin); - - //Record new value - connection_timing_cost[clb_net][ipin] = new_timing_cost; - } - - f_update_td_costs_connections_elapsed_sec += timer.elapsed_sec(); - } - - //Re-total timing costs of all nets - { - vtr::Timer timer; - *timing_cost = connection_timing_cost.total_cost(); - f_update_td_costs_sum_nets_elapsed_sec += timer.elapsed_sec(); - } - -#ifdef VTR_ASSERT_DEBUG_ENABLED - double check_timing_cost = 0.; - comp_td_costs(delay_model, place_crit, &check_timing_cost); - VTR_ASSERT_DEBUG_MSG(check_timing_cost == *timing_cost, - "Total timing cost calculated incrementally in update_td_costs() is " - "not consistent with value calculated from scratch in comp_td_costs()"); -#endif - f_update_td_costs_total_elapsed_sec += t.elapsed_sec(); -} - -//Recomputes timing cost from scratch based on the current delays and criticality estimates -// -// For a more efficient incremental update see update_td_costs() -static void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { - /* Computes the cost (from scratch) from the delays and criticalities * - * of all point to point connections, we define the timing cost of * - * each connection as criticality*delay. */ - - /* NB: We calculate the timing cost in a hierarchicl manner (first connectsion, - * then nets, then sum of nets) in order to allow it to be incrementally - * while avoiding round-off effects. See update_td_costs() for details. - */ - - auto& cluster_ctx = g_vpr_ctx.clustering(); - - for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */ - - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; - - for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { - float conn_timing_cost = comp_td_connection_cost(delay_model, place_crit, net_id, ipin); - - //Record new value - connection_timing_cost[net_id][ipin] = conn_timing_cost; - } - - //Store net timing cost for more efficient incremental updating - net_timing_cost[net_id] = sum_td_net_cost(net_id); - } - - /* Make sure timing cost does not go above MIN_TIMING_COST. */ - *timing_cost = sum_td_costs(); -} - -//Calculates the timing cost of the specified connection. -// Updates the value in connection_timing_cost -// Assumes only be called from compt_td_cost() or update_td_costs() -static double comp_td_connection_cost(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, ClusterNetId net, int ipin) { - VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins"); - - VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_single_connection_delay(delay_model, net, ipin), - "Connection delays should already be updated"); - - double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin]; - - VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_delay[net][ipin]), - "Propsoed connection delay should already be invalidated"); - - VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_timing_cost[net][ipin]), - "Proposed connection timing cost should already be invalidated"); - - return conn_timing_cost; -} - -//Returns the timing cost of the specified 'net' based on the values in connection_timing_cost -static double sum_td_net_cost(ClusterNetId net) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - - double net_td_cost = 0; - for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { - net_td_cost += connection_timing_cost[net][ipin]; - } - - return net_td_cost; -} - -//Returns the total timing cost accross all nets based on the values in net_timing_cost -static double sum_td_costs() { - auto& cluster_ctx = g_vpr_ctx.clustering(); - - double td_cost = 0; - for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */ - - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; - - td_cost += net_timing_cost[net_id]; - } - - return td_cost; -} - /* Finds the cost from scratch. Done only when the placement * * has been radically changed (i.e. after initial placement). * * Otherwise find the cost change incrementally. If method * diff --git a/vpr/src/place/place_timing_update.cpp b/vpr/src/place/place_timing_update.cpp new file mode 100644 index 00000000000..bbcc5c9bb22 --- /dev/null +++ b/vpr/src/place/place_timing_update.cpp @@ -0,0 +1,326 @@ +/** + * @file place_timing_update.cpp + * @brief Defines the routines declared in place_timing_update.h. + */ + +#include "vtr_time.h" + +#include "place_timing_update.h" +#include "place_global.h" + +///@brief Use an incremental approach to updating timing costs after re-computing criticalities +static constexpr bool INCR_COMP_TD_COSTS = true; + +///@brief File-scope variable that can be accessed via the routine get_udpate_td_costs_runtime_stats(). +static t_update_td_costs_stats update_td_costs_stats; + +///@brief Routines local to place_timing_update.cpp +static double comp_td_connection_cost(const PlaceDelayModel* delay_model, + const PlacerCriticalities& place_crit, + ClusterNetId net, + int ipin); +static double sum_td_net_cost(ClusterNetId net); +static double sum_td_costs(); + +/** + * @brief Initialize the timing information and structures in the placer. + * + * Perform first time update on the timing graph, and initialize the values within + * PlacerCriticalities, PlacerSetupSlacks, and connection_timing_cost. + */ +void initialize_timing_info(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Initialize the timing update mode. Update both + //setup slacks and criticalities from scratch + timing_update_mode->update_criticalities = true; + timing_update_mode->update_setup_slacks = true; + timing_update_mode->recompute_criticalities = true; + timing_update_mode->recompute_setup_slacks = true; + + //As a safety measure, for the first time update, + //invalidate all timing edges via the pin invalidator + //by passing in all the clb sink pins + for (ClusterNetId net_id : clb_nlist.nets()) { + for (ClusterPinId pin_id : clb_nlist.net_sinks(net_id)) { + pin_timing_invalidator->invalidate_connection(pin_id, timing_info); + } + } + + //Perform timing info update + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + //Compute timing cost from scratch + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + + //Initialize the data structure that stores committed placer setup slacks + commit_setup_slacks(setup_slacks); + + //Don't warn again about unconstrained nodes again during placement + timing_info->set_warn_unconstrained(false); +} + +/** + * @brief Update timing info based on the current block positions. + * + * Update the values stored in PlacerCriticalities and PlacerSetupSlacks. + * This routine tries its best to be incremental when it comes to updating + * these values, and branching variables are stored in `timing_update_mode`. + * For a detailed description of how these variables work, please refer to + * the declaration documentation on t_placer_timing_update_mode. + * + * If criticalities are updated, the timing costs are updated as well. + * Calling this routine to update timing_cost will produce round-off error + * in the long run, so this value will be recomputed once in a while, via + * other timing driven routines. + * + * All the pins with changed connection delays have already been added into + * the ClusteredPinTimingInvalidator to allow incremental STA update. These + * changed connection delays are a direct result of moved blocks in try_swap(). + */ +void update_setup_slacks_and_criticalities(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs) { + //Run STA to update slacks and adjusted/relaxed criticalities + timing_info->update(); + + if (timing_update_mode->update_setup_slacks) { + //Update placer's setup slacks + setup_slacks->update_setup_slacks(timing_info, timing_update_mode->recompute_setup_slacks); + } + + if (timing_update_mode->update_criticalities) { + //Update placer's criticalities (e.g. sharpen with crit_exponent) + criticalities->update_criticalities(timing_info, crit_exponent, timing_update_mode->recompute_criticalities); + + //Update connection, net and total timing costs based on new criticalities + if (INCR_COMP_TD_COSTS) { + update_td_costs(delay_model, *criticalities, &costs->timing_cost); + } else { + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + } + } + + //Setup slacks and criticalities need to be in sync with the timing_info. + //if they are to be incrementally updated on the next iteration. + //Otherwise, a re-computation for all clb sink pins is required. + timing_update_mode->recompute_setup_slacks = !timing_update_mode->update_setup_slacks; + timing_update_mode->recompute_criticalities = !timing_update_mode->update_criticalities; + + //Clear invalidation state + pin_timing_invalidator->reset(); +} + +/** + * @brief Incrementally updates timing cost based on the current delays and criticality estimates. + * + * Unlike comp_td_costs(), this only updates connections who's criticality has changed. + * This is a superset of those connections whose connection delay has changed. For a + * from-scratch recalculation, refer to comp_td_cost(). + * + * We must be careful calculating the total timing cost incrementally, due to limited + * floating point precision, so that we get a bit-identical result matching the one + * calculated by comp_td_costs(). + * + * In particular, we can not simply calculate the incremental delta's caused by changed + * connection timing costs and adjust the timing cost. Due to limited precision, the results + * of floating point math operations are order dependant and we would get a different result. + * + * To get around this, we calculate the timing costs hierarchically, to ensure that we + * calculate the sum with the same order of operations as comp_td_costs(). + * + * See PlacerTimingCosts object used to represent connection_timing_costs for details. + */ +void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { + vtr::Timer t; + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; + + //Update the modified pin timing costs + { + vtr::Timer timer; + auto clb_pins_modified = place_crit.pins_with_modified_criticality(); + for (ClusterPinId clb_pin : clb_pins_modified) { + if (clb_nlist.pin_type(clb_pin) == PinType::DRIVER) continue; + + ClusterNetId clb_net = clb_nlist.pin_net(clb_pin); + VTR_ASSERT_SAFE(clb_net); + + if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue; + + int ipin = clb_nlist.pin_net_index(clb_pin); + VTR_ASSERT_SAFE(ipin >= 1 && ipin < int(clb_nlist.net_pins(clb_net).size())); + + double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin); + + //Record new value + connection_timing_cost[clb_net][ipin] = new_timing_cost; + } + + update_td_costs_stats.connections_elapsed_sec += timer.elapsed_sec(); + } + + //Re-total timing costs of all nets + { + vtr::Timer timer; + *timing_cost = connection_timing_cost.total_cost(); + update_td_costs_stats.sum_nets_elapsed_sec += timer.elapsed_sec(); + } + +#ifdef VTR_ASSERT_DEBUG_ENABLED + double check_timing_cost = 0.; + comp_td_costs(delay_model, place_crit, &check_timing_cost); + VTR_ASSERT_DEBUG_MSG(check_timing_cost == *timing_cost, + "Total timing cost calculated incrementally in update_td_costs() is " + "not consistent with value calculated from scratch in comp_td_costs()"); +#endif + update_td_costs_stats.total_elapsed_sec += t.elapsed_sec(); +} + +/** + * @brief Recomputes timing cost from scratch based on the current delays and criticality estimates. + * + * Computes the cost (from scratch) from the delays and criticalities of all point to point + * connections, we define the timing cost of each connection as criticality * delay. + * + * We calculate the timing cost in a hierarchical manner (first connection, then nets, then + * sum of nets) in order to allow it to be incremental while avoiding round-off effects. + * + * For a more efficient incremental update, see update_td_costs(). + */ +void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + + for (auto net_id : cluster_ctx.clb_nlist.nets()) { + if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; + + for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { + float conn_timing_cost = comp_td_connection_cost(delay_model, place_crit, net_id, ipin); + + /* Record new value */ + connection_timing_cost[net_id][ipin] = conn_timing_cost; + } + /* Store net timing cost for more efficient incremental updating */ + net_timing_cost[net_id] = sum_td_net_cost(net_id); + } + /* Make sure timing cost does not go above MIN_TIMING_COST. */ + *timing_cost = sum_td_costs(); +} + +/** + * @brief Calculates the timing cost of the specified connection. + * + * This routine assumes that it is only called either compt_td_cost() or + * update_td_costs(). Otherwise, various assertions below would fail. + */ +static double comp_td_connection_cost(const PlaceDelayModel* delay_model, + const PlacerCriticalities& place_crit, + ClusterNetId net, + int ipin) { + VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins"); + + VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_single_connection_delay(delay_model, net, ipin), + "Connection delays should already be updated"); + + double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin]; + + VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_delay[net][ipin]), + "Propsoed connection delay should already be invalidated"); + + VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_timing_cost[net][ipin]), + "Proposed connection timing cost should already be invalidated"); + + return conn_timing_cost; +} + +///@brief Returns the timing cost of the specified 'net' based on the values in connection_timing_cost. +static double sum_td_net_cost(ClusterNetId net) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + + double net_td_cost = 0; + for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { + net_td_cost += connection_timing_cost[net][ipin]; + } + + return net_td_cost; +} + +///@brief Returns the total timing cost accross all nets based on the values in net_timing_cost. +static double sum_td_costs() { + auto& cluster_ctx = g_vpr_ctx.clustering(); + + double td_cost = 0; + for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */ + + if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; + + td_cost += net_timing_cost[net_id]; + } + + return td_cost; +} + +/** + * @brief Commit all the setup slack values from the PlacerSetupSlacks + * class to a vtr matrix. + * + * This incremental routine will be correct if and only if it is called + * immediately after each time update_setup_slacks_and_criticalities + * updates the setup slacks (i.e. update_setup_slacks = true). + */ +void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Incremental: only go through sink pins with modified setup slack + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId pin_id : clb_pins_modified) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + + connection_setup_slack[net_id][pin_index_in_net] = setup_slacks->setup_slack(net_id, pin_index_in_net); + } +} + +/** + * @brief Verify that the values in the vtr matrix matches the PlacerSetupSlacks class. + * + * Return true if all values are identical. Otherwise, return false. + */ +bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Go through every single sink pin to check that the slack values are the same + for (ClusterNetId net_id : clb_nlist.nets()) { + for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { + if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) { + return false; + } + } + } + return true; +} + +///@brief Fetch the file-scope variable update_td_costs_stats in timing_place.cpp. +t_update_td_costs_stats get_update_td_costs_runtime_stats() { + return update_td_costs_stats; +} diff --git a/vpr/src/place/place_timing_update.h b/vpr/src/place/place_timing_update.h new file mode 100644 index 00000000000..fa5a47e8727 --- /dev/null +++ b/vpr/src/place/place_timing_update.h @@ -0,0 +1,94 @@ +/** + * @file place_timing_update.h + * @brief Stores timing update routines declarations used by the VPR placer. + */ +#pragma once +#include "timing_place.h" +#include "place_util.h" + +/// Date: Tue, 25 Aug 2020 03:46:10 -0400 Subject: [PATCH 20/21] Enchanced documentation for timing_place.*. Moved chanx, chany 2d arrays to the placement global file. ALso fixed a bug with in class static constexpr variable compilation issue. --- vpr/src/place/place.cpp | 35 ++- vpr/src/place/place_global.h | 4 +- vpr/src/place/place_timing_update.cpp | 42 +++- vpr/src/place/place_util.cpp | 12 +- vpr/src/place/place_util.h | 14 +- vpr/src/place/timing_place.cpp | 109 +++++---- vpr/src/place/timing_place.h | 307 +++++++++++++++++--------- 7 files changed, 358 insertions(+), 165 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 82de9ede639..e82753bef59 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -186,6 +186,28 @@ vtr::vector net_timing_cost; */ vtr::vector bb_coords, bb_num_on_edges; +/** + * @brief 2D arrays used to precompute the inverse of the average + * number of tracks per channel between [subhigh] and [sublow]. + * + * Access them as chan?_place_cost_fac[subhigh][sublow]. + * They are used to speed up the computation of the cost function that + * takes the length of the net bounding box in each dimension, divided + * by the average number of tracks in that direction. + * + * For other cost functions they will never be used. + * + * @param chanx_place_cost_fac + * 1st dimension index range: [0...device_ctx.grid.width()-2] + * @param chany_place_cost_fac + * 1st dimension index range: [0...device_ctx.grid.height()-2] + * + * For more detailed structure allocation process and index ranges, see + * alloc_and_load_for_fast_cost_update(). + */ +float** chanx_place_cost_fac; +float** chany_place_cost_fac; + /** * @brief The following arrays are used by the try_swap function for speed. * @@ -196,17 +218,6 @@ std::vector ts_nets_to_update; /********** End of definitions of variables in place_global.h **********/ -/* The arrays below are used to precompute the inverse of the average * - * number of tracks per channel between [subhigh] and [sublow]. Access * - * them as chan?_place_cost_fac[subhigh][sublow]. They are used to * - * speed up the computation of the cost function that takes the length * - * of the net bounding box in each dimension, divided by the average * - * number of tracks in that direction; for other cost functions they * - * will never be used. * - */ -static float** chanx_place_cost_fac; //[0...device_ctx.grid.width()-2] -static float** chany_place_cost_fac; //[0...device_ctx.grid.height()-2] - /* These file-scoped variables keep track of the number of swaps * * rejected, accepted or aborted. The total number of swap attempts * * is the sum of the three number. */ @@ -2813,4 +2824,4 @@ static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& pla bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); -} \ No newline at end of file +} diff --git a/vpr/src/place/place_global.h b/vpr/src/place/place_global.h index b9615f2a328..8ab36b9d1f5 100644 --- a/vpr/src/place/place_global.h +++ b/vpr/src/place/place_global.h @@ -31,4 +31,6 @@ extern ClbNetPinsMatrix proposed_connection_timing_cost; extern vtr::vector net_timing_cost; extern vtr::vector bb_coords, bb_num_on_edges; extern vtr::vector ts_bb_coord_new, ts_bb_edge_new; -extern std::vector ts_nets_to_update; \ No newline at end of file +extern float** chanx_place_cost_fac; +extern float** chany_place_cost_fac; +extern std::vector ts_nets_to_update; diff --git a/vpr/src/place/place_timing_update.cpp b/vpr/src/place/place_timing_update.cpp index bbcc5c9bb22..fa74f97dfb5 100644 --- a/vpr/src/place/place_timing_update.cpp +++ b/vpr/src/place/place_timing_update.cpp @@ -92,6 +92,26 @@ void initialize_timing_info(float crit_exponent, * All the pins with changed connection delays have already been added into * the ClusteredPinTimingInvalidator to allow incremental STA update. These * changed connection delays are a direct result of moved blocks in try_swap(). + * + * @param crit_exponent Used to calculate `sharpened` criticalities. + * + * @param delay_model Used to calculate the delay between two locations. + * + * @param criticalities Mapping interface between atom pin criticalities + * and clb pin criticalities. + * + * @param setup_slacks Mapping interface between atom pin raw setup slacks + * and clb pin raw setup slacks. + * + * @param pin_timing_invalidator Stores all the pins that have their delay value changed + * and needs to be updated in the timing graph. + * + * @param timing_info Stores the timing graph and other important timing info. + * + * @param timing_update_mode Determines what should be updated when this routine is + * called, and using incremental techniques is appropriate. + * + * @param costs Stores the updated timing cost for the whole placement. */ void update_setup_slacks_and_criticalities(float crit_exponent, const PlaceDelayModel* delay_model, @@ -284,9 +304,19 @@ static double sum_td_costs() { * @brief Commit all the setup slack values from the PlacerSetupSlacks * class to a vtr matrix. * - * This incremental routine will be correct if and only if it is called - * immediately after each time update_setup_slacks_and_criticalities - * updates the setup slacks (i.e. update_setup_slacks = true). + * This routine is incremental since it relies on the pins_with_modified_setup_slack() + * to detect which pins need to be updated and which pins do not. + * + * Therefore, it is assumed that this routine is always called immediately after + * each time update_setup_slacks_and_criticalities() updates the setup slacks + * (i.e. t_placer_timing_update_mode::update_setup_slacks = true). Otherwise, + * pins_with_modified_setup_slack() cannot accurately account for all the pins + * that have their setup slacks changed, making this routine incorrect. + * + * Currently, the only exception to the rule above is when setup slack analysis is used + * during the placement quench. The new setup slacks might be either accepted or + * rejected, so for efficiency reasons, this routine is not called if the slacks are + * rejected in the end. For more detailed info, see the try_swap() routine. */ void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; @@ -305,6 +335,12 @@ void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { * @brief Verify that the values in the vtr matrix matches the PlacerSetupSlacks class. * * Return true if all values are identical. Otherwise, return false. + * Used to check if the timing update has been succesfully revereted if a proposed move + * is rejected when applying setup slack analysis during the placement quench. + * If successful, the setup slacks in the timing analyzer should be the same as + * the setup slacks in connection_setup_slack matrix without running commit_setup_slacks(). + * + * For more detailed info, see the try_swap() routine. */ bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; diff --git a/vpr/src/place/place_util.cpp b/vpr/src/place/place_util.cpp index b06a5b8c9ef..bb2738580b2 100644 --- a/vpr/src/place/place_util.cpp +++ b/vpr/src/place/place_util.cpp @@ -1,19 +1,29 @@ +/** + * @file place_util.cpp + * @brief Definitions of structure routines declared in place_util.h. + */ + #include "place_util.h" #include "globals.h" +/// init_grid_blocks(); static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); +///@brief Initialize the placement context. void init_placement_context() { auto& place_ctx = g_vpr_ctx.mutable_placement(); auto& cluster_ctx = g_vpr_ctx.clustering(); + /* Intialize the lookup of CLB block positions */ place_ctx.block_locs.clear(); place_ctx.block_locs.resize(cluster_ctx.clb_nlist.blocks().size()); + /* Initialize the reverse lookup of CLB block positions */ place_ctx.grid_blocks = init_grid_blocks(); } +///@brief Initialize `grid_blocks`, the inverse structure of `block_locs`. static vtr::Matrix init_grid_blocks() { auto& device_ctx = g_vpr_ctx.device(); @@ -200,4 +210,4 @@ static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) *rlim *= (1. - 0.44 + success_rat); *rlim = std::max(std::min(*rlim, upper_lim), 1.f); -} \ No newline at end of file +} diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h index 46887954c3e..04d94f2f38d 100644 --- a/vpr/src/place/place_util.h +++ b/vpr/src/place/place_util.h @@ -1,3 +1,9 @@ +/** + * @file place_util.h + * @brief Utility structures representing various states of the + * placement. Also contains declarations of related routines. + */ + #pragma once #include "vpr_types.h" @@ -5,7 +11,7 @@ class t_placer_costs; class t_annealing_state; -///@brief Initialize the placement context +///@brief Initialize the placement context. void init_placement_context(); ///@brief Get the initial limit for inner loop block move attempt limit. @@ -57,7 +63,7 @@ class t_placer_costs { double timing_cost_norm; private: - static constexpr double MAX_INV_TIMING_COST = 1.e9; + double MAX_INV_TIMING_COST = 1.e9; enum e_place_algorithm place_algorithm; public: //Constructor @@ -98,7 +104,7 @@ class t_annealing_state { int move_lim; private: - static constexpr float FINAL_RLIM = 1.; + float FINAL_RLIM = 1.; public: //Constructor t_annealing_state(const t_annealing_sched& annealing_sched, @@ -109,4 +115,4 @@ class t_annealing_state { public: //Accessor float final_rlim() const { return FINAL_RLIM; } -}; +}; \ No newline at end of file diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp index 4593777ce15..ae8e1b1e27c 100644 --- a/vpr/src/place/timing_place.cpp +++ b/vpr/src/place/timing_place.cpp @@ -1,3 +1,7 @@ +/** + * @file timing_place.cpp + * @brief Stores the method definitions of classes defined in timing_place.h. + */ #include #include @@ -14,35 +18,34 @@ #include "timing_info.h" -//Use an incremental approach to updating criticalities and setup slacks? -constexpr bool INCR_UPDATE_CRITICALITIES = true; -constexpr bool INCR_UPDATE_SETUP_SLACKS = true; +///@brief Use an incremental approach to updating criticalities and setup slacks? +static constexpr bool INCR_UPDATE_CRITICALITIES = true, INCR_UPDATE_SETUP_SLACKS = true; -/**************************************/ - -/* Allocates space for the timing_place_crit_ data structure * - * I chunk the data to save space on large problems. */ +///@brief Allocates space for the timing_place_crit_ data structure. PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) : clb_nlist_(clb_nlist) , pin_lookup_(netlist_pin_lookup) , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { } +/** + * @brief Updated the criticalities in the timing_place_crit_ data structure. + * + * If the criticalities are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_criticality() + * cannot accurately account for all the pins that need to be updated. In this case, + * we pass in recompute=true to update all criticalities from scratch. + * + * If the criticality exponent has changed, we also need to update from scratch. + */ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent, bool recompute) { - //If the criticalities are not updated immediately after each time we call - //timing_info->update(), then timing_info->pins_with_modified_setup_criticality() - //cannot accurately account for all the pins that need to be updated. - //In this case, we pass in recompute=true to update all criticalities from scratch. - // - //If the criticality exponent has changed, we also need to update from scratch. - - //Determine what pins need updating + /* Determine what pins need updating */ if (!recompute && crit_exponent == last_crit_exponent_ && INCR_UPDATE_CRITICALITIES) { incr_update_criticalities(timing_info); } else { recompute_criticalities(); - //Record new criticality exponent + /* Record new criticality exponent */ last_crit_exponent_ = crit_exponent; } @@ -50,7 +53,7 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf * For every pin on every net (or, equivalently, for every tedge ending * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ - // Update the effected pins + /* Update the effected pins */ for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) { ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); @@ -64,18 +67,21 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf } } +/** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to criticalities can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * criticality, rather than those marked as modified by the timing analyzer. + * + * Since timing_info uses shifted/relaxed criticality (which depends on max required + * time and worst case slacks), additional nodes may be modified when updating the + * atom pin criticalities. + */ + void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timing_info) { cluster_pins_with_modified_criticality_.clear(); - //Collect the cluster pins which need to be updated based on the latest timing - //analysis - // - //Note we use the set of pins reported by the *timing_info* as having modified - //criticality, rather than those marked as modified by the timing analyzer. - //Since timing_info uses shifted/relaxed criticality (which depends on max - //required time and worst case slacks), additional nodes may be modified - //when updating the atom pin criticalities. - for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); @@ -88,10 +94,15 @@ void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timin } } +/** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerCriticalities::incr_update_criticalities(). + */ void PlacerCriticalities::recompute_criticalities() { cluster_pins_with_modified_criticality_.clear(); - //Non-incremental: all sink pins need updating + /* Non-incremental: all sink pins need updating */ for (ClusterNetId net_id : clb_nlist_.nets()) { for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { cluster_pins_with_modified_criticality_.insert(pin_id); @@ -99,35 +110,44 @@ void PlacerCriticalities::recompute_criticalities() { } } +///@brief Override the criticality of a particular connection. void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float val) { timing_place_crit_[net_id][ipin] = val; } +/** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const { return vtr::make_range(cluster_pins_with_modified_criticality_); } /**************************************/ -/* Allocates space for the timing_place_setup_slacks_ data structure */ +///@brief Allocates space for the timing_place_setup_slacks_ data structure. PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) : clb_nlist_(clb_nlist) , pin_lookup_(netlist_pin_lookup) , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { } +/** + * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure. + * + * If the setup slacks are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_slack() + * cannot accurately account for all the pins that need to be updated. + * In this case, we pass in recompute=true to update all setup slacks from scratch. + */ void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute) { - //If the setup slacks are not updated immediately after each time we call - //timing_info->update(), then timing_info->pins_with_modified_setup_slack() - //cannot accurately account for all the pins that need to be updated. - //In this case, we pass in recompute=true to update all setup slacks from scratch. if (!recompute && INCR_UPDATE_SETUP_SLACKS) { incr_update_setup_slacks(timing_info); } else { recompute_setup_slacks(); } - //Update the effected pins + /* Update the effected pins */ for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) { ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); @@ -138,13 +158,16 @@ void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info, } } +/** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to setup slacks can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * setup slacks, rather than those marked as modified by the timing analyzer. + */ void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_info) { cluster_pins_with_modified_setup_slack_.clear(); - //Collect the cluster pins which need to be updated based on the latest timing analysis - // - //Note we use the set of pins reported by the *timing_info* as having modified - //setup slacks, rather than those marked as modified by the timing analyzer. for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_slack()) { ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); @@ -157,10 +180,15 @@ void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_i } } +/** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks(). + */ void PlacerSetupSlacks::recompute_setup_slacks() { cluster_pins_with_modified_setup_slack_.clear(); - //Non-incremental: all sink pins need updating + /* Non-incremental: all sink pins need updating */ for (ClusterNetId net_id : clb_nlist_.nets()) { for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { cluster_pins_with_modified_setup_slack_.insert(pin_id); @@ -168,10 +196,15 @@ void PlacerSetupSlacks::recompute_setup_slacks() { } } +///@brief Override the setup slack of a particular connection. void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float val) { timing_place_setup_slacks_[net_id][ipin] = val; } +/** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { return vtr::make_range(cluster_pins_with_modified_setup_slack_); } diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index b88e72af2c2..50042b50ea4 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -1,3 +1,32 @@ +/** + * @file timing_place.h + * @brief Interface used by the VPR placer to query information + * from the Tatum timing analyzer. + * + * @class PlacerSetupSlacks + * Queries connection **RAW** setup slacks, which can + * range from negative to positive values. Also maps + * atom pin setup slacks to clb pin setup slacks. + * @class PlacerCriticalities + * Query connection criticalities, which are calculuated + * based on the raw setup slacks and ranges from 0 to 1. + * Also maps atom pin crit. to clb pin crit. + * @class PlacerTimingCosts + * Hierarchical structure used by update_td_costs() to + * maintain the order of addition operation of float values + * (to avoid round-offs) while doing incremental updates. + * + * Calculating criticalities: + * All the raw setup slack values across a single clock domain are gathered, shifted, + * and rated from best to worst. The best shifted slack value (the most positive one) + * will have a criticality of 0, while the worse shifted slack value (always 0) + * will have a criticality of 1. Criticalities are used to calculated timing costs + * for each connection (delay * criticality). + * + * For a more detailed description on how criticalities are calculated, see + * calc_relaxed_criticality() in `timing_util.cpp`. + */ + #ifndef TIMING_PLACE #define TIMING_PLACE @@ -7,32 +36,42 @@ #include "place_delay_model.h" #include "vpr_net_pins_matrix.h" -/* Usage +/** + * @brief PlacerCriticalities returns the clustered netlist connection criticalities + * used by the placer ('sharpened' by a criticality exponent). + * + * Usage * ===== - * PlacerCriticalities returns the clustered netlist connection criticalities used by - * the placer ('sharpened' by a criticality exponent). This also serves to map atom - * netlist level criticalites (i.e. on AtomPinIds) to the clustered netlist (i.e. - * ClusterPinIds) used during placement. + * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds) + * to the clustered netlist (i.e. ClusterPinIds) used during placement. * - * Criticalities are calculated by calling update_criticalities(), which will - * update criticalities based on the atom netlist connection criticalities provided by - * the passed in SetupTimingInfo. This is done incrementally, based on the modified - * connections/AtomPinIds returned by SetupTimingInfo. + * Criticalities are calculated by calling update_setup_slacks_and_criticalities() and + * setting t_placer_timing_update_mode::update_criticalities to true. It will update + * criticalities based on the atom netlist connection criticalities provided by the + * passed in SetupTimingInfo. * - * The criticalities of individual connections can then be queried by calling the - * criticality() member function. + * This process can be done incrementally, based on the modified connections/AtomPinIds + * returned by SetupTimingInfo. But sometimes a recomputation is required. For detailed + * information please see the description of `t_placer_timing_update_mode` structure. * - * It also supports iterating via pins_with_modified_criticalities() through the - * clustered netlist pins/connections which have had their criticality modified by - * the last call to update_criticalities(), which is useful for incrementally + * It also supports iterating via pins_with_modified_criticalities() through the + * clustered netlist pins/connections which have had their criticality modified by + * the last call to update_criticalities(), which is useful for incrementally * re-calculating timing costs. * + * The criticalities of individual connections can then be queried by calling the + * criticality() member function. + * * Implementation * ============== - * To support incremental re-calculation the class saves the last criticality exponent - * passed to update_criticalites(). If the next update uses the same exponent criticalities - * can be incrementally updated. Otherwise they must be re-calculated from scratch, since - * a change in exponent changes *all* criticalities. + * To support incremental re-calculation, the class saves the last criticality exponent + * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same + * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated + * from scratch, since a change in exponent changes *all* criticalities. + * + * If the timing graph is updated while t_placer_timing_update_mode::update_criticalities is + * set to false, a re-calculation of *all* criticalities is required as well (since we don't + * know exactly which pins have changed after multiple timing updates have been performed). */ class PlacerCriticalities { public: //Types @@ -48,55 +87,79 @@ class PlacerCriticalities { PlacerCriticalities& operator=(const PlacerCriticalities& clb_nlist) = delete; public: //Accessors - //Returns the criticality of the specified connection + ///@brief Returns the criticality of the specified connection. float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; } - //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified - //by the last call to update_criticalities() + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ pin_range pins_with_modified_criticality() const; public: //Modifiers - //Updates criticalities based on the atom netlist criticalitites provided by - //timing_info and the provided criticality_exponent. + /** + * @brief Updates criticalities based on the atom netlist criticalitites + * provided by timing_info and the provided criticality_exponent. + */ void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent, bool recompute); - //Override the criticality of a particular connection + ///@brief Override the criticality of a particular connection. void set_criticality(ClusterNetId net, int ipin, float val); private: //Data + ///@brief The clb netlist in the placement context. const ClusteredNetlist& clb_nlist_; - const ClusteredPinAtomPinsLookup& pin_lookup_; - ClbNetPinsMatrix timing_place_crit_; /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ + ///@brief The lookup table that maps atom pins to clb pins. + const ClusteredPinAtomPinsLookup& pin_lookup_; - //The criticality exponent when update_criticalites() was last called (used to detect if incremental update can be used) + /** + * @brief The matrix that stores criticality value for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_crit_; + + /** + * The criticality exponent when update_criticalites() was last called + * (used to detect if incremental update can be used). + */ float last_crit_exponent_ = std::numeric_limits::quiet_NaN(); - //Set of pins with criticaltites modified by last call to update_criticalities() + ///@brief Set of pins with criticaltites modified by last call to update_criticalities(). vtr::vec_id_set cluster_pins_with_modified_criticality_; - //Updates criticalities: incremental V.S. from scratch + ///@brief Updates criticalities: incremental V.S. from scratch void incr_update_criticalities(const SetupTimingInfo* timing_info); void recompute_criticalities(); }; -/* Usage +/** + * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection. + * + * Usage * ===== - * PlacerSetupSlacks returns the clustered netlist connection setup slack used by - * the placer. This also serves to map atom netlist level slack (i.e. on AtomPinIds) - * to the clustered netlist (i.e. ClusterPinIds) used during placement. + * This also serves to map atom netlist level setup slacks (i.e. on AtomPinIds) to the + * clustered netlist (i.e. ClusterPinIds) used during placement. + * + * Setup slacks are calculated by calling update_setup_slacks_and_criticalities(), + * with t_placer_timing_update_mode::update_setup_slacks to true. It will update setup + * slacks based on the atom netlist connection setup slacks provided by the passed in + * SetupTimingInfo. * - * Setup slacks are calculated by calling update_setup_slacks(), which will - * update setup slacks based on the atom netlist connection setup slacks provided by - * the passed in SetupTimingInfo. This is done incrementally, based on the modified - * connections/AtomPinIds returned by SetupTimingInfo. + * This process can be done incrementally, based on the modified connections/AtomPinIds + * returned by SetupTimingInfo. But sometimes a recomputation is required. For detailed + * information please see the description of `t_placer_timing_update_mode` structure. * - * The setup slacks of individual connections can then be queried by calling the + * It also supports iterating via pins_with_modified_setup_slack() through the clustered + * netlist pins/connections which have had their setup slacks modified by the last call + * to update_setup_slacks(). + * + * The RAW setup slacks of individual connections can then be queried by calling the * setup_slack() member function. * - * It also supports iterating via pins_with_modified_setup_slack() through the - * clustered netlist pins/connections which have had their setup slacks modified by - * the last call to update_setup_slacks(). + * Note: RAW setup slacks are unlike criticalities. Their values are not confined between + * 0 and 1. Their values can be either positive or negative. */ class PlacerSetupSlacks { public: //Types @@ -112,40 +175,46 @@ class PlacerSetupSlacks { PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete; public: //Accessors - //Returns the setup slack of the specified connection + ///@brief Returns the setup slack of the specified connection. float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; } - //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified - //by the last call to update_setup_slacks() + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ pin_range pins_with_modified_setup_slack() const; public: //Modifiers - //Updates setup slacks based on the atom netlist setup slacks provided by timing_info + ///@brief Updates setup slacks based on the atom netlist setup slacks provided by timing_info. void update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute); - //Override the setup slack of a particular connection + ///@brief Override the setup slack of a particular connection. void set_setup_slack(ClusterNetId net, int ipin, float val); private: //Data const ClusteredNetlist& clb_nlist_; const ClusteredPinAtomPinsLookup& pin_lookup_; - ClbNetPinsMatrix timing_place_setup_slacks_; /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ + /** + * @brief The matrix that stores raw setup slack values for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_setup_slacks_; - //Set of pins with criticaltites modified by last call to update_criticalities() + ///@brief Set of pins with raw setup slacks modified by last call to update_criticalities() vtr::vec_id_set cluster_pins_with_modified_setup_slack_; - //Updates setup slacks: incremental V.S. from scratch + ///@brief Updates setup slacks: incremental V.S. from scratch. void incr_update_setup_slacks(const SetupTimingInfo* timing_info); void recompute_setup_slacks(); }; -/* Usage - * ===== - * PlacerTimingCosts mimics a 2D array of connection timing costs running from: - * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] +/** + * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from: + * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. * - * So it can be used similar to: + * It can be used similar to: * * PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct * @@ -156,53 +225,53 @@ class PlacerSetupSlacks { * * //Potentially other modifications... * - * //Calculate the updated timing cost, of all connections, incrementally based - * //on modifications + * //Calculate the updated timing cost, of all connections, + * //incrementally based on modifications * float total_timing_cost = connection_timing_costs.total_cost(); - * + * * However behind the scenes PlacerTimingCosts tracks when connection costs are modified, * and efficiently re-calculates the total timing cost incrementally based on the connections * which have had their cost modified. * - * Implementaion - * ============= - * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part + * Implementation + * ============== + * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part * of connection_costs_. To mimic 2d-array like access PlacerTimingCosts also uses two proxy * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy * respectively). * * The first part of connection_costs_ stores intermediate sums of the connection costs for - * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary + * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary * tree, where leaves correspond to individual connection costs and intermediate nodes the - * partial sums of the connection costs. (The binary tree is stored implicitly in the - * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary + * partial sums of the connection costs. (The binary tree is stored implicitly in the + * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary * tree we calculate the total timing cost over all connections. * * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up - * to the root) which have ancestors (leaves) with modified connection costs. When the + * to the root) which have ancestors (leaves) with modified connection costs. When the * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost. - * Only invalidated nodes are traversed, with valid nodes just returning their previously + * Only invalidated nodes are traversed, with valid nodes just returning their previously * calculated (and unchanged) value. * - * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can + * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can * be done in O(k log K) time. * - * It is important to note that due to limited floating point precision, floating point + * It is important to note that due to limited floating point precision, floating point * arithmetic has an order dependence (due to round-off). Using a binary tree to total * the timing connection costs allows us to incrementally update the total timign cost while - * maintianing the *same order of operations* as if it was re-computed from scratch. This + * maintianing the *same order of operations* as if it was re-computed from scratch. This * ensures we *always* get consistent results regardless of what/when connections are changed. * * Proxy Classes - * ------------- + * ============= * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of * internal storage of that net's connection costs. * - * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular - * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy - * supports assignment, allowing clients to modify the connection cost. It also detects if the - * assigned value differs from the previous value and if so, calls PlacerTimingCosts's + * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular + * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy + * supports assignment, allowing clients to modify the connection cost. It also detects if the + * assigned value differs from the previous value and if so, calls PlacerTimingCosts's * invalidate() method on that connection cost. * * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN) @@ -250,7 +319,9 @@ class PlacerTimingCosts { size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1); VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections"); - VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); + VTR_ASSERT_MSG( + num_connections == 0 || num_level_before_leaves < num_connections, + "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); //We don't need to store all possible leaves if we have fewer connections //(i.e. bottom-right of tree is empty) @@ -270,16 +341,19 @@ class PlacerTimingCosts { } } - //Proxy class representing a connection cost - // Supports modification of connection cost while detecting changes and - // reporting them up to PlacerTimingCosts + /** + * @brief Proxy class representing a connection cost. + * + * Supports modification of connection cost while detecting + * changes and reporting them up to PlacerTimingCosts. + */ class ConnectionProxy { public: ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost) : timing_costs_(timing_costs) , connection_cost_(connection_cost) {} - //Allow clients to modify the connection cost via assignment + ///@brief Allow clients to modify the connection cost via assignment. ConnectionProxy& operator=(double new_cost) { if (new_cost != connection_cost_) { //If connection cost changed, update it, and mark it @@ -290,9 +364,11 @@ class PlacerTimingCosts { return *this; } - //Support getting the current connection cost as a double - // Useful for client code operating on the cost values (e.g. - // difference between costs) + /** + * @brief Support getting the current connection cost as a double. + * + * Useful for client code operating on the cost values (e.g. difference between costs). + */ operator double() { return connection_cost_; } @@ -302,15 +378,18 @@ class PlacerTimingCosts { double& connection_cost_; }; - //Proxy class representing the connection costs of a net - // Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection + /** + * @brief Proxy class representing the connection costs of a net. + * + * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection. + */ class NetProxy { public: NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs) : timing_costs_(timing_costs) , net_sink_costs_(net_sink_costs) {} - //Indexes into the specific net pin/connection + ///@brief Indexes into the specific net pin/connection. ConnectionProxy operator[](size_t ipin) { return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); } @@ -320,7 +399,7 @@ class PlacerTimingCosts { double* net_sink_costs_; }; - //Indexes into the specific net + ///@brief Indexes into the specific net. NetProxy operator[](ClusterNetId net_id) { VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0); @@ -339,8 +418,10 @@ class PlacerTimingCosts { std::swap(num_levels_, other.num_levels_); } - //Calculates the total cost of all connections efficiently - //in the face of modified connection costs + /** + * @brief Calculates the total cost of all connections efficiently + * in the face of modified connection costs. + */ double total_cost() { float cost = total_cost_recurr(0); //Root @@ -351,7 +432,7 @@ class PlacerTimingCosts { } private: - //Recursively calculate and update the timing cost rooted at inode + ///@brief Recursively calculate and update the timing cost rooted at inode. double total_cost_recurr(size_t inode) { //Prune out-of-tree if (inode > connection_costs_.size() - 1) { @@ -386,12 +467,18 @@ class PlacerTimingCosts { return node_cost; } - friend ConnectionProxy; //So it can call invalidate() + ///@brief Friend-ed so it can call invalidate(). + friend ConnectionProxy; void invalidate(double* invalidated_cost) { //Check pointer within range of internal storage - VTR_ASSERT_SAFE_MSG(invalidated_cost >= &connection_costs_[0], "Connection cost pointer should be after start of internal storage"); - VTR_ASSERT_SAFE_MSG(invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], "Connection cost pointer should be before end of internal storage"); + VTR_ASSERT_SAFE_MSG( + invalidated_cost >= &connection_costs_[0], + "Connection cost pointer should be after start of internal storage"); + + VTR_ASSERT_SAFE_MSG( + invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], + "Connection cost pointer should be before end of internal storage"); size_t icost = invalidated_cost - &connection_costs_[0]; @@ -400,7 +487,7 @@ class PlacerTimingCosts { //Invalidate parent intermediate costs up to root or first //already-invalidated parent size_t iparent = parent(icost); - ; + while (!std::isnan(connection_costs_[iparent])) { //Invalidate connection_costs_[iparent] = std::numeric_limits::quiet_NaN(); @@ -428,33 +515,41 @@ class PlacerTimingCosts { return (i - 1) / 2; } - //Returns the number of nodes in ilevel'th level - //If ilevel is negative, return 0, since the root shouldn't be counted - //as a leaf node candidate + /** + * @brief Returns the number of nodes in ilevel'th level. + * + * If ilevel is negative, return 0, since the root shouldn't + * be counted as a leaf node candidate. + */ size_t num_nodes_in_level(int ilevel) const { return ilevel < 0 ? 0 : (2 << (ilevel)); } - //Returns the total number of nodes in levels [0..ilevel] (inclusive) + ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive). size_t num_nodes_up_to_level(int ilevel) const { return (2 << (ilevel + 1)) - 1; } private: - //Vector storing the implicit binary tree of connection costs - // The actual connections are stored at the end of the vector - // (last level of the binary tree). The earlier portions of - // the tree are the intermediate nodes. - // - // The methods left_child()/right_child()/parent() can be used - // to traverse the tree by indicies into this vector + /** + * @brief Vector storing the implicit binary tree of connection costs. + * + * The actual connections are stored at the end of the vector + * (last level of the binary tree). The earlier portions of + * the tree are the intermediate nodes. + * + * The methods left_child()/right_child()/parent() can be used + * to traverse the tree by indicies into this vector. + */ std::vector connection_costs_; - //Vector storing the indicies of the first connection for - //each net in the netlist, used for indexing by net. + /** + * @brief Vector storing the indicies of the first connection + * for each net in the netlist, used for indexing by net. + */ vtr::vector net_start_indicies_; - //Number of levels in the binary tree + ///@brief Number of levels in the binary tree. size_t num_levels_ = 0; }; From 74d279c10c9fd00809171593e61fa1829ab957ca Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 26 Aug 2020 01:48:31 -0400 Subject: [PATCH 21/21] Added documentation for the timing driven routines used in try_swap() in place.cpp. --- vpr/src/place/place.cpp | 494 ++++++++++++++++------------ vpr/src/place/place_delay_model.cpp | 2 +- vpr/src/place/place_global.h | 3 + vpr/src/place/place_util.h | 2 +- 4 files changed, 297 insertions(+), 204 deletions(-) diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index e82753bef59..feb2c0e19fb 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -103,6 +103,8 @@ constexpr double INVALID_COST = std::numeric_limits::quiet_NaN(); * place_global.h. These variables were originally local to the current file. * * However, they were moved so as to facilitate moving some of the routines * * in the current file into other source files. * + * TODO: Create a single extern variable that allows access to all these data * + * structures so that these structures don't have to be declared as extern. * *******************************************************************************/ /** @@ -641,10 +643,12 @@ void try_place(const t_placer_opts& placer_opts, costs.cost = 1; } else { //placer_opts.place_algorithm == BOUNDING_BOX_PLACE + + //cost is the same as wirelength cost costs.bb_cost = comp_bb_cost(NORMAL); - costs.cost = costs.bb_cost; /// 0. && vtr::frand() < rlim_escape_fraction) { rlim = std::numeric_limits::infinity(); @@ -1331,7 +1345,7 @@ static e_move_result try_swap(const t_annealing_state* state, rlim = state->rlim; } - //Generate a new move (perturbation) used to explore the space of possible placements + /* Generate a new move (perturbation) used to explore the space of possible placements */ e_create_move create_move_outcome = move_generator.propose_move(blocks_affected, rlim); LOG_MOVE_STATS_PROPOSED(t, blocks_affected); @@ -1339,7 +1353,7 @@ static e_move_result try_swap(const t_annealing_state* state, e_move_result move_outcome = ABORTED; if (create_move_outcome == e_create_move::ABORT) { - //Proposed move is not legal -- give up on this move + /* Proposed move is not legal -- give up on this move */ clear_move_blocks(blocks_affected); LOG_MOVE_STATS_OUTCOME(std::numeric_limits::quiet_NaN(), @@ -1348,183 +1362,191 @@ static e_move_result try_swap(const t_annealing_state* state, "ABORTED", "illegal move"); move_outcome = ABORTED; - } else { - VTR_ASSERT(create_move_outcome == e_create_move::VALID); - /* - * To make evaluating the move simpler (e.g. calculating changed bounding box), - * we first move the blocks to thier new locations (apply the move to - * place_ctx.block_locs) and then computed the change in cost. If the move is - * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing - * the move). If the move is rejected the blocks are returned to their original - * positions (reverting place_ctx.block_locs to its original state). - * - * Note that the inverse look-up place_ctx.grid_blocks is only updated - * after move acceptance is determined, and so should not be used when - * evaluating a move. - */ + return move_outcome; + } - //Update the block positions - apply_move_blocks(blocks_affected); - - //Find all the nets affected by this swap and update their costs - //This routine calculates new connection delays and timing costs - //and store them in proposed_* data structures - //This routine also calculates the wiring cost, which doesn't - //depend on the timing driven data - int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, - delay_model, - criticalities, - blocks_affected, - bb_delta_c, - timing_delta_c); - - //Find all the sink pins with changed connection delays from the affected blocks. - //These sink pins will be passed into the pin_timing_invalidator for timing update. - //They will also be added to the pin invalidator when we wish to revert a timing update. - std::vector sink_pins_affected; - find_affected_sink_pins(blocks_affected, sink_pins_affected); + /* Move is valid. Proceed to analyze cost. */ + VTR_ASSERT(create_move_outcome == e_create_move::VALID); - if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { - //Invalidates timing of modified connections for incremental timing updates - //This routine relies on comparing proposed_connection_delay and connection_delay - invalidate_affected_connection_delays(sink_pins_affected, - pin_timing_invalidator, - timing_info); + /* + * To make evaluating the move simpler (e.g. calculating changed bounding box), + * we first move the blocks to thier new locations (apply the move to + * place_ctx.block_locs) and then computed the change in cost. If the move is + * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing + * the move). If the move is rejected the blocks are returned to their original + * positions (reverting place_ctx.block_locs to its original state). + * + * Note that the inverse look-up place_ctx.grid_blocks is only updated + * after move acceptance is determined, and so should not be used when + * evaluating a move. + */ - //Update the connection_timing_cost and connection_delay - //values from the temporary values. - commit_td_cost(blocks_affected); + //Update the block positions + apply_move_blocks(blocks_affected); + + //Find all the nets affected by this swap and update the wiring costs. + //This cost value doesn't depend on the timing info. + //Also find all the pins affected by the swap, and calculates new connection + //delays and timing costs and store them in proposed_* data structures. + int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, + delay_model, + criticalities, + blocks_affected, + bb_delta_c, + timing_delta_c); + + //Find all the sink pins with changed connection delays from the affected blocks. + //These sink pins will be passed into the pin_timing_invalidator for timing update. + //They will also be added to the pin invalidator when we wish to revert a timing update. + std::vector sink_pins_affected; + find_affected_sink_pins(blocks_affected, sink_pins_affected); + + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Invalidates timing of modified connections for incremental timing updates. + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); - //Update timing information. Since we are analyzing setup slacks, - //we only update those values and keep the criticalities stale - //so as not to interfere with the original timing driven algorithm. - // - //Note: the timing info must be called after applying block moves - //and committing the timing driven delays and costs. - //If we wish to revert this timing update due to move rejection, - //we need to revert block moves and restore the timing values. - timing_update_mode->update_criticalities = false; - timing_update_mode->update_setup_slacks = true; - update_setup_slacks_and_criticalities(state->crit_exponent, - delay_model, - criticalities, - setup_slacks, - pin_timing_invalidator, - timing_info, - timing_update_mode, - costs); + //Update the connection_timing_cost and connection_delay + //values from the temporary values. + //This step is necessary for performing timing update. + commit_td_cost(blocks_affected); - /* Get the setup slack analysis cost */ - //TODO: calculate a weighted average of the slack cost and wiring cost - delta_c = analyze_setup_slack_cost(setup_slacks); + //Update timing information. Since we are analyzing setup slacks, + //we only update those values and keep the criticalities stale + //so as not to interfere with the original timing cost algorithm. + // + //Note: the timing info must be called after applying block moves + //and committing the timing driven delays and costs. + //If we wish to revert this timing update due to move rejection, + //we need to first revert block moves and restore timing values. + timing_update_mode->update_criticalities = false; + timing_update_mode->update_setup_slacks = true; + update_setup_slacks_and_criticalities(state->crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); - } else if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - /*in this case we redefine delta_c as a combination of timing and bb. * - *additionally, we normalize all values, therefore delta_c is in * - *relation to 1*/ + /* Get the setup slack analysis cost */ + //TODO: calculate a weighted average of the slack cost and wiring cost + delta_c = analyze_setup_slack_cost(setup_slacks); - delta_c = (1 - timing_tradeoff) * bb_delta_c * costs->bb_cost_norm - + timing_tradeoff * timing_delta_c * costs->timing_cost_norm; + } else if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + /*in this case we redefine delta_c as a combination of timing and bb. * + *additionally, we normalize all values, therefore delta_c is in * + *relation to 1*/ - } else { //place_algorithm == BOUNDING_BOX_PLACE (wiring cost) - delta_c = bb_delta_c; - } + delta_c = (1 - timing_tradeoff) * bb_delta_c * costs->bb_cost_norm + + timing_tradeoff * timing_delta_c * costs->timing_cost_norm; - /* 1 -> move accepted, 0 -> rejected. */ - move_outcome = assess_swap(delta_c, state->t); + } else { //place_algorithm == BOUNDING_BOX_PLACE (wiring cost) + delta_c = bb_delta_c; + } - if (move_outcome == ACCEPTED) { - costs->cost += delta_c; - costs->bb_cost += bb_delta_c; + /* 1 -> move accepted, 0 -> rejected. */ + move_outcome = assess_swap(delta_c, state->t); - if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { - /* Update the timing driven cost as usual */ - costs->timing_cost += timing_delta_c; + if (move_outcome == ACCEPTED) { + costs->cost += delta_c; + costs->bb_cost += bb_delta_c; - //Commit the setup slack information - //The timing delay and cost values should be committed already - commit_setup_slacks(setup_slacks); - } + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + /* Update the timing driven cost as usual */ + costs->timing_cost += timing_delta_c; - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - costs->timing_cost += timing_delta_c; + //Commit the setup slack information + //The timing delay and cost values should be committed already + commit_setup_slacks(setup_slacks); + } - //Invalidates timing of modified connections for incremental timing updates - //This routine relies on comparing proposed_connection_delay and connection_delay - //If the setup slack analysis was not performed, the - //sink pins are yet to be invalidated. - invalidate_affected_connection_delays(sink_pins_affected, - pin_timing_invalidator, - timing_info); + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + costs->timing_cost += timing_delta_c; - //update the connection_timing_cost and connection_delay - //values from the temporary values - commit_td_cost(blocks_affected); - } + //Invalidates timing of modified connections for incremental timing + //updates. This routine relies on comparing proposed_connection_delay + //and connection_delay. If the setup slack analysis was not performed, + //the sink pins are yet to be invalidated. + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); - /* update net cost functions and reset flags. */ - update_move_nets(num_nets_affected); + //Update the connection_timing_cost and connection_delay + //values from the temporary values + commit_td_cost(blocks_affected); + } - /* Update clb data structures since we kept the move. */ - commit_move_blocks(blocks_affected); + /* Update net cost functions and reset flags. */ + update_move_nets(num_nets_affected); - } else { //move_outcome == REJECTED + /* Update clb data structures since we kept the move. */ + commit_move_blocks(blocks_affected); - /* Reset the net cost function flags first. */ - reset_move_nets(num_nets_affected); + } else { //move_outcome == REJECTED - /* Restore the place_ctx.block_locs data structures to their state before the move. */ - revert_move_blocks(blocks_affected); + /* Reset the net cost function flags first. */ + reset_move_nets(num_nets_affected); - if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { - //Revert the timing delays and costs to pre-update values - //These routines must be called after reverting the block moves - //TODO: make this process incremental - comp_td_connection_delays(delay_model); - comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + /* Restore the place_ctx.block_locs data structures to their state before the move. */ + revert_move_blocks(blocks_affected); - //Re-invalidate the affected sink pins - invalidate_affected_connection_delays(sink_pins_affected, - pin_timing_invalidator, - timing_info); + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Revert the timing delays and costs to pre-update values. + //These routines must be called after reverting the block moves + //if we wish to perform a reversion of the previous timing update. + // + //TODO: make this process incremental. Currently, all the delays + //are recomputed before all the timing costs are recomputed. + comp_td_connection_delays(delay_model); + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); - /* Revert the timing update */ - update_setup_slacks_and_criticalities(state->crit_exponent, - delay_model, - criticalities, - setup_slacks, - pin_timing_invalidator, - timing_info, - timing_update_mode, - costs); + /* Re-invalidate the affected sink pins */ + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); - VTR_ASSERT_SAFE_MSG( - verify_connection_setup_slacks(setup_slacks), - "The current setup slacks should be identical to the values before the try swap timing info update."); - } + /* Revert the timing update */ + update_setup_slacks_and_criticalities(state->crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - /* Unstage the values stored in proposed_* data structures */ - revert_td_cost(blocks_affected); - } + /* Check the consistency of the setup slack values */ + VTR_ASSERT_SAFE_MSG( + verify_connection_setup_slacks(setup_slacks), + "The current setup slacks should be identical to the values before the try swap timing info update."); + } + + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + /* Discard the values stored in proposed_* data structures */ + revert_td_cost(blocks_affected); } + } - move_outcome_stats.delta_cost_norm = delta_c; - move_outcome_stats.delta_bb_cost_norm = bb_delta_c * costs->bb_cost_norm; - move_outcome_stats.delta_timing_cost_norm = timing_delta_c * costs->timing_cost_norm; + /* Record the costs in the move outcome stats */ + move_outcome_stats.delta_cost_norm = delta_c; + move_outcome_stats.delta_bb_cost_norm = bb_delta_c * costs->bb_cost_norm; + move_outcome_stats.delta_timing_cost_norm = timing_delta_c * costs->timing_cost_norm; - move_outcome_stats.delta_bb_cost_abs = bb_delta_c; - move_outcome_stats.delta_timing_cost_abs = timing_delta_c; + move_outcome_stats.delta_bb_cost_abs = bb_delta_c; + move_outcome_stats.delta_timing_cost_abs = timing_delta_c; - LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c, - (move_outcome ? "ACCEPTED" : "REJECTED"), ""); - } + LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c, + (move_outcome ? "ACCEPTED" : "REJECTED"), ""); move_outcome_stats.outcome = move_outcome; move_generator.process_outcome(move_outcome_stats); + /* Clear the data structure containing block move info */ clear_move_blocks(blocks_affected); //VTR_ASSERT(check_macro_placement_consistency() == 0); @@ -1536,10 +1558,22 @@ static e_move_result try_swap(const t_annealing_state* state, return move_outcome; } -//Puts all the nets changed by the current swap into nets_to_update, -//and updates their bounding box. -// -//Returns the number of affected nets. +/** + * @brief Find all the nets and pins affected by this swap and update costs. + * + * Find all the nets affected by this swap and update the bouding box (wiring) + * costs. This cost function doesn't depend on the timing info. + * + * Find all the pins affected by this swap and update the timing cost. + * The timing costs are calculated by getting the new connection delays, multiplied + * by the connection criticalities returned by the timing analyzer. + * These timing costs are stored in the proposed_* data structures. + * + * The change in the bounding box cost is stored in `bb_delta_c`. + * The change in the timing cost is stored in `timing_delta_c`. + * + * @return The number of affected nets. + */ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, @@ -1593,6 +1627,7 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm return num_affected_nets; } +///@brief Stores all the nets affected by the block moves (avoid duplicates). static void record_affected_net(const ClusterNetId net, int& num_affected_nets) { //Record effected nets if (proposed_net_cost[net] < 0.) { @@ -1605,6 +1640,7 @@ static void record_affected_net(const ClusterNetId net, int& num_affected_nets) } } +///@brief Update the net bounding box. static void update_net_bb(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected, int iblk, @@ -1636,6 +1672,16 @@ static void update_net_bb(const ClusterNetId net, } } +/** + * @brief Get the proposed timing delay and cost based on the current block moves. + * + * Only considers the sink pins on the moved blocks, and the sink pins of the nets + * driven by the driver pins on the moved blocks. + * Add all these pins into blocks_affected.affected_pins so that we don't have to + * go through the moved blocks and gather them again in other routines. + * + * Also calculates the change in the timing cost by the proposed block moves. + */ static void update_td_delta_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& criticalities, const ClusterNetId net, @@ -1681,19 +1727,23 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, } } +/** + * @brief Find all the sink pins with changed connection delays from the affected blocks. + * + * These sink pins will be passed into the pin_timing_invalidator for timing update. + * They will also be added to the pin invalidator when we wish to revert a timing update. + * + * It is possible that some connections may not have changed delay. For instance, if + * using a dx/dy delay model, this could occur if a sink moved to a new position with + * the same dx/dy from it's driver. To minimize work during the incremental STA update + * we do not invalidate such unchanged connections. + */ static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected, std::vector& sink_pins_affected) { auto& cluster_ctx = g_vpr_ctx.clustering(); auto& clb_nlist = cluster_ctx.clb_nlist; for (ClusterPinId clb_pin : blocks_affected.affected_pins) { - //It is possible that some connections may not have changed delay.(e.g. - //For instance, if using a dx/dy delay model, this could occur if a sink - //moved to a new position with the same dx/dy from it's driver. - // - //To minimize work during the incremental STA update we do not invalidate - //such unchanged connections. - ClusterNetId net = clb_nlist.pin_net(clb_pin); int ipin = clb_nlist.pin_net_index(clb_pin); @@ -1704,6 +1754,24 @@ static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affect } } +/** + * @brief Check if the setup slack has gotten better or worse due to block swap. + * + * Get all the modified slack values via the PlacerSetupSlacks class, and compare + * then with the original values at these connections. Sort them and compare them + * one by one, and return the difference of the first different pair. + * + * If the new slack value is larger(better), than return a negative value so that + * the move will be accepted. If the new slack value is smaller(worse), return a + * positive value so that the move will be rejected. + * + * If no slack values have changed, then return an arbitrary positive number. A + * move resulting in no change in the slack values should probably be unnecessary. + * + * The sorting is need to prevent in the unlikely circumstances that a bad slack + * value suddenly got very good due to the block move, while a good slack value + * got very bad, perhaps even worse than the original worse slack value. + */ static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { const auto& cluster_ctx = g_vpr_ctx.clustering(); const auto& clb_nlist = cluster_ctx.clb_nlist; @@ -1739,16 +1807,24 @@ static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { return 1; } +/** + * @brief Decide whether to accept a move based on the probability + * calculated from the current annealing temperature. + * + * Returns: 1 -> move accepted, 0 -> rejected. + */ static e_move_result assess_swap(double delta_c, double t) { - /* Returns: 1 -> move accepted, 0 -> rejected. */ + /* A non-positive cost will always be accepted */ if (delta_c <= 0) { return ACCEPTED; } + /* If temperature is 0 and the cost is positive, guaranteed rejection */ if (t == 0.) { return REJECTED; } + /* Calculated the probability using temp and decide */ float fnum = vtr::frand(); float prob_fac = std::exp(-delta_c / t); if (prob_fac > fnum) { @@ -1758,27 +1834,31 @@ static e_move_result assess_swap(double delta_c, double t) { return REJECTED; } +/** + * @brief Recomputes the wiring cost to eliminate round-off that may have accrued. + * + * This process assumes that all the net costs have been updated. + */ static double recompute_bb_cost() { - /* Recomputes the cost to eliminate roundoff that may have accrued. * - * This routine does as little work as possible to compute this new * - * cost. */ - - double cost = 0; - auto& cluster_ctx = g_vpr_ctx.clustering(); + double cost = 0; for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */ if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */ /* Bounding boxes don't have to be recomputed; they're correct. */ cost += net_cost[net_id]; } } - - return (cost); + return cost; } -/* Update the connection_timing_cost values from the temporary * - * values for all connections that have changed. */ +/** + * @brief Update the connection_timing_cost values from the temporary + * values for all connections that have/haven't changed. + * + * All the connections have already been gathered by blocks_affected.affected_pins + * after running the routine find_affected_nets_and_update_costs(). + */ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { auto& cluster_ctx = g_vpr_ctx.clustering(); auto& clb_nlist = cluster_ctx.clb_nlist; @@ -1796,8 +1876,10 @@ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { } } -//Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost based on -//the move proposed in blocks_affected +/** + * @brief Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost + * based on the move proposed in blocks_affected. + */ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { #ifndef VTR_ASSERT_SAFE_ENABLED static_cast(blocks_affected); @@ -1816,10 +1898,15 @@ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { #endif } -//Invalidates the delays of connections effected by the specified move -// -//Relies on proposed_connection_delay and connection_delay to detect -//which connections have actually had their delay changed. +/** + * @brief Invalidates the delays of connections effected by the specified move. + * + * Relies on find_affected_sink_pins() to find all the connections with different + * `proposed_connection_delay` and `connection_delay`. + * + * Invalidate all the timing graph edges associated with these sink pins via the + * ClusteredPinTimingInvalidator class. + */ static void invalidate_affected_connection_delays(const std::vector& sink_pins_affected, ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info) { @@ -1828,17 +1915,11 @@ static void invalidate_affected_connection_delays(const std::vectorinvalidate_connection(clb_pin, timing_info); } } -//Returns true if 'net' is driven by one of the blocks in 'blocks_affected' +///@brief Returns true if 'net' is driven by one of the blocks in 'blocks_affected'. static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected) { auto& cluster_ctx = g_vpr_ctx.clustering(); @@ -1851,14 +1932,23 @@ static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_b return false; } -/* Finds the cost from scratch. Done only when the placement * - * has been radically changed (i.e. after initial placement). * - * Otherwise find the cost change incrementally. If method * - * check is NORMAL, we find bounding boxes that are updateable * - * for the larger nets. If method is CHECK, all bounding boxes * - * are found via the non_updateable_bb routine, to provide a * - * cost which can be used to check the correctness of the * - * other routine. */ +/** + * @brief Find the wiring cost. + * + * Find the wiring cost from scratch only when the placement has + * been radically changed (i.e. after the initial placement). + * Otherwise, find the cost change incrementally. + * + * @param method + * + * NORMAL If the method check is NORMAL, we find boudning + * boxes that are updateable for the larger nets. + * + * CHECK If the method check is CHECK, all bounding boxes + * are found via the non_updateable_bb routine to + * provide a cost which can be used to check the + * correctness of the other routine. + */ static double comp_bb_cost(e_cost_methods method) { double cost = 0; double expected_wirelength = 0.0; diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/place_delay_model.cpp index 31486293231..e8a58db6704 100644 --- a/vpr/src/place/place_delay_model.cpp +++ b/vpr/src/place/place_delay_model.cpp @@ -374,4 +374,4 @@ void comp_td_connection_delays(const PlaceDelayModel* delay_model) { connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, net_id, ipin); } } -} \ No newline at end of file +} diff --git a/vpr/src/place/place_global.h b/vpr/src/place/place_global.h index 8ab36b9d1f5..fd1cc2d9a6b 100644 --- a/vpr/src/place/place_global.h +++ b/vpr/src/place/place_global.h @@ -13,6 +13,9 @@ * * For detailed descriptions on what each data structure stores, please see * place.cpp, where these variables are defined. + * + * TODO: Create a single extern variable that allows access to all these data + * structures so that these structures don't have to be declared as extern. */ #pragma once diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h index 04d94f2f38d..399684ae03a 100644 --- a/vpr/src/place/place_util.h +++ b/vpr/src/place/place_util.h @@ -115,4 +115,4 @@ class t_annealing_state { public: //Accessor float final_rlim() const { return FINAL_RLIM; } -}; \ No newline at end of file +};