diff --git a/libs/libvtrutil/src/vtr_vec_id_set.h b/libs/libvtrutil/src/vtr_vec_id_set.h index 9e0a1f0802e..ed6620b1cdd 100644 --- a/libs/libvtrutil/src/vtr_vec_id_set.h +++ b/libs/libvtrutil/src/vtr_vec_id_set.h @@ -2,6 +2,7 @@ #define VTR_SET_H #include +#include namespace vtr { diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index f49d08bf6c9..e3948d6f88a 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -572,6 +572,8 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts) PlacerOpts->effort_scaling = Options.place_effort_scaling; PlacerOpts->timing_update_type = Options.timing_update_type; + + PlacerOpts->place_quench_metric = Options.place_quench_metric; } static void SetupAnalysisOpts(const t_options& Options, t_analysis_opts& analysis_opts) { diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 64d607f7a24..53760cc9b68 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -1025,6 +1025,41 @@ struct ParseTimingUpdateType { } }; +struct ParsePlaceQuenchMetric { + ConvertedValue from_str(std::string str) { + ConvertedValue conv_value; + if (str == "auto") + conv_value.set_value(e_place_quench_metric::AUTO); + else if (str == "timing_cost") + conv_value.set_value(e_place_quench_metric::TIMING_COST); + else if (str == "setup_slack") + conv_value.set_value(e_place_quench_metric::SETUP_SLACK); + else { + std::stringstream msg; + msg << "Invalid conversion from '" << str << "' to e_place_quench_metric (expected one of: " << argparse::join(default_choices(), ", ") << ")"; + conv_value.set_error(msg.str()); + } + return conv_value; + } + + ConvertedValue to_str(e_place_quench_metric val) { + ConvertedValue conv_value; + if (val == e_place_quench_metric::AUTO) + conv_value.set_value("auto"); + if (val == e_place_quench_metric::TIMING_COST) + conv_value.set_value("timing_cost"); + else { + VTR_ASSERT(val == e_place_quench_metric::SETUP_SLACK); + conv_value.set_value("setup_slack"); + } + return conv_value; + } + + std::vector default_choices() { + return {"auto", "timing_cost", "setup_slack"}; + } +}; + argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& args) { std::string description = "Implements the specified circuit onto the target FPGA architecture" @@ -1814,6 +1849,17 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .default_value("") .show_in(argparse::ShowIn::HELP_ONLY); + place_timing_grp.add_argument(args.place_quench_metric, "--place_quench_metric") + .help( + "Controls which cost function the placer uses during the quench stage:\n" + " * auto: VPR decides\n" + " * timing_cost: The same cost formulation as the one used during\n" + " the annealing stage (more stable)\n" + " * setup_slack: Directly uses setup slacks (in combination with wiring)\n" + " to check if the block moves should be accepted\n") + .default_value("auto") + .show_in(argparse::ShowIn::HELP_ONLY); + auto& route_grp = parser.add_argument_group("routing options"); route_grp.add_argument(args.max_router_iterations, "--max_router_iterations") diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h index 55d6b46b532..9e25e81f528 100644 --- a/vpr/src/base/read_options.h +++ b/vpr/src/base/read_options.h @@ -128,6 +128,7 @@ struct t_options { argparse::ArgValue place_delay_model; argparse::ArgValue place_delay_model_reducer; argparse::ArgValue allowed_tiles_for_delay_model; + argparse::ArgValue place_quench_metric; /* Router Options */ argparse::ArgValue check_rr_graph; diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 0dffc51af41..97df4413bcf 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -851,7 +851,8 @@ struct t_annealing_sched { * doPlacement: true if placement is supposed to be done in the CAD flow, false otherwise */ enum e_place_algorithm { BOUNDING_BOX_PLACE, - PATH_TIMING_DRIVEN_PLACE + PATH_TIMING_DRIVEN_PLACE, + SETUP_SLACK_ANALYSIS_PLACE }; enum e_pad_loc_type { @@ -889,6 +890,12 @@ enum class e_place_delta_delay_algorithm { DIJKSTRA_EXPANSION, }; +enum class e_place_quench_metric { + TIMING_COST, + SETUP_SLACK, + AUTO +}; + struct t_placer_opts { enum e_place_algorithm place_algorithm; float timing_tradeoff; @@ -935,6 +942,7 @@ struct t_placer_opts { std::string allowed_tiles_for_delay_model; e_place_delta_delay_algorithm place_delta_delay_matrix_calculation_method; + e_place_quench_metric place_quench_metric; }; /* All the parameters controlling the router's operation are in this * diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 07d29f914fd..321a9a2556c 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -1,3 +1,8 @@ +/** + * @file place.cpp + * @brief This is a core file that defines the major placer routines used by VPR. + */ + #include #include #include @@ -44,6 +49,9 @@ #include "tatum/echo_writer.hpp" #include "tatum/TimingReporter.hpp" +#include "place_global.h" +#include "place_timing_update.h" + using std::max; using std::min; @@ -58,10 +66,6 @@ using std::min; * cost computation. 0.01 means that there is a 1% error tolerance. */ #define ERROR_TOL .01 -/* The final rlim (range limit) is 1, which is the smallest value that can * - * still make progress, since an rlim of 0 wouldn't allow any swaps. */ -#define FINAL_RLIM 1 - /* This defines the maximum number of swap attempts before invoking the * * once-in-a-while placement legality check as well as floating point * * variables round-offs check. */ @@ -91,110 +95,130 @@ struct t_placer_statistics { sum_of_squares; int success_sum; }; +constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); +constexpr double INVALID_COST = std::numeric_limits::quiet_NaN(); + +/******************************************************************************* + * Below is a list of definitions of data structures declared as `extern` in * + * place_global.h. These variables were originally local to the current file. * + * However, they were moved so as to facilitate moving some of the routines * + * in the current file into other source files. * + * TODO: Create a single extern variable that allows access to all these data * + * structures so that these structures don't have to be declared as extern. * + *******************************************************************************/ + +/** + * @brief Cost of a net, and a temporary cost of a net used during move assessment. + * + * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector net_cost, proposed_net_cost; -struct t_placer_costs { - //Although we do nost cost calculations with float's we - //use doubles for the accumulated costs to avoid round-off, - //particularly on large designs where the magnitude of a single - //move's delta cost is small compared to the overall cost. - double cost; - double bb_cost; - double timing_cost; -}; - -struct t_placer_prev_inverse_costs { - double bb_cost; - double timing_cost; -}; - -// Used by update_annealing_state() -struct t_annealing_state { - float t; // Temperature - float rlim; // Range limit for swaps - float inverse_delta_rlim; // used to calculate crit_exponent - float alpha; // Temperature decays by this factor each outer iteration - float restart_t; // Temperature used after restart due to minimum success ratio - float crit_exponent; // Used by timing-driven placement to "sharpen" timing criticality - int move_lim_max; // Maximum move limit - int move_lim; // Current move limit -}; +/** + * @brief A flag array to indicate whether the specific bounding box has + * been updated in this particular swap or not. + * + * If it has been updated before, the code must use the updated data, instead of + * the out-of-date data passed into the subroutine, particularly used in try_swap(). + * + * NOT_UPDATED_YET Indicates that the net has not been updated before. + * UPDATED_ONCE Indicates that the net has been updated once, if it is going to be + * updated again, the values from the previous update must be used. + * GOT_FROM_SCRATCH Only applicable for nets larger than SMALL_NETS. It indicates that + * the particular bounding box cannot be updated incrementally before, + * hence the bounding box is got from scratch, so the bounding box + * would definitely be right, DO NOT update again. + * + * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector bb_updated_before; -constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); +/** + * @brief Net connection delays. + * + * @param connection_delay + * Delays based on the committed block positions. + * @param proposed_connection_delay + * Delays based on the proposed block positions. Only for connections + * affected by the proposed move. Otherwise, INVALID_DELAY. + * + * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ +ClbNetPinsMatrix connection_delay, proposed_connection_delay; -constexpr double MAX_INV_TIMING_COST = 1.e9; -/* Stops inverse timing cost from going to infinity with very lax timing constraints, - * which avoids multiplying by a gigantic prev_inverse.timing_cost when auto-normalizing. - * The exact value of this cost has relatively little impact, but should not be - * large enough to be on the order of timing costs for normal constraints. */ - -/********************** Variables local to place.c ***************************/ - -/* Cost of a net, and a temporary cost of a net used during move assessment. */ -static vtr::vector net_cost, proposed_net_cost; - -/* [0...cluster_ctx.clb_nlist.nets().size()-1] * - * A flag array to indicate whether the specific bounding box has been updated * - * in this particular swap or not. If it has been updated before, the code * - * must use the updated data, instead of the out-of-date data passed into the * - * subroutine, particularly used in try_swap(). The value NOT_UPDATED_YET * - * indicates that the net has not been updated before, UPDATED_ONCE indicated * - * that the net has been updated once, if it is going to be updated again, the * - * values from the previous update must be used. GOT_FROM_SCRATCH is only * - * applicable for nets larger than SMALL_NETS and it indicates that the * - * particular bounding box cannot be updated incrementally before, hence the * - * bounding box is got from scratch, so the bounding box would definitely be * - * right, DO NOT update again. */ -static vtr::vector bb_updated_before; - -/* - * Net connection delays based on the placement. +/** + * @brief Net connection setup slacks based on most recently updated timing graph. + * + * Updated with commit_setup_slacks() routine. + * * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static ClbNetPinsMatrix connection_delay; //Delays based on commited block positions -static ClbNetPinsMatrix proposed_connection_delay; //Delays for proposed block positions (only - // for connections effected by move, otherwise - // INVALID_DELAY) +ClbNetPinsMatrix connection_setup_slack; -/* - * Timing cost of connections (i.e. criticality * delay). +/** + * @brief Net connection timing costs (i.e. criticality * delay). + * + * @param connection_timing_cost + * Costs of committed block positions. See PlacerTimingCosts. + * @param proposed_connection_timing_cost + * Costs for proposed block positions. Only for connection + * affected by the proposed move. Otherwise, INVALID_DELAY + * * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static PlacerTimingCosts connection_timing_cost; //Costs of commited block positions -static ClbNetPinsMatrix proposed_connection_timing_cost; //Costs for proposed block positions - // (only for connectsion effected by - // move, otherwise INVALID_DELAY) - -/* - * Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection). - * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1] +PlacerTimingCosts connection_timing_cost; +ClbNetPinsMatrix proposed_connection_timing_cost; + +/** + * @brief Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection). + * + * Like connection_timing_cost, but summed across net pins. Used to allow more + * efficient recalculation of timing cost if only a sub-set of nets are changed + * while maintaining numeric stability. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1] */ -static vtr::vector net_timing_cost; //Like connection_timing_cost, but summed - // accross net pins. Used to allow more - // efficient recalculation of timing cost - // if only a sub-set of nets are changed - // while maintaining numeric stability. - -/* [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates and the number of * - * blocks on each of a net's bounding box (to allow efficient updates), * - * respectively. */ - -static vtr::vector bb_coords, bb_num_on_edges; - -/* The arrays below are used to precompute the inverse of the average * - * number of tracks per channel between [subhigh] and [sublow]. Access * - * them as chan?_place_cost_fac[subhigh][sublow]. They are used to * - * speed up the computation of the cost function that takes the length * - * of the net bounding box in each dimension, divided by the average * - * number of tracks in that direction; for other cost functions they * - * will never be used. * +vtr::vector net_timing_cost; + +/** + * @brief Store the bounding box coordinates and the number of blocks on each + * of a net's bounding box (to allow efficient updates) respectively. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1] */ -static float** chanx_place_cost_fac; //[0...device_ctx.grid.width()-2] -static float** chany_place_cost_fac; //[0...device_ctx.grid.height()-2] +vtr::vector bb_coords, bb_num_on_edges; -/* The following arrays are used by the try_swap function for speed. */ -/* [0...cluster_ctx.clb_nlist.nets().size()-1] */ -static vtr::vector ts_bb_coord_new, ts_bb_edge_new; -static std::vector ts_nets_to_update; +/** + * @brief 2D arrays used to precompute the inverse of the average + * number of tracks per channel between [subhigh] and [sublow]. + * + * Access them as chan?_place_cost_fac[subhigh][sublow]. + * They are used to speed up the computation of the cost function that + * takes the length of the net bounding box in each dimension, divided + * by the average number of tracks in that direction. + * + * For other cost functions they will never be used. + * + * @param chanx_place_cost_fac + * 1st dimension index range: [0...device_ctx.grid.width()-2] + * @param chany_place_cost_fac + * 1st dimension index range: [0...device_ctx.grid.height()-2] + * + * For more detailed structure allocation process and index ranges, see + * alloc_and_load_for_fast_cost_update(). + */ +float** chanx_place_cost_fac; +float** chany_place_cost_fac; + +/** + * @brief The following arrays are used by the try_swap function for speed. + * + * Index range: [0...cluster_ctx.clb_nlist.nets().size()-1] + */ +vtr::vector ts_bb_coord_new, ts_bb_edge_new; +std::vector ts_nets_to_update; + +/********** End of definitions of variables in place_global.h **********/ /* These file-scoped variables keep track of the number of swaps * * rejected, accepted or aborted. The total number of swap attempts * @@ -216,11 +240,6 @@ static const float cross_count[50] = {/* [0..49] */ 1.0, 1.0, 1.0, 1.0828, 1.153 2.5064, 2.5356, 2.5610, 2.5864, 2.6117, 2.6371, 2.6625, 2.6887, 2.7148, 2.7410, 2.7671, 2.7933}; -static float f_update_td_costs_connections_elapsed_sec = 0.; -static float f_update_td_costs_nets_elapsed_sec = 0.; -static float f_update_td_costs_sum_nets_elapsed_sec = 0.; -static float f_update_td_costs_total_elapsed_sec = 0.; - std::unique_ptr f_move_stats_file(nullptr, vtr::fclose); #ifdef VTR_ENABLE_DEBUG_LOGGING @@ -320,16 +339,16 @@ static double comp_bb_cost(e_cost_methods method); static void update_move_nets(int num_nets_affected); static void reset_move_nets(int num_nets_affected); -static e_move_result try_swap(float t, +static e_move_result try_swap(const t_annealing_state* state, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - float rlim, MoveGenerator& move_generator, - TimingInfo* timing_info, + SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, float rlim_escape_fraction, enum e_place_algorithm place_algorithm, float timing_tradeoff); @@ -347,54 +366,39 @@ static int check_placement_consistency(); static int check_block_placement_consistency(); static int check_macro_placement_consistency(); -static float starting_t(t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, +static float starting_t(const t_annealing_state* state, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs, t_annealing_sched annealing_sched, - int max_moves, - float rlim, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, - TimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + SetupTimingInfo* timing_info, MoveGenerator& move_generator, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts); -static bool update_annealing_state(t_annealing_state* state, - float success_rat, - const t_placer_costs& costs, - const t_placer_opts& placer_opts, - const t_annealing_sched& annealing_sched); - -static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); - static int count_connections(); static double get_std_dev(int n, double sum_x_squared, double av_x); static double recompute_bb_cost(); -static float comp_td_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin); - -static void comp_td_connection_delays(const PlaceDelayModel* delay_model); - static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); -static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected, +static void invalidate_affected_connection_delays(const std::vector& sink_pins_affected, ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info); static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected); -static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost); +static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected, + std::vector& sink_pins_affected); -static void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost); - -static double comp_td_connection_cost(const PlaceDelayModel* delay_mode, const PlacerCriticalities& place_crit, ClusterNetId net, int ipin); -static double sum_td_net_cost(ClusterNetId net); -static double sum_td_costs(); +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks); static e_move_result assess_swap(double delta_c, double t); @@ -431,41 +435,34 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr); static void free_try_swap_arrays(); -static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info); - -static void recompute_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs); - -static void placement_inner_loop(float t, +static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info); + +static void placement_inner_loop(const t_annealing_state* state, int temp_num, - float rlim, const t_placer_opts& placer_opts, - int move_lim, - float crit_exponent, int inner_recompute_limit, t_placer_statistics* stats, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int* moves_since_cost_recompute, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, - SetupTimingInfo* timing_info); + SetupTimingInfo* timing_info, + enum e_place_algorithm place_algorithm); static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, @@ -495,7 +492,7 @@ static void print_place_status(const size_t num_temps, size_t tot_moves); static void print_resources_utilization(); -static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); +static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts); /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, @@ -518,12 +515,10 @@ void try_place(const t_placer_opts& placer_opts, auto& timing_ctx = g_vpr_ctx.timing(); auto pre_place_timing_stats = timing_ctx.stats; - int tot_iter, moves_since_cost_recompute, width_fac, num_connections, - outer_crit_iter_count, inner_recompute_limit; - float success_rat, first_crit_exponent, first_rlim; + int tot_iter, moves_since_cost_recompute, width_fac, num_connections, outer_crit_iter_count; + float success_rat, first_crit_exponent; - t_placer_costs costs; - t_placer_prev_inverse_costs prev_inverse_costs; + t_placer_costs costs(placer_opts.place_algorithm); tatum::TimingPathInfo critical_path; float sTNS = NAN; @@ -537,10 +532,12 @@ void try_place(const t_placer_opts& placer_opts, std::shared_ptr placement_delay_calc; std::unique_ptr place_delay_model; std::unique_ptr move_generator; + std::unique_ptr placer_setup_slacks; std::unique_ptr placer_criticalities; std::unique_ptr pin_timing_invalidator; t_pl_blocks_to_be_moved blocks_affected(cluster_ctx.clb_nlist.blocks().size()); + t_placer_timing_update_mode timing_update_mode; /* Allocated here because it goes into timing critical code where each memory allocation is expensive */ IntraLbPbPinLookup pb_gpin_lookup(device_ctx.logical_block_types); @@ -552,8 +549,8 @@ void try_place(const t_placer_opts& placer_opts, num_ts_called = 0; if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - /*do this before the initial placement to avoid messing up the initial placement */ - place_delay_model = alloc_lookups_and_criticalities(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs); + /* Do this before the initial placement to avoid messing up the initial placement */ + place_delay_model = alloc_lookups_and_delay_model(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs); if (isEchoFileEnabled(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)) { place_delay_model->dump_echo(getEchoFileName(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)); @@ -604,6 +601,8 @@ void try_place(const t_placer_opts& placer_opts, timing_info = make_setup_timing_info(placement_delay_calc, placer_opts.timing_update_type); + placer_setup_slacks = std::make_unique(cluster_ctx.clb_nlist, netlist_pin_lookup); + placer_criticalities = std::make_unique(cluster_ctx.clb_nlist, netlist_pin_lookup); pin_timing_invalidator = std::make_unique(cluster_ctx.clb_nlist, @@ -611,15 +610,15 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.nlist, atom_ctx.lookup, *timing_info->timing_graph()); - //Update timing and costs - recompute_criticalities(first_crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); - - timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement + //First time compute timing and costs, compute from scratch + initialize_timing_info(first_crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &timing_update_mode, + &costs); critical_path = timing_info->least_slack_critical_path(); @@ -635,26 +634,34 @@ void try_place(const t_placer_opts& placer_opts, outer_crit_iter_count = 1; - prev_inverse_costs.timing_cost = 1 / costs.timing_cost; - prev_inverse_costs.bb_cost = 1 / costs.bb_cost; - costs.cost = 1; /*our new cost function uses normalized values of */ - /*bb_cost and timing_cost, the value of cost will be reset */ - /*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */ - } else { /*BOUNDING_BOX_PLACE */ - costs.cost = costs.bb_cost = comp_bb_cost(NORMAL); - costs.timing_cost = 0; + /** + * Initialize the normalization factors. Calling costs.update_norm_factors() here + * would fail the golden results of strong_multiclock benchmark + */ + costs.timing_cost_norm = 1 / costs.timing_cost; + costs.bb_cost_norm = 1 / costs.bb_cost; + costs.cost = 1; + + } else { //placer_opts.place_algorithm == BOUNDING_BOX_PLACE + + //cost is the same as wirelength cost + costs.bb_cost = comp_bb_cost(NORMAL); + costs.cost = costs.bb_cost; + + //Timing cost and normalization factors are not used + costs.timing_cost = INVALID_COST; + costs.timing_cost_norm = INVALID_COST; + costs.bb_cost_norm = INVALID_COST; + outer_crit_iter_count = 0; num_connections = 0; first_crit_exponent = 0; - - prev_inverse_costs.timing_cost = 0; /*inverses not used */ - prev_inverse_costs.bb_cost = 0; } //Sanity check that initial placement is legal check_place(costs, place_delay_model.get(), placer_criticalities.get(), placer_opts.place_algorithm); - //Initial pacement statistics + //Initial placement statistics VTR_LOG("Initial placement cost: %g bb_cost: %g td_cost: %g\n", costs.cost, costs.bb_cost, costs.timing_cost); if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { @@ -687,63 +694,47 @@ void try_place(const t_placer_opts& placer_opts, print_place(nullptr, nullptr, filename.c_str()); } - int move_lim = 1; - if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { - //This scales the move limit proportional to num_blocks ^ (4/3) - move_lim = (int)(annealing_sched.inner_num * pow(cluster_ctx.clb_nlist.blocks().size(), 1.3333)); - } else if (placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT) { - //This scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) - // - //For highly utilized devices (device_size ~ num_blocks) this is the same as - //num_blocks ^ (4/3). - // - //For low utilization devices (device_size >> num_blocks) this performs more - //moves (device_size ^ (2/3)) to ensure better optimization. In this case, - //more moves than num_blocks ^ (4/3) may be required, since the search space - //is larger. - float device_size = device_ctx.grid.width() * device_ctx.grid.height(); - move_lim = (int)(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(cluster_ctx.clb_nlist.blocks().size(), 2. / 3.)); - } else { - VPR_ERROR(VPR_ERROR_PLACE, "Unrecognized placer effort scaling"); - } - VTR_LOG("Moves per temperature: %d\n", move_lim); - - /* Sometimes I want to run the router with a random placement. Avoid * - * using 0 moves to stop division by 0 and 0 length vector problems, * - * by setting move_lim to 1 (which is still too small to do any * - * significant optimization). */ - if (move_lim <= 0) - move_lim = 1; + int first_move_lim = get_initial_move_lim(placer_opts, annealing_sched); + int inner_recompute_limit; if (placer_opts.inner_loop_recompute_divider != 0) { - inner_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.inner_loop_recompute_divider); + inner_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.inner_loop_recompute_divider); } else { /*don't do an inner recompute */ - inner_recompute_limit = move_lim + 1; + inner_recompute_limit = first_move_lim + 1; } int quench_recompute_limit; if (placer_opts.quench_recompute_divider != 0) { - quench_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.quench_recompute_divider); + quench_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.quench_recompute_divider); } else { /*don't do an quench recompute */ - quench_recompute_limit = move_lim + 1; + quench_recompute_limit = first_move_lim + 1; } - first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); + /* Get the first range limiter */ + float first_rlim = float(std::max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1)); - float first_t = starting_t(&costs, &prev_inverse_costs, - annealing_sched, move_lim, first_rlim, - place_delay_model.get(), - placer_criticalities.get(), - timing_info.get(), - *move_generator, - pin_timing_invalidator.get(), - blocks_affected, - placer_opts); + /* Set the temperature high so essentially all swaps will be accepted */ + /* when trying to determine the starting temp for placement inner loop. */ + float first_t = HUGE_POSITIVE_FLOAT; + + /* Initialize annealing state variables */ + t_annealing_state state(annealing_sched, first_t, first_rlim, first_move_lim, first_crit_exponent); - t_annealing_state state; - init_annealing_state(&state, annealing_sched, first_t, first_rlim, move_lim, first_crit_exponent); + /* Update the starting temperature for placement annealing to a more appropriate value */ + state.t = starting_t(&state, + &timing_update_mode, + &costs, + annealing_sched, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + timing_info.get(), + *move_generator, + pin_timing_invalidator.get(), + blocks_affected, + placer_opts); if (!placer_opts.move_stats_file.empty()) { f_move_stats_file = std::unique_ptr(vtr::fopen(placer_opts.move_stats_file.c_str(), "w"), vtr::fclose); @@ -761,30 +752,30 @@ void try_place(const t_placer_opts& placer_opts, /* Outer loop of the simulated annealing begins */ do { vtr::Timer temperature_timer; - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - costs.cost = 1; - } - outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); - - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, - state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, - &costs, - &prev_inverse_costs, + outer_loop_update_timing_info(placer_opts, &timing_update_mode, + &costs, + num_connections, + state.crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get()); + + placement_inner_loop(&state, num_temps, placer_opts, + inner_recompute_limit, &stats, + &timing_update_mode, &costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), *move_generator, blocks_affected, - timing_info.get()); + timing_info.get(), + placer_opts.place_algorithm); tot_iter += state.move_lim; @@ -818,39 +809,49 @@ void try_place(const t_placer_opts& placer_opts, /* Outer loop of the simmulated annealing ends */ auto pre_quench_timing_stats = timing_ctx.stats; + + /* Start quench */ + state.t = 0; //Freeze out: only accept solutions that improve placement + state.move_lim = first_move_lim; //Revert the move limit to initial value + { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); - outer_loop_recompute_criticalities(placer_opts, &costs, - &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); - - state.t = 0; /* freeze out */ + outer_loop_update_timing_info(placer_opts, &timing_update_mode, + &costs, + num_connections, + state.crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get()); + + //Use setup slack analysis if the placer is timing driven + //and the quench metric is SETUP_SLACK. Otherwise, use the + //same cost formulation as the annealing stage + auto quench_algorithm = get_placement_quench_algorithm(placer_opts); /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, - move_lim, state.crit_exponent, quench_recompute_limit, &stats, - &costs, - &prev_inverse_costs, + placement_inner_loop(&state, num_temps, placer_opts, + quench_recompute_limit, &stats, + &timing_update_mode, &costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), *move_generator, blocks_affected, - timing_info.get()); + timing_info.get(), + quench_algorithm); - tot_iter += move_lim; + tot_iter += state.move_lim; ++num_temps; - calc_placer_stats(stats, success_rat, std_dev, costs, move_lim); + calc_placer_stats(stats, success_rat, std_dev, costs, state.move_lim); if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { critical_path = timing_info->least_slack_critical_path(); @@ -894,12 +895,18 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs - recompute_criticalities(state.crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); + timing_update_mode.update_criticalities = true; + timing_update_mode.update_setup_slacks = true; + update_setup_slacks_and_criticalities(state.crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &timing_update_mode, + &costs); + + commit_setup_slacks(placer_setup_slacks.get()); critical_path = timing_info->least_slack_critical_path(); @@ -949,22 +956,30 @@ void try_place(const t_placer_opts& placer_opts, print_timing_stats("Placement Quench", post_quench_timing_stats, pre_quench_timing_stats); print_timing_stats("Placement Total ", timing_ctx.stats, pre_place_timing_stats); - VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", f_update_td_costs_connections_elapsed_sec, f_update_td_costs_nets_elapsed_sec, f_update_td_costs_sum_nets_elapsed_sec, f_update_td_costs_total_elapsed_sec); + auto update_td_costs_runtime_stats = get_update_td_costs_runtime_stats(); + + VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", + update_td_costs_runtime_stats.connections_elapsed_sec, + update_td_costs_runtime_stats.nets_elapsed_sec, + update_td_costs_runtime_stats.sum_nets_elapsed_sec, + update_td_costs_runtime_stats.total_elapsed_sec); } -/* Function to recompute the criticalities before the inner loop of the annealing */ -static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info) { - if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) +/* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */ +static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info) { + if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) { return; + } /*at each temperature change we update these values to be used */ /*for normalizing the tradeoff between timing and wirelength (bb) */ @@ -976,67 +991,45 @@ static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, num_connections = std::max(num_connections, 1); //Avoid division by zero VTR_ASSERT(num_connections > 0); - //Update timing information - recompute_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + //Update all timing information + timing_update_mode->update_criticalities = true; + timing_update_mode->update_setup_slacks = true; + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + //Always commit the setup slacks when they are updated + commit_setup_slacks(setup_slacks); + *outer_crit_iter_count = 0; } (*outer_crit_iter_count)++; - /*at each temperature change we update these values to be used */ - /*for normalizing the tradeoff between timing and wirelength (bb) */ - prev_inverse_costs->bb_cost = 1 / costs->bb_cost; - /*Prevent inverse timing cost from going to infinity */ - prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); -} - -//Update timing information based on current placement by running STA to get new slacks, -//and calculate updated criticalities and timing costs -static void recompute_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs) { - //Run STA to update slacks and adjusted/relaxed criticalities - timing_info->update(); - - //Update placer'criticalities (e.g. sharpen with crit_exponent) - criticalities->update_criticalities(timing_info, crit_exponent); - - //Update connection, net and total timing costs based on new criticalities -#ifdef INCR_COMP_TD_COSTS - update_td_costs(delay_model, *criticalities, &costs->timing_cost); -#else - comp_td_costs(delay_model, *criticalities, &costs->timing_cost); -#endif - - //Clear invalidation state - pin_timing_invalidator->reset(); + costs->update_norm_factors(); ///move_lim; inner_iter++) { + e_move_result swap_result = try_swap(state, + timing_update_mode, + costs, move_generator, timing_info, pin_timing_invalidator, blocks_affected, delay_model, criticalities, + setup_slacks, placer_opts.rlim_escape_fraction, - placer_opts.place_algorithm, + place_algorithm, placer_opts.timing_tradeoff); if (swap_result == ACCEPTED) { @@ -1072,7 +1068,7 @@ static void placement_inner_loop(float t, num_swap_accepted++; } else if (swap_result == ABORTED) { num_swap_aborted++; - } else { // swap_result == REJECTED + } else { //swap_result == REJECTED num_swap_rejected++; } @@ -1081,21 +1077,28 @@ static void placement_inner_loop(float t, * We do this only once in a while, since it is expensive. */ if (inner_crit_iter_count >= inner_recompute_limit - && inner_iter != move_lim - 1) { /*on last iteration don't recompute */ + && inner_iter != state->move_lim - 1) { /*on last iteration don't recompute */ inner_crit_iter_count = 0; #ifdef VERBOSE VTR_LOG("Inner loop recompute criticalities\n"); #endif /* Using the delays in connection_delay, do a timing analysis to update slacks and - * criticalities and update the timing cost since it will change. + * criticalities and update the timing cost since they will change. */ - recompute_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + timing_update_mode->update_criticalities = true; + timing_update_mode->update_setup_slacks = true; + update_setup_slacks_and_criticalities(state->crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + //Always commit the setup slacks when they are updated + commit_setup_slacks(setup_slacks); } inner_crit_iter_count++; } @@ -1109,7 +1112,7 @@ static void placement_inner_loop(float t, /* Lines below prevent too much round-off error from accumulating * in the cost over many iterations (due to incremental updates). - * This round-off can lead to error checks failing because the cost + * This round-off can lead to error checks failing because the cost * is different from what you get when you recompute from scratch. */ ++(*moves_since_cost_recompute); @@ -1120,9 +1123,9 @@ static void placement_inner_loop(float t, if (placer_opts.placement_saves_per_temperature >= 1 && inner_iter > 0 - && (inner_iter + 1) % (move_lim / placer_opts.placement_saves_per_temperature) == 0) { + && (inner_iter + 1) % (state->move_lim / placer_opts.placement_saves_per_temperature) == 0) { std::string filename = vtr::string_fmt("placement_%03d_%03d.place", temp_num + 1, inner_placement_save_count); - VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, move_lim, filename.c_str()); + VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, state->move_lim, filename.c_str()); print_place(nullptr, nullptr, filename.c_str()); ++inner_placement_save_count; } @@ -1194,122 +1197,42 @@ static double get_std_dev(int n, double sum_x_squared, double av_x) { return (std_dev); } -static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) { - /* Update the range limited to keep acceptance prob. near 0.44. Use * - * a floating point rlim to allow gradual transitions at low temps. */ - - float upper_lim; - - *rlim = (*rlim) * (1. - 0.44 + success_rat); - upper_lim = max(grid.width() - 1, grid.height() - 1); - *rlim = min(*rlim, upper_lim); - *rlim = max(*rlim, (float)1.); -} - -/* Update the annealing state according to the annealing schedule selected. - * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. - * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. - * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. - * See doc/src/vpr/dusty_sa.rst for more details. - * - * Returns true until the schedule is finished. */ -static bool update_annealing_state(t_annealing_state* state, - float success_rat, - const t_placer_costs& costs, - const t_placer_opts& placer_opts, - const t_annealing_sched& annealing_sched) { - /* Return `false` when the exit criterion is met. */ - if (annealing_sched.type == USER_SCHED) { - state->t *= annealing_sched.alpha_t; - return state->t >= annealing_sched.exit_t; - } - - auto& device_ctx = g_vpr_ctx.device(); - auto& cluster_ctx = g_vpr_ctx.clustering(); - - /* Automatic annealing schedule */ - float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); - - if (annealing_sched.type == DUSTY_SCHED) { - bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets - if (success_rat < annealing_sched.success_min || restart_temp) { - if (state->alpha > annealing_sched.alpha_max) return false; - state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature. - state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay); - } else { - if (success_rat > annealing_sched.success_target) { - state->restart_t = state->t; - } - state->t *= state->alpha; - } - state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat)))); - } else { /* annealing_sched.type == AUTO_SCHED */ - if (success_rat > 0.96) { - state->alpha = 0.5; - } else if (success_rat > 0.8) { - state->alpha = 0.9; - } else if (success_rat > 0.15 || state->rlim > 1.) { - state->alpha = 0.95; - } else { - state->alpha = 0.8; - } - state->t *= state->alpha; - - // Must be duplicated to retain previous behavior - if (state->t < t_exit || std::isnan(t_exit)) return false; - } - - // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk. - // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. - update_rlim(&state->rlim, success_rat, device_ctx.grid); - - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - state->crit_exponent = (1 - (state->rlim - FINAL_RLIM) * state->inverse_delta_rlim) - * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) - + placer_opts.td_place_exp_first; - } - - return true; -} - -static float starting_t(t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, +static float starting_t(const t_annealing_state* state, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs, t_annealing_sched annealing_sched, - int max_moves, - float rlim, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, - TimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + SetupTimingInfo* timing_info, MoveGenerator& move_generator, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts) { /* Finds the starting temperature (hot condition). */ - - int i, num_accepted, move_lim; - double std_dev, av, sum_of_squares; /* Double important to avoid round off */ + int num_accepted = 0; + double std_dev, av = 0, sum_of_squares = 0; /* Double important to avoid round off */ if (annealing_sched.type == USER_SCHED) return (annealing_sched.init_t); auto& cluster_ctx = g_vpr_ctx.clustering(); - move_lim = min(max_moves, (int)cluster_ctx.clb_nlist.blocks().size()); - - num_accepted = 0; - av = 0.; - sum_of_squares = 0.; - - /* Try one move per block. Set t high so essentially all accepted. */ + /* Determines the block swap loop count. */ + int move_lim = std::min(state->move_lim_max, int(cluster_ctx.clb_nlist.blocks().size())); - for (i = 0; i < move_lim; i++) { - e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, costs, prev_inverse_costs, rlim, + for (int i = 0; i < move_lim; i++) { + //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack + e_move_result swap_result = try_swap(state, + timing_update_mode, + costs, move_generator, timing_info, pin_timing_invalidator, blocks_affected, delay_model, criticalities, + setup_slacks, placer_opts.rlim_escape_fraction, placer_opts.place_algorithm, placer_opts.timing_tradeoff); @@ -1373,43 +1296,56 @@ static void reset_move_nets(int num_nets_affected) { } } -static e_move_result try_swap(float t, +/** + * @brief Pick some block and moves it to another spot. + * + * If the new location is empty, directly move the block. If the new location + * is occupied, switch the blocks. Due to the different sizes of the blocks, + * this block switching may occur for multiple times. It might also cause the + * current swap attempt to abort due to inability to find suitable locations + * for moved blocks. + * + * The move generator will record all the switched blocks in the variable + * `blocks_affected`. Afterwards, the move will be assessed by the chosen + * cost formulation. Currently, there are three ways to assess move cost, + * which are stored in the enum type `e_place_algorithm`. + * + * @return Whether the block swap is accepted, rejected or aborted. + */ +static e_move_result try_swap(const t_annealing_state* state, + t_placer_timing_update_mode* timing_update_mode, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - float rlim, MoveGenerator& move_generator, - TimingInfo* timing_info, + SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, float rlim_escape_fraction, enum e_place_algorithm place_algorithm, float timing_tradeoff) { - /* Picks some block and moves it to another spot. If this spot is * - * occupied, switch the blocks. Assess the change in cost function. * - * rlim is the range limiter. * - * Returns whether the swap is accepted, rejected or aborted. * - * Passes back the new value of the cost functions. */ - num_ts_called++; MoveOutcomeStats move_outcome_stats; - /* I'm using negative values of proposed_net_cost as a flag, so DO NOT * - * use cost functions that can go negative. */ + /* I'm using negative values of proposed_net_cost as a flag, */ + /* so DO NOT use cost functions that can go negative. */ - double delta_c = 0; /* Change in cost due to this swap. */ - double bb_delta_c = 0; - double timing_delta_c = 0; + double delta_c = 0; //Change in cost due to this swap. + double bb_delta_c = 0; //Change in the bounding box (wiring) cost. + double timing_delta_c = 0; //Change in the timing cost (delay * criticality). - //Allow some fraction of moves to not be restricted by rlim, - //in the hopes of better escaping local minima + /*Allow some fraction of moves to not be restricted by rlim, */ + /*in the hopes of better escaping local minima. */ + float rlim; if (rlim_escape_fraction > 0. && vtr::frand() < rlim_escape_fraction) { rlim = std::numeric_limits::infinity(); + } else { + rlim = state->rlim; } - //Generate a new move (perturbation) used to explore the space of possible placements + /* Generate a new move (perturbation) used to explore the space of possible placements */ e_create_move create_move_outcome = move_generator.propose_move(blocks_affected, rlim); LOG_MOVE_STATS_PROPOSED(t, blocks_affected); @@ -1417,7 +1353,7 @@ static e_move_result try_swap(float t, e_move_result move_outcome = ABORTED; if (create_move_outcome == e_create_move::ABORT) { - //Proposed move is not legal -- give up on this move + /* Proposed move is not legal -- give up on this move */ clear_move_blocks(blocks_affected); LOG_MOVE_STATS_OUTCOME(std::numeric_limits::quiet_NaN(), @@ -1426,98 +1362,191 @@ static e_move_result try_swap(float t, "ABORTED", "illegal move"); move_outcome = ABORTED; - } else { - VTR_ASSERT(create_move_outcome == e_create_move::VALID); - /* - * To make evaluating the move simpler (e.g. calculating changed bounding box), - * we first move the blocks to thier new locations (apply the move to - * place_ctx.block_locs) and then computed the change in cost. If the move is - * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing - * the move). If the move is rejected the blocks are returned to their original - * positions (reverting place_ctx.block_locs to its original state). - * - * Note that the inverse look-up place_ctx.grid_blocks is only updated - * after move acceptance is determined, and so should not be used when - * evaluating a move. - */ + return move_outcome; + } - //Update the block positions - apply_move_blocks(blocks_affected); + /* Move is valid. Proceed to analyze cost. */ + VTR_ASSERT(create_move_outcome == e_create_move::VALID); - // Find all the nets affected by this swap and update their costs - int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, - delay_model, - criticalities, - blocks_affected, - bb_delta_c, - timing_delta_c); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - /*in this case we redefine delta_c as a combination of timing and bb. * - *additionally, we normalize all values, therefore delta_c is in * - *relation to 1*/ + /* + * To make evaluating the move simpler (e.g. calculating changed bounding box), + * we first move the blocks to thier new locations (apply the move to + * place_ctx.block_locs) and then computed the change in cost. If the move is + * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing + * the move). If the move is rejected the blocks are returned to their original + * positions (reverting place_ctx.block_locs to its original state). + * + * Note that the inverse look-up place_ctx.grid_blocks is only updated + * after move acceptance is determined, and so should not be used when + * evaluating a move. + */ - delta_c = (1 - timing_tradeoff) * bb_delta_c * prev_inverse_costs->bb_cost - + timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost; - } else { - delta_c = bb_delta_c; + //Update the block positions + apply_move_blocks(blocks_affected); + + //Find all the nets affected by this swap and update the wiring costs. + //This cost value doesn't depend on the timing info. + //Also find all the pins affected by the swap, and calculates new connection + //delays and timing costs and store them in proposed_* data structures. + int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, + delay_model, + criticalities, + blocks_affected, + bb_delta_c, + timing_delta_c); + + //Find all the sink pins with changed connection delays from the affected blocks. + //These sink pins will be passed into the pin_timing_invalidator for timing update. + //They will also be added to the pin invalidator when we wish to revert a timing update. + std::vector sink_pins_affected; + find_affected_sink_pins(blocks_affected, sink_pins_affected); + + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Invalidates timing of modified connections for incremental timing updates. + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); + + //Update the connection_timing_cost and connection_delay + //values from the temporary values. + //This step is necessary for performing timing update. + commit_td_cost(blocks_affected); + + //Update timing information. Since we are analyzing setup slacks, + //we only update those values and keep the criticalities stale + //so as not to interfere with the original timing cost algorithm. + // + //Note: the timing info must be called after applying block moves + //and committing the timing driven delays and costs. + //If we wish to revert this timing update due to move rejection, + //we need to first revert block moves and restore timing values. + timing_update_mode->update_criticalities = false; + timing_update_mode->update_setup_slacks = true; + update_setup_slacks_and_criticalities(state->crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + /* Get the setup slack analysis cost */ + //TODO: calculate a weighted average of the slack cost and wiring cost + delta_c = analyze_setup_slack_cost(setup_slacks); + + } else if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + /*in this case we redefine delta_c as a combination of timing and bb. * + *additionally, we normalize all values, therefore delta_c is in * + *relation to 1*/ + + delta_c = (1 - timing_tradeoff) * bb_delta_c * costs->bb_cost_norm + + timing_tradeoff * timing_delta_c * costs->timing_cost_norm; + + } else { //place_algorithm == BOUNDING_BOX_PLACE (wiring cost) + delta_c = bb_delta_c; + } + + /* 1 -> move accepted, 0 -> rejected. */ + move_outcome = assess_swap(delta_c, state->t); + + if (move_outcome == ACCEPTED) { + costs->cost += delta_c; + costs->bb_cost += bb_delta_c; + + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + /* Update the timing driven cost as usual */ + costs->timing_cost += timing_delta_c; + + //Commit the setup slack information + //The timing delay and cost values should be committed already + commit_setup_slacks(setup_slacks); } - /* 1 -> move accepted, 0 -> rejected. */ - move_outcome = assess_swap(delta_c, t); - - if (move_outcome == ACCEPTED) { - costs->cost += delta_c; - costs->bb_cost += bb_delta_c; - - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - costs->timing_cost += timing_delta_c; + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + costs->timing_cost += timing_delta_c; + + //Invalidates timing of modified connections for incremental timing + //updates. This routine relies on comparing proposed_connection_delay + //and connection_delay. If the setup slack analysis was not performed, + //the sink pins are yet to be invalidated. + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); + + //Update the connection_timing_cost and connection_delay + //values from the temporary values + commit_td_cost(blocks_affected); + } - //Invalidates timing of modified connections for incremental timing updates - //Must be called before commit_td_cost since it relies on comparing - //proposed_connection_delay and connection_delay - invalidate_affected_connection_delays(blocks_affected, - pin_timing_invalidator, - timing_info); + /* Update net cost functions and reset flags. */ + update_move_nets(num_nets_affected); - /*update the connection_timing_cost and connection_delay - * values from the temporary values */ - commit_td_cost(blocks_affected); - } + /* Update clb data structures since we kept the move. */ + commit_move_blocks(blocks_affected); - /* update net cost functions and reset flags. */ - update_move_nets(num_nets_affected); + } else { //move_outcome == REJECTED - /* Update clb data structures since we kept the move. */ - commit_move_blocks(blocks_affected); + /* Reset the net cost function flags first. */ + reset_move_nets(num_nets_affected); - } else { /* Move was rejected. */ - /* Reset the net cost function flags first. */ - reset_move_nets(num_nets_affected); + /* Restore the place_ctx.block_locs data structures to their state before the move. */ + revert_move_blocks(blocks_affected); - /* Restore the place_ctx.block_locs data structures to their state before the move. */ - revert_move_blocks(blocks_affected); + if (place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { + //Revert the timing delays and costs to pre-update values. + //These routines must be called after reverting the block moves + //if we wish to perform a reversion of the previous timing update. + // + //TODO: make this process incremental. Currently, all the delays + //are recomputed before all the timing costs are recomputed. + comp_td_connection_delays(delay_model); + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + + /* Re-invalidate the affected sink pins */ + invalidate_affected_connection_delays(sink_pins_affected, + pin_timing_invalidator, + timing_info); + + /* Revert the timing update */ + update_setup_slacks_and_criticalities(state->crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + /* Check the consistency of the setup slack values */ + VTR_ASSERT_SAFE_MSG( + verify_connection_setup_slacks(setup_slacks), + "The current setup slacks should be identical to the values before the try swap timing info update."); + } - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - revert_td_cost(blocks_affected); - } + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + /* Discard the values stored in proposed_* data structures */ + revert_td_cost(blocks_affected); } + } - move_outcome_stats.delta_cost_norm = delta_c; - move_outcome_stats.delta_bb_cost_norm = bb_delta_c * prev_inverse_costs->bb_cost; - move_outcome_stats.delta_timing_cost_norm = timing_delta_c * prev_inverse_costs->timing_cost; + /* Record the costs in the move outcome stats */ + move_outcome_stats.delta_cost_norm = delta_c; + move_outcome_stats.delta_bb_cost_norm = bb_delta_c * costs->bb_cost_norm; + move_outcome_stats.delta_timing_cost_norm = timing_delta_c * costs->timing_cost_norm; - move_outcome_stats.delta_bb_cost_abs = bb_delta_c; - move_outcome_stats.delta_timing_cost_abs = timing_delta_c; + move_outcome_stats.delta_bb_cost_abs = bb_delta_c; + move_outcome_stats.delta_timing_cost_abs = timing_delta_c; - LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c, - (move_outcome ? "ACCEPTED" : "REJECTED"), ""); - } + LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c, + (move_outcome ? "ACCEPTED" : "REJECTED"), ""); move_outcome_stats.outcome = move_outcome; move_generator.process_outcome(move_outcome_stats); + /* Clear the data structure containing block move info */ clear_move_blocks(blocks_affected); //VTR_ASSERT(check_macro_placement_consistency() == 0); @@ -1526,13 +1555,25 @@ static e_move_result try_swap(float t, check_place(*costs, delay_model, place_algorithm); #endif - return (move_outcome); + return move_outcome; } -//Puts all the nets changed by the current swap into nets_to_update, -//and updates their bounding box. -// -//Returns the number of affected nets. +/** + * @brief Find all the nets and pins affected by this swap and update costs. + * + * Find all the nets affected by this swap and update the bouding box (wiring) + * costs. This cost function doesn't depend on the timing info. + * + * Find all the pins affected by this swap and update the timing cost. + * The timing costs are calculated by getting the new connection delays, multiplied + * by the connection criticalities returned by the timing analyzer. + * These timing costs are stored in the proposed_* data structures. + * + * The change in the bounding box cost is stored in `bb_delta_c`. + * The change in the timing cost is stored in `timing_delta_c`. + * + * @return The number of affected nets. + */ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, @@ -1566,7 +1607,7 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm //once per net, not once per pin. update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE || place_algorithm == SETUP_SLACK_ANALYSIS_PLACE) { //Determine the change in timing costs if required update_td_delta_costs(delay_model, *criticalities, net_id, blk_pin, blocks_affected, timing_delta_c); } @@ -1586,6 +1627,7 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm return num_affected_nets; } +///@brief Stores all the nets affected by the block moves (avoid duplicates). static void record_affected_net(const ClusterNetId net, int& num_affected_nets) { //Record effected nets if (proposed_net_cost[net] < 0.) { @@ -1598,6 +1640,7 @@ static void record_affected_net(const ClusterNetId net, int& num_affected_nets) } } +///@brief Update the net bounding box. static void update_net_bb(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected, int iblk, @@ -1629,6 +1672,16 @@ static void update_net_bb(const ClusterNetId net, } } +/** + * @brief Get the proposed timing delay and cost based on the current block moves. + * + * Only considers the sink pins on the moved blocks, and the sink pins of the nets + * driven by the driver pins on the moved blocks. + * Add all these pins into blocks_affected.affected_pins so that we don't have to + * go through the moved blocks and gather them again in other routines. + * + * Also calculates the change in the timing cost by the proposed block moves. + */ static void update_td_delta_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& criticalities, const ClusterNetId net, @@ -1641,7 +1694,7 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, //This pin is a net driver on a moved block. //Re-compute all point to point connections for this net. for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { - float temp_delay = comp_td_connection_delay(delay_model, net, ipin); + float temp_delay = comp_td_single_connection_delay(delay_model, net, ipin); proposed_connection_delay[net][ipin] = temp_delay; proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay; @@ -1663,7 +1716,7 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, if (!driven_by_moved_block(net, blocks_affected)) { int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin); - float temp_delay = comp_td_connection_delay(delay_model, net, net_pin); + float temp_delay = comp_td_single_connection_delay(delay_model, net, net_pin); proposed_connection_delay[net][net_pin] = temp_delay; proposed_connection_timing_cost[net][net_pin] = criticalities.criticality(net, net_pin) * temp_delay; @@ -1674,16 +1727,104 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, } } +/** + * @brief Find all the sink pins with changed connection delays from the affected blocks. + * + * These sink pins will be passed into the pin_timing_invalidator for timing update. + * They will also be added to the pin invalidator when we wish to revert a timing update. + * + * It is possible that some connections may not have changed delay. For instance, if + * using a dx/dy delay model, this could occur if a sink moved to a new position with + * the same dx/dy from it's driver. To minimize work during the incremental STA update + * we do not invalidate such unchanged connections. + */ +static void find_affected_sink_pins(const t_pl_blocks_to_be_moved& blocks_affected, + std::vector& sink_pins_affected) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; + + for (ClusterPinId clb_pin : blocks_affected.affected_pins) { + ClusterNetId net = clb_nlist.pin_net(clb_pin); + int ipin = clb_nlist.pin_net_index(clb_pin); + + if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) { + //Delay has changed. Must invalidate this sink pin. + sink_pins_affected.push_back(clb_pin); + } + } +} + +/** + * @brief Check if the setup slack has gotten better or worse due to block swap. + * + * Get all the modified slack values via the PlacerSetupSlacks class, and compare + * then with the original values at these connections. Sort them and compare them + * one by one, and return the difference of the first different pair. + * + * If the new slack value is larger(better), than return a negative value so that + * the move will be accepted. If the new slack value is smaller(worse), return a + * positive value so that the move will be rejected. + * + * If no slack values have changed, then return an arbitrary positive number. A + * move resulting in no change in the slack values should probably be unnecessary. + * + * The sorting is need to prevent in the unlikely circumstances that a bad slack + * value suddenly got very good due to the block move, while a good slack value + * got very bad, perhaps even worse than the original worse slack value. + */ +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Find the original/proposed setup slacks of pins with modified values + std::vector original_setup_slacks, proposed_setup_slacks; + + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId clb_pin : clb_pins_modified) { + ClusterNetId net_id = clb_nlist.pin_net(clb_pin); + size_t ipin = clb_nlist.pin_net_index(clb_pin); + + original_setup_slacks.push_back(connection_setup_slack[net_id][ipin]); + proposed_setup_slacks.push_back(setup_slacks->setup_slack(net_id, ipin)); + } + + //Sort in ascending order, from worse slack value to best + std::sort(original_setup_slacks.begin(), original_setup_slacks.end()); + std::sort(proposed_setup_slacks.begin(), proposed_setup_slacks.end()); + + //Check the first pair of slack values that are different + //If found, return their difference + for (size_t idiff = 0; idiff < original_setup_slacks.size(); ++idiff) { + float slack_diff = original_setup_slacks[idiff] != proposed_setup_slacks[idiff]; + + if (slack_diff != 0) { + return slack_diff; + } + } + + //If all slack values are identical(or no modified slack values), + //reject this move by returning an arbitrary positive number as cost + return 1; +} + +/** + * @brief Decide whether to accept a move based on the probability + * calculated from the current annealing temperature. + * + * Returns: 1 -> move accepted, 0 -> rejected. + */ static e_move_result assess_swap(double delta_c, double t) { - /* Returns: 1 -> move accepted, 0 -> rejected. */ + /* A non-positive cost will always be accepted */ if (delta_c <= 0) { return ACCEPTED; } + /* If temperature is 0 and the cost is positive, guaranteed rejection */ if (t == 0.) { return REJECTED; } + /* Calculated the probability using temp and decide */ float fnum = vtr::frand(); float prob_fac = std::exp(-delta_c / t); if (prob_fac > fnum) { @@ -1693,131 +1834,52 @@ static e_move_result assess_swap(double delta_c, double t) { return REJECTED; } +/** + * @brief Recomputes the wiring cost to eliminate round-off that may have accrued. + * + * This process assumes that all the net costs have been updated. + */ static double recompute_bb_cost() { - /* Recomputes the cost to eliminate roundoff that may have accrued. * - * This routine does as little work as possible to compute this new * - * cost. */ - - double cost = 0; - auto& cluster_ctx = g_vpr_ctx.clustering(); + double cost = 0; for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */ if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */ /* Bounding boxes don't have to be recomputed; they're correct. */ cost += net_cost[net_id]; } } - - return (cost); -} - -/*returns the delay of one point to point connection */ -static float comp_td_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& place_ctx = g_vpr_ctx.placement(); - - float delay_source_to_sink = 0.; - - if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { - //Only estimate delay for signals routed through the inter-block - //routing network. TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay." - - ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id); - ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin); - - ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin); - ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin); - - int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin); - int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin); - - int source_x = place_ctx.block_locs[source_block].loc.x; - int source_y = place_ctx.block_locs[source_block].loc.y; - int sink_x = place_ctx.block_locs[sink_block].loc.x; - int sink_y = place_ctx.block_locs[sink_block].loc.y; - - /* Note: This heuristic only considers delta_x and delta_y, a much better heuristic - * would be to to create a more comprehensive lookup table. - * - * In particular this aproach does not accurately capture the effect of fast - * carry-chain connections. - */ - delay_source_to_sink = delay_model->delay(source_x, - source_y, - source_block_ipin, - sink_x, - sink_y, - sink_block_ipin); - if (delay_source_to_sink < 0) { - VPR_ERROR(VPR_ERROR_PLACE, - "in comp_td_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n" - "in comp_td_connection_delay: Delay is less than 0\n", - block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(), - source_x, source_y, - block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(), - sink_x, sink_y, - delay_source_to_sink); - } - } - - return (delay_source_to_sink); -} - -//Recompute all point to point delays, updating connection_delay -static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - - for (auto net_id : cluster_ctx.clb_nlist.nets()) { - for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { - connection_delay[net_id][ipin] = comp_td_connection_delay(delay_model, net_id, ipin); - } - } + return cost; } -/* Update the connection_timing_cost values from the temporary * - * values for all connections that have changed. */ +/** + * @brief Update the connection_timing_cost values from the temporary + * values for all connections that have/haven't changed. + * + * All the connections have already been gathered by blocks_affected.affected_pins + * after running the routine find_affected_nets_and_update_costs(). + */ static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; - /* Go through all the blocks moved. */ - for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) { - ClusterBlockId bnum = blocks_affected.moved_blocks[iblk].block_num; - for (ClusterPinId pin_id : cluster_ctx.clb_nlist.block_pins(bnum)) { - ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(pin_id); - - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) - continue; - - if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::DRIVER) { - //This net is being driven by a moved block, recompute - //all point to point connections on this net. - for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { - connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin]; - proposed_connection_delay[net_id][ipin] = INVALID_DELAY; - connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin]; - proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY; - } - } else { - //This pin is a net sink on a moved block - VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK); - - /* The following "if" prevents the value from being updated twice. */ - if (!driven_by_moved_block(net_id, blocks_affected)) { - int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin_id); + //Go through all the sink pins affected + for (ClusterPinId pin_id : blocks_affected.affected_pins) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + int ipin = clb_nlist.pin_net_index(pin_id); - connection_delay[net_id][net_pin] = proposed_connection_delay[net_id][net_pin]; - proposed_connection_delay[net_id][net_pin] = INVALID_DELAY; - connection_timing_cost[net_id][net_pin] = proposed_connection_timing_cost[net_id][net_pin]; - proposed_connection_timing_cost[net_id][net_pin] = INVALID_DELAY; - } - } - } /* Finished going through all the pins in the moved block */ - } /* Finished going through all the blocks moved */ + //Commit the timing delay and cost values + connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin]; + proposed_connection_delay[net_id][ipin] = INVALID_DELAY; + connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin]; + proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY; + } } -//Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost based on -//the move proposed in blocks_affected +/** + * @brief Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost + * based on the move proposed in blocks_affected. + */ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { #ifndef VTR_ASSERT_SAFE_ENABLED static_cast(blocks_affected); @@ -1836,39 +1898,28 @@ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { #endif } -//Invalidates the delays of connections effected by the specified move -// -//Relies on proposed_connection_delay and connection_delay to detect -//which connections have actually had their delay changed. -static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected, +/** + * @brief Invalidates the delays of connections effected by the specified move. + * + * Relies on find_affected_sink_pins() to find all the connections with different + * `proposed_connection_delay` and `connection_delay`. + * + * Invalidate all the timing graph edges associated with these sink pins via the + * ClusteredPinTimingInvalidator class. + */ +static void invalidate_affected_connection_delays(const std::vector& sink_pins_affected, ClusteredPinTimingInvalidator* pin_tedges_invalidator, TimingInfo* timing_info) { VTR_ASSERT_SAFE(timing_info); VTR_ASSERT_SAFE(pin_tedges_invalidator); - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& clb_nlist = cluster_ctx.clb_nlist; - - //Inalidate timing graph edges affected by the move - for (ClusterPinId pin : blocks_affected.affected_pins) { - //It is possible that some connections may not have changed delay.(e.g. - //For instance, if using a dx/dy delay model, this could occur if a sink - //moved to a new position with the same dx/dy from it's driver. - // - //To minimze work during the incremental STA update we do not invalidate - //such unchanged connections. - - ClusterNetId net = clb_nlist.pin_net(pin); - int ipin = clb_nlist.pin_net_index(pin); - - if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) { - //Delay changed, must invalidate - pin_tedges_invalidator->invalidate_connection(pin, timing_info); - } + //Invalidate timing graph edges affected by the move + for (ClusterPinId clb_pin : sink_pins_affected) { + pin_tedges_invalidator->invalidate_connection(clb_pin, timing_info); } } -//Returns true if 'net' is driven by one of the blocks in 'blocks_affected' +///@brief Returns true if 'net' is driven by one of the blocks in 'blocks_affected'. static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected) { auto& cluster_ctx = g_vpr_ctx.clustering(); @@ -1881,163 +1932,23 @@ static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_b return false; } -//Incrementally updates timing cost based on the current delays and criticality estimates -// Unlike comp_td_costs() this only updates connections who's criticality has changed; -// this is a superset of those connections who's delay has changed. -// -// For a from-scratch recalculation see comp_td_cost() -static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { - /* NB: We must be careful calculating the total timing cost incrementally, - * due to limitd floating point precision, so that we get a - * bit-identical result matching that calculated by comp_td_costs(). - * - * In particular, we can not simply calculate the incremental - * delta's caused by changed connection timing costs and adjust - * the timing cost. Due to limited precision, the results of - * floating point math operations are order dependant and we - * would get a different result. - * - * To get around this, we calculate the timing costs hierarchically - * to ensures we calculate the sum with the same order of operations - * as comp_td_costs(). - * - * See PlacerTimingCosts object used to represent connection_timing_costs - * for details. - */ - vtr::Timer t; - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& clb_nlist = cluster_ctx.clb_nlist; - - //Update the modified pin timing costs - { - vtr::Timer timer; - auto clb_pins_modified = place_crit.pins_with_modified_criticality(); - for (ClusterPinId clb_pin : clb_pins_modified) { - if (clb_nlist.pin_type(clb_pin) == PinType::DRIVER) continue; - - ClusterNetId clb_net = clb_nlist.pin_net(clb_pin); - VTR_ASSERT_SAFE(clb_net); - - if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue; - - int ipin = clb_nlist.pin_net_index(clb_pin); - VTR_ASSERT_SAFE(ipin >= 0 && ipin < int(clb_nlist.net_pins(clb_net).size())); - - double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin); - - //Record new value - connection_timing_cost[clb_net][ipin] = new_timing_cost; - } - - f_update_td_costs_connections_elapsed_sec += timer.elapsed_sec(); - } - - //Re-total timing costs of all nets - { - vtr::Timer timer; - *timing_cost = connection_timing_cost.total_cost(); - f_update_td_costs_sum_nets_elapsed_sec += timer.elapsed_sec(); - } - -#ifdef VTR_ASSERT_DEBUG_ENABLED - double check_timing_cost = 0.; - comp_td_costs(delay_model, place_crit, &check_timing_cost); - VTR_ASSERT_DEBUG_MSG(check_timing_cost == *timing_cost, - "Total timing cost calculated incrementally in update_td_costs() is " - "not consistent with value calculated from scratch in comp_td_costs()"); -#endif - f_update_td_costs_total_elapsed_sec += t.elapsed_sec(); -} - -//Recomputes timing cost from scratch based on the current delays and criticality estimates -// -// For a more efficient incremental update see update_td_costs() -static void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { - /* Computes the cost (from scratch) from the delays and criticalities * - * of all point to point connections, we define the timing cost of * - * each connection as criticality*delay. */ - - /* NB: We calculate the timing cost in a hierarchicl manner (first connectsion, - * then nets, then sum of nets) in order to allow it to be incrementally - * while avoiding round-off effects. See update_td_costs() for details. - */ - - auto& cluster_ctx = g_vpr_ctx.clustering(); - - for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */ - - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; - - for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { - float conn_timing_cost = comp_td_connection_cost(delay_model, place_crit, net_id, ipin); - - //Record new value - connection_timing_cost[net_id][ipin] = conn_timing_cost; - } - - //Store net timing cost for more efficient incremental updating - net_timing_cost[net_id] = sum_td_net_cost(net_id); - } - - /* Make sure timing cost does not go above MIN_TIMING_COST. */ - *timing_cost = sum_td_costs(); -} - -//Calculates the timing cost of the specified connection. -// Updates the value in connection_timing_cost -// Assumes only be called from compt_td_cost() or update_td_costs() -static double comp_td_connection_cost(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, ClusterNetId net, int ipin) { - VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins"); - - VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_connection_delay(delay_model, net, ipin), - "Connection delays should already be updated"); - - double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin]; - - VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_delay[net][ipin]), - "Propsoed connection delay should already be invalidated"); - - VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_timing_cost[net][ipin]), - "Proposed connection timing cost should already be invalidated"); - - return conn_timing_cost; -} - -//Returns the timing cost of the specified 'net' based on the values in connection_timing_cost -static double sum_td_net_cost(ClusterNetId net) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - - double net_td_cost = 0; - for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { - net_td_cost += connection_timing_cost[net][ipin]; - } - - return net_td_cost; -} - -//Returns the total timing cost accross all nets based on the values in net_timing_cost -static double sum_td_costs() { - auto& cluster_ctx = g_vpr_ctx.clustering(); - - double td_cost = 0; - for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */ - - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; - - td_cost += net_timing_cost[net_id]; - } - - return td_cost; -} - -/* Finds the cost from scratch. Done only when the placement * - * has been radically changed (i.e. after initial placement). * - * Otherwise find the cost change incrementally. If method * - * check is NORMAL, we find bounding boxes that are updateable * - * for the larger nets. If method is CHECK, all bounding boxes * - * are found via the non_updateable_bb routine, to provide a * - * cost which can be used to check the correctness of the * - * other routine. */ +/** + * @brief Find the wiring cost. + * + * Find the wiring cost from scratch only when the placement has + * been radically changed (i.e. after the initial placement). + * Otherwise, find the cost change incrementally. + * + * @param method + * + * NORMAL If the method check is NORMAL, we find boudning + * boxes that are updateable for the larger nets. + * + * CHECK If the method check is CHECK, all bounding boxes + * are found via the non_updateable_bb routine to + * provide a cost which can be used to check the + * correctness of the other routine. + */ static double comp_bb_cost(e_cost_methods method) { double cost = 0; double expected_wirelength = 0.0; @@ -2096,6 +2007,8 @@ static void alloc_and_load_placement_structs(float place_cost_exp, connection_delay = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.f); proposed_connection_delay = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.f); + connection_setup_slack = make_net_pins_matrix(cluster_ctx.clb_nlist, std::numeric_limits::infinity()); + connection_timing_cost = PlacerTimingCosts(cluster_ctx.clb_nlist); proposed_connection_timing_cost = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.); net_timing_cost.resize(num_nets, 0.); @@ -2137,6 +2050,7 @@ static void free_placement_structs(const t_placer_opts& placer_opts) { if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { vtr::release_memory(connection_timing_cost); vtr::release_memory(connection_delay); + vtr::release_memory(connection_setup_slack); vtr::release_memory(proposed_connection_timing_cost); vtr::release_memory(proposed_connection_delay); @@ -2981,24 +2895,21 @@ static void print_resources_utilization() { VTR_LOG("\n"); } -static void init_annealing_state(t_annealing_state* state, - const t_annealing_sched& annealing_sched, - float t, - float rlim, - int move_lim_max, - float crit_exponent) { - state->alpha = annealing_sched.alpha_min; - state->t = t; - state->restart_t = t; - state->rlim = rlim; - state->inverse_delta_rlim = 1 / (rlim - FINAL_RLIM); - state->move_lim_max = std::max(1, move_lim_max); - if (annealing_sched.type == DUSTY_SCHED) { - state->move_lim = std::max(1, (int)(state->move_lim_max * annealing_sched.success_target)); +static e_place_algorithm get_placement_quench_algorithm(const t_placer_opts& placer_opts) { + e_place_algorithm place_algo = placer_opts.place_algorithm; + e_place_quench_metric quench_metric = placer_opts.place_quench_metric; + + if (place_algo == e_place_algorithm::PATH_TIMING_DRIVEN_PLACE) { + if (quench_metric == e_place_quench_metric::AUTO || quench_metric == e_place_quench_metric::TIMING_COST) { + return PATH_TIMING_DRIVEN_PLACE; + } else { + VTR_ASSERT(quench_metric == e_place_quench_metric::SETUP_SLACK); + return SETUP_SLACK_ANALYSIS_PLACE; + } } else { - state->move_lim = state->move_lim_max; + VTR_ASSERT(place_algo == e_place_algorithm::BOUNDING_BOX_PLACE); + return BOUNDING_BOX_PLACE; } - state->crit_exponent = crit_exponent; } bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/place_delay_model.cpp index c30f32b3e7d..e8a58db6704 100644 --- a/vpr/src/place/place_delay_model.cpp +++ b/vpr/src/place/place_delay_model.cpp @@ -10,6 +10,8 @@ #include "vtr_math.h" #include "vpr_error.h" +#include "place_global.h" + #ifdef VTR_ENABLE_CAPNPROTO # include "capnp/serialize.h" # include "place_delay_model.capnp.h" @@ -18,10 +20,7 @@ # include "serdes_utils.h" #endif /* VTR_ENABLE_CAPNPROTO */ -/* - * DeltaDelayModel - */ - +///@brief DeltaDelayModel methods. float DeltaDelayModel::delay(int from_x, int from_y, int /*from_pin*/, int to_x, int to_y, int /*to_pin*/) const { int delta_x = std::abs(from_x - to_x); int delta_y = std::abs(from_y - to_y); @@ -46,9 +45,11 @@ void DeltaDelayModel::dump_echo(std::string filepath) const { vtr::fclose(f); } -/* - * OverrideDelayModel - */ +const DeltaDelayModel* OverrideDelayModel::base_delay_model() const { + return base_delay_model_.get(); +} + +///@brief OverrideDelayModel methods. float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const { //First check to if there is an override delay value auto& device_ctx = g_vpr_ctx.device(); @@ -136,18 +137,14 @@ float OverrideDelayModel::get_delay_override(int from_type, int from_class, int return iter->second; } -const DeltaDelayModel* OverrideDelayModel::base_delay_model() const { - return base_delay_model_.get(); -} - void OverrideDelayModel::set_base_delay_model(std::unique_ptr base_delay_model_obj) { base_delay_model_ = std::move(base_delay_model_obj); } -// When writing capnp targetted serialization, always allow compilation when -// VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception -// instead. -// +/** + * When writing capnp targetted serialization, always allow compilation when + * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead. + */ #ifndef VTR_ENABLE_CAPNPROTO # define DISABLE_ERROR \ @@ -300,3 +297,81 @@ void OverrideDelayModel::write(const std::string& file) const { } #endif + +///@brief Initialize the placer delay model. +std::unique_ptr alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + t_det_routing_arch* det_routing_arch, + std::vector& segment_inf, + const t_direct_inf* directs, + const int num_directs) { + return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf, + chan_width_dist, directs, num_directs); +} + +/** + * @brief Returns the delay of one point to point connection. + * + * Only estimate delay for signals routed through the inter-block routing network. + * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay." + */ +float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& place_ctx = g_vpr_ctx.placement(); + + float delay_source_to_sink = 0.; + + if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { + ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id); + ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin); + + ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin); + ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin); + + int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin); + int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin); + + int source_x = place_ctx.block_locs[source_block].loc.x; + int source_y = place_ctx.block_locs[source_block].loc.y; + int sink_x = place_ctx.block_locs[sink_block].loc.x; + int sink_y = place_ctx.block_locs[sink_block].loc.y; + + /** + * This heuristic only considers delta_x and delta_y, a much better + * heuristic would be to to create a more comprehensive lookup table. + * + * In particular this approach does not accurately capture the effect + * of fast carry-chain connections. + */ + delay_source_to_sink = delay_model->delay(source_x, + source_y, + source_block_ipin, + sink_x, + sink_y, + sink_block_ipin); + if (delay_source_to_sink < 0) { + VPR_ERROR(VPR_ERROR_PLACE, + "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n" + "in comp_td_single_connection_delay: Delay is less than 0\n", + block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(), + source_x, source_y, + block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(), + sink_x, sink_y, + delay_source_to_sink); + } + } + + return (delay_source_to_sink); +} + +///@brief Recompute all point to point delays, updating `connection_delay` matrix. +void comp_td_connection_delays(const PlaceDelayModel* delay_model) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + for (auto net_id : cluster_ctx.clb_nlist.nets()) { + for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { + connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, net_id, ipin); + } + } +} diff --git a/vpr/src/place/place_delay_model.h b/vpr/src/place/place_delay_model.h index db22db238ec..55b0558cb49 100644 --- a/vpr/src/place/place_delay_model.h +++ b/vpr/src/place/place_delay_model.h @@ -1,3 +1,9 @@ +/** + * @file + * @brief This file contains all the class and function declarations related to + * the placer delay model. For implementations, see place_delay_model.cpp. + */ + #ifndef PLACE_DELAY_MODEL_H #define PLACE_DELAY_MODEL_H @@ -20,12 +26,30 @@ # define ALWAYS_INLINE inline #endif -//Abstract interface to a placement delay model +///@brief Forward declarations. +class PlaceDelayModel; + +///@brief Initialize the placer delay model. +std::unique_ptr alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist, + const t_placer_opts& place_opts, + const t_router_opts& router_opts, + t_det_routing_arch* det_routing_arch, + std::vector& segment_inf, + const t_direct_inf* directs, + const int num_directs); + +///@brief Returns the delay of one point to point connection. +float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin); + +///@brief Recompute all point to point delays, updating `connection_delay` matrix. +void comp_td_connection_delays(const PlaceDelayModel* delay_model); + +///@brief Abstract interface to a placement delay model. class PlaceDelayModel { public: virtual ~PlaceDelayModel() = default; - // Computes place delay model. + ///@brief Computes place delay model. virtual void compute( RouterDelayProfiler& route_profiler, const t_placer_opts& placer_opts, @@ -33,25 +57,32 @@ class PlaceDelayModel { int longest_length) = 0; - //Returns the delay estimate between the specified block pins - // - // Either compute or read methods must be invoked before invoking - // delay. + /** + * @brief Returns the delay estimate between the specified block pins. + * + * Either compute or read methods must be invoked before invoking delay. + */ virtual float delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const = 0; - //Dumps the delay model to an echo file + ///@brief Dumps the delay model to an echo file. virtual void dump_echo(std::string filename) const = 0; - // Write place delay model to specified file. - // May be unimplemented, in which case method should throw an exception. + /** + * @brief Write place delay model to specified file. + * + * May be unimplemented, in which case method should throw an exception. + */ virtual void write(const std::string& file) const = 0; - // Read place delay model from specified file. - // May be unimplemented, in which case method should throw an exception. + /** + * @brief Read place delay model from specified file. + * + * May be unimplemented, in which case method should throw an exception. + */ virtual void read(const std::string& file) = 0; }; -//A simple delay model based on the distance (delta) between block locations +///@brief A simple delay model based on the distance (delta) between block locations. class DeltaDelayModel : public PlaceDelayModel { public: DeltaDelayModel() {} @@ -109,10 +140,13 @@ class OverrideDelayModel : public PlaceDelayModel { short delta_x; short delta_y; - //A combination of ALWAYS_INLINE attribute and std::lexicographical_compare - //is required for operator< to be inlined by compiler. - //Proper inlining of the function reduces place time by around 5%. - //For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225 + /** + * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare + * is required for operator< to be inlined by compiler. Proper inlining of the + * function reduces place time by around 5%. + * + * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225 + */ friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) { const short* left = reinterpret_cast(&lhs); const short* right = reinterpret_cast(&rhs); @@ -123,8 +157,11 @@ class OverrideDelayModel : public PlaceDelayModel { vtr::flat_map2 delay_overrides_; - //operator< treats memory layout of t_override as an array of short - //this requires all members of t_override are shorts and there is no padding between members of t_override + /** + * operator< treats memory layout of t_override as an array of short. + * This requires all members of t_override are shorts and there is no + * padding between members of t_override. + */ static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)"); static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts"); static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts"); diff --git a/vpr/src/place/place_global.h b/vpr/src/place/place_global.h new file mode 100644 index 00000000000..fd1cc2d9a6b --- /dev/null +++ b/vpr/src/place/place_global.h @@ -0,0 +1,39 @@ +/** + * @file + * @brief This file contains all the global data structures referenced across + * multiple files in ./vpr/src/place. + * + * These global data structures were originally local to place.cpp, and they + * were referenced by a lot of routines local to place.cpp. However, to shorten + * the file size of place.cpp, these routines are moved to other files. + * + * Instead of elongating the argument list of the moved routines, I moved the + * data structures to here so that they can be easily shared across different + * files. + * + * For detailed descriptions on what each data structure stores, please see + * place.cpp, where these variables are defined. + * + * TODO: Create a single extern variable that allows access to all these data + * structures so that these structures don't have to be declared as extern. + */ + +#pragma once +#include +#include "vtr_vector.h" +#include "vpr_net_pins_matrix.h" +#include "timing_place.h" + +extern vtr::vector net_cost, proposed_net_cost; +extern vtr::vector bb_updated_before; +extern ClbNetPinsMatrix connection_delay; +extern ClbNetPinsMatrix proposed_connection_delay; +extern ClbNetPinsMatrix connection_setup_slack; +extern PlacerTimingCosts connection_timing_cost; +extern ClbNetPinsMatrix proposed_connection_timing_cost; +extern vtr::vector net_timing_cost; +extern vtr::vector bb_coords, bb_num_on_edges; +extern vtr::vector ts_bb_coord_new, ts_bb_edge_new; +extern float** chanx_place_cost_fac; +extern float** chany_place_cost_fac; +extern std::vector ts_nets_to_update; diff --git a/vpr/src/place/place_timing_update.cpp b/vpr/src/place/place_timing_update.cpp new file mode 100644 index 00000000000..fa74f97dfb5 --- /dev/null +++ b/vpr/src/place/place_timing_update.cpp @@ -0,0 +1,362 @@ +/** + * @file place_timing_update.cpp + * @brief Defines the routines declared in place_timing_update.h. + */ + +#include "vtr_time.h" + +#include "place_timing_update.h" +#include "place_global.h" + +///@brief Use an incremental approach to updating timing costs after re-computing criticalities +static constexpr bool INCR_COMP_TD_COSTS = true; + +///@brief File-scope variable that can be accessed via the routine get_udpate_td_costs_runtime_stats(). +static t_update_td_costs_stats update_td_costs_stats; + +///@brief Routines local to place_timing_update.cpp +static double comp_td_connection_cost(const PlaceDelayModel* delay_model, + const PlacerCriticalities& place_crit, + ClusterNetId net, + int ipin); +static double sum_td_net_cost(ClusterNetId net); +static double sum_td_costs(); + +/** + * @brief Initialize the timing information and structures in the placer. + * + * Perform first time update on the timing graph, and initialize the values within + * PlacerCriticalities, PlacerSetupSlacks, and connection_timing_cost. + */ +void initialize_timing_info(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Initialize the timing update mode. Update both + //setup slacks and criticalities from scratch + timing_update_mode->update_criticalities = true; + timing_update_mode->update_setup_slacks = true; + timing_update_mode->recompute_criticalities = true; + timing_update_mode->recompute_setup_slacks = true; + + //As a safety measure, for the first time update, + //invalidate all timing edges via the pin invalidator + //by passing in all the clb sink pins + for (ClusterNetId net_id : clb_nlist.nets()) { + for (ClusterPinId pin_id : clb_nlist.net_sinks(net_id)) { + pin_timing_invalidator->invalidate_connection(pin_id, timing_info); + } + } + + //Perform timing info update + update_setup_slacks_and_criticalities(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + timing_update_mode, + costs); + + //Compute timing cost from scratch + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + + //Initialize the data structure that stores committed placer setup slacks + commit_setup_slacks(setup_slacks); + + //Don't warn again about unconstrained nodes again during placement + timing_info->set_warn_unconstrained(false); +} + +/** + * @brief Update timing info based on the current block positions. + * + * Update the values stored in PlacerCriticalities and PlacerSetupSlacks. + * This routine tries its best to be incremental when it comes to updating + * these values, and branching variables are stored in `timing_update_mode`. + * For a detailed description of how these variables work, please refer to + * the declaration documentation on t_placer_timing_update_mode. + * + * If criticalities are updated, the timing costs are updated as well. + * Calling this routine to update timing_cost will produce round-off error + * in the long run, so this value will be recomputed once in a while, via + * other timing driven routines. + * + * All the pins with changed connection delays have already been added into + * the ClusteredPinTimingInvalidator to allow incremental STA update. These + * changed connection delays are a direct result of moved blocks in try_swap(). + * + * @param crit_exponent Used to calculate `sharpened` criticalities. + * + * @param delay_model Used to calculate the delay between two locations. + * + * @param criticalities Mapping interface between atom pin criticalities + * and clb pin criticalities. + * + * @param setup_slacks Mapping interface between atom pin raw setup slacks + * and clb pin raw setup slacks. + * + * @param pin_timing_invalidator Stores all the pins that have their delay value changed + * and needs to be updated in the timing graph. + * + * @param timing_info Stores the timing graph and other important timing info. + * + * @param timing_update_mode Determines what should be updated when this routine is + * called, and using incremental techniques is appropriate. + * + * @param costs Stores the updated timing cost for the whole placement. + */ +void update_setup_slacks_and_criticalities(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_timing_update_mode* timing_update_mode, + t_placer_costs* costs) { + //Run STA to update slacks and adjusted/relaxed criticalities + timing_info->update(); + + if (timing_update_mode->update_setup_slacks) { + //Update placer's setup slacks + setup_slacks->update_setup_slacks(timing_info, timing_update_mode->recompute_setup_slacks); + } + + if (timing_update_mode->update_criticalities) { + //Update placer's criticalities (e.g. sharpen with crit_exponent) + criticalities->update_criticalities(timing_info, crit_exponent, timing_update_mode->recompute_criticalities); + + //Update connection, net and total timing costs based on new criticalities + if (INCR_COMP_TD_COSTS) { + update_td_costs(delay_model, *criticalities, &costs->timing_cost); + } else { + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + } + } + + //Setup slacks and criticalities need to be in sync with the timing_info. + //if they are to be incrementally updated on the next iteration. + //Otherwise, a re-computation for all clb sink pins is required. + timing_update_mode->recompute_setup_slacks = !timing_update_mode->update_setup_slacks; + timing_update_mode->recompute_criticalities = !timing_update_mode->update_criticalities; + + //Clear invalidation state + pin_timing_invalidator->reset(); +} + +/** + * @brief Incrementally updates timing cost based on the current delays and criticality estimates. + * + * Unlike comp_td_costs(), this only updates connections who's criticality has changed. + * This is a superset of those connections whose connection delay has changed. For a + * from-scratch recalculation, refer to comp_td_cost(). + * + * We must be careful calculating the total timing cost incrementally, due to limited + * floating point precision, so that we get a bit-identical result matching the one + * calculated by comp_td_costs(). + * + * In particular, we can not simply calculate the incremental delta's caused by changed + * connection timing costs and adjust the timing cost. Due to limited precision, the results + * of floating point math operations are order dependant and we would get a different result. + * + * To get around this, we calculate the timing costs hierarchically, to ensure that we + * calculate the sum with the same order of operations as comp_td_costs(). + * + * See PlacerTimingCosts object used to represent connection_timing_costs for details. + */ +void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { + vtr::Timer t; + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; + + //Update the modified pin timing costs + { + vtr::Timer timer; + auto clb_pins_modified = place_crit.pins_with_modified_criticality(); + for (ClusterPinId clb_pin : clb_pins_modified) { + if (clb_nlist.pin_type(clb_pin) == PinType::DRIVER) continue; + + ClusterNetId clb_net = clb_nlist.pin_net(clb_pin); + VTR_ASSERT_SAFE(clb_net); + + if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue; + + int ipin = clb_nlist.pin_net_index(clb_pin); + VTR_ASSERT_SAFE(ipin >= 1 && ipin < int(clb_nlist.net_pins(clb_net).size())); + + double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin); + + //Record new value + connection_timing_cost[clb_net][ipin] = new_timing_cost; + } + + update_td_costs_stats.connections_elapsed_sec += timer.elapsed_sec(); + } + + //Re-total timing costs of all nets + { + vtr::Timer timer; + *timing_cost = connection_timing_cost.total_cost(); + update_td_costs_stats.sum_nets_elapsed_sec += timer.elapsed_sec(); + } + +#ifdef VTR_ASSERT_DEBUG_ENABLED + double check_timing_cost = 0.; + comp_td_costs(delay_model, place_crit, &check_timing_cost); + VTR_ASSERT_DEBUG_MSG(check_timing_cost == *timing_cost, + "Total timing cost calculated incrementally in update_td_costs() is " + "not consistent with value calculated from scratch in comp_td_costs()"); +#endif + update_td_costs_stats.total_elapsed_sec += t.elapsed_sec(); +} + +/** + * @brief Recomputes timing cost from scratch based on the current delays and criticality estimates. + * + * Computes the cost (from scratch) from the delays and criticalities of all point to point + * connections, we define the timing cost of each connection as criticality * delay. + * + * We calculate the timing cost in a hierarchical manner (first connection, then nets, then + * sum of nets) in order to allow it to be incremental while avoiding round-off effects. + * + * For a more efficient incremental update, see update_td_costs(). + */ +void comp_td_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& place_crit, double* timing_cost) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + + for (auto net_id : cluster_ctx.clb_nlist.nets()) { + if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; + + for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { + float conn_timing_cost = comp_td_connection_cost(delay_model, place_crit, net_id, ipin); + + /* Record new value */ + connection_timing_cost[net_id][ipin] = conn_timing_cost; + } + /* Store net timing cost for more efficient incremental updating */ + net_timing_cost[net_id] = sum_td_net_cost(net_id); + } + /* Make sure timing cost does not go above MIN_TIMING_COST. */ + *timing_cost = sum_td_costs(); +} + +/** + * @brief Calculates the timing cost of the specified connection. + * + * This routine assumes that it is only called either compt_td_cost() or + * update_td_costs(). Otherwise, various assertions below would fail. + */ +static double comp_td_connection_cost(const PlaceDelayModel* delay_model, + const PlacerCriticalities& place_crit, + ClusterNetId net, + int ipin) { + VTR_ASSERT_SAFE_MSG(ipin > 0, "Shouldn't be calculating connection timing cost for driver pins"); + + VTR_ASSERT_SAFE_MSG(connection_delay[net][ipin] == comp_td_single_connection_delay(delay_model, net, ipin), + "Connection delays should already be updated"); + + double conn_timing_cost = place_crit.criticality(net, ipin) * connection_delay[net][ipin]; + + VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_delay[net][ipin]), + "Propsoed connection delay should already be invalidated"); + + VTR_ASSERT_SAFE_MSG(std::isnan(proposed_connection_timing_cost[net][ipin]), + "Proposed connection timing cost should already be invalidated"); + + return conn_timing_cost; +} + +///@brief Returns the timing cost of the specified 'net' based on the values in connection_timing_cost. +static double sum_td_net_cost(ClusterNetId net) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + + double net_td_cost = 0; + for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { + net_td_cost += connection_timing_cost[net][ipin]; + } + + return net_td_cost; +} + +///@brief Returns the total timing cost accross all nets based on the values in net_timing_cost. +static double sum_td_costs() { + auto& cluster_ctx = g_vpr_ctx.clustering(); + + double td_cost = 0; + for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */ + + if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) continue; + + td_cost += net_timing_cost[net_id]; + } + + return td_cost; +} + +/** + * @brief Commit all the setup slack values from the PlacerSetupSlacks + * class to a vtr matrix. + * + * This routine is incremental since it relies on the pins_with_modified_setup_slack() + * to detect which pins need to be updated and which pins do not. + * + * Therefore, it is assumed that this routine is always called immediately after + * each time update_setup_slacks_and_criticalities() updates the setup slacks + * (i.e. t_placer_timing_update_mode::update_setup_slacks = true). Otherwise, + * pins_with_modified_setup_slack() cannot accurately account for all the pins + * that have their setup slacks changed, making this routine incorrect. + * + * Currently, the only exception to the rule above is when setup slack analysis is used + * during the placement quench. The new setup slacks might be either accepted or + * rejected, so for efficiency reasons, this routine is not called if the slacks are + * rejected in the end. For more detailed info, see the try_swap() routine. + */ +void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Incremental: only go through sink pins with modified setup slack + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId pin_id : clb_pins_modified) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); + + connection_setup_slack[net_id][pin_index_in_net] = setup_slacks->setup_slack(net_id, pin_index_in_net); + } +} + +/** + * @brief Verify that the values in the vtr matrix matches the PlacerSetupSlacks class. + * + * Return true if all values are identical. Otherwise, return false. + * Used to check if the timing update has been succesfully revereted if a proposed move + * is rejected when applying setup slack analysis during the placement quench. + * If successful, the setup slacks in the timing analyzer should be the same as + * the setup slacks in connection_setup_slack matrix without running commit_setup_slacks(). + * + * For more detailed info, see the try_swap() routine. + */ +bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Go through every single sink pin to check that the slack values are the same + for (ClusterNetId net_id : clb_nlist.nets()) { + for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { + if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) { + return false; + } + } + } + return true; +} + +///@brief Fetch the file-scope variable update_td_costs_stats in timing_place.cpp. +t_update_td_costs_stats get_update_td_costs_runtime_stats() { + return update_td_costs_stats; +} diff --git a/vpr/src/place/place_timing_update.h b/vpr/src/place/place_timing_update.h new file mode 100644 index 00000000000..fa5a47e8727 --- /dev/null +++ b/vpr/src/place/place_timing_update.h @@ -0,0 +1,94 @@ +/** + * @file place_timing_update.h + * @brief Stores timing update routines declarations used by the VPR placer. + */ +#pragma once +#include "timing_place.h" +#include "place_util.h" + +/// init_grid_blocks(); +static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); +///@brief Initialize the placement context. void init_placement_context() { auto& place_ctx = g_vpr_ctx.mutable_placement(); auto& cluster_ctx = g_vpr_ctx.clustering(); + /* Intialize the lookup of CLB block positions */ place_ctx.block_locs.clear(); place_ctx.block_locs.resize(cluster_ctx.clb_nlist.blocks().size()); + /* Initialize the reverse lookup of CLB block positions */ place_ctx.grid_blocks = init_grid_blocks(); } +///@brief Initialize `grid_blocks`, the inverse structure of `block_locs`. static vtr::Matrix init_grid_blocks() { auto& device_ctx = g_vpr_ctx.device(); @@ -29,3 +40,174 @@ static vtr::Matrix init_grid_blocks() { return grid_blocks; } + +///@brief Constructor: stores current placer algorithm. +t_placer_costs::t_placer_costs(enum e_place_algorithm algo) + : place_algorithm(algo) { + if (place_algorithm != PATH_TIMING_DRIVEN_PLACE) { + VTR_ASSERT_MSG( + place_algorithm == BOUNDING_BOX_PLACE, + "Must pass a valid placer algorithm into the placer cost structure."); + } +} + +/** + * @brief Mutator: updates the norm factors in the outer loop iteration. + * + * At each temperature change we update these values to be used + * for normalizing the trade-off between timing and wirelength (bb) + */ +void t_placer_costs::update_norm_factors() { + if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + bb_cost_norm = 1 / bb_cost; + //Prevent the norm factor from going to infinity + timing_cost_norm = std::min(1 / timing_cost, MAX_INV_TIMING_COST); + cost = 1; //The value of cost will be reset to 1 if timing driven + } else { //place_algorithm == BOUNDING_BOX_PLACE + cost = bb_cost; //The cost value should be identical to the wirelength cost + } +} + +///@brief Constructor: Initialize all annealing state variables. +t_annealing_state::t_annealing_state(const t_annealing_sched& annealing_sched, + float first_t, + float first_rlim, + int first_move_lim, + float first_crit_exponent) { + alpha = annealing_sched.alpha_min; + t = first_t; + restart_t = first_t; + rlim = first_rlim; + inverse_delta_rlim = 1 / (first_rlim - FINAL_RLIM); + move_lim_max = first_move_lim; + crit_exponent = first_crit_exponent; + + //Determine the current move_lim based on the schedule type + if (annealing_sched.type == DUSTY_SCHED) { + move_lim = std::max(1, (int)(move_lim_max * annealing_sched.success_target)); + } else { + move_lim = move_lim_max; + } +} + +/** + * @brief Get the initial limit for inner loop block move attempt limit. + * + * There are two ways to scale the move limit. + * e_place_effort_scaling::CIRCUIT + * scales the move limit proportional to num_blocks ^ (4/3) + * e_place_effort_scaling::DEVICE_CIRCUIT + * scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) + * + * The second method is almost identical to the first one when the device + * is highly utilized (device_size ~ num_blocks). For low utilization devices + * (device_size >> num_blocks), the search space is larger, so the second method + * performs more moves to ensure better optimization. + */ + +int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched) { + const auto& device_ctx = g_vpr_ctx.device(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + auto device_size = device_ctx.grid.width() * device_ctx.grid.height(); + auto num_blocks = cluster_ctx.clb_nlist.blocks().size(); + + int move_lim; + if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { + move_lim = int(annealing_sched.inner_num * pow(num_blocks, 4. / 3.)); + } else { + VTR_ASSERT_MSG( + placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT, + "Unrecognized placer effort scaling"); + + move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.)); + } + + /* Avoid having a non-positive move_lim */ + move_lim = std::max(move_lim, 1); + + VTR_LOG("Moves per temperature: %d\n", move_lim); + + return move_lim; +} + +/** + * @brief Update the annealing state according to the annealing schedule selected. + * + * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. + * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. + * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. + * See doc/src/vpr/dusty_sa.rst for more details. + * + * Returns true until the schedule is finished. + */ +bool update_annealing_state(t_annealing_state* state, + float success_rat, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched) { + /* Return `false` when the exit criterion is met. */ + if (annealing_sched.type == USER_SCHED) { + state->t *= annealing_sched.alpha_t; + return state->t >= annealing_sched.exit_t; + } + + auto& device_ctx = g_vpr_ctx.device(); + auto& cluster_ctx = g_vpr_ctx.clustering(); + + /* Automatic annealing schedule */ + float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); + + if (annealing_sched.type == DUSTY_SCHED) { + bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets + if (success_rat < annealing_sched.success_min || restart_temp) { + if (state->alpha > annealing_sched.alpha_max) return false; + state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature. + state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay); + } else { + if (success_rat > annealing_sched.success_target) { + state->restart_t = state->t; + } + state->t *= state->alpha; + } + state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat)))); + } else { /* annealing_sched.type == AUTO_SCHED */ + if (success_rat > 0.96) { + state->alpha = 0.5; + } else if (success_rat > 0.8) { + state->alpha = 0.9; + } else if (success_rat > 0.15 || state->rlim > 1.) { + state->alpha = 0.95; + } else { + state->alpha = 0.8; + } + state->t *= state->alpha; + + // Must be duplicated to retain previous behavior + if (state->t < t_exit || std::isnan(t_exit)) return false; + } + + // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk. + // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. + update_rlim(&state->rlim, success_rat, device_ctx.grid); + + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + state->crit_exponent = (1 - (state->rlim - state->final_rlim()) * state->inverse_delta_rlim) + * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) + + placer_opts.td_place_exp_first; + } + + return true; +} + +/** + * @brief Update the range limited to keep acceptance prob. near 0.44. + * + * Use a floating point rlim to allow gradual transitions at low temps. + */ +static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) { + float upper_lim = std::max(grid.width() - 1, grid.height() - 1); + + *rlim *= (1. - 0.44 + success_rat); + *rlim = std::max(std::min(*rlim, upper_lim), 1.f); +} diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h index f35ec854ac9..399684ae03a 100644 --- a/vpr/src/place/place_util.h +++ b/vpr/src/place/place_util.h @@ -1,8 +1,118 @@ -#ifndef PLACE_UTIL_H -#define PLACE_UTIL_H -#include +/** + * @file place_util.h + * @brief Utility structures representing various states of the + * placement. Also contains declarations of related routines. + */ -//Initialize the placement context +#pragma once +#include "vpr_types.h" + +///@brief Forward declarations. +class t_placer_costs; +class t_annealing_state; + +///@brief Initialize the placement context. void init_placement_context(); -#endif +///@brief Get the initial limit for inner loop block move attempt limit. +int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched); + +///@brief Update the annealing state according to the annealing schedule selected. +bool update_annealing_state(t_annealing_state* state, + float success_rat, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched); + +/** + * @brief Data structure that stores different cost values in the placer. + * + * Although we do cost calculations with float values, we use doubles + * for the accumulated costs to avoid round-off, particularly on large + * designs where the magnitude of a single move's delta cost is small + * compared to the overall cost. + * + * The cost normalization factors are updated upon every temperature change + * in the outer_loop_update_timing_info routine. They are the multiplicative + * inverses of their respective cost values when the routine is called. They + * serve to normalize the trade-off between timing and wirelength (bb). + * + * @param cost The weighted average of the wiring cost and the timing cost. + * @param bb_cost The bounding box cost, aka the wiring cost. + * @param timing_cost The timing cost, which is connection delay * criticality. + * + * @param bb_cost_norm The normalization factor for the wiring cost. + * @param timing_cost_norm The normalization factor for the timing cost, which + * is upper-bounded by the value of MAX_INV_TIMING_COST. + * + * @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity + * with very lax timing constraints, which avoids multiplying by a + * gigantic timing_cost_norm when auto-normalizing. The exact value + * of this cost has relatively little impact, but should not be large + * enough to be on the order of timing costs for normal constraints. + * + * @param place_algorithm Determines how the member values are updated upon + * each temperature change during the placer annealing process. + */ +class t_placer_costs { + public: + double cost; + double bb_cost; + double timing_cost; + double bb_cost_norm; + double timing_cost_norm; + + private: + double MAX_INV_TIMING_COST = 1.e9; + enum e_place_algorithm place_algorithm; + + public: //Constructor + t_placer_costs(enum e_place_algorithm algo); + + public: //Mutator + void update_norm_factors(); +}; + +/** + * @brief Stores variables that are used by the annealing process. + * + * This structure is updated by update_annealing_state() on each outer + * loop iteration. It stores various important variables that need to + * be accessed during the placement inner loop. + * + * @param t Temperature for simulated annealing. + * @param rlim Range limit for block swaps. + * @param inverse_delta_rlim Used to update crit_exponent. + * @param alpha Temperature decays factor (multiplied each outer loop iteration). + * @param restart_t Temperature used after restart due to minimum success ratio. + * @param crit_exponent Used by timing-driven placement to "sharpen" the timing criticality. + * @param move_lim_max Maximum block move limit. + * @param move_lim Current block move limit. + * + * @param FINAL_RLIM The final rlim (range limit) is 1, which is the smallest value that + * can still make progress, since an rlim of 0 wouldn't allow any swaps. + */ +class t_annealing_state { + public: + float t; + float rlim; + float inverse_delta_rlim; + float alpha; + float restart_t; + float crit_exponent; + int move_lim_max; + int move_lim; + + private: + float FINAL_RLIM = 1.; + + public: //Constructor + t_annealing_state(const t_annealing_sched& annealing_sched, + float first_t, + float first_rlim, + int first_move_lim, + float first_crit_exponent); + + public: //Accessor + float final_rlim() const { return FINAL_RLIM; } +}; diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp index e62eab6c894..ae8e1b1e27c 100644 --- a/vpr/src/place/timing_place.cpp +++ b/vpr/src/place/timing_place.cpp @@ -1,3 +1,7 @@ +/** + * @file timing_place.cpp + * @brief Stores the method definitions of classes defined in timing_place.h. + */ #include #include @@ -14,71 +18,42 @@ #include "timing_info.h" -//Use an incremental approach to updaing criticalities? -constexpr bool INCR_UPDATE_CRITICALITIES = true; +///@brief Use an incremental approach to updating criticalities and setup slacks? +static constexpr bool INCR_UPDATE_CRITICALITIES = true, INCR_UPDATE_SETUP_SLACKS = true; -/**************************************/ - -/* Allocates space for the timing_place_crit_ data structure * - * I chunk the data to save space on large problems. */ +///@brief Allocates space for the timing_place_crit_ data structure. PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) : clb_nlist_(clb_nlist) , pin_lookup_(netlist_pin_lookup) , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { } -/**************************************/ -void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent) { +/** + * @brief Updated the criticalities in the timing_place_crit_ data structure. + * + * If the criticalities are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_criticality() + * cannot accurately account for all the pins that need to be updated. In this case, + * we pass in recompute=true to update all criticalities from scratch. + * + * If the criticality exponent has changed, we also need to update from scratch. + */ +void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent, bool recompute) { + /* Determine what pins need updating */ + if (!recompute && crit_exponent == last_crit_exponent_ && INCR_UPDATE_CRITICALITIES) { + incr_update_criticalities(timing_info); + } else { + recompute_criticalities(); + + /* Record new criticality exponent */ + last_crit_exponent_ = crit_exponent; + } + /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. * For every pin on every net (or, equivalently, for every tedge ending * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ - //Determine what pins need updating - if (INCR_UPDATE_CRITICALITIES) { - cluster_pins_with_modified_criticality_.clear(); - if (crit_exponent != last_crit_exponent_) { - //Criticality exponent changed, must re-calculate criticalities for *all* sink pins - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } - - //Record new criticality exponent - last_crit_exponent_ = crit_exponent; - } else { - //Criticality exponent unchanged - // - //Collect the cluster pins which need to be updated based on the latest timing - //analysis - // - //Note we use the set of pins reported by the *timing_info* as having modified - //criticality, rather than those marked as modified by the timing analyzer. - //Since timing_info uses shifted/relaxed criticality (which depends on max - //required time and worst case slacks), additional nodes may be modified - //when updating the atom pin criticalities. - - for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { - ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); - - //Some atom pins correspond to connections which are completely - //contained within a cluster, and hence have no corresponding - //clustered pin. - if (!clb_pin) continue; - - cluster_pins_with_modified_criticality_.insert(clb_pin); - } - } - } else { - //Non-incremental: all pins and nets need updating - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } - } - - //Update the effected pins + /* Update the effected pins */ for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) { ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); @@ -92,21 +67,144 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf } } +/** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to criticalities can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * criticality, rather than those marked as modified by the timing analyzer. + * + * Since timing_info uses shifted/relaxed criticality (which depends on max required + * time and worst case slacks), additional nodes may be modified when updating the + * atom pin criticalities. + */ + +void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_criticality_.clear(); + + for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_criticality_.insert(clb_pin); + } +} + +/** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerCriticalities::incr_update_criticalities(). + */ +void PlacerCriticalities::recompute_criticalities() { + cluster_pins_with_modified_criticality_.clear(); + + /* Non-incremental: all sink pins need updating */ + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_criticality_.insert(pin_id); + } + } +} + +///@brief Override the criticality of a particular connection. void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float val) { timing_place_crit_[net_id][ipin] = val; } +/** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const { return vtr::make_range(cluster_pins_with_modified_criticality_); } -std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - const t_direct_inf* directs, - const int num_directs) { - return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf, - chan_width_dist, directs, num_directs); +/**************************************/ + +///@brief Allocates space for the timing_place_setup_slacks_ data structure. +PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) + : clb_nlist_(clb_nlist) + , pin_lookup_(netlist_pin_lookup) + , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { +} + +/** + * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure. + * + * If the setup slacks are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_slack() + * cannot accurately account for all the pins that need to be updated. + * In this case, we pass in recompute=true to update all setup slacks from scratch. + */ +void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute) { + if (!recompute && INCR_UPDATE_SETUP_SLACKS) { + incr_update_setup_slacks(timing_info); + } else { + recompute_setup_slacks(); + } + + /* Update the effected pins */ + for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) { + ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); + int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); + + float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info, pin_lookup_, clb_pin); + + timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack; + } +} + +/** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to setup slacks can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * setup slacks, rather than those marked as modified by the timing analyzer. + */ +void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_setup_slack_.clear(); + + for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_slack()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_setup_slack_.insert(clb_pin); + } +} + +/** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks(). + */ +void PlacerSetupSlacks::recompute_setup_slacks() { + cluster_pins_with_modified_setup_slack_.clear(); + + /* Non-incremental: all sink pins need updating */ + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_setup_slack_.insert(pin_id); + } + } +} + +///@brief Override the setup slack of a particular connection. +void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float val) { + timing_place_setup_slacks_[net_id][ipin] = val; +} + +/** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ +PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { + return vtr::make_range(cluster_pins_with_modified_setup_slack_); } diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index c3d8a41c3a1..50042b50ea4 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -1,3 +1,32 @@ +/** + * @file timing_place.h + * @brief Interface used by the VPR placer to query information + * from the Tatum timing analyzer. + * + * @class PlacerSetupSlacks + * Queries connection **RAW** setup slacks, which can + * range from negative to positive values. Also maps + * atom pin setup slacks to clb pin setup slacks. + * @class PlacerCriticalities + * Query connection criticalities, which are calculuated + * based on the raw setup slacks and ranges from 0 to 1. + * Also maps atom pin crit. to clb pin crit. + * @class PlacerTimingCosts + * Hierarchical structure used by update_td_costs() to + * maintain the order of addition operation of float values + * (to avoid round-offs) while doing incremental updates. + * + * Calculating criticalities: + * All the raw setup slack values across a single clock domain are gathered, shifted, + * and rated from best to worst. The best shifted slack value (the most positive one) + * will have a criticality of 0, while the worse shifted slack value (always 0) + * will have a criticality of 1. Criticalities are used to calculated timing costs + * for each connection (delay * criticality). + * + * For a more detailed description on how criticalities are calculated, see + * calc_relaxed_criticality() in `timing_util.cpp`. + */ + #ifndef TIMING_PLACE #define TIMING_PLACE @@ -7,39 +36,42 @@ #include "place_delay_model.h" #include "vpr_net_pins_matrix.h" -std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist, - const t_placer_opts& place_opts, - const t_router_opts& router_opts, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - const t_direct_inf* directs, - const int num_directs); -/* Usage +/** + * @brief PlacerCriticalities returns the clustered netlist connection criticalities + * used by the placer ('sharpened' by a criticality exponent). + * + * Usage * ===== - * PlacerCriticalities returns the clustered netlist connection criticalities used by - * the placer ('sharpened' by a criticality exponent). This also serves to map atom - * netlist level criticalites (i.e. on AtomPinIds) to the clustered netlist (i.e. - * ClusterPinIds) used during placement. + * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds) + * to the clustered netlist (i.e. ClusterPinIds) used during placement. * - * Criticalities are calculated by calling update_criticalities(), which will - * update criticalities based on the atom netlist connection criticalities provided by - * the passed in SetupTimingInfo. This is done incrementally, based on the modified - * connections/AtomPinIds returned by SetupTimingInfo. + * Criticalities are calculated by calling update_setup_slacks_and_criticalities() and + * setting t_placer_timing_update_mode::update_criticalities to true. It will update + * criticalities based on the atom netlist connection criticalities provided by the + * passed in SetupTimingInfo. * - * The criticalities of individual connections can then be queried by calling the - * criticality() member function. + * This process can be done incrementally, based on the modified connections/AtomPinIds + * returned by SetupTimingInfo. But sometimes a recomputation is required. For detailed + * information please see the description of `t_placer_timing_update_mode` structure. * - * It also supports iterating via pins_with_modified_criticalities() through the - * clustered netlist pins/connections which have had their criticality modified by - * the last call to update_criticalities(), which is useful for incrementally + * It also supports iterating via pins_with_modified_criticalities() through the + * clustered netlist pins/connections which have had their criticality modified by + * the last call to update_criticalities(), which is useful for incrementally * re-calculating timing costs. * + * The criticalities of individual connections can then be queried by calling the + * criticality() member function. + * * Implementation * ============== - * To support incremental re-calculation the class saves the last criticality exponent - * passed to update_criticalites(). If the next update uses the same exponent criticalities - * can be incrementally updated. Otherwise they must be re-calculated from scratch, since - * a change in exponent changes *all* criticalities. + * To support incremental re-calculation, the class saves the last criticality exponent + * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same + * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated + * from scratch, since a change in exponent changes *all* criticalities. + * + * If the timing graph is updated while t_placer_timing_update_mode::update_criticalities is + * set to false, a re-calculation of *all* criticalities is required as well (since we don't + * know exactly which pins have changed after multiple timing updates have been performed). */ class PlacerCriticalities { public: //Types @@ -55,40 +87,134 @@ class PlacerCriticalities { PlacerCriticalities& operator=(const PlacerCriticalities& clb_nlist) = delete; public: //Accessors - //Returns the criticality of the specified connection + ///@brief Returns the criticality of the specified connection. float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; } - //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified - //by the last call to update_criticalities() + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ pin_range pins_with_modified_criticality() const; public: //Modifiers - //Incrementally updates criticalities based on the atom netlist criticalitites provied by - //timing_info and the provided criticality_exponent. - void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent); + /** + * @brief Updates criticalities based on the atom netlist criticalitites + * provided by timing_info and the provided criticality_exponent. + */ + void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent, bool recompute); - //Override the criticality of a particular connection + ///@brief Override the criticality of a particular connection. void set_criticality(ClusterNetId net, int ipin, float val); private: //Data + ///@brief The clb netlist in the placement context. const ClusteredNetlist& clb_nlist_; - const ClusteredPinAtomPinsLookup& pin_lookup_; - ClbNetPinsMatrix timing_place_crit_; /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ + ///@brief The lookup table that maps atom pins to clb pins. + const ClusteredPinAtomPinsLookup& pin_lookup_; - //The criticality exponent when update_criticalites() was last called (used to detect if incremental update can be used) + /** + * @brief The matrix that stores criticality value for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_crit_; + + /** + * The criticality exponent when update_criticalites() was last called + * (used to detect if incremental update can be used). + */ float last_crit_exponent_ = std::numeric_limits::quiet_NaN(); - //Set of pins with criticaltites modified by last call to update_criticalities() + ///@brief Set of pins with criticaltites modified by last call to update_criticalities(). vtr::vec_id_set cluster_pins_with_modified_criticality_; + + ///@brief Updates criticalities: incremental V.S. from scratch + void incr_update_criticalities(const SetupTimingInfo* timing_info); + void recompute_criticalities(); }; -/* Usage +/** + * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection. + * + * Usage * ===== - * PlacerTimingCosts mimics a 2D array of connection timing costs running from: - * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + * This also serves to map atom netlist level setup slacks (i.e. on AtomPinIds) to the + * clustered netlist (i.e. ClusterPinIds) used during placement. + * + * Setup slacks are calculated by calling update_setup_slacks_and_criticalities(), + * with t_placer_timing_update_mode::update_setup_slacks to true. It will update setup + * slacks based on the atom netlist connection setup slacks provided by the passed in + * SetupTimingInfo. + * + * This process can be done incrementally, based on the modified connections/AtomPinIds + * returned by SetupTimingInfo. But sometimes a recomputation is required. For detailed + * information please see the description of `t_placer_timing_update_mode` structure. + * + * It also supports iterating via pins_with_modified_setup_slack() through the clustered + * netlist pins/connections which have had their setup slacks modified by the last call + * to update_setup_slacks(). + * + * The RAW setup slacks of individual connections can then be queried by calling the + * setup_slack() member function. + * + * Note: RAW setup slacks are unlike criticalities. Their values are not confined between + * 0 and 1. Their values can be either positive or negative. + */ +class PlacerSetupSlacks { + public: //Types + typedef vtr::vec_id_set::iterator pin_iterator; + typedef vtr::vec_id_set::iterator net_iterator; + + typedef vtr::Range pin_range; + typedef vtr::Range net_range; + + public: //Lifetime + PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup); + PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete; + PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete; + + public: //Accessors + ///@brief Returns the setup slack of the specified connection. + float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; } + + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ + pin_range pins_with_modified_setup_slack() const; + + public: //Modifiers + ///@brief Updates setup slacks based on the atom netlist setup slacks provided by timing_info. + void update_setup_slacks(const SetupTimingInfo* timing_info, bool recompute); + + ///@brief Override the setup slack of a particular connection. + void set_setup_slack(ClusterNetId net, int ipin, float val); + + private: //Data + const ClusteredNetlist& clb_nlist_; + const ClusteredPinAtomPinsLookup& pin_lookup_; + + /** + * @brief The matrix that stores raw setup slack values for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_setup_slacks_; + + ///@brief Set of pins with raw setup slacks modified by last call to update_criticalities() + vtr::vec_id_set cluster_pins_with_modified_setup_slack_; + + ///@brief Updates setup slacks: incremental V.S. from scratch. + void incr_update_setup_slacks(const SetupTimingInfo* timing_info); + void recompute_setup_slacks(); +}; + +/** + * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from: + * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. * - * So it can be used similar to: + * It can be used similar to: * * PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct * @@ -99,53 +225,53 @@ class PlacerCriticalities { * * //Potentially other modifications... * - * //Calculate the updated timing cost, of all connections, incrementally based - * //on modifications + * //Calculate the updated timing cost, of all connections, + * //incrementally based on modifications * float total_timing_cost = connection_timing_costs.total_cost(); - * + * * However behind the scenes PlacerTimingCosts tracks when connection costs are modified, * and efficiently re-calculates the total timing cost incrementally based on the connections * which have had their cost modified. * - * Implementaion - * ============= - * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part + * Implementation + * ============== + * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part * of connection_costs_. To mimic 2d-array like access PlacerTimingCosts also uses two proxy * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy * respectively). * * The first part of connection_costs_ stores intermediate sums of the connection costs for - * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary + * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary * tree, where leaves correspond to individual connection costs and intermediate nodes the - * partial sums of the connection costs. (The binary tree is stored implicitly in the - * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary + * partial sums of the connection costs. (The binary tree is stored implicitly in the + * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary * tree we calculate the total timing cost over all connections. * * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up - * to the root) which have ancestors (leaves) with modified connection costs. When the + * to the root) which have ancestors (leaves) with modified connection costs. When the * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost. - * Only invalidated nodes are traversed, with valid nodes just returning their previously + * Only invalidated nodes are traversed, with valid nodes just returning their previously * calculated (and unchanged) value. * - * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can + * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can * be done in O(k log K) time. * - * It is important to note that due to limited floating point precision, floating point + * It is important to note that due to limited floating point precision, floating point * arithmetic has an order dependence (due to round-off). Using a binary tree to total * the timing connection costs allows us to incrementally update the total timign cost while - * maintianing the *same order of operations* as if it was re-computed from scratch. This + * maintianing the *same order of operations* as if it was re-computed from scratch. This * ensures we *always* get consistent results regardless of what/when connections are changed. * * Proxy Classes - * ------------- + * ============= * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of * internal storage of that net's connection costs. * - * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular - * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy - * supports assignment, allowing clients to modify the connection cost. It also detects if the - * assigned value differs from the previous value and if so, calls PlacerTimingCosts's + * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular + * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy + * supports assignment, allowing clients to modify the connection cost. It also detects if the + * assigned value differs from the previous value and if so, calls PlacerTimingCosts's * invalidate() method on that connection cost. * * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN) @@ -193,7 +319,9 @@ class PlacerTimingCosts { size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1); VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections"); - VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); + VTR_ASSERT_MSG( + num_connections == 0 || num_level_before_leaves < num_connections, + "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); //We don't need to store all possible leaves if we have fewer connections //(i.e. bottom-right of tree is empty) @@ -213,16 +341,19 @@ class PlacerTimingCosts { } } - //Proxy class representing a connection cost - // Supports modification of connection cost while detecting changes and - // reporting them up to PlacerTimingCosts + /** + * @brief Proxy class representing a connection cost. + * + * Supports modification of connection cost while detecting + * changes and reporting them up to PlacerTimingCosts. + */ class ConnectionProxy { public: ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost) : timing_costs_(timing_costs) , connection_cost_(connection_cost) {} - //Allow clients to modify the connection cost via assignment + ///@brief Allow clients to modify the connection cost via assignment. ConnectionProxy& operator=(double new_cost) { if (new_cost != connection_cost_) { //If connection cost changed, update it, and mark it @@ -233,9 +364,11 @@ class PlacerTimingCosts { return *this; } - //Support getting the current connection cost as a double - // Useful for client code operating on the cost values (e.g. - // difference between costs) + /** + * @brief Support getting the current connection cost as a double. + * + * Useful for client code operating on the cost values (e.g. difference between costs). + */ operator double() { return connection_cost_; } @@ -245,15 +378,18 @@ class PlacerTimingCosts { double& connection_cost_; }; - //Proxy class representing the connection costs of a net - // Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection + /** + * @brief Proxy class representing the connection costs of a net. + * + * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection. + */ class NetProxy { public: NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs) : timing_costs_(timing_costs) , net_sink_costs_(net_sink_costs) {} - //Indexes into the specific net pin/connection + ///@brief Indexes into the specific net pin/connection. ConnectionProxy operator[](size_t ipin) { return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); } @@ -263,7 +399,7 @@ class PlacerTimingCosts { double* net_sink_costs_; }; - //Indexes into the specific net + ///@brief Indexes into the specific net. NetProxy operator[](ClusterNetId net_id) { VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0); @@ -282,8 +418,10 @@ class PlacerTimingCosts { std::swap(num_levels_, other.num_levels_); } - //Calculates the total cost of all connections efficiently - //in the face of modified connection costs + /** + * @brief Calculates the total cost of all connections efficiently + * in the face of modified connection costs. + */ double total_cost() { float cost = total_cost_recurr(0); //Root @@ -294,7 +432,7 @@ class PlacerTimingCosts { } private: - //Recursively calculate and update the timing cost rooted at inode + ///@brief Recursively calculate and update the timing cost rooted at inode. double total_cost_recurr(size_t inode) { //Prune out-of-tree if (inode > connection_costs_.size() - 1) { @@ -329,12 +467,18 @@ class PlacerTimingCosts { return node_cost; } - friend ConnectionProxy; //So it can call invalidate() + ///@brief Friend-ed so it can call invalidate(). + friend ConnectionProxy; void invalidate(double* invalidated_cost) { //Check pointer within range of internal storage - VTR_ASSERT_SAFE_MSG(invalidated_cost >= &connection_costs_[0], "Connection cost pointer should be after start of internal storage"); - VTR_ASSERT_SAFE_MSG(invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], "Connection cost pointer should be before end of internal storage"); + VTR_ASSERT_SAFE_MSG( + invalidated_cost >= &connection_costs_[0], + "Connection cost pointer should be after start of internal storage"); + + VTR_ASSERT_SAFE_MSG( + invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], + "Connection cost pointer should be before end of internal storage"); size_t icost = invalidated_cost - &connection_costs_[0]; @@ -343,7 +487,7 @@ class PlacerTimingCosts { //Invalidate parent intermediate costs up to root or first //already-invalidated parent size_t iparent = parent(icost); - ; + while (!std::isnan(connection_costs_[iparent])) { //Invalidate connection_costs_[iparent] = std::numeric_limits::quiet_NaN(); @@ -371,33 +515,41 @@ class PlacerTimingCosts { return (i - 1) / 2; } - //Returns the number of nodes in ilevel'th level - //If ilevel is negative, return 0, since the root shouldn't be counted - //as a leaf node candidate + /** + * @brief Returns the number of nodes in ilevel'th level. + * + * If ilevel is negative, return 0, since the root shouldn't + * be counted as a leaf node candidate. + */ size_t num_nodes_in_level(int ilevel) const { return ilevel < 0 ? 0 : (2 << (ilevel)); } - //Returns the total number of nodes in levels [0..ilevel] (inclusive) + ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive). size_t num_nodes_up_to_level(int ilevel) const { return (2 << (ilevel + 1)) - 1; } private: - //Vector storing the implicit binary tree of connection costs - // The actual connections are stored at the end of the vector - // (last level of the binary tree). The earlier portions of - // the tree are the intermediate nodes. - // - // The methods left_child()/right_child()/parent() can be used - // to traverse the tree by indicies into this vector + /** + * @brief Vector storing the implicit binary tree of connection costs. + * + * The actual connections are stored at the end of the vector + * (last level of the binary tree). The earlier portions of + * the tree are the intermediate nodes. + * + * The methods left_child()/right_child()/parent() can be used + * to traverse the tree by indicies into this vector. + */ std::vector connection_costs_; - //Vector storing the indicies of the first connection for - //each net in the netlist, used for indexing by net. + /** + * @brief Vector storing the indicies of the first connection + * for each net in the netlist, used for indexing by net. + */ vtr::vector net_start_indicies_; - //Number of levels in the binary tree + ///@brief Number of levels in the binary tree. size_t num_levels_ = 0; }; diff --git a/vpr/src/timing/timing_util.cpp b/vpr/src/timing/timing_util.cpp index 6dd2c06d249..5bff2ac8324 100644 --- a/vpr/src/timing/timing_util.cpp +++ b/vpr/src/timing/timing_util.cpp @@ -579,6 +579,23 @@ float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, cons return clb_pin_crit; } +//Return the setup slack of a net's pin in the CLB netlist +float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin) { + //There may be multiple atom netlist pins connected to this CLB pin + float clb_pin_setup_slack = std::numeric_limits::quiet_NaN(); + + for (const auto atom_pin : pin_lookup.connected_atom_pins(clb_pin)) { + //Take the worst of the atom pin slacks as the CLB pin slack + if (std::isnan(clb_pin_setup_slack)) { + clb_pin_setup_slack = timing_info.setup_pin_slack(atom_pin); + } else { + clb_pin_setup_slack = std::min(clb_pin_setup_slack, timing_info.setup_pin_slack(atom_pin)); + } + } + + return clb_pin_setup_slack; +} + //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum //required time and worst slack for all domain pairs represent by the slack tags // diff --git a/vpr/src/timing/timing_util.h b/vpr/src/timing/timing_util.h index 87f6b86787b..682771e9763 100644 --- a/vpr/src/timing/timing_util.h +++ b/vpr/src/timing/timing_util.h @@ -183,6 +183,9 @@ class ClusteredPinTimingInvalidator { //Return the criticality of a net's pin in the CLB netlist float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin); +//Return the setup slack of a net's pin in the CLB netlist +float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin); + //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum //required time and worst slack for all domain pairs represent by the slack tags //