diff --git a/doc/src/vpr/command_line_usage.rst b/doc/src/vpr/command_line_usage.rst index 5b48457cfd6..94a47f3e6d9 100644 --- a/doc/src/vpr/command_line_usage.rst +++ b/doc/src/vpr/command_line_usage.rst @@ -717,6 +717,44 @@ If any of init_t, exit_t or alpha_t is specified, the user schedule, with a fixe **Default:** ``0.0`` +.. _dusty_sa_options: +Setting any of the following options selects `Dusty's annealing schedule `_. + +.. option:: --alpha_min + + The minimum (starting) update factor (alpha) used. + Ranges between 0 and alpha_max. + + **Default:** ``0.2`` + +.. option:: --alpha_max + + The maximum (stopping) update factor (alpha) used after which simulated annealing will complete. + Ranges between alpha_min and 1. + + **Default:** ``0.9`` + +.. option:: --alpha_decay + + The rate at which alpha will approach 1: alpha(n) = 1 - (1 - alpha(n-1)) * alpha_decay + Ranges between 0 and 1. + + **Default:** ``0.7`` + +.. option:: --anneal_success_min + + The minimum success ratio after which the temperature will reset to maintain the target success ratio. + Ranges between 0 and anneal_success_target. + + **Default:** ``0.1`` + +.. option:: --anneal_success_target + + The temperature after each reset is selected to keep this target success ratio. + Ranges between anneal_success_target and 1. + + **Default:** ``0.25`` + .. _timing_driven_placer_options: Timing-Driven Placer Options diff --git a/doc/src/vpr/dusty_sa.rst b/doc/src/vpr/dusty_sa.rst new file mode 100644 index 00000000000..5231c2dfd65 --- /dev/null +++ b/doc/src/vpr/dusty_sa.rst @@ -0,0 +1,22 @@ +Dusty's Simulated Annealing Schedule +==================================== + +This simulated annealing schedule is designed to quickly characterize the search space and maintain a target success ratio (accepted moves.) + +It starts at the minimum alpha (``--alpha_min``) to allow it to quickly find the target. + +For each alpha, the temperature decays by a factor of alpha after each outer loop iteration. + +The temperature before which the success ratio drops below the target (``--anneal_success_target``) is recorded; after hitting the minimum success ratio (``--anneal_success_min``), the temperature resets to a little before recorded temperature, and alpha parameter itself decays according to ``--alpha_decay``. + +The effect of this is many fast, but slowing sweeps in temperature, focused where they can make the most effective progress. Unlike fixed and adaptive schedules that monotonically decrease temperature, this allows the global properties of the search space to affect the schedule. + +In addition, move_lim (which controls the number of iterations in the inner loop) is scaled with the target success ratio over the current success ratio, which reduces the time to reach the target ratio. + +The schedule terminates when the maximum alpha (``--alpha_max``) is reached. Termination is ensured by the narrowing range between the recorded upper temperature and the minimum success ratio, which will eventually cause alpha to reach its maximum. + +This algorithm was inspired by Lester Ingber's adaptive simulated annealing algorithm [ASA93]_. + +See ``update_state()`` in ``place.cpp`` for the algorithm details. + +.. [ASA93] Ingber, Lester. "Adaptive simulated annealing (ASA)." Global optimization C-code, Caltech Alumni Association, Pasadena, CA (1993). diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index 4ec08f21787..d3cf7d3f0c1 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -420,6 +420,31 @@ static void SetupAnnealSched(const t_options& Options, VPR_FATAL_ERROR(VPR_ERROR_OTHER, "inner_num must be greater than 0.\n"); } + AnnealSched->alpha_min = Options.PlaceAlphaMin; + if (AnnealSched->alpha_min >= 1 || AnnealSched->alpha_min <= 0) { + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "alpha_min must be between 0 and 1 exclusive.\n"); + } + + AnnealSched->alpha_max = Options.PlaceAlphaMax; + if (AnnealSched->alpha_max >= 1 || AnnealSched->alpha_max <= AnnealSched->alpha_min) { + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "alpha_max must be between alpha_min and 1 exclusive.\n"); + } + + AnnealSched->alpha_decay = Options.PlaceAlphaDecay; + if (AnnealSched->alpha_decay >= 1 || AnnealSched->alpha_decay <= 0) { + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "alpha_decay must be between 0 and 1 exclusive.\n"); + } + + AnnealSched->success_min = Options.PlaceSuccessMin; + if (AnnealSched->success_min >= 1 || AnnealSched->success_min <= 0) { + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "success_min must be between 0 and 1 exclusive.\n"); + } + + AnnealSched->success_target = Options.PlaceSuccessTarget; + if (AnnealSched->success_target >= 1 || AnnealSched->success_target <= 0) { + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "success_target must be between 0 and 1 exclusive.\n"); + } + AnnealSched->type = Options.anneal_sched_type; } diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp index 263752ab99d..25462b4012b 100644 --- a/vpr/src/base/ShowSetup.cpp +++ b/vpr/src/base/ShowSetup.cpp @@ -155,6 +155,9 @@ static void ShowAnnealSched(const t_annealing_sched& AnnealSched) { case USER_SCHED: VTR_LOG("USER_SCHED\n"); break; + case DUSTY_SCHED: + VTR_LOG("DUSTY_SCHED\n"); + break; default: VTR_LOG_ERROR("Unknown annealing schedule\n"); } @@ -165,6 +168,12 @@ static void ShowAnnealSched(const t_annealing_sched& AnnealSched) { VTR_LOG("AnnealSched.init_t: %f\n", AnnealSched.init_t); VTR_LOG("AnnealSched.alpha_t: %f\n", AnnealSched.alpha_t); VTR_LOG("AnnealSched.exit_t: %f\n", AnnealSched.exit_t); + } else if (DUSTY_SCHED == AnnealSched.type) { + VTR_LOG("AnnealSched.alpha_min: %f\n", AnnealSched.alpha_min); + VTR_LOG("AnnealSched.alpha_max: %f\n", AnnealSched.alpha_max); + VTR_LOG("AnnealSched.alpha_decay: %f\n", AnnealSched.alpha_decay); + VTR_LOG("AnnealSched.success_min: %f\n", AnnealSched.success_min); + VTR_LOG("AnnealSched.success_target: %f\n", AnnealSched.success_target); } } diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 13436b3f278..9a537706ce6 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -1566,6 +1566,36 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .default_value("0.8") .show_in(argparse::ShowIn::HELP_ONLY); + place_grp.add_argument(args.PlaceAlphaMin, "--alpha_min") + .help( + "For placement using Dusty's annealing schedule. Minimum (starting) value of alpha.") + .default_value("0.2") + .show_in(argparse::ShowIn::HELP_ONLY); + + place_grp.add_argument(args.PlaceAlphaMax, "--alpha_max") + .help( + "For placement using Dusty's annealing schedule. Maximum (stopping) value of alpha.") + .default_value("0.9") + .show_in(argparse::ShowIn::HELP_ONLY); + + place_grp.add_argument(args.PlaceAlphaDecay, "--alpha_decay") + .help( + "For placement using Dusty's annealing schedule. The value that alpha is scaled by after reset.") + .default_value("0.7") + .show_in(argparse::ShowIn::HELP_ONLY); + + place_grp.add_argument(args.PlaceSuccessMin, "--anneal_success_min") + .help( + "For placement using Dusty's annealing schedule. Minimum success ratio when annealing before resetting the temperature to maintain the target success ratio.") + .default_value("0.1") + .show_in(argparse::ShowIn::HELP_ONLY); + + place_grp.add_argument(args.PlaceSuccessTarget, "--anneal_success_target") + .help( + "For placement using Dusty's annealing schedule. Target success ratio when annealing.") + .default_value("0.25") + .show_in(argparse::ShowIn::HELP_ONLY); + place_grp.add_argument(args.pad_loc_file, "--fix_pins") .help( "Fixes I/O pad locations during placement. Valid options:\n" @@ -2192,13 +2222,19 @@ void set_conditional_defaults(t_options& args) { args.quench_recompute_divider.set(args.inner_loop_recompute_divider, Provenance::INFERRED); } - //Are we using the automatic, or user-specified annealing schedule? - if (args.PlaceInitT.provenance() == Provenance::SPECIFIED - || args.PlaceExitT.provenance() == Provenance::SPECIFIED - || args.PlaceAlphaT.provenance() == Provenance::SPECIFIED) { + //Which schedule? + if (args.PlaceAlphaMin.provenance() == Provenance::SPECIFIED // Any of these flags select Dusty's schedule + || args.PlaceAlphaMax.provenance() == Provenance::SPECIFIED + || args.PlaceAlphaDecay.provenance() == Provenance::SPECIFIED + || args.PlaceSuccessMin.provenance() == Provenance::SPECIFIED + || args.PlaceSuccessTarget.provenance() == Provenance::SPECIFIED) { + args.anneal_sched_type.set(DUSTY_SCHED, Provenance::INFERRED); + } else if (args.PlaceInitT.provenance() == Provenance::SPECIFIED // Any of these flags select a manual schedule + || args.PlaceExitT.provenance() == Provenance::SPECIFIED + || args.PlaceAlphaT.provenance() == Provenance::SPECIFIED) { args.anneal_sched_type.set(USER_SCHED, Provenance::INFERRED); } else { - args.anneal_sched_type.set(AUTO_SCHED, Provenance::INFERRED); + args.anneal_sched_type.set(AUTO_SCHED, Provenance::INFERRED); // Otherwise use the automatic schedule } //Are the pad locations specified? diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h index 520771188f5..8aaca2db130 100644 --- a/vpr/src/base/read_options.h +++ b/vpr/src/base/read_options.h @@ -97,6 +97,11 @@ struct t_options { argparse::ArgValue PlaceInitT; argparse::ArgValue PlaceExitT; argparse::ArgValue PlaceAlphaT; + argparse::ArgValue PlaceAlphaMin; + argparse::ArgValue PlaceAlphaMax; + argparse::ArgValue PlaceAlphaDecay; + argparse::ArgValue PlaceSuccessMin; + argparse::ArgValue PlaceSuccessTarget; argparse::ArgValue anneal_sched_type; argparse::ArgValue PlaceAlgorithm; argparse::ArgValue pad_loc_type; diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 6e5ace8535a..a97724548d5 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -486,6 +486,7 @@ enum class e_timing_update_type { /* Timing data structures end */ enum sched_type { AUTO_SCHED, + DUSTY_SCHED, USER_SCHED }; /* Annealing schedule */ @@ -806,6 +807,18 @@ struct t_annealing_sched { float init_t; float alpha_t; float exit_t; + + /* Parameters for DUSTY_SCHED * + * The alpha ranges from alpha_min to alpha_max, decaying each * + * iteration by `alpha_decay`. * + * `restart_filter` is the low-pass coefficient (EWMA) for updating * + * the new starting temperature for each alpha. * + * Give up after `wait` alphas. */ + float alpha_min; + float alpha_max; + float alpha_decay; + float success_min; + float success_target; }; /* Various options for the placer. * diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index ef6f0ba8c74..195088ba971 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -58,6 +58,10 @@ using std::min; * cost computation. 0.01 means that there is a 1% error tolerance. */ #define ERROR_TOL .01 +/* The final rlim (range limit) is 1, which is the smallest value that can * + * still make progress, since an rlim of 0 wouldn't allow any swaps. */ +#define FINAL_RLIM 1 + /* This defines the maximum number of swap attempts before invoking the * * once-in-a-while placement legality check as well as floating point * * variables round-offs check. */ @@ -103,6 +107,18 @@ struct t_placer_prev_inverse_costs { double timing_cost; }; +// Used by update_annealing_state() +struct t_annealing_state { + float t; // Temperature + float rlim; // Range limit for swaps + float inverse_delta_rlim; // used to calculate crit_exponent + float alpha; // Temperature decays by this factor each outer iteration + float restart_t; // Temperature used after restart due to minimum success ratio + float crit_exponent; // Used by timing-driven placement to "sharpen" timing criticality + int move_lim_max; // Maximum move limit + int move_lim; // Current move limit +}; + constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); constexpr double MAX_INV_TIMING_COST = 1.e9; @@ -344,12 +360,14 @@ static float starting_t(t_placer_costs* costs, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts); -static void update_t(float* t, float rlim, float success_rat, t_annealing_sched annealing_sched); +static bool update_annealing_state(t_annealing_state* state, + float success_rat, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched); static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); -static int exit_crit(float t, float cost, t_annealing_sched annealing_sched); - static int count_connections(); static double get_std_dev(int n, double sum_x_squared, double av_x); @@ -465,7 +483,7 @@ static void print_place_status_header(); static void print_place_status(const size_t num_temps, const float elapsed_sec, const float t, - const float oldt, + const float alpha, const t_placer_statistics& stats, const float cpd, const float sTNS, @@ -477,6 +495,8 @@ static void print_place_status(const size_t num_temps, size_t tot_moves); static void print_resources_utilization(); +static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); + /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, t_annealing_sched annealing_sched, @@ -498,11 +518,9 @@ void try_place(const t_placer_opts& placer_opts, auto& timing_ctx = g_vpr_ctx.timing(); auto pre_place_timing_stats = timing_ctx.stats; - int tot_iter, move_lim = 0, moves_since_cost_recompute, width_fac, num_connections, - outer_crit_iter_count, inner_recompute_limit; - float t, success_rat, rlim, - oldt = 0, crit_exponent, - first_rlim, final_rlim, inverse_delta_rlim; + int tot_iter, moves_since_cost_recompute, width_fac, num_connections, + outer_crit_iter_count, inner_recompute_limit; + float success_rat, first_crit_exponent, first_rlim; t_placer_costs costs; t_placer_prev_inverse_costs prev_inverse_costs; @@ -567,7 +585,7 @@ void try_place(const t_placer_opts& placer_opts, if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { costs.bb_cost = comp_bb_cost(NORMAL); - crit_exponent = placer_opts.td_place_exp_first; /*this will be modified when rlim starts to change */ + first_crit_exponent = placer_opts.td_place_exp_first; /*this will be modified when rlim starts to change */ num_connections = count_connections(); VTR_LOG("\n"); @@ -594,7 +612,7 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.lookup, *timing_info->timing_graph()); //Update timing and costs - recompute_criticalities(crit_exponent, + recompute_criticalities(first_crit_exponent, place_delay_model.get(), placer_criticalities.get(), pin_timing_invalidator.get(), @@ -627,7 +645,7 @@ void try_place(const t_placer_opts& placer_opts, costs.timing_cost = 0; outer_crit_iter_count = 0; num_connections = 0; - crit_exponent = 0; + first_crit_exponent = 0; prev_inverse_costs.timing_cost = 0; /*inverses not used */ prev_inverse_costs.bb_cost = 0; @@ -669,6 +687,7 @@ void try_place(const t_placer_opts& placer_opts, print_place(nullptr, nullptr, filename.c_str()); } + int move_lim = 1; if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { //This scales the move limit proportional to num_blocks ^ (4/3) move_lim = (int)(annealing_sched.inner_num * pow(cluster_ctx.clb_nlist.blocks().size(), 1.3333)); @@ -711,24 +730,20 @@ void try_place(const t_placer_opts& placer_opts, quench_recompute_limit = move_lim + 1; } - rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); + first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); - first_rlim = rlim; /*used in timing-driven placement for exponent computation */ - final_rlim = 1; - inverse_delta_rlim = 1 / (first_rlim - final_rlim); + float first_t = starting_t(&costs, &prev_inverse_costs, + annealing_sched, move_lim, first_rlim, + place_delay_model.get(), + placer_criticalities.get(), + timing_info.get(), + *move_generator, + pin_timing_invalidator.get(), + blocks_affected, + placer_opts); - t = starting_t(&costs, - &prev_inverse_costs, - annealing_sched, - move_lim, - rlim, - place_delay_model.get(), - placer_criticalities.get(), - timing_info.get(), - *move_generator, - pin_timing_invalidator.get(), - blocks_affected, - placer_opts); + t_annealing_state state; + init_annealing_state(&state, annealing_sched, first_t, first_rlim, move_lim, first_crit_exponent); if (!placer_opts.move_stats_file.empty()) { f_move_stats_file = std::unique_ptr(vtr::fopen(placer_opts.move_stats_file.c_str(), "w"), vtr::fclose); @@ -743,8 +758,8 @@ void try_place(const t_placer_opts& placer_opts, VTR_LOG("\n"); print_place_status_header(); - /* Outer loop of the simmulated annealing begins */ - while (exit_crit(t, costs.cost, annealing_sched) == 0) { + /* Outer loop of the simulated annealing begins */ + do { vtr::Timer temperature_timer; if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { costs.cost = 1; @@ -752,15 +767,15 @@ void try_place(const t_placer_opts& placer_opts, outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs, num_connections, - crit_exponent, + state.crit_exponent, &outer_crit_iter_count, place_delay_model.get(), placer_criticalities.get(), pin_timing_invalidator.get(), timing_info.get()); - placement_inner_loop(t, num_temps, rlim, placer_opts, - move_lim, crit_exponent, inner_recompute_limit, &stats, + placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, + state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, &costs, &prev_inverse_costs, &moves_since_cost_recompute, @@ -771,12 +786,10 @@ void try_place(const t_placer_opts& placer_opts, blocks_affected, timing_info.get()); - tot_iter += move_lim; + tot_iter += state.move_lim; - calc_placer_stats(stats, success_rat, std_dev, costs, move_lim); + calc_placer_stats(stats, success_rat, std_dev, costs, state.move_lim); - oldt = t; /* for finding and printing alpha. */ - update_t(&t, rlim, success_rat, annealing_sched); ++num_temps; if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { @@ -787,28 +800,22 @@ void try_place(const t_placer_opts& placer_opts, print_place_status(num_temps, temperature_timer.elapsed_sec(), - t, oldt, + state.t, state.alpha, stats, critical_path.delay(), sTNS, sWNS, - success_rat, std_dev, rlim, crit_exponent, tot_iter); + success_rat, std_dev, state.rlim, state.crit_exponent, tot_iter); sprintf(msg, "Cost: %g BB Cost %g TD Cost %g Temperature: %g", - costs.cost, costs.bb_cost, costs.timing_cost, t); + costs.cost, costs.bb_cost, costs.timing_cost, state.t); update_screen(ScreenUpdatePriority::MINOR, msg, PLACEMENT, timing_info); - update_rlim(&rlim, success_rat, device_ctx.grid); - - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - crit_exponent = (1 - (rlim - final_rlim) * inverse_delta_rlim) - * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) - + placer_opts.td_place_exp_first; - } #ifdef VERBOSE if (getEchoEnabled()) { print_clb_placement("first_iteration_clb_placement.echo"); } #endif - } /* Outer loop of the simmulated annealing ends */ + } while (update_annealing_state(&state, success_rat, costs, placer_opts, annealing_sched)); + /* Outer loop of the simmulated annealing ends */ auto pre_quench_timing_stats = timing_ctx.stats; { /* Quench */ @@ -817,19 +824,19 @@ void try_place(const t_placer_opts& placer_opts, outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs, num_connections, - crit_exponent, + state.crit_exponent, &outer_crit_iter_count, place_delay_model.get(), placer_criticalities.get(), pin_timing_invalidator.get(), timing_info.get()); - t = 0; /* freeze out */ + state.t = 0; /* freeze out */ /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ - placement_inner_loop(t, num_temps, rlim, placer_opts, - move_lim, crit_exponent, quench_recompute_limit, &stats, + placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, + move_lim, state.crit_exponent, quench_recompute_limit, &stats, &costs, &prev_inverse_costs, &moves_since_cost_recompute, @@ -839,7 +846,6 @@ void try_place(const t_placer_opts& placer_opts, *move_generator, blocks_affected, timing_info.get()); - oldt = t; tot_iter += move_lim; ++num_temps; @@ -855,9 +861,9 @@ void try_place(const t_placer_opts& placer_opts, float quench_elapsed_sec = temperature_timer.elapsed_sec(); print_place_status(num_temps, quench_elapsed_sec, - t, oldt, stats, + state.t, state.alpha, stats, critical_path.delay(), sTNS, sWNS, - success_rat, std_dev, rlim, crit_exponent, tot_iter); + success_rat, std_dev, state.rlim, state.crit_exponent, tot_iter); } auto post_quench_timing_stats = timing_ctx.stats; @@ -888,7 +894,7 @@ void try_place(const t_placer_opts& placer_opts, VTR_ASSERT(timing_info); //Update timing and costs - recompute_criticalities(crit_exponent, + recompute_criticalities(state.crit_exponent, place_delay_model.get(), placer_criticalities.get(), pin_timing_invalidator.get(), @@ -1200,49 +1206,70 @@ static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) *rlim = max(*rlim, (float)1.); } -/* Update the temperature according to the annealing schedule selected. */ -static void update_t(float* t, float rlim, float success_rat, t_annealing_sched annealing_sched) { - /* float fac; */ - +/* Update the annealing state according to the annealing schedule selected. + * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. + * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. + * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. + * See doc/src/vpr/dusty_sa.rst for more details. + * + * Returns true until the schedule is finished. */ +static bool update_annealing_state(t_annealing_state* state, + float success_rat, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched) { + /* Return `false` when the exit criterion is met. */ if (annealing_sched.type == USER_SCHED) { - *t = annealing_sched.alpha_t * (*t); - } else { /* AUTO_SCHED */ - if (success_rat > 0.96) { - *t = (*t) * 0.5; - } else if (success_rat > 0.8) { - *t = (*t) * 0.9; - } else if (success_rat > 0.15 || rlim > 1.) { - *t = (*t) * 0.95; - } else { - *t = (*t) * 0.8; - } + state->t *= annealing_sched.alpha_t; + return state->t >= annealing_sched.exit_t; } -} -static int exit_crit(float t, float cost, t_annealing_sched annealing_sched) { - /* Return 1 when the exit criterion is met. */ + auto& device_ctx = g_vpr_ctx.device(); + auto& cluster_ctx = g_vpr_ctx.clustering(); - if (annealing_sched.type == USER_SCHED) { - if (t < annealing_sched.exit_t) { - return (1); + /* Automatic annealing schedule */ + float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); + + if (annealing_sched.type == DUSTY_SCHED) { + bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets + if (success_rat < annealing_sched.success_min || restart_temp) { + if (state->alpha > annealing_sched.alpha_max) return false; + state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature. + state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay); } else { - return (0); + if (success_rat > annealing_sched.success_target) { + state->restart_t = state->t; + } + state->t *= state->alpha; } - } + state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat)))); + } else { /* annealing_sched.type == AUTO_SCHED */ + if (success_rat > 0.96) { + state->alpha = 0.5; + } else if (success_rat > 0.8) { + state->alpha = 0.9; + } else if (success_rat > 0.15 || state->rlim > 1.) { + state->alpha = 0.95; + } else { + state->alpha = 0.8; + } + state->t *= state->alpha; - auto& cluster_ctx = g_vpr_ctx.clustering(); + // Must be duplicated to retain previous behavior + if (state->t < t_exit || std::isnan(t_exit)) return false; + } - /* Automatic annealing schedule */ - float t_exit = 0.005 * cost / cluster_ctx.clb_nlist.nets().size(); + // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk. + // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. + update_rlim(&state->rlim, success_rat, device_ctx.grid); - if (t < t_exit) { - return (1); - } else if (std::isnan(t_exit)) { - //May get nan if there are no nets - return (1); - } else { - return (0); + if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + state->crit_exponent = (1 - (state->rlim - FINAL_RLIM) * state->inverse_delta_rlim) + * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) + + placer_opts.td_place_exp_first; } + + return true; } static float starting_t(t_placer_costs* costs, @@ -2889,7 +2916,7 @@ static void print_place_status_header() { static void print_place_status(const size_t num_temps, const float elapsed_sec, const float t, - const float oldt, + const float alpha, const t_placer_statistics& stats, const float cpd, const float sTNS, @@ -2908,19 +2935,13 @@ static void print_place_status(const size_t num_temps, "%7.3f %7.4f %6.1f %8.2f", num_temps, elapsed_sec, - oldt, + t, stats.av_cost, stats.av_bb_cost, stats.av_timing_cost, 1e9 * cpd, 1e9 * sTNS, 1e9 * sWNS, acc_rate, std_dev, rlim, crit_exponent); pretty_print_uint(" ", tot_moves, 9, 3); - float alpha; - if (oldt == 0.) { - alpha = 0.; - } else { - alpha = t / oldt; - } VTR_LOG(" %6.3f\n", alpha); fflush(stdout); } @@ -2960,6 +2981,26 @@ static void print_resources_utilization() { VTR_LOG("\n"); } +static void init_annealing_state(t_annealing_state* state, + const t_annealing_sched& annealing_sched, + float t, + float rlim, + int move_lim_max, + float crit_exponent) { + state->alpha = annealing_sched.alpha_min; + state->t = t; + state->restart_t = t; + state->rlim = rlim; + state->inverse_delta_rlim = 1 / (rlim - FINAL_RLIM); + state->move_lim_max = std::max(1, move_lim_max); + if (annealing_sched.type == DUSTY_SCHED) { + state->move_lim = std::max(1, (int)(state->move_lim_max * annealing_sched.success_target)); + } else { + state->move_lim = state->move_lim_max; + } + state->crit_exponent = crit_exponent; +} + bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); } diff --git a/vtr_flow/parse/pass_requirements/common/pass_requirements.vpr_route_min_chan_width.txt b/vtr_flow/parse/pass_requirements/common/pass_requirements.vpr_route_min_chan_width.txt index 0dd1d66a33b..8d5c033fc16 100644 --- a/vtr_flow/parse/pass_requirements/common/pass_requirements.vpr_route_min_chan_width.txt +++ b/vtr_flow/parse/pass_requirements/common/pass_requirements.vpr_route_min_chan_width.txt @@ -1,7 +1,7 @@ #VPR metrics at minimum channel width #Routing Metrics -min_chan_width;Range(0.80,1.30) +min_chan_width;Range(0.25,1.30) routed_wirelength;RangeAbs(0.60,1.50,5) #Area metrics diff --git a/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_pack_place.txt b/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_pack_place.txt index fef86e396d1..424e1890153 100644 --- a/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_pack_place.txt +++ b/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_pack_place.txt @@ -2,6 +2,6 @@ %include "../common/pass_requirements.vpr_pack_place.txt" #Timing metrics -placed_CPD_est;Range(0.80,1.40) -placed_setup_TNS_est;Range(0.80,1.40) -placed_setup_WNS_est;Range(0.80,1.40) +placed_CPD_est;Range(0.50,1.40) +placed_setup_TNS_est;Range(0.50,1.40) +placed_setup_WNS_est;Range(0.50,1.40) diff --git a/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_fixed_chan_width.txt b/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_fixed_chan_width.txt index 4c9000ccf98..029e013d010 100644 --- a/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_fixed_chan_width.txt +++ b/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_fixed_chan_width.txt @@ -2,9 +2,9 @@ %include "../common/pass_requirements.vpr_route_fixed_chan_width.txt" #Timing metrics -critical_path_delay;Range(0.80,1.40) -geomean_nonvirtual_intradomain_critical_path_delay;Range(0.80,1.40) -setup_TNS;Range(0.80,1.40) -setup_WNS;Range(0.80,1.40) +critical_path_delay;Range(0.50,1.40) +geomean_nonvirtual_intradomain_critical_path_delay;Range(0.50,1.40) +setup_TNS;Range(0.50,1.40) +setup_WNS;Range(0.50,1.40) #hold_TNS;Range(0.05,20.00) #hold_WNS;Range(0.05,20.00) diff --git a/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_relaxed_chan_width.txt b/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_relaxed_chan_width.txt index e9e5bd04b04..d4b3903dc01 100644 --- a/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_relaxed_chan_width.txt +++ b/vtr_flow/parse/pass_requirements/timing/pass_requirements.vpr_route_relaxed_chan_width.txt @@ -11,9 +11,9 @@ crit_path_routing_area_per_tile;Range(0.8,1.3) #Run-time Metrics crit_path_route_time;RangeAbs(0.10,10.0,2) #Timing Metrics -critical_path_delay;Range(0.80,1.40) -geomean_nonvirtual_intradomain_critical_path_delay;Range(0.80,1.40) -setup_TNS;Range(0.80,1.40) -setup_WNS;Range(0.80,1.40) +critical_path_delay;Range(0.50,1.40) +geomean_nonvirtual_intradomain_critical_path_delay;Range(0.50,1.40) +setup_TNS;Range(0.50,1.40) +setup_WNS;Range(0.50,1.40) #hold_TNS;Range(0.05,20.00) #hold_WNS;Range(0.05,20.00) diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly/titan_quick_qor/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly/titan_quick_qor/config/config.txt index 41c81f248e2..2e4c7d43020 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly/titan_quick_qor/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly/titan_quick_qor/config/config.txt @@ -66,4 +66,4 @@ pass_requirements_file=pass_requirements_vpr_titan.txt #A large number of routing iterations is set to ensure the router doesn't give up to easily on the larger benchmarks #To be more run-time comparable to commercial tools like Quartus, we run with higher placer effort (inner_num=2) and lower astar_fac (1.0) #Set a 24hr timeout so they don't run forever -script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 +script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --seed 197 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly/vtr_bidir/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly/vtr_bidir/config/config.txt index 4268abaa631..704d5d7b142 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly/vtr_bidir/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly/vtr_bidir/config/config.txt @@ -47,5 +47,5 @@ pass_requirements_file=pass_requirements.txt #We increase the critical path router iterations beyond the default 50, to avoid #spurrious routing failures at relaxed channel width (since we know they should #be routable via the minimum channel width search) -script_params=-starting_stage vpr -track_memory_usage -crit_path_router_iterations 60 +script_params=-starting_stage vpr -track_memory_usage -crit_path_router_iterations 60 --seed 250