diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 9991523ca79..82eceb426dd 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -63,10 +63,6 @@ using std::min; * cost computation. 0.01 means that there is a 1% error tolerance. */ #define ERROR_TOL .01 -/* The final rlim (range limit) is 1, which is the smallest value that can * - * still make progress, since an rlim of 0 wouldn't allow any swaps. */ -#define FINAL_RLIM 1 - /* This defines the maximum number of swap attempts before invoking the * * once-in-a-while placement legality check as well as floating point * * variables round-offs check. */ @@ -89,19 +85,8 @@ enum e_cost_methods { CHECK }; -struct t_placer_statistics { - double av_cost, av_bb_cost, av_timing_cost, - sum_of_squares; - int success_sum; -}; - constexpr float INVALID_DELAY = std::numeric_limits::quiet_NaN(); - -constexpr double MAX_INV_TIMING_COST = 1.e9; -/* Stops inverse timing cost from going to infinity with very lax timing constraints, - * which avoids multiplying by a gigantic prev_inverse.timing_cost when auto-normalizing. - * The exact value of this cost has relatively little impact, but should not be - * large enough to be on the order of timing costs for normal constraints. */ +constexpr float INVALID_COST = std::numeric_limits::quiet_NaN(); /********************** Variables local to place.c ***************************/ @@ -262,11 +247,8 @@ static double comp_bb_cost(e_cost_methods method); static void update_move_nets(int num_nets_affected); static void reset_move_nets(int num_nets_affected); -static e_move_result try_swap(float t, - float crit_exponent, +static e_move_result try_swap(const t_annealing_state* state, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - float rlim, MoveGenerator& move_generator, SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, @@ -291,12 +273,9 @@ static int check_placement_consistency(); static int check_block_placement_consistency(); static int check_macro_placement_consistency(); -static float starting_t(float crit_exponent, +static float starting_t(const t_annealing_state* state, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, - int max_moves, - float rlim, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, @@ -306,18 +285,8 @@ static float starting_t(float crit_exponent, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts); -static bool update_annealing_state(t_annealing_state* state, - float success_rat, - const t_placer_costs& costs, - const t_placer_opts& placer_opts, - const t_annealing_sched& annealing_sched); - -static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid); - static int count_connections(); -static double get_std_dev(int n, double sum_x_squared, double av_x); - static double recompute_bb_cost(); static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); @@ -369,7 +338,6 @@ static void free_try_swap_arrays(); static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int num_connections, float crit_exponent, int* outer_crit_iter_count, @@ -379,16 +347,11 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, ClusteredPinTimingInvalidator* pin_timing_invalidator, SetupTimingInfo* timing_info); -static void placement_inner_loop(float t, - int temp_num, - float rlim, +static void placement_inner_loop(const t_annealing_state* state, const t_placer_opts& placer_opts, - int move_lim, - float crit_exponent, int inner_recompute_limit, t_placer_statistics* stats, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int* moves_since_cost_recompute, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, @@ -404,32 +367,21 @@ static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlacerCriticalities* criticalities, t_placer_costs* costs); -static void calc_placer_stats(t_placer_statistics& stats, float& success_rat, double& std_dev, const t_placer_costs& costs, const int move_lim); - static void generate_post_place_timing_reports(const t_placer_opts& placer_opts, const t_analysis_opts& analysis_opts, const SetupTimingInfo& timing_info, const PlacementDelayCalculator& delay_calc); static void print_place_status_header(); -static void print_place_status(const size_t num_temps, - const float elapsed_sec, - const float t, - const float alpha, +static void print_place_status(const t_annealing_state& state, const t_placer_statistics& stats, - const float cpd, - const float sTNS, - const float sWNS, - const float acc_rate, - const float std_dev, - const float rlim, - const float crit_exponent, + float elapsed_sec, + float cpd, + float sTNS, + float sWNS, size_t tot_moves); static void print_resources_utilization(); -void transform_blocks_affected(t_pl_blocks_to_be_moved blocksAffected); -static void init_annealing_state(t_annealing_state* state, const t_annealing_sched& annealing_sched, float t, float rlim, int move_lim_max, float crit_exponent); - /*****************************************************************************/ void try_place(const t_placer_opts& placer_opts, t_annealing_sched annealing_sched, @@ -456,16 +408,15 @@ void try_place(const t_placer_opts& placer_opts, int tot_iter, moves_since_cost_recompute, width_fac, num_connections, outer_crit_iter_count, inner_recompute_limit; - float success_rat, first_crit_exponent, first_rlim; + float first_crit_exponent, first_rlim, first_t; + int first_move_lim; - t_placer_costs costs; - t_placer_prev_inverse_costs prev_inverse_costs; + t_placer_costs costs(placer_opts.place_algorithm); tatum::TimingPathInfo critical_path; float sTNS = NAN; float sWNS = NAN; - double std_dev; char msg[vtr::bufsize]; t_placer_statistics stats; @@ -573,7 +524,7 @@ void try_place(const t_placer_opts& placer_opts, critical_path = timing_info->least_slack_critical_path(); - //Write out the initial timing echo file + /* Write out the initial timing echo file */ if (isEchoFileEnabled(E_ECHO_INITIAL_PLACEMENT_TIMING_GRAPH)) { tatum::write_echo(getEchoFileName(E_ECHO_INITIAL_PLACEMENT_TIMING_GRAPH), *timing_ctx.graph, *timing_ctx.constraints, *placement_delay_calc, timing_info->analyzer()); @@ -585,20 +536,27 @@ void try_place(const t_placer_opts& placer_opts, outer_crit_iter_count = 1; - prev_inverse_costs.timing_cost = 1 / costs.timing_cost; - prev_inverse_costs.bb_cost = 1 / costs.bb_cost; - costs.cost = 1; /*our new cost function uses normalized values of */ - /*bb_cost and timing_cost, the value of cost will be reset */ - /*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */ - } else { /*BOUNDING_BOX_PLACE */ - costs.cost = costs.bb_cost = comp_bb_cost(NORMAL); - costs.timing_cost = 0; + /* Initialize the normalization factors. Calling costs.update_norm_factors() * + * here would fail the golden results of strong_sdc benchmark */ + costs.timing_cost_norm = 1 / costs.timing_cost; + costs.bb_cost_norm = 1 / costs.bb_cost; + costs.cost = 1; + } else { + VTR_ASSERT(placer_opts.place_algorithm == BOUNDING_BOX_PLACE); + + /* Total cost is the same as wirelength cost */ + costs.bb_cost = comp_bb_cost(NORMAL); + costs.cost = costs.bb_cost; + + /* Timing cost and normalization factors are not used */ + costs.timing_cost = INVALID_COST; + costs.timing_cost_norm = INVALID_COST; + costs.bb_cost_norm = INVALID_COST; + + /* Other initializations */ outer_crit_iter_count = 0; num_connections = 0; first_crit_exponent = 0; - - prev_inverse_costs.timing_cost = 0; /*inverses not used */ - prev_inverse_costs.bb_cost = 0; } //Sanity check that initial placement is legal @@ -637,65 +595,44 @@ void try_place(const t_placer_opts& placer_opts, print_place(nullptr, nullptr, filename.c_str()); } - int move_lim = 1; - if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { - //This scales the move limit proportional to num_blocks ^ (4/3) - move_lim = (int)(annealing_sched.inner_num * pow(cluster_ctx.clb_nlist.blocks().size(), 1.3333)); - } else if (placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT) { - //This scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) - // - //For highly utilized devices (device_size ~ num_blocks) this is the same as - //num_blocks ^ (4/3). - // - //For low utilization devices (device_size >> num_blocks) this performs more - //moves (device_size ^ (2/3)) to ensure better optimization. In this case, - //more moves than num_blocks ^ (4/3) may be required, since the search space - //is larger. - float device_size = device_ctx.grid.width() * device_ctx.grid.height(); - move_lim = (int)(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(cluster_ctx.clb_nlist.blocks().size(), 2. / 3.)); - } else { - VPR_ERROR(VPR_ERROR_PLACE, "Unrecognized placer effort scaling"); - } - VTR_LOG("Moves per temperature: %d\n", move_lim); - - /* Sometimes I want to run the router with a random placement. Avoid * - * using 0 moves to stop division by 0 and 0 length vector problems, * - * by setting move_lim to 1 (which is still too small to do any * - * significant optimization). */ - if (move_lim <= 0) - move_lim = 1; + first_move_lim = get_initial_move_lim(placer_opts, annealing_sched); if (placer_opts.inner_loop_recompute_divider != 0) { - inner_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.inner_loop_recompute_divider); + inner_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.inner_loop_recompute_divider); } else { /*don't do an inner recompute */ - inner_recompute_limit = move_lim + 1; + inner_recompute_limit = first_move_lim + 1; } int quench_recompute_limit; if (placer_opts.quench_recompute_divider != 0) { - quench_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.quench_recompute_divider); + quench_recompute_limit = (int)(0.5 + (float)first_move_lim / (float)placer_opts.quench_recompute_divider); } else { /*don't do an quench recompute */ - quench_recompute_limit = move_lim + 1; + quench_recompute_limit = first_move_lim + 1; } + /* Get the first range limiter */ first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); - float first_t = starting_t(first_crit_exponent, - &costs, &prev_inverse_costs, - annealing_sched, move_lim, first_rlim, - place_delay_model.get(), - placer_criticalities.get(), - placer_setup_slacks.get(), - timing_info.get(), - *move_generator, - pin_timing_invalidator.get(), - blocks_affected, - placer_opts); - - t_annealing_state state; - init_annealing_state(&state, annealing_sched, first_t, first_rlim, move_lim, first_crit_exponent); + /* Set the temperature high so essentially all swaps will be accepted */ + /* when trying to determine the starting temp for placement inner loop. */ + first_t = HUGE_POSITIVE_FLOAT; + + t_annealing_state state(annealing_sched, first_t, first_rlim, first_move_lim, first_crit_exponent); + + /* Update the starting temperature for placement annealing to a more appropriate value */ + state.t = starting_t(&state, + &costs, + annealing_sched, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + timing_info.get(), + *move_generator, + pin_timing_invalidator.get(), + blocks_affected, + placer_opts); if (!placer_opts.move_stats_file.empty()) { f_move_stats_file = std::unique_ptr(vtr::fopen(placer_opts.move_stats_file.c_str(), "w"), vtr::fclose); @@ -704,7 +641,6 @@ void try_place(const t_placer_opts& placer_opts, tot_iter = 0; moves_since_cost_recompute = 0; - int num_temps = 0; #ifdef ENABLE_ANALYTIC_PLACE // Analytic placer: When enabled, skip most of the annealing and go straight to quench @@ -720,12 +656,9 @@ void try_place(const t_placer_opts& placer_opts, /* Outer loop of the simulated annealing begins */ do { vtr::Timer temperature_timer; - if (placer_opts.place_algorithm.is_timing_driven()) { - costs.cost = 1; - } outer_loop_update_timing_info(placer_opts, - &costs, &prev_inverse_costs, + &costs, num_connections, state.crit_exponent, &outer_crit_iter_count, @@ -735,9 +668,9 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), timing_info.get()); - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, - state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, - &costs, &prev_inverse_costs, + placement_inner_loop(&state, placer_opts, + inner_recompute_limit, &stats, + &costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), @@ -749,10 +682,7 @@ void try_place(const t_placer_opts& placer_opts, placer_opts.place_algorithm); tot_iter += state.move_lim; - - calc_placer_stats(stats, success_rat, std_dev, costs, state.move_lim); - - ++num_temps; + ++state.num_temps; if (placer_opts.place_algorithm.is_timing_driven()) { critical_path = timing_info->least_slack_critical_path(); @@ -760,12 +690,7 @@ void try_place(const t_placer_opts& placer_opts, sWNS = timing_info->setup_worst_negative_slack(); } - print_place_status(num_temps, - temperature_timer.elapsed_sec(), - state.t, state.alpha, - stats, - critical_path.delay(), sTNS, sWNS, - success_rat, std_dev, state.rlim, state.crit_exponent, tot_iter); + print_place_status(state, stats, temperature_timer.elapsed_sec(), critical_path.delay(), sTNS, sWNS, tot_iter); sprintf(msg, "Cost: %g BB Cost %g TD Cost %g Temperature: %g", costs.cost, costs.bb_cost, costs.timing_cost, state.t); @@ -776,7 +701,7 @@ void try_place(const t_placer_opts& placer_opts, print_clb_placement("first_iteration_clb_placement.echo"); } #endif - } while (update_annealing_state(&state, success_rat, costs, placer_opts, annealing_sched)); + } while (state.outer_loop_update(stats.success_rate, costs, placer_opts, annealing_sched)); /* Outer loop of the simmulated annealing ends */ #ifdef ENABLE_ANALYTIC_PLACE @@ -784,12 +709,16 @@ void try_place(const t_placer_opts& placer_opts, quench: #endif /* ENABLE_ANALYTIC_PLACE */ + /* Start Quench */ + state.t = 0; //Freeze out: only accept solutions that improve placement. + state.move_lim = state.move_lim_max; //Revert the move limit to initial value. + auto pre_quench_timing_stats = timing_ctx.stats; { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); outer_loop_update_timing_info(placer_opts, - &costs, &prev_inverse_costs, + &costs, num_connections, state.crit_exponent, &outer_crit_iter_count, @@ -799,13 +728,11 @@ void try_place(const t_placer_opts& placer_opts, pin_timing_invalidator.get(), timing_info.get()); - state.t = 0; /* freeze out */ - /* Run inner loop again with temperature = 0 so as to accept only swaps * which reduce the cost of the placement */ - placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, - move_lim, state.crit_exponent, quench_recompute_limit, &stats, - &costs, &prev_inverse_costs, + placement_inner_loop(&state, placer_opts, + quench_recompute_limit, &stats, + &costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), @@ -816,10 +743,8 @@ void try_place(const t_placer_opts& placer_opts, timing_info.get(), placer_opts.place_quench_algorithm); - tot_iter += move_lim; - ++num_temps; - - calc_placer_stats(stats, success_rat, std_dev, costs, move_lim); + tot_iter += state.move_lim; + ++state.num_temps; if (placer_opts.place_quench_algorithm.is_timing_driven()) { critical_path = timing_info->least_slack_critical_path(); @@ -827,17 +752,12 @@ void try_place(const t_placer_opts& placer_opts, sWNS = timing_info->setup_worst_negative_slack(); } - float quench_elapsed_sec = temperature_timer.elapsed_sec(); - print_place_status(num_temps, - quench_elapsed_sec, - state.t, state.alpha, stats, - critical_path.delay(), sTNS, sWNS, - success_rat, std_dev, state.rlim, state.crit_exponent, tot_iter); + print_place_status(state, stats, temperature_timer.elapsed_sec(), critical_path.delay(), sTNS, sWNS, tot_iter); } auto post_quench_timing_stats = timing_ctx.stats; if (placer_opts.placement_saves_per_temperature >= 1) { - std::string filename = vtr::string_fmt("placement_%03d_%03d.place", num_temps + 1, 0); + std::string filename = vtr::string_fmt("placement_%03d_%03d.place", state.num_temps + 1, 0); VTR_LOG("Saving final placement to file: %s\n", filename.c_str()); print_place(nullptr, nullptr, filename.c_str()); } @@ -905,7 +825,7 @@ void try_place(const t_placer_opts& placer_opts, float reject_rate = (float)num_swap_rejected / total_swap_attempts; float accept_rate = (float)num_swap_accepted / total_swap_attempts; float abort_rate = (float)num_swap_aborted / total_swap_attempts; - VTR_LOG("Placement number of temperatures: %d\n", num_temps); + VTR_LOG("Placement number of temperatures: %d\n", state.num_temps); VTR_LOG("Placement total # of swap attempts: %*d\n", num_swap_print_digits, total_swap_attempts); VTR_LOG("\tSwaps accepted: %*d (%4.1f %%)\n", num_swap_print_digits, num_swap_accepted, 100 * accept_rate); VTR_LOG("\tSwaps rejected: %*d (%4.1f %%)\n", num_swap_print_digits, num_swap_rejected, 100 * reject_rate); @@ -928,7 +848,6 @@ void try_place(const t_placer_opts& placer_opts, /* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int num_connections, float crit_exponent, int* outer_crit_iter_count, @@ -964,24 +883,16 @@ static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, } (*outer_crit_iter_count)++; - /*at each temperature change we update these values to be used */ - /*for normalizing the tradeoff between timing and wirelength (bb) */ - prev_inverse_costs->bb_cost = 1 / costs->bb_cost; - /*Prevent inverse timing cost from going to infinity */ - prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); + /* Update the cost normalization factors */ + costs->update_norm_factors(); } /* Function which contains the inner loop of the simulated annealing */ -static void placement_inner_loop(float t, - int temp_num, - float rlim, +static void placement_inner_loop(const t_annealing_state* state, const t_placer_opts& placer_opts, - int move_lim, - float crit_exponent, int inner_recompute_limit, t_placer_statistics* stats, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, int* moves_since_cost_recompute, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, @@ -995,21 +906,14 @@ static void placement_inner_loop(float t, int inner_placement_save_count = 0; //How many times have we dumped placement to a file this temperature? - stats->av_cost = 0.; - stats->av_bb_cost = 0.; - stats->av_timing_cost = 0.; - stats->sum_of_squares = 0.; - stats->success_sum = 0; + stats->reset(); inner_crit_iter_count = 1; /* Inner loop begins */ - for (inner_iter = 0; inner_iter < move_lim; inner_iter++) { - e_move_result swap_result = try_swap(t, - crit_exponent, + for (inner_iter = 0; inner_iter < state->move_lim; inner_iter++) { + e_move_result swap_result = try_swap(state, costs, - prev_inverse_costs, - rlim, move_generator, timing_info, pin_timing_invalidator, @@ -1023,11 +927,7 @@ static void placement_inner_loop(float t, if (swap_result == ACCEPTED) { /* Move was accepted. Update statistics that are useful for the annealing schedule. */ - stats->success_sum++; - stats->av_cost += costs->cost; - stats->av_bb_cost += costs->bb_cost; - stats->av_timing_cost += costs->timing_cost; - stats->sum_of_squares += (costs->cost) * (costs->cost); + stats->single_swap_update(*costs); num_swap_accepted++; } else if (swap_result == ABORTED) { num_swap_aborted++; @@ -1040,14 +940,14 @@ static void placement_inner_loop(float t, * We do this only once in a while, since it is expensive. */ if (inner_crit_iter_count >= inner_recompute_limit - && inner_iter != move_lim - 1) { /*on last iteration don't recompute */ + && inner_iter != state->move_lim - 1) { /*on last iteration don't recompute */ inner_crit_iter_count = 0; #ifdef VERBOSE VTR_LOG("Inner loop recompute criticalities\n"); #endif //Update all timing related classes - perform_full_timing_update(crit_exponent, + perform_full_timing_update(state->crit_exponent, delay_model, criticalities, setup_slacks, @@ -1078,14 +978,16 @@ static void placement_inner_loop(float t, if (placer_opts.placement_saves_per_temperature >= 1 && inner_iter > 0 - && (inner_iter + 1) % (move_lim / placer_opts.placement_saves_per_temperature) == 0) { - std::string filename = vtr::string_fmt("placement_%03d_%03d.place", temp_num + 1, inner_placement_save_count); - VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, move_lim, filename.c_str()); + && (inner_iter + 1) % (state->move_lim / placer_opts.placement_saves_per_temperature) == 0) { + std::string filename = vtr::string_fmt("placement_%03d_%03d.place", state->num_temps + 1, inner_placement_save_count); + VTR_LOG("Saving placement to file at temperature move %d / %d: %s\n", inner_iter, state->move_lim, filename.c_str()); print_place(nullptr, nullptr, filename.c_str()); ++inner_placement_save_count; } } - /* Inner loop ends */ + + /* Calculate the success_rate and std_dev of the costs. */ + stats->calc_iteration_stats(*costs, state->move_lim); } static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, @@ -1131,118 +1033,10 @@ static int count_connections() { return (count); } -static double get_std_dev(int n, double sum_x_squared, double av_x) { - /* Returns the standard deviation of data set x. There are n sample points, * - * sum_x_squared is the summation over n of x^2 and av_x is the average x. * - * All operations are done in double precision, since round off error can be * - * a problem in the initial temp. std_dev calculation for big circuits. */ - - double std_dev; - - if (n <= 1) - std_dev = 0.; - else - std_dev = (sum_x_squared - n * av_x * av_x) / (double)(n - 1); - - if (std_dev > 0.) /* Very small variances sometimes round negative */ - std_dev = sqrt(std_dev); - else - std_dev = 0.; - - return (std_dev); -} - -static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) { - /* Update the range limited to keep acceptance prob. near 0.44. Use * - * a floating point rlim to allow gradual transitions at low temps. */ - - float upper_lim; - - *rlim = (*rlim) * (1. - 0.44 + success_rat); - upper_lim = max(grid.width() - 1, grid.height() - 1); - *rlim = min(*rlim, upper_lim); - *rlim = max(*rlim, (float)1.); -} - -/* Update the annealing state according to the annealing schedule selected. - * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. - * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. - * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. - * See doc/src/vpr/dusty_sa.rst for more details. - * - * Returns true until the schedule is finished. */ -static bool update_annealing_state(t_annealing_state* state, - float success_rat, - const t_placer_costs& costs, - const t_placer_opts& placer_opts, - const t_annealing_sched& annealing_sched) { -#ifndef NO_GRAPHICS - t_draw_state* draw_state = get_draw_state_vars(); - if (draw_state->list_of_breakpoints.size() != 0) - //update temperature in the current information variable - get_bp_state_globals()->get_glob_breakpoint_state()->temp_count++; -#endif - - /* Return `false` when the exit criterion is met. */ - if (annealing_sched.type == USER_SCHED) { - state->t *= annealing_sched.alpha_t; - return state->t >= annealing_sched.exit_t; - } - - auto& device_ctx = g_vpr_ctx.device(); - auto& cluster_ctx = g_vpr_ctx.clustering(); - - /* Automatic annealing schedule */ - float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); - - if (annealing_sched.type == DUSTY_SCHED) { - bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets - if (success_rat < annealing_sched.success_min || restart_temp) { - if (state->alpha > annealing_sched.alpha_max) return false; - state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature. - state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay); - } else { - if (success_rat > annealing_sched.success_target) { - state->restart_t = state->t; - } - state->t *= state->alpha; - } - state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat)))); - } else { /* annealing_sched.type == AUTO_SCHED */ - if (success_rat > 0.96) { - state->alpha = 0.5; - } else if (success_rat > 0.8) { - state->alpha = 0.9; - } else if (success_rat > 0.15 || state->rlim > 1.) { - state->alpha = 0.95; - } else { - state->alpha = 0.8; - } - state->t *= state->alpha; - - // Must be duplicated to retain previous behavior - if (state->t < t_exit || std::isnan(t_exit)) return false; - } - - // Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk. - // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. - update_rlim(&state->rlim, success_rat, device_ctx.grid); - - if (placer_opts.place_algorithm.is_timing_driven()) { - state->crit_exponent = (1 - (state->rlim - FINAL_RLIM) * state->inverse_delta_rlim) - * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) - + placer_opts.td_place_exp_first; - } - - return true; -} - -static float starting_t(float crit_exponent, +///@brief Find the starting temperature for the annealing loop. +static float starting_t(const t_annealing_state* state, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, - int max_moves, - float rlim, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, @@ -1251,32 +1045,26 @@ static float starting_t(float crit_exponent, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const t_placer_opts& placer_opts) { - /* Finds the starting temperature (hot condition). */ - - int i, num_accepted, move_lim; - double std_dev, av, sum_of_squares; /* Double important to avoid round off */ - - if (annealing_sched.type == USER_SCHED) + /* Use user-specified value for the initial temperature. */ + if (annealing_sched.type == USER_SCHED) { return (annealing_sched.init_t); + } auto& cluster_ctx = g_vpr_ctx.clustering(); - move_lim = min(max_moves, (int)cluster_ctx.clb_nlist.blocks().size()); + /* Use to calculate the average of cost when swap is accepted. */ + int num_accepted = 0; - num_accepted = 0; - av = 0.; - sum_of_squares = 0.; + /* Use double types to avoid round off. */ + double av = 0., sum_of_squares = 0.; - /* Try one move per block. Set the temperature high so essentially all accepted. */ - float t = HUGE_POSITIVE_FLOAT; + /* Determines the block swap loop count. */ + int move_lim = std::min(state->move_lim_max, (int)cluster_ctx.clb_nlist.blocks().size()); - for (i = 0; i < move_lim; i++) { + for (int i = 0; i < move_lim; i++) { //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack - e_move_result swap_result = try_swap(t, - crit_exponent, + e_move_result swap_result = try_swap(state, costs, - prev_inverse_costs, - rlim, move_generator, timing_info, pin_timing_invalidator, @@ -1300,18 +1088,19 @@ static float starting_t(float crit_exponent, } } - if (num_accepted != 0) - av /= num_accepted; - else - av = 0.; + /* Take the average of the accepted swaps' cost values. */ + av = num_accepted > 0 ? (av / num_accepted) : 0.; - std_dev = get_std_dev(num_accepted, sum_of_squares, av); + /* Get the standard deviation. */ + double std_dev = get_std_dev(num_accepted, sum_of_squares, av); + /* Print warning if not all swaps are accepted. */ if (num_accepted != move_lim) { VTR_LOG_WARN("Starting t: %d of %d configurations accepted.\n", num_accepted, move_lim); } #ifdef VERBOSE + /* Print stats related to finding the initital temp. */ VTR_LOG("std_dev: %g, average cost: %g, starting temp: %g\n", std_dev, av, 20. * std_dev); #endif @@ -1347,11 +1136,24 @@ static void reset_move_nets(int num_nets_affected) { } } -static e_move_result try_swap(float t, - float crit_exponent, +/** + * @brief Pick some block and moves it to another spot. + * + * If the new location is empty, directly move the block. If the new location + * is occupied, switch the blocks. Due to the different sizes of the blocks, + * this block switching may occur for multiple times. It might also cause the + * current swap attempt to abort due to inability to find suitable locations + * for moved blocks. + * + * The move generator will record all the switched blocks in the variable + * `blocks_affected`. Afterwards, the move will be assessed by the chosen + * cost formulation. Currently, there are three ways to assess move cost, + * which are stored in the enum type `e_place_algorithm`. + * + * @return Whether the block swap is accepted, rejected or aborted. + */ +static e_move_result try_swap(const t_annealing_state* state, t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - float rlim, MoveGenerator& move_generator, SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, @@ -1362,30 +1164,27 @@ static e_move_result try_swap(float t, float rlim_escape_fraction, const t_place_algorithm& place_algorithm, float timing_tradeoff) { - /* Picks some block and moves it to another spot. If this spot is * - * occupied, switch the blocks. Assess the change in cost function. * - * rlim is the range limiter. * - * Returns whether the swap is accepted, rejected or aborted. * - * Passes back the new value of the cost functions. */ - num_ts_called++; MoveOutcomeStats move_outcome_stats; - /* I'm using negative values of proposed_net_cost as a flag, so DO NOT * - * use cost functions that can go negative. */ + /* I'm using negative values of proposed_net_cost as a flag, * + * so DO NOT use cost functions that can go negative. */ - double delta_c = 0; /* Change in cost due to this swap. */ - double bb_delta_c = 0; - double timing_delta_c = 0; + double delta_c = 0; //Change in cost due to this swap. + double bb_delta_c = 0; //Change in the bounding box (wiring) cost. + double timing_delta_c = 0; //Change in the timing cost (delay * criticality). - //Allow some fraction of moves to not be restricted by rlim, - //in the hopes of better escaping local minima + /* Allow some fraction of moves to not be restricted by rlim, */ + /* in the hopes of better escaping local minima. */ + float rlim; if (rlim_escape_fraction > 0. && vtr::frand() < rlim_escape_fraction) { rlim = std::numeric_limits::infinity(); + } else { + rlim = state->rlim; } - //Generate a new move (perturbation) used to explore the space of possible placements + /* Generate a new move (perturbation) used to explore the space of possible placements. */ e_create_move create_move_outcome = move_generator.propose_move(blocks_affected, rlim); LOG_MOVE_STATS_PROPOSED(t, blocks_affected); @@ -1393,9 +1192,6 @@ static e_move_result try_swap(float t, e_move_result move_outcome = ABORTED; if (create_move_outcome == e_create_move::ABORT) { - //Proposed move is not legal -- give up on this move - clear_move_blocks(blocks_affected); - LOG_MOVE_STATS_OUTCOME(std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), @@ -1407,25 +1203,24 @@ static e_move_result try_swap(float t, /* * To make evaluating the move simpler (e.g. calculating changed bounding box), - * we first move the blocks to thier new locations (apply the move to - * place_ctx.block_locs) and then computed the change in cost. If the move is - * accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing - * the move). If the move is rejected the blocks are returned to their original - * positions (reverting place_ctx.block_locs to its original state). + * we first move the blocks to their new locations (apply the move to + * place_ctx.block_locs) and then compute the change in cost. If the move + * is accepted, the inverse look-up in place_ctx.grid_blocks is updated + * (committing the move). If the move is rejected, the blocks are returned to + * their original positions (reverting place_ctx.block_locs to its original state). * - * Note that the inverse look-up place_ctx.grid_blocks is only updated - * after move acceptance is determined, and so should not be used when - * evaluating a move. + * Note that the inverse look-up place_ctx.grid_blocks is only updated after + * move acceptance is determined, so it should not be used when evaluating a move. */ - //Update the block positions + /* Update the block positions */ apply_move_blocks(blocks_affected); - //Find all the nets affected by this swap and update their costs - //This routine calculates new connection delays and timing costs - //and store them in proposed_* data structures - //This routine also calculates the wiring cost, which doesn't - //depend on the timing driven data + //Find all the nets affected by this swap and update the wiring costs. + //This cost value doesn't depend on the timing info. + // + //Also find all the pins affected by the swap, and calculates new connection + //delays and timing costs and store them in proposed_* data structures. int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, delay_model, criticalities, @@ -1433,32 +1228,31 @@ static e_move_result try_swap(float t, bb_delta_c, timing_delta_c); - //For setup slack analysis, we first do a timing analysis to get the newest slack values - //resulted from the proposed block moves. If the move turns out to be accepted, we keep - //the updated slack values and commit the block moves. If rejected, we reject the proposed - //block moves and revert this timing analysis. + //For setup slack analysis, we first do a timing analysis to get the newest + //slack values resulted from the proposed block moves. If the move turns out + //to be accepted, we keep the updated slack values and commit the block moves. + //If rejected, we reject the proposed block moves and revert this timing analysis. if (place_algorithm == SLACK_TIMING_PLACE) { - //Gather all the connections with modified delays for incremental timing updates. - //This routine relies on comparing proposed_connection_delay and connection_delay. + /* Invalidates timing of modified connections for incremental timing updates. */ invalidate_affected_connections(blocks_affected, pin_timing_invalidator, timing_info); - //Update the connection_timing_cost and connection_delay - //values from the temporary values. + /* Update the connection_timing_cost and connection_delay * + * values from the temporary values. */ commit_td_cost(blocks_affected); - //Update timing information. Since we are analyzing setup slacks, - //we only update those values and keep the criticalities stale - //so as not to interfere with the original timing driven algorithm. - // - //Note: the timing info must be updated after applying block moves - //and committing the timing driven delays and costs. - //If we wish to revert this timing update due to move rejection, - //we need to revert block moves and restore the timing values. + /* Update timing information. Since we are analyzing setup slacks, * + * we only update those values and keep the criticalities stale * + * so as not to interfere with the original timing driven algorithm. * + * + * Note: the timing info must be updated after applying block moves * + * and committing the timing driven delays and costs. * + * If we wish to revert this timing update due to move rejection, * + * we need to revert block moves and restore the timing values. */ criticalities->disable_update(); setup_slacks->enable_update(); - update_timing_classes(crit_exponent, + update_timing_classes(state->crit_exponent, timing_info, criticalities, setup_slacks, @@ -1467,21 +1261,18 @@ static e_move_result try_swap(float t, /* Get the setup slack analysis cost */ //TODO: calculate a weighted average of the slack cost and wiring cost delta_c = analyze_setup_slack_cost(setup_slacks); - } else if (place_algorithm == CRITICALITY_TIMING_PLACE) { - /*in this case we redefine delta_c as a combination of timing and bb. * - *additionally, we normalize all values, therefore delta_c is in * - *relation to 1*/ - delta_c = (1 - timing_tradeoff) * bb_delta_c * prev_inverse_costs->bb_cost - + timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost; - + /* Take delta_c as a combination of timing and wiring cost. In + * addition to `timing_tradeoff`, we normalize the cost values */ + delta_c = (1 - timing_tradeoff) * bb_delta_c * costs->bb_cost_norm + + timing_tradeoff * timing_delta_c * costs->timing_cost_norm; } else { - VTR_ASSERT(place_algorithm == BOUNDING_BOX_PLACE); + VTR_ASSERT_SAFE(place_algorithm == BOUNDING_BOX_PLACE); delta_c = bb_delta_c; } /* 1 -> move accepted, 0 -> rejected. */ - move_outcome = assess_swap(delta_c, t); + move_outcome = assess_swap(delta_c, state->t); if (move_outcome == ACCEPTED) { costs->cost += delta_c; @@ -1499,26 +1290,26 @@ static e_move_result try_swap(float t, if (place_algorithm == CRITICALITY_TIMING_PLACE) { costs->timing_cost += timing_delta_c; - //Invalidates timing of modified connections for incremental timing updates - //This routine relies on comparing proposed_connection_delay and connection_delay - //If the setup slack analysis was not performed, the - //sink pins are yet to be invalidated. + /* Invalidates timing of modified connections for incremental * + * timing updates. These invalidations are accumulated for a * + * big timing update in the outer loop. */ invalidate_affected_connections(blocks_affected, pin_timing_invalidator, timing_info); - //update the connection_timing_cost and connection_delay - //values from the temporary values + /* Update the connection_timing_cost and connection_delay * + * values from the temporary values. */ commit_td_cost(blocks_affected); } - /* update net cost functions and reset flags. */ + /* Update net cost functions and reset flags. */ update_move_nets(num_nets_affected); /* Update clb data structures since we kept the move. */ commit_move_blocks(blocks_affected); - } else { //move_outcome == REJECTED + } else { + VTR_ASSERT_SAFE(move_outcome == REJECTED); /* Reset the net cost function flags first. */ reset_move_nets(num_nets_affected); @@ -1527,21 +1318,21 @@ static e_move_result try_swap(float t, revert_move_blocks(blocks_affected); if (place_algorithm == SLACK_TIMING_PLACE) { - //Revert the timing delays and costs to pre-update values - //These routines must be called after reverting the block moves + /* Revert the timing delays and costs to pre-update values. */ + /* These routines must be called after reverting the block moves. */ //TODO: make this process incremental comp_td_connection_delays(delay_model); comp_td_costs(delay_model, *criticalities, &costs->timing_cost); - //Re-invalidate the affected sink pins since the proposed move is - //rejected, and the same blocks are reverted to their original - //positions. The affected sink pins should stay the same. + /* Re-invalidate the affected sink pins since the proposed * + * move is rejected, and the same blocks are reverted to * + * their original positions. */ invalidate_affected_connections(blocks_affected, pin_timing_invalidator, timing_info); /* Revert the timing update */ - update_timing_classes(crit_exponent, + update_timing_classes(state->crit_exponent, timing_info, criticalities, setup_slacks, @@ -1559,8 +1350,8 @@ static e_move_result try_swap(float t, } move_outcome_stats.delta_cost_norm = delta_c; - move_outcome_stats.delta_bb_cost_norm = bb_delta_c * prev_inverse_costs->bb_cost; - move_outcome_stats.delta_timing_cost_norm = timing_delta_c * prev_inverse_costs->timing_cost; + move_outcome_stats.delta_bb_cost_norm = bb_delta_c * costs->bb_cost_norm; + move_outcome_stats.delta_timing_cost_norm = timing_delta_c * costs->timing_cost_norm; move_outcome_stats.delta_bb_cost_abs = bb_delta_c; move_outcome_stats.delta_timing_cost_abs = timing_delta_c; @@ -1578,6 +1369,8 @@ static e_move_result try_swap(float t, stop_placement_and_check_breakopints(blocks_affected, move_outcome, delta_c, bb_delta_c, timing_delta_c); # endif #endif + + /* Clear the data structure containing block move info */ clear_move_blocks(blocks_affected); //VTR_ASSERT(check_macro_placement_consistency() == 0); @@ -1589,10 +1382,28 @@ static e_move_result try_swap(float t, return move_outcome; } -//Puts all the nets changed by the current swap into nets_to_update, -//and updates their bounding box. -// -//Returns the number of affected nets. +/** + * @brief Find all the nets and pins affected by this swap and update costs. + * + * Find all the nets affected by this swap and update the bounding box (wiring) + * costs. This cost function doesn't depend on the timing info. + * + * Find all the connections affected by this swap and update the timing cost. + * For a connection to be affected, it not only needs to be on or driven by + * a block, but it also needs to have its delay changed. Otherwise, it will + * not be added to the affected_pins structure. + * + * For more, see update_td_delta_costs(). + * + * The timing costs are calculated by getting the new connection delays, + * multiplied by the connection criticalities returned by the timing + * analyzer. These timing costs are stored in the proposed_* data structures. + * + * The change in the bounding box cost is stored in `bb_delta_c`. + * The change in the timing cost is stored in `timing_delta_c`. + * + * @return The number of affected nets. + */ static int find_affected_nets_and_update_costs(const t_place_algorithm& place_algorithm, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, @@ -1605,37 +1416,35 @@ static int find_affected_nets_and_update_costs(const t_place_algorithm& place_al int num_affected_nets = 0; - //Go through all the blocks moved + /* Go through all the blocks moved. */ for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) { ClusterBlockId blk = blocks_affected.moved_blocks[iblk].block_num; - //Go through all the pins in the moved block + /* Go through all the pins in the moved block. */ for (ClusterPinId blk_pin : cluster_ctx.clb_nlist.block_pins(blk)) { ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(blk_pin); VTR_ASSERT_SAFE_MSG(net_id, "Only valid nets should be found in compressed netlist block pins"); if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) - continue; //TODO: do we require anyting special here for global nets. "Global nets are assumed to span the whole chip, and do not effect costs" + //TODO: Do we require anyting special here for global nets? + //"Global nets are assumed to span the whole chip, and do not effect costs." + continue; - //Record effected nets + /* Record effected nets */ record_affected_net(net_id, num_affected_nets); - //Update the net bounding boxes - // - //Do not update the net cost here since it should only be updated - //once per net, not once per pin. + /* Update the net bounding boxes. */ update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin); if (place_algorithm.is_timing_driven()) { - /* Determine the change in connection delay and timing cost */ + /* Determine the change in connection delay and timing cost. */ update_td_delta_costs(delay_model, *criticalities, net_id, blk_pin, blocks_affected, timing_delta_c); } } } - /* Now update the bounding box costs (since the net bounding boxes are up-to-date). - * The cost is only updated once per net. - */ + /* Now update the bounding box costs (since the net bounding * + * boxes are up-to-date). The cost is only updated once per net. */ for (int inet_affected = 0; inet_affected < num_affected_nets; inet_affected++) { ClusterNetId net_id = ts_nets_to_update[inet_affected]; @@ -1646,18 +1455,25 @@ static int find_affected_nets_and_update_costs(const t_place_algorithm& place_al return num_affected_nets; } +///@brief Record effected nets. static void record_affected_net(const ClusterNetId net, int& num_affected_nets) { - //Record effected nets + /* Record effected nets. */ if (proposed_net_cost[net] < 0.) { - //Net not marked yet. + /* Net not marked yet. */ ts_nets_to_update[num_affected_nets] = net; num_affected_nets++; - //Flag to say we've marked this net. + /* Flag to say we've marked this net. */ proposed_net_cost[net] = 1.; } } +/** + * @brief Update the net bounding boxes. + * + * Do not update the net cost here since it should only + * be updated once per net, not once per pin. + */ static void update_net_bb(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected, int iblk, @@ -2800,21 +2616,6 @@ static void free_try_swap_arrays() { g_vpr_ctx.mutable_placement().compressed_block_grids.clear(); } -static void calc_placer_stats(t_placer_statistics& stats, float& success_rat, double& std_dev, const t_placer_costs& costs, const int move_lim) { - success_rat = ((float)stats.success_sum) / move_lim; - if (stats.success_sum == 0) { - stats.av_cost = costs.cost; - stats.av_bb_cost = costs.bb_cost; - stats.av_timing_cost = costs.timing_cost; - } else { - stats.av_cost /= stats.success_sum; - stats.av_bb_cost /= stats.success_sum; - stats.av_timing_cost /= stats.success_sum; - } - - std_dev = get_std_dev(stats.success_sum, stats.sum_of_squares, stats.av_cost); -} - static void generate_post_place_timing_reports(const t_placer_opts& placer_opts, const t_analysis_opts& analysis_opts, const SetupTimingInfo& timing_info, @@ -2848,18 +2649,12 @@ static void print_place_status_header() { VTR_LOG("---- ------ ------- ------- ---------- ---------- ------- ---------- -------- ------- ------- ------ -------- --------- ------\n"); } -static void print_place_status(const size_t num_temps, - const float elapsed_sec, - const float t, - const float alpha, +static void print_place_status(const t_annealing_state& state, const t_placer_statistics& stats, - const float cpd, - const float sTNS, - const float sWNS, - const float acc_rate, - const float std_dev, - const float rlim, - const float crit_exponent, + float elapsed_sec, + float cpd, + float sTNS, + float sWNS, size_t tot_moves) { VTR_LOG( "%4zu " @@ -2868,16 +2663,16 @@ static void print_place_status(const size_t num_temps, "%7.3f %10.2f %-10.5g " "%7.3f % 10.3g % 8.3f " "%7.3f %7.4f %6.1f %8.2f", - num_temps, + state.num_temps, elapsed_sec, - t, + state.t, stats.av_cost, stats.av_bb_cost, stats.av_timing_cost, 1e9 * cpd, 1e9 * sTNS, 1e9 * sWNS, - acc_rate, std_dev, rlim, crit_exponent); + stats.success_rate, stats.std_dev, state.rlim, state.crit_exponent); pretty_print_uint(" ", tot_moves, 9, 3); - VTR_LOG(" %6.3f\n", alpha); + VTR_LOG(" %6.3f\n", state.alpha); fflush(stdout); } @@ -2916,26 +2711,6 @@ static void print_resources_utilization() { VTR_LOG("\n"); } -static void init_annealing_state(t_annealing_state* state, - const t_annealing_sched& annealing_sched, - float t, - float rlim, - int move_lim_max, - float crit_exponent) { - state->alpha = annealing_sched.alpha_min; - state->t = t; - state->restart_t = t; - state->rlim = rlim; - state->inverse_delta_rlim = 1 / (rlim - FINAL_RLIM); - state->move_lim_max = std::max(1, move_lim_max); - if (annealing_sched.type == DUSTY_SCHED) { - state->move_lim = std::max(1, (int)(state->move_lim_max * annealing_sched.success_target)); - } else { - state->move_lim = state->move_lim_max; - } - state->crit_exponent = crit_exponent; -} - bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { return (vpr_setup.PlacerOpts.place_algorithm.is_timing_driven()); } diff --git a/vpr/src/place/place_util.cpp b/vpr/src/place/place_util.cpp index 4090156474d..80695ff5686 100644 --- a/vpr/src/place/place_util.cpp +++ b/vpr/src/place/place_util.cpp @@ -1,36 +1,330 @@ /** * @file place_util.cpp - * @brief Definitions of structure routines declared in place_util.h. + * @brief Definitions of structure methods and routines declared in place_util.h. + * These are mainly utility functions used by the placer. */ #include "place_util.h" #include "globals.h" +#include "draw_global.h" +/* File-scope routines */ static vtr::Matrix init_grid_blocks(); +/** + * @brief Initialize the placer's block-grid dual direction mapping. + * + * Forward direction - block to grid: place_ctx.block_locs. + * Reverse direction - grid to block: place_ctx.grid_blocks. + * + * Initialize both of them to empty states. + */ void init_placement_context() { auto& place_ctx = g_vpr_ctx.mutable_placement(); auto& cluster_ctx = g_vpr_ctx.clustering(); + /* Intialize the lookup of CLB block positions */ place_ctx.block_locs.clear(); place_ctx.block_locs.resize(cluster_ctx.clb_nlist.blocks().size()); + /* Initialize the reverse lookup of CLB block positions */ place_ctx.grid_blocks = init_grid_blocks(); } +/** + * @brief Initialize `grid_blocks`, the inverse structure of `block_locs`. + * + * The container at each grid block location should have a length equal to the + * subtile capacity of that block. Unused subtile would be marked EMPTY_BLOCK_ID. + */ static vtr::Matrix init_grid_blocks() { auto& device_ctx = g_vpr_ctx.device(); + /* Structure should have the same dimensions as the grid. */ auto grid_blocks = vtr::Matrix({device_ctx.grid.width(), device_ctx.grid.height()}); + for (size_t x = 0; x < device_ctx.grid.width(); ++x) { for (size_t y = 0; y < device_ctx.grid.height(); ++y) { auto type = device_ctx.grid[x][y].type; + grid_blocks[x][y].blocks.resize(type->capacity, EMPTY_BLOCK_ID); + } + } + return grid_blocks; +} - int capacity = type->capacity; +/** + * @brief Mutator: updates the norm factors in the outer loop iteration. + * + * At each temperature change we update these values to be used + * for normalizing the trade-off between timing and wirelength (bb) + */ +void t_placer_costs::update_norm_factors() { + if (place_algorithm.is_timing_driven()) { + bb_cost_norm = 1 / bb_cost; + //Prevent the norm factor from going to infinity + timing_cost_norm = std::min(1 / timing_cost, MAX_INV_TIMING_COST); + cost = 1; //The value of cost will be reset to 1 if timing driven + } else { + VTR_ASSERT_SAFE(place_algorithm == BOUNDING_BOX_PLACE); + cost = bb_cost; //The cost value should be identical to the wirelength cost + } +} + +///@brief Constructor: Initialize all annealing state variables and macros. +t_annealing_state::t_annealing_state(const t_annealing_sched& annealing_sched, + float first_t, + float first_rlim, + int first_move_lim, + float first_crit_exponent) { + num_temps = 0; + alpha = annealing_sched.alpha_min; + t = first_t; + restart_t = first_t; + rlim = first_rlim; + move_lim_max = first_move_lim; + crit_exponent = first_crit_exponent; + + /* Determine the current move_lim based on the schedule type */ + if (annealing_sched.type == DUSTY_SCHED) { + move_lim = std::max(1, (int)(move_lim_max * annealing_sched.success_target)); + } else { + move_lim = move_lim_max; + } + + /* Store this inverse value for speed when updating crit_exponent. */ + INVERSE_DELTA_RLIM = 1 / (first_rlim - FINAL_RLIM); + + /* The range limit cannot exceed the largest grid size. */ + auto& grid = g_vpr_ctx.device().grid; + UPPER_RLIM = std::max(grid.width() - 1, grid.height() - 1); +} + +/** + * @brief Get the initial limit for inner loop block move attempt limit. + * + * There are two ways to scale the move limit. + * e_place_effort_scaling::CIRCUIT + * scales the move limit proportional to num_blocks ^ (4/3) + * e_place_effort_scaling::DEVICE_CIRCUIT + * scales the move limit proportional to device_size ^ (2/3) * num_blocks ^ (2/3) + * + * The second method is almost identical to the first one when the device + * is highly utilized (device_size ~ num_blocks). For low utilization devices + * (device_size >> num_blocks), the search space is larger, so the second method + * performs more moves to ensure better optimization. + */ +int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched) { + const auto& device_ctx = g_vpr_ctx.device(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + float device_size = device_ctx.grid.width() * device_ctx.grid.height(); + size_t num_blocks = cluster_ctx.clb_nlist.blocks().size(); + + int move_lim; + if (placer_opts.effort_scaling == e_place_effort_scaling::CIRCUIT) { + move_lim = int(annealing_sched.inner_num * pow(num_blocks, 1.3333)); + } else { + VTR_ASSERT(placer_opts.effort_scaling == e_place_effort_scaling::DEVICE_CIRCUIT); + move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.)); + } + + /* Avoid having a non-positive move_lim */ + move_lim = std::max(move_lim, 1); + + VTR_LOG("Moves per temperature: %d\n", move_lim); + + return move_lim; +} + +/** + * @brief Update the annealing state according to the annealing schedule selected. + * + * USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria. + * AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio. + * DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio. + * See doc/src/vpr/dusty_sa.rst for more details. + * + * @return True->continues the annealing. False->exits the annealing. + */ +bool t_annealing_state::outer_loop_update(float success_rate, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched) { +#ifndef NO_GRAPHICS + t_draw_state* draw_state = get_draw_state_vars(); + if (draw_state->list_of_breakpoints.size() != 0) { + /* Update temperature in the current information variable. */ + get_bp_state_globals()->get_glob_breakpoint_state()->temp_count++; + } +#endif + + if (annealing_sched.type == USER_SCHED) { + /* Update t with user specified alpha. */ + t *= annealing_sched.alpha_t; + + /* Check if the exit criterion is met. */ + bool exit_anneal = t >= annealing_sched.exit_t; + + return exit_anneal; + } - grid_blocks[x][y].blocks.resize(capacity, EMPTY_BLOCK_ID); + /* Automatically determine exit temperature. */ + auto& cluster_ctx = g_vpr_ctx.clustering(); + float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size(); + + if (annealing_sched.type == DUSTY_SCHED) { + /* May get nan if there are no nets */ + bool restart_temp = t < t_exit || std::isnan(t_exit); + + /* If the success rate or the temperature is * + * too low, reset the temperature and alpha. */ + if (success_rate < annealing_sched.success_min || restart_temp) { + /* Only exit anneal when alpha gets too large. */ + if (alpha > annealing_sched.alpha_max) { + return false; + } + /* Take a half step from the restart temperature. */ + t = restart_t / sqrt(alpha); + /* Update alpha. */ + alpha = 1.0 - ((1.0 - alpha) * annealing_sched.alpha_decay); + } else { + /* If the success rate is promising, next time * + * reset t to the current annealing temperature. */ + if (success_rate > annealing_sched.success_target) { + restart_t = t; + } + /* Update t. */ + t *= alpha; + } + + /* Update move lim. */ + update_move_lim(annealing_sched.success_target, success_rate); + } else { + VTR_ASSERT_SAFE(annealing_sched.type == AUTO_SCHED); + /* Automatically adjust alpha according to success rate. */ + if (success_rate > 0.96) { + alpha = 0.5; + } else if (success_rate > 0.8) { + alpha = 0.9; + } else if (success_rate > 0.15 || rlim > 1.) { + alpha = 0.95; + } else { + alpha = 0.8; + } + /* Update temp. */ + t *= alpha; + /* Must be duplicated to retain previous behavior. */ + if (t < t_exit || std::isnan(t_exit)) { + return false; } } - return grid_blocks; + /* Update the range limiter. */ + update_rlim(success_rate); + + /* If using timing driven algorithm, update the crit_exponent. */ + if (placer_opts.place_algorithm.is_timing_driven()) { + update_crit_exponent(placer_opts); + } + + /* Continues the annealing. */ + return true; +} + +/** + * @brief Update the range limiter to keep acceptance prob. near 0.44. + * + * Use a floating point rlim to allow gradual transitions at low temps. + * The range is bounded by 1 (FINAL_RLIM) and the grid size (UPPER_RLIM). + */ +void t_annealing_state::update_rlim(float success_rate) { + rlim *= (1. - 0.44 + success_rate); + rlim = std::min(rlim, UPPER_RLIM); + rlim = std::max(rlim, FINAL_RLIM); +} + +/** + * @brief Update the criticality exponent. + * + * When rlim shrinks towards the FINAL_RLIM value (indicating + * that we are fine-tuning a more optimized placement), we can + * focus more on a smaller number of critical connections. + * To achieve this, we make the crit_exponent sharper, so that + * critical connections would become more critical than before. + * + * We calculate how close rlim is to its final value comparing + * to its initial value. Then, we apply the same scaling factor + * on the crit_exponent so that it lands on the suitable value + * between td_place_exp_first and td_place_exp_last. The scaling + * factor is calculated and applied linearly. + */ +void t_annealing_state::update_crit_exponent(const t_placer_opts& placer_opts) { + /* If rlim == FINAL_RLIM, then scale == 0. */ + float scale = 1 - (rlim - FINAL_RLIM) * INVERSE_DELTA_RLIM; + + /* Apply the scaling factor on crit_exponent. */ + crit_exponent = scale * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) + + placer_opts.td_place_exp_first; +} + +/** + * @brief Update the move limit based on the success rate. + * + * The value is bounded between 1 and move_lim_max. + */ +void t_annealing_state::update_move_lim(float success_target, float success_rate) { + move_lim = move_lim_max * (success_target / success_rate); + move_lim = std::min(move_lim, move_lim_max); + move_lim = std::max(move_lim, 1); +} + +void t_placer_statistics::reset() { + av_cost = 0.; + av_bb_cost = 0.; + av_timing_cost = 0.; + sum_of_squares = 0.; + success_sum = 0; + success_rate = 0.; + std_dev = 0.; +} + +void t_placer_statistics::single_swap_update(const t_placer_costs& costs) { + success_sum++; + av_cost += costs.cost; + av_bb_cost += costs.bb_cost; + av_timing_cost += costs.timing_cost; + sum_of_squares += (costs.cost) * (costs.cost); +} + +void t_placer_statistics::calc_iteration_stats(const t_placer_costs& costs, int move_lim) { + if (success_sum == 0) { + av_cost = costs.cost; + av_bb_cost = costs.bb_cost; + av_timing_cost = costs.timing_cost; + } else { + av_cost /= success_sum; + av_bb_cost /= success_sum; + av_timing_cost /= success_sum; + } + success_rate = success_sum / float(move_lim); + std_dev = get_std_dev(success_sum, sum_of_squares, av_cost); +} + +/** + * @brief Returns the standard deviation of data set x. + * + * There are n sample points, sum_x_squared is the summation over n of x^2 and av_x + * is the average x. All operations are done in double precision, since round off + * error can be a problem in the initial temp. std_dev calculation for big circuits. + */ +double get_std_dev(int n, double sum_x_squared, double av_x) { + double std_dev; + if (n <= 1) { + std_dev = 0.; + } else { + std_dev = (sum_x_squared - n * av_x * av_x) / (double)(n - 1); + } + + /* Very small variances sometimes round negative. */ + return (std_dev > 0.) ? sqrt(std_dev) : 0.; } diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h index 0534ba662a4..818df5d6b4e 100644 --- a/vpr/src/place/place_util.h +++ b/vpr/src/place/place_util.h @@ -1,37 +1,206 @@ /** * @file place_util.h * @brief Utility structures representing various states of the - * placement. Also contains declarations of related routines. + * placement and utility functions used by the placer. */ #pragma once +#include "vpr_types.h" -struct t_placer_costs { - //Although we do nost cost calculations with float's we - //use doubles for the accumulated costs to avoid round-off, - //particularly on large designs where the magnitude of a single - //move's delta cost is small compared to the overall cost. +/** + * @brief Data structure that stores different cost values in the placer. + * + * Although we do cost calculations with float values, we use doubles + * for the accumulated costs to avoid round-off, particularly on large + * designs where the magnitude of a single move's delta cost is small + * compared to the overall cost. + * + * To balance the trade-off between timing and wirelength (bb) cost, the + * change in costs produced by block swaps are divided by the final cost + * values of the previous iteration. However, the divisions are expensive, + * so we store their multiplicative inverses when they are updated in + * the outer loop routines to speed up the normalization process. + * + * @param cost The weighted average of the wiring cost and the timing cost. + * @param bb_cost The bounding box cost, aka the wiring cost. + * @param timing_cost The timing cost, which is connection delay * criticality. + * + * @param bb_cost_norm The normalization factor for the wiring cost. + * @param timing_cost_norm The normalization factor for the timing cost, which + * is upper-bounded by the value of MAX_INV_TIMING_COST. + * + * @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity + * with very lax timing constraints, which avoids multiplying by a + * gigantic timing_cost_norm when auto-normalizing. The exact value + * of this cost has relatively little impact, but should not be large + * enough to be on the order of timing costs for normal constraints. + * + * @param place_algorithm Determines how the member values are updated upon + * each temperature change during the placer annealing process. + */ +class t_placer_costs { + public: //members double cost; double bb_cost; double timing_cost; + double bb_cost_norm; + double timing_cost_norm; + + public: //Constructor + t_placer_costs(t_place_algorithm algo) + : place_algorithm(algo) {} + + public: //Mutator + void update_norm_factors(); + + private: + double MAX_INV_TIMING_COST = 1.e9; + t_place_algorithm place_algorithm; }; -struct t_placer_prev_inverse_costs { - double bb_cost; - double timing_cost; +/** + * @brief Stores variables that are used by the annealing process. + * + * This structure is updated by update_annealing_state() on each outer + * loop iteration. It stores various important variables that need to + * be accessed during the placement inner loop. + * + * Private variables are not given accessor functions. They serve as + * macros originally defined in place.cpp as global scope variables. + * + * Public members: + * @param t + * Temperature for simulated annealing. + * @param restart_t + * Temperature used after restart due to minimum success ratio. + * Currently only used and updated by DUSTY_SCHED. + * @param alpha + * Temperature decays factor (multiplied each outer loop iteration). + * @param num_temps + * The count of how many temperature iterations have passed. + * + * @param rlim + * Range limit for block swaps. + * Currently only updated by DUSTY_SCHED and AUTO_SCHED. + * @param crit_exponent + * Used by timing-driven placement to "sharpen" the timing criticality. + * Depends on rlim. Currently only updated by DUSTY_SCHED and AUTO_SCHED. + * @param move_lim + * Current block move limit. + * Currently only updated by DUSTY_SCHED. + * @param move_lim_max + * Maximum block move limit. + * + * Private members: + * @param UPPER_RLIM + * The upper limit for the range limiter value. + * @param FINAL_RLIM + * The final rlim (range limit) is 1, which is the smallest value that + * can still make progress, since an rlim of 0 wouldn't allow any swaps. + * @param INVERSE_DELTA_RLIM + * Used to update crit_exponent. See update_rlim() for more. + * + * Mutators: + * @param outer_loop_update() + * Update the annealing state variables in the placement outer loop. + * @param update_rlim(), update_crit_exponent(), update_move_lim() + * Inline subroutines used by the main routine outer_loop_update(). + */ +class t_annealing_state { + public: + float t; + float restart_t; + float alpha; + int num_temps; + + float rlim; + float crit_exponent; + int move_lim; + int move_lim_max; + + private: + float UPPER_RLIM; + float FINAL_RLIM = 1.; + float INVERSE_DELTA_RLIM; + + public: //Constructor + t_annealing_state(const t_annealing_sched& annealing_sched, + float first_t, + float first_rlim, + int first_move_lim, + float first_crit_exponent); + + public: //Mutator + bool outer_loop_update(float success_rate, + const t_placer_costs& costs, + const t_placer_opts& placer_opts, + const t_annealing_sched& annealing_sched); + + private: //Mutator + inline void update_rlim(float success_rate); + inline void update_crit_exponent(const t_placer_opts& placer_opts); + inline void update_move_lim(float success_target, float success_rate); }; -// Used by update_annealing_state() -struct t_annealing_state { - float t; // Temperature - float rlim; // Range limit for swaps - float inverse_delta_rlim; // used to calculate crit_exponent - float alpha; // Temperature decays by this factor each outer iteration - float restart_t; // Temperature used after restart due to minimum success ratio - float crit_exponent; // Used by timing-driven placement to "sharpen" timing criticality - int move_lim_max; // Maximum move limit - int move_lim; // Current move limit +/** + * @brief Stores statistics produced by a single annealing iteration. + * + * This structure is refreshed at the beginning of every annealing loop + * by calling reset(). Whenever a block swap move is accepted, this + * structure calls single_swap_update() to update its variables. At the + * end of the current iteration, it calls calc_iteration_stats() to + * summarize the results (success_rate & std_dev of the total costs). + * + * In terms of calculating statistics for total cost, we mean that we + * operate upon the set of placer cost values gathered after every + * accepted block move. + * + * @param av_cost + * Average total cost. Cost formulation depends on + * the place algorithm currently being used. + * @param av_bb_cost + * Average bounding box (wiring) cost. + * @param av_timing_cost + * Average timing cost (delay * criticality). + * @param sum_of_squares + * Sum of squares of the total cost. + * @param success_num + * Number of accepted block swaps for the current iteration. + * @param success_rate + * num_accepted / total_trials for the current iteration. + * @param std_dev + * Standard deviation of the total cost. + * + */ +class t_placer_statistics { + public: + double av_cost; + double av_bb_cost; + double av_timing_cost; + double sum_of_squares; + int success_sum; + float success_rate; + double std_dev; + + public: //Constructor + t_placer_statistics() { reset(); } + + public: //Mutator + ///@brief Clear all data fields. + void reset(); + + ///@brief Update stats when a single swap move has been accepted. + void calc_iteration_stats(const t_placer_costs& costs, int move_lim); + + ///@brief Calculate placer success rate and cost std_dev for this iteration. + void single_swap_update(const t_placer_costs& costs); }; -//Initialize the placement context +///@brief Initialize the placer's block-grid dual direction mapping. void init_placement_context(); + +///@brief Get the initial limit for inner loop block move attempt limit. +int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sched& annealing_sched); + +///@brief Returns the standard deviation of data set x. +double get_std_dev(int n, double sum_x_squared, double av_x);