diff --git a/doc/src/vpr/command_line_usage.rst b/doc/src/vpr/command_line_usage.rst index aa6e97d3e16..f3f9f5caa0f 100644 --- a/doc/src/vpr/command_line_usage.rst +++ b/doc/src/vpr/command_line_usage.rst @@ -701,16 +701,24 @@ If any of init_t, exit_t or alpha_t is specified, the user schedule, with a fixe **Default:** ````. -.. option:: --place_algorithm {bounding_box | path_timing_driven} +.. option:: --place_algorithm {bounding_box | criticality_timing | slack_timing} Controls the algorithm used by the placer. - ``bounding_box`` focuses purely on minimizing the bounding box wirelength of the circuit. + ``bounding_box`` Focuses purely on minimizing the bounding box wirelength of the circuit. Turns off timing analysis if specified. - ``path_timing_driven`` focuses on minimizing both wirelength and the critical path delay. + ``criticality_timing`` Focuses on minimizing both the wirelength and the connection timing costs (criticality * delay). + ``slack_timing`` Focuses on improving the circuit slack values to reduce critical path delay. - **Default:** ``path_timing_driven`` + **Default:** ``criticality_timing`` + +.. option:: --place_quench_algorithm {bounding_box | criticality_timing | slack_timing} + + Controls the algorithm used by the placer during placement quench. + The algorithm options have identical functionality as the ones used by the option ``--place_algorithm``. If specified, it overrides the option ``--place_algorithm`` during placement quench. + + **Default:** ``criticality_timing`` .. option:: --place_chan_width diff --git a/vpr/src/base/CheckSetup.cpp b/vpr/src/base/CheckSetup.cpp index c3ee3ca59b2..cd914374764 100644 --- a/vpr/src/base/CheckSetup.cpp +++ b/vpr/src/base/CheckSetup.cpp @@ -23,7 +23,7 @@ void CheckSetup(const t_packer_opts& PackerOpts, } if ((GLOBAL == RouterOpts.route_type) - && (BOUNDING_BOX_PLACE != PlacerOpts.place_algorithm)) { + && (PlacerOpts.place_algorithm.is_timing_driven())) { /* Works, but very weird. Can't optimize timing well, since you're * not doing proper architecture delay modelling. */ VTR_LOG_WARN( @@ -32,7 +32,7 @@ void CheckSetup(const t_packer_opts& PackerOpts, } if ((false == Timing.timing_analysis_enabled) - && (PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE)) { + && (PlacerOpts.place_algorithm.is_timing_driven())) { /* May work, not tested */ VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Timing analysis must be enabled for timing-driven placement.\n"); diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index b318c13e4cb..5eeb45d61a7 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -534,6 +534,7 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts) PlacerOpts->td_place_exp_last = Options.place_exp_last; PlacerOpts->place_algorithm = Options.PlaceAlgorithm; + PlacerOpts->place_quench_algorithm = Options.PlaceQuenchAlgorithm; PlacerOpts->constraints_file = Options.constraints_file; diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp index a2a17b56f66..e7127fc3ff9 100644 --- a/vpr/src/base/ShowSetup.cpp +++ b/vpr/src/base/ShowSetup.cpp @@ -499,12 +499,15 @@ static void ShowPlacerOpts(const t_placer_opts& PlacerOpts, if ((PLACE_ONCE == PlacerOpts.place_freq) || (PLACE_ALWAYS == PlacerOpts.place_freq)) { VTR_LOG("PlacerOpts.place_algorithm: "); - switch (PlacerOpts.place_algorithm) { + switch (PlacerOpts.place_algorithm.get()) { case BOUNDING_BOX_PLACE: VTR_LOG("BOUNDING_BOX_PLACE\n"); break; - case PATH_TIMING_DRIVEN_PLACE: - VTR_LOG("PATH_TIMING_DRIVEN_PLACE\n"); + case CRITICALITY_TIMING_PLACE: + VTR_LOG("CRITICALITY_TIMING_PLACE\n"); + break; + case SLACK_TIMING_PLACE: + VTR_LOG("SLACK_TIMING_PLACE\n"); break; default: VTR_LOG_ERROR("Unknown placement algorithm\n"); @@ -533,7 +536,7 @@ static void ShowPlacerOpts(const t_placer_opts& PlacerOpts, VTR_LOG("PlacerOpts.place_chan_width: %d\n", PlacerOpts.place_chan_width); - if (PATH_TIMING_DRIVEN_PLACE == PlacerOpts.place_algorithm) { + if (PlacerOpts.place_algorithm.is_timing_driven()) { VTR_LOG("PlacerOpts.inner_loop_recompute_divider: %d\n", PlacerOpts.inner_loop_recompute_divider); VTR_LOG("PlacerOpts.recompute_crit_iter: %d\n", PlacerOpts.recompute_crit_iter); VTR_LOG("PlacerOpts.timing_tradeoff: %f\n", PlacerOpts.timing_tradeoff); diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 2a8ea5a230e..bb9907448ab 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -350,13 +350,22 @@ struct ParsePlaceDeltaDelayAlgorithm { struct ParsePlaceAlgorithm { ConvertedValue from_str(std::string str) { ConvertedValue conv_value; - if (str == "bounding_box") + if (str == "bounding_box") { conv_value.set_value(BOUNDING_BOX_PLACE); - else if (str == "path_timing_driven") - conv_value.set_value(PATH_TIMING_DRIVEN_PLACE); - else { + } else if (str == "criticality_timing") { + conv_value.set_value(CRITICALITY_TIMING_PLACE); + } else if (str == "slack_timing") { + conv_value.set_value(SLACK_TIMING_PLACE); + } else { std::stringstream msg; - msg << "Invalid conversion from '" << str << "' to e_router_algorithm (expected one of: " << argparse::join(default_choices(), ", ") << ")"; + msg << "Invalid conversion from '" << str << "' to e_place_algorithm (expected one of: " << argparse::join(default_choices(), ", ") << ")"; + + //Deprecated option: "path_timing_driven" -> PATH_DRIVEN_TIMING_PLACE + //New option: "criticality_timing" -> CRITICALITY_TIMING_PLACE + if (str == "path_timing_driven") { + msg << "\nDeprecated option: 'path_timing_driven'. It has been renamed to 'criticality_timing'"; + } + conv_value.set_error(msg.str()); } return conv_value; @@ -364,17 +373,19 @@ struct ParsePlaceAlgorithm { ConvertedValue to_str(e_place_algorithm val) { ConvertedValue conv_value; - if (val == BOUNDING_BOX_PLACE) + if (val == BOUNDING_BOX_PLACE) { conv_value.set_value("bounding_box"); - else { - VTR_ASSERT(val == PATH_TIMING_DRIVEN_PLACE); - conv_value.set_value("path_timing_driven"); + } else if (val == CRITICALITY_TIMING_PLACE) { + conv_value.set_value("criticality_timing"); + } else { + VTR_ASSERT(val == SLACK_TIMING_PLACE); + conv_value.set_value("slack_timing"); } return conv_value; } std::vector default_choices() { - return {"bounding_box", "path_timing_driven"}; + return {"bounding_box", "criticality_timing", "slack_timing"}; } }; @@ -1679,9 +1690,25 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .show_in(argparse::ShowIn::HELP_ONLY); place_grp.add_argument(args.PlaceAlgorithm, "--place_algorithm") - .help("Controls which placement algorithm is used") - .default_value("path_timing_driven") - .choices({"bounding_box", "path_timing_driven"}) + .help( + "Controls which placement algorithm is used. Valid options:\n" + " * bounding_box: Focuses purely on minimizing the bounding box wirelength of the circuit. Turns off timing analysis if specified.\n" + " * criticality_timing: Focuses on minimizing both the wirelength and the connection timing costs (criticality * delay).\n" + " * slack_timing: Focuses on improving the circuit slack values to reduce critical path delay.\n") + .default_value("criticality_timing") + .choices({"bounding_box", "criticality_timing", "slack_timing"}) + .show_in(argparse::ShowIn::HELP_ONLY); + + place_grp.add_argument(args.PlaceQuenchAlgorithm, "--place_quench_algorithm") + .help( + "Controls which placement algorithm is used during placement quench.\n" + "If specified, it overrides the option --place_algorithm during placement quench.\n" + "Valid options:\n" + " * bounding_box: Focuses purely on minimizing the bounding box wirelength of the circuit. Turns off timing analysis if specified.\n" + " * criticality_timing: Focuses on minimizing both the wirelength and the connection timing costs (criticality * delay).\n" + " * slack_timing: Focuses on improving the circuit slack values to reduce critical path delay.\n") + .default_value("criticality_timing") + .choices({"bounding_box", "criticality_timing", "slack_timing"}) .show_in(argparse::ShowIn::HELP_ONLY); place_grp.add_argument(args.PlaceChanWidth, "--place_chan_width") @@ -2314,12 +2341,17 @@ void set_conditional_defaults(t_options& args) { //Which placement algorithm to use? if (args.PlaceAlgorithm.provenance() != Provenance::SPECIFIED) { if (args.timing_analysis) { - args.PlaceAlgorithm.set(PATH_TIMING_DRIVEN_PLACE, Provenance::INFERRED); + args.PlaceAlgorithm.set(CRITICALITY_TIMING_PLACE, Provenance::INFERRED); } else { args.PlaceAlgorithm.set(BOUNDING_BOX_PLACE, Provenance::INFERRED); } } + //Which placement algorithm to use during placement quench? + if (args.PlaceQuenchAlgorithm.provenance() != Provenance::SPECIFIED) { + args.PlaceQuenchAlgorithm.set(args.PlaceAlgorithm, Provenance::INFERRED); + } + //Place chan width follows Route chan width if unspecified if (args.PlaceChanWidth.provenance() != Provenance::SPECIFIED && args.RouteChanWidth.provenance() == Provenance::SPECIFIED) { args.PlaceChanWidth.set(args.RouteChanWidth.value(), Provenance::INFERRED); diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h index 3ac214d20a2..e377c70bd7f 100644 --- a/vpr/src/base/read_options.h +++ b/vpr/src/base/read_options.h @@ -104,6 +104,7 @@ struct t_options { argparse::ArgValue PlaceSuccessTarget; argparse::ArgValue anneal_sched_type; argparse::ArgValue PlaceAlgorithm; + argparse::ArgValue PlaceQuenchAlgorithm; argparse::ArgValue pad_loc_type; argparse::ArgValue PlaceChanWidth; argparse::ArgValue place_rlim_escape_fraction; diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 91174023c63..105a2eb84d3 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -829,29 +829,82 @@ struct t_annealing_sched { float success_target; }; -/* Various options for the placer. * - * place_algorithm: BOUNDING_BOX_PLACE or PATH_TIMING_DRIVEN_PLACE * - * timing_tradeoff: When TIMING_DRIVEN_PLACE mode, what is the tradeoff * - * timing driven and BOUNDING_BOX_PLACE. * - * place_cost_exp: Power to which denominator is raised for linear_cong. * - * place_chan_width: The channel width assumed if only one placement is * - * performed. * - * pad_loc_type: Are pins free to move during placement or fixed randomly. * - * constraints_file: File used to lock block locations during placement. * - * place_freq: Should the placement be skipped, done once, or done for each * - * channel width in the binary search. * - * recompute_crit_iter: how many temperature stages pass before we recompute * - * criticalities based on average point to point delay * - * inner_loop_crit_divider: (move_lim/inner_loop_crit_divider) determines how* - * many inner_loop iterations pass before a recompute of * - * criticalities is done. * - * td_place_exp_first: exponent that is used on the timing_driven criticlity * - * it is the value that the exponent starts at. * - * td_place_exp_last: value that the criticality exponent will be at the end * - * doPlacement: true if placement is supposed to be done in the CAD flow, false otherwise */ +/****************************************************************** + * Placer data types + *******************************************************************/ + +/** + * @brief Types of placement algorithms used in the placer. + * + * @param BOUNDING_BOX_PLACE + * Focuses purely on minimizing the bounding + * box wirelength of the circuit. + * @param CRITICALITY_TIMING_PLACE + * Focuses on minimizing both the wirelength and the + * connection timing costs (criticality * delay). + * @param SLACK_TIMING_PLACE + * Focuses on improving the circuit slack values + * to reduce critical path delay. + * + * The default is to use CRITICALITY_TIMING_PLACE. BOUNDING_BOX_PLACE + * is used when there is no timing information available (wiring only). + * SLACK_TIMING_PLACE is mainly feasible during placement quench. + */ enum e_place_algorithm { BOUNDING_BOX_PLACE, - PATH_TIMING_DRIVEN_PLACE + CRITICALITY_TIMING_PLACE, + SLACK_TIMING_PLACE +}; + +/** + * @brief Provides a wrapper around enum e_place_algorithm. + * + * Supports the method is_timing_driven(), which allows flexible updates + * to the placer algorithms if more timing driven placement strategies + * are added in tht future. This method is used across various placement + * setup files, and it can be useful for major placer routines as well. + * + * More methods can be added to this class if the placement strategies + * will be further divided into more categories the future. + * + * Also supports assignments and comparisons between t_place_algorithm + * and e_place_algorithm so as not to break down previous codes. + */ +class t_place_algorithm { + public: + //Constructors + t_place_algorithm() = default; + t_place_algorithm(e_place_algorithm _algo) + : algo(_algo) {} + ~t_place_algorithm() = default; + + //Assignment operators + t_place_algorithm& operator=(const t_place_algorithm& rhs) { + algo = rhs.algo; + return *this; + } + t_place_algorithm& operator=(e_place_algorithm rhs) { + algo = rhs; + return *this; + } + + //Equality operators + bool operator==(const t_place_algorithm& rhs) const { return algo == rhs.algo; } + bool operator==(e_place_algorithm rhs) const { return algo == rhs; } + bool operator!=(const t_place_algorithm& rhs) const { return algo != rhs.algo; } + bool operator!=(e_place_algorithm rhs) const { return algo != rhs; } + + ///@brief Check if the algorithm belongs to the timing driven category. + inline bool is_timing_driven() const { + return algo == CRITICALITY_TIMING_PLACE || algo == SLACK_TIMING_PLACE; + } + + ///@brief Accessor: returns the underlying e_place_algorithm enum value. + e_place_algorithm get() const { return algo; } + + private: + ///@brief The underlying algorithm. Default set to CRITICALITY_TIMING_PLACE. + e_place_algorithm algo = e_place_algorithm::CRITICALITY_TIMING_PLACE; }; enum e_pad_loc_type { @@ -859,6 +912,7 @@ enum e_pad_loc_type { RANDOM }; +///@brief Used to calculate the inner placer loop's block swapping limit move_lim. enum e_place_effort_scaling { CIRCUIT, /// bb_updated_before; * Net connection delays based on the placement. * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static ClbNetPinsMatrix connection_delay; //Delays based on commited block positions +static ClbNetPinsMatrix connection_delay; //Delays based on committed block positions static ClbNetPinsMatrix proposed_connection_delay; //Delays for proposed block positions (only - // for connections effected by move, otherwise - // INVALID_DELAY) +// for connections effected by move, otherwise +// INVALID_DELAY) + +static ClbNetPinsMatrix connection_setup_slack; //Setup slacks based on most recently updated timing graph /* * Timing cost of connections (i.e. criticality * delay). * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ -static PlacerTimingCosts connection_timing_cost; //Costs of commited block positions +static PlacerTimingCosts connection_timing_cost; //Costs of committed block positions static ClbNetPinsMatrix proposed_connection_timing_cost; //Costs for proposed block positions - // (only for connectsion effected by - // move, otherwise INVALID_DELAY) +// (only for connection effected by +// move, otherwise INVALID_DELAY) /* * Timing cost of nets (i.e. sum of criticality * delay for each net sink/connection). * Index ranges: [0..cluster_ctx.clb_nlist.nets().size()-1] */ static vtr::vector net_timing_cost; //Like connection_timing_cost, but summed - // accross net pins. Used to allow more - // efficient recalculation of timing cost - // if only a sub-set of nets are changed - // while maintaining numeric stability. +// accross net pins. Used to allow more +// efficient recalculation of timing cost +// if only a sub-set of nets are changed +// while maintaining numeric stability. /* [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates and the number of * * blocks on each of a net's bounding box (to allow efficient updates), * @@ -332,40 +334,44 @@ static void update_move_nets(int num_nets_affected); static void reset_move_nets(int num_nets_affected); static e_move_result try_swap(float t, + float crit_exponent, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, MoveGenerator& move_generator, - TimingInfo* timing_info, + SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, float rlim_escape_fraction, - enum e_place_algorithm place_algorithm, + const t_place_algorithm& place_algorithm, float timing_tradeoff); static void check_place(const t_placer_costs& costs, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, - enum e_place_algorithm place_algorithm); + const t_place_algorithm& place_algorithm); static int check_placement_costs(const t_placer_costs& costs, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, - enum e_place_algorithm place_algorithm); + const t_place_algorithm& place_algorithm); static int check_placement_consistency(); static int check_block_placement_consistency(); static int check_macro_placement_consistency(); -static float starting_t(t_placer_costs* costs, +static float starting_t(float crit_exponent, + t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, int max_moves, float rlim, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, - TimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + SetupTimingInfo* timing_info, MoveGenerator& move_generator, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, @@ -389,13 +395,17 @@ static float comp_td_connection_delay(const PlaceDelayModel* delay_model, Cluste static void comp_td_connection_delays(const PlaceDelayModel* delay_model); +static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks); + +static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks); + static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected); -static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected, - ClusteredPinTimingInvalidator* pin_tedges_invalidator, - TimingInfo* timing_info); +static void invalidate_affected_connections(const t_pl_blocks_to_be_moved& blocks_affected, + ClusteredPinTimingInvalidator* pin_tedges_invalidator, + TimingInfo* timing_info); static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected); @@ -407,13 +417,15 @@ static double comp_td_connection_cost(const PlaceDelayModel* delay_mode, const P static double sum_td_net_cost(ClusterNetId net); static double sum_td_costs(); +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks); + static e_move_result assess_swap(double delta_c, double t); static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new); static void update_bb(ClusterNetId net_id, t_bb* bb_coord_new, t_bb* bb_edge_new, int xold, int yold, int xnew, int ynew); -static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm, +static int find_affected_nets_and_update_costs(const t_place_algorithm& place_algorithm, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, t_pl_blocks_to_be_moved& blocks_affected, @@ -442,23 +454,43 @@ static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr); static void free_try_swap_arrays(); -static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info); - -static void recompute_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs); +static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_costs* costs, + t_placer_prev_inverse_costs* prev_inverse_costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info); + +static void initialize_timing_info(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs); + +static void update_timing_classes(float crit_exponent, + SetupTimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator); + +static void update_timing_cost(const PlaceDelayModel* delay_model, + const PlacerCriticalities* criticalities, + double* timing_cost); + +static void perform_full_timing_update(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs); static void placement_inner_loop(float t, int temp_num, @@ -474,9 +506,11 @@ static void placement_inner_loop(float t, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, - SetupTimingInfo* timing_info); + SetupTimingInfo* timing_info, + const t_place_algorithm& place_algorithm); static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, @@ -550,6 +584,7 @@ void try_place(const t_placer_opts& placer_opts, std::shared_ptr placement_delay_calc; std::unique_ptr place_delay_model; std::unique_ptr move_generator; + std::unique_ptr placer_setup_slacks; std::unique_ptr placer_criticalities; std::unique_ptr pin_timing_invalidator; @@ -564,7 +599,7 @@ void try_place(const t_placer_opts& placer_opts, num_swap_aborted = 0; num_ts_called = 0; - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { /*do this before the initial placement to avoid messing up the initial placement */ place_delay_model = alloc_lookups_and_criticalities(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs); @@ -595,7 +630,7 @@ void try_place(const t_placer_opts& placer_opts, /* Gets initial cost and loads bounding boxes. */ - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { costs.bb_cost = comp_bb_cost(NORMAL); first_crit_exponent = placer_opts.td_place_exp_first; /*this will be modified when rlim starts to change */ @@ -617,6 +652,8 @@ void try_place(const t_placer_opts& placer_opts, timing_info = make_setup_timing_info(placement_delay_calc, placer_opts.timing_update_type); + placer_setup_slacks = std::make_unique(cluster_ctx.clb_nlist, netlist_pin_lookup); + placer_criticalities = std::make_unique(cluster_ctx.clb_nlist, netlist_pin_lookup); pin_timing_invalidator = std::make_unique(cluster_ctx.clb_nlist, @@ -624,15 +661,14 @@ void try_place(const t_placer_opts& placer_opts, atom_ctx.nlist, atom_ctx.lookup, *timing_info->timing_graph()); - //Update timing and costs - recompute_criticalities(first_crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); - - timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement + //First time compute timing and costs, compute from scratch + initialize_timing_info(first_crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &costs); critical_path = timing_info->least_slack_critical_path(); @@ -651,9 +687,9 @@ void try_place(const t_placer_opts& placer_opts, prev_inverse_costs.timing_cost = 1 / costs.timing_cost; prev_inverse_costs.bb_cost = 1 / costs.bb_cost; costs.cost = 1; /*our new cost function uses normalized values of */ - /*bb_cost and timing_cost, the value of cost will be reset */ - /*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */ - } else { /*BOUNDING_BOX_PLACE */ + /*bb_cost and timing_cost, the value of cost will be reset */ + /*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */ + } else { /*BOUNDING_BOX_PLACE */ costs.cost = costs.bb_cost = comp_bb_cost(NORMAL); costs.timing_cost = 0; outer_crit_iter_count = 0; @@ -670,7 +706,7 @@ void try_place(const t_placer_opts& placer_opts, //Initial pacement statistics VTR_LOG("Initial placement cost: %g bb_cost: %g td_cost: %g\n", costs.cost, costs.bb_cost, costs.timing_cost); - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { VTR_LOG("Initial placement estimated Critical Path Delay (CPD): %g ns\n", 1e9 * critical_path.delay()); VTR_LOG("Initial placement estimated setup Total Negative Slack (sTNS): %g ns\n", @@ -745,10 +781,12 @@ void try_place(const t_placer_opts& placer_opts, first_rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); - float first_t = starting_t(&costs, &prev_inverse_costs, + float first_t = starting_t(first_crit_exponent, + &costs, &prev_inverse_costs, annealing_sched, move_lim, first_rlim, place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), timing_info.get(), *move_generator, pin_timing_invalidator.get(), @@ -774,30 +812,33 @@ void try_place(const t_placer_opts& placer_opts, /* Outer loop of the simulated annealing begins */ do { vtr::Timer temperature_timer; - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { costs.cost = 1; } - outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); + outer_loop_update_timing_info(placer_opts, + &costs, &prev_inverse_costs, + num_connections, + state.crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get()); placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, state.move_lim, state.crit_exponent, inner_recompute_limit, &stats, - &costs, - &prev_inverse_costs, + &costs, &prev_inverse_costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), *move_generator, blocks_affected, - timing_info.get()); + timing_info.get(), + placer_opts.place_algorithm); tot_iter += state.move_lim; @@ -805,7 +846,7 @@ void try_place(const t_placer_opts& placer_opts, ++num_temps; - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { critical_path = timing_info->least_slack_critical_path(); sTNS = timing_info->setup_total_negative_slack(); sWNS = timing_info->setup_worst_negative_slack(); @@ -834,15 +875,16 @@ void try_place(const t_placer_opts& placer_opts, { /* Quench */ vtr::ScopedFinishTimer temperature_timer("Placement Quench"); - outer_loop_recompute_criticalities(placer_opts, &costs, - &prev_inverse_costs, - num_connections, - state.crit_exponent, - &outer_crit_iter_count, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get()); + outer_loop_update_timing_info(placer_opts, + &costs, &prev_inverse_costs, + num_connections, + state.crit_exponent, + &outer_crit_iter_count, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get()); state.t = 0; /* freeze out */ @@ -850,22 +892,23 @@ void try_place(const t_placer_opts& placer_opts, * which reduce the cost of the placement */ placement_inner_loop(state.t, num_temps, state.rlim, placer_opts, move_lim, state.crit_exponent, quench_recompute_limit, &stats, - &costs, - &prev_inverse_costs, + &costs, &prev_inverse_costs, &moves_since_cost_recompute, pin_timing_invalidator.get(), place_delay_model.get(), placer_criticalities.get(), + placer_setup_slacks.get(), *move_generator, blocks_affected, - timing_info.get()); + timing_info.get(), + placer_opts.place_quench_algorithm); tot_iter += move_lim; ++num_temps; calc_placer_stats(stats, success_rat, std_dev, costs, move_lim); - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_quench_algorithm.is_timing_driven()) { critical_path = timing_info->least_slack_critical_path(); sTNS = timing_info->setup_total_negative_slack(); sWNS = timing_info->setup_worst_negative_slack(); @@ -902,17 +945,16 @@ void try_place(const t_placer_opts& placer_opts, VTR_LOG("Swaps called: %d\n", num_ts_called); report_aborted_moves(); - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { //Final timing estimate VTR_ASSERT(timing_info); - - //Update timing and costs - recompute_criticalities(state.crit_exponent, - place_delay_model.get(), - placer_criticalities.get(), - pin_timing_invalidator.get(), - timing_info.get(), - &costs); + perform_full_timing_update(state.crit_exponent, + place_delay_model.get(), + placer_criticalities.get(), + placer_setup_slacks.get(), + pin_timing_invalidator.get(), + timing_info.get(), + &costs); critical_path = timing_info->least_slack_critical_path(); @@ -965,19 +1007,21 @@ void try_place(const t_placer_opts& placer_opts, VTR_LOG("update_td_costs: connections %g nets %g sum_nets %g total %g\n", f_update_td_costs_connections_elapsed_sec, f_update_td_costs_nets_elapsed_sec, f_update_td_costs_sum_nets_elapsed_sec, f_update_td_costs_total_elapsed_sec); } -/* Function to recompute the criticalities before the inner loop of the annealing */ -static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, - t_placer_costs* costs, - t_placer_prev_inverse_costs* prev_inverse_costs, - int num_connections, - float crit_exponent, - int* outer_crit_iter_count, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info) { - if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE) +/* Function to update the setup slacks and criticalities before the inner loop of the annealing/quench */ +static void outer_loop_update_timing_info(const t_placer_opts& placer_opts, + t_placer_costs* costs, + t_placer_prev_inverse_costs* prev_inverse_costs, + int num_connections, + float crit_exponent, + int* outer_crit_iter_count, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info) { + if (!placer_opts.place_algorithm.is_timing_driven()) { return; + } /*at each temperature change we update these values to be used */ /*for normalizing the tradeoff between timing and wirelength (bb) */ @@ -989,13 +1033,15 @@ static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, num_connections = std::max(num_connections, 1); //Avoid division by zero VTR_ASSERT(num_connections > 0); - //Update timing information - recompute_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + //Update all timing related classes + perform_full_timing_update(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); + *outer_crit_iter_count = 0; } (*outer_crit_iter_count)++; @@ -1007,29 +1053,135 @@ static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts, prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST); } -//Update timing information based on current placement by running STA to get new slacks, -//and calculate updated criticalities and timing costs -static void recompute_criticalities(float crit_exponent, - const PlaceDelayModel* delay_model, - PlacerCriticalities* criticalities, - ClusteredPinTimingInvalidator* pin_timing_invalidator, - SetupTimingInfo* timing_info, - t_placer_costs* costs) { - //Run STA to update slacks and adjusted/relaxed criticalities +static void initialize_timing_info(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //As a safety measure, for the first time update, + //invalidate all timing edges via the pin invalidator + //by passing in all the clb sink pins + for (ClusterNetId net_id : clb_nlist.nets()) { + for (ClusterPinId pin_id : clb_nlist.net_sinks(net_id)) { + pin_timing_invalidator->invalidate_connection(pin_id, timing_info); + } + } + + //Perform first time update for all timing related classes + perform_full_timing_update(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); + + //Don't warn again about unconstrained nodes again during placement + timing_info->set_warn_unconstrained(false); +} + +/** + * @brief Update timing information based on the current block positions. + * + * Run STA to update the timing info class. + * + * Update the values stored in PlacerCriticalities and PlacerSetupSlacks + * if they are enabled to update. To enable updating, call their respective + * enable_update() method. See their documentation for more detailed info. + * + * If criticalities are updated, the timing driven costs should be updated + * as well by calling update_timing_cost(). Calling this routine to update + * timing_cost will produce round-off error in the long run due to its + * incremental nature, so the timing cost value will be recomputed once in + * a while, via other timing driven routines. + * + * If setup slacks are updated, then normally they should be committed to + * `connection_setup_slack` via commit_setup_slacks() routine. However, + * sometimes new setup slack values are not committed immediately if we + * expect to revert the current timing update in the near future, or if + * we wish to compare the new slack values to the original ones. + * + * All the pins with changed connection delays have already been added into + * the ClusteredPinTimingInvalidator to allow incremental STA update. These + * changed connection delays are a direct result of moved blocks in try_swap(). + */ +static void update_timing_classes(float crit_exponent, + SetupTimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator) { + /* Run STA to update slacks and adjusted/relaxed criticalities. */ timing_info->update(); - //Update placer'criticalities (e.g. sharpen with crit_exponent) + /* Update the placer's criticalities (e.g. sharpen with crit_exponent). */ criticalities->update_criticalities(timing_info, crit_exponent); - //Update connection, net and total timing costs based on new criticalities + /* Update the placer's raw setup slacks. */ + setup_slacks->update_setup_slacks(timing_info); + + /* Clear invalidation state. */ + pin_timing_invalidator->reset(); +} + +/** + * @brief Update the timing driven (td) costs. + * + * This routine either uses incremental update_td_costs(), or updates + * from scratch using comp_td_costs(). By default, it is incremental + * by iterating over the set of clustered netlist connections/pins + * returned by PlacerCriticalities::pins_with_modified_criticality(). + * + * Hence, this routine should always be called when PlacerCriticalites + * is enabled to be updated in update_timing_classes(). Otherwise, the + * incremental method will no longer be correct. + */ +static void update_timing_cost(const PlaceDelayModel* delay_model, + const PlacerCriticalities* criticalities, + double* timing_cost) { #ifdef INCR_COMP_TD_COSTS - update_td_costs(delay_model, *criticalities, &costs->timing_cost); + update_td_costs(delay_model, *criticalities, timing_cost); #else - comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + comp_td_costs(delay_model, *criticalities, timing_cost); #endif +} - //Clear invalidation state - pin_timing_invalidator->reset(); +/** + * @brief Updates every timing related classes, variables and structures. + * + * This routine exists to reduce code duplication, as the placer routines + * often require updating every timing related stuff. + * + * Updates: SetupTimingInfo, PlacerCriticalities, PlacerSetupSlacks, + * timing_cost, connection_setup_slack. + */ +static void perform_full_timing_update(float crit_exponent, + const PlaceDelayModel* delay_model, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + ClusteredPinTimingInvalidator* pin_timing_invalidator, + SetupTimingInfo* timing_info, + t_placer_costs* costs) { + /* Update all timing related classes. */ + criticalities->enable_update(); + setup_slacks->enable_update(); + update_timing_classes(crit_exponent, + timing_info, + criticalities, + setup_slacks, + pin_timing_invalidator); + + /* Update the timing cost with new connection criticalities. */ + update_timing_cost(delay_model, + criticalities, + &costs->timing_cost); + + /* Commit the setup slacks since they are updated. */ + commit_setup_slacks(setup_slacks); } /* Function which contains the inner loop of the simulated annealing */ @@ -1047,9 +1199,11 @@ static void placement_inner_loop(float t, ClusteredPinTimingInvalidator* pin_timing_invalidator, const PlaceDelayModel* delay_model, PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, MoveGenerator& move_generator, t_pl_blocks_to_be_moved& blocks_affected, - SetupTimingInfo* timing_info) { + SetupTimingInfo* timing_info, + const t_place_algorithm& place_algorithm) { int inner_crit_iter_count, inner_iter; int inner_placement_save_count = 0; //How many times have we dumped placement to a file this temperature? @@ -1064,15 +1218,20 @@ static void placement_inner_loop(float t, /* Inner loop begins */ for (inner_iter = 0; inner_iter < move_lim; inner_iter++) { - e_move_result swap_result = try_swap(t, costs, prev_inverse_costs, rlim, + e_move_result swap_result = try_swap(t, + crit_exponent, + costs, + prev_inverse_costs, + rlim, move_generator, timing_info, pin_timing_invalidator, blocks_affected, delay_model, criticalities, + setup_slacks, placer_opts.rlim_escape_fraction, - placer_opts.place_algorithm, + place_algorithm, placer_opts.timing_tradeoff); if (swap_result == ACCEPTED) { @@ -1089,7 +1248,7 @@ static void placement_inner_loop(float t, num_swap_rejected++; } - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (place_algorithm.is_timing_driven()) { /* Do we want to re-timing analyze the circuit to get updated slack and criticality values? * We do this only once in a while, since it is expensive. */ @@ -1100,15 +1259,14 @@ static void placement_inner_loop(float t, #ifdef VERBOSE VTR_LOG("Inner loop recompute criticalities\n"); #endif - /* Using the delays in connection_delay, do a timing analysis to update slacks and - * criticalities and update the timing cost since it will change. - */ - recompute_criticalities(crit_exponent, - delay_model, - criticalities, - pin_timing_invalidator, - timing_info, - costs); + //Update all timing related classes + perform_full_timing_update(crit_exponent, + delay_model, + criticalities, + setup_slacks, + pin_timing_invalidator, + timing_info, + costs); } inner_crit_iter_count++; } @@ -1122,7 +1280,7 @@ static void placement_inner_loop(float t, /* Lines below prevent too much round-off error from accumulating * in the cost over many iterations (due to incremental updates). - * This round-off can lead to error checks failing because the cost + * This round-off can lead to error checks failing because the cost * is different from what you get when you recompute from scratch. */ ++(*moves_since_cost_recompute); @@ -1155,7 +1313,7 @@ static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, } costs->bb_cost = new_bb_cost; - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { double new_timing_cost = 0.; comp_td_costs(delay_model, *criticalities, &new_timing_cost); if (fabs(new_timing_cost - costs->timing_cost) > costs->timing_cost * ERROR_TOL) { @@ -1281,7 +1439,7 @@ static bool update_annealing_state(t_annealing_state* state, // The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves. update_rlim(&state->rlim, success_rat, device_ctx.grid); - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { state->crit_exponent = (1 - (state->rlim - FINAL_RLIM) * state->inverse_delta_rlim) * (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first) + placer_opts.td_place_exp_first; @@ -1290,14 +1448,16 @@ static bool update_annealing_state(t_annealing_state* state, return true; } -static float starting_t(t_placer_costs* costs, +static float starting_t(float crit_exponent, + t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, t_annealing_sched annealing_sched, int max_moves, float rlim, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, - TimingInfo* timing_info, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, + SetupTimingInfo* timing_info, MoveGenerator& move_generator, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, @@ -1318,16 +1478,23 @@ static float starting_t(t_placer_costs* costs, av = 0.; sum_of_squares = 0.; - /* Try one move per block. Set t high so essentially all accepted. */ + /* Try one move per block. Set the temperature high so essentially all accepted. */ + float t = HUGE_POSITIVE_FLOAT; for (i = 0; i < move_lim; i++) { - e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, costs, prev_inverse_costs, rlim, + //Will not deploy setup slack analysis, so omit crit_exponenet and setup_slack + e_move_result swap_result = try_swap(t, + crit_exponent, + costs, + prev_inverse_costs, + rlim, move_generator, timing_info, pin_timing_invalidator, blocks_affected, delay_model, criticalities, + setup_slacks, placer_opts.rlim_escape_fraction, placer_opts.place_algorithm, placer_opts.timing_tradeoff); @@ -1392,17 +1559,19 @@ static void reset_move_nets(int num_nets_affected) { } static e_move_result try_swap(float t, + float crit_exponent, t_placer_costs* costs, t_placer_prev_inverse_costs* prev_inverse_costs, float rlim, MoveGenerator& move_generator, - TimingInfo* timing_info, + SetupTimingInfo* timing_info, ClusteredPinTimingInvalidator* pin_timing_invalidator, t_pl_blocks_to_be_moved& blocks_affected, const PlaceDelayModel* delay_model, - const PlacerCriticalities* criticalities, + PlacerCriticalities* criticalities, + PlacerSetupSlacks* setup_slacks, float rlim_escape_fraction, - enum e_place_algorithm place_algorithm, + const t_place_algorithm& place_algorithm, float timing_tradeoff) { /* Picks some block and moves it to another spot. If this spot is * * occupied, switch the blocks. Assess the change in cost function. * @@ -1463,21 +1632,62 @@ static e_move_result try_swap(float t, //Update the block positions apply_move_blocks(blocks_affected); - // Find all the nets affected by this swap and update their costs + //Find all the nets affected by this swap and update their costs + //This routine calculates new connection delays and timing costs + //and store them in proposed_* data structures + //This routine also calculates the wiring cost, which doesn't + //depend on the timing driven data int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, delay_model, criticalities, blocks_affected, bb_delta_c, timing_delta_c); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + + //For setup slack analysis, we first do a timing analysis to get the newest slack values + //resulted from the proposed block moves. If the move turns out to be accepted, we keep + //the updated slack values and commit the block moves. If rejected, we reject the proposed + //block moves and revert this timing analysis. + if (place_algorithm == SLACK_TIMING_PLACE) { + //Gather all the connections with modified delays for incremental timing updates. + //This routine relies on comparing proposed_connection_delay and connection_delay. + invalidate_affected_connections(blocks_affected, + pin_timing_invalidator, + timing_info); + + //Update the connection_timing_cost and connection_delay + //values from the temporary values. + commit_td_cost(blocks_affected); + + //Update timing information. Since we are analyzing setup slacks, + //we only update those values and keep the criticalities stale + //so as not to interfere with the original timing driven algorithm. + // + //Note: the timing info must be updated after applying block moves + //and committing the timing driven delays and costs. + //If we wish to revert this timing update due to move rejection, + //we need to revert block moves and restore the timing values. + criticalities->disable_update(); + setup_slacks->enable_update(); + update_timing_classes(crit_exponent, + timing_info, + criticalities, + setup_slacks, + pin_timing_invalidator); + + /* Get the setup slack analysis cost */ + //TODO: calculate a weighted average of the slack cost and wiring cost + delta_c = analyze_setup_slack_cost(setup_slacks); + + } else if (place_algorithm == CRITICALITY_TIMING_PLACE) { /*in this case we redefine delta_c as a combination of timing and bb. * *additionally, we normalize all values, therefore delta_c is in * *relation to 1*/ - delta_c = (1 - timing_tradeoff) * bb_delta_c * prev_inverse_costs->bb_cost + timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost; + } else { + VTR_ASSERT(place_algorithm == BOUNDING_BOX_PLACE); delta_c = bb_delta_c; } @@ -1488,18 +1698,28 @@ static e_move_result try_swap(float t, costs->cost += delta_c; costs->bb_cost += bb_delta_c; - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (place_algorithm == SLACK_TIMING_PLACE) { + /* Update the timing driven cost as usual */ + costs->timing_cost += timing_delta_c; + + //Commit the setup slack information + //The timing delay and cost values should be committed already + commit_setup_slacks(setup_slacks); + } + + if (place_algorithm == CRITICALITY_TIMING_PLACE) { costs->timing_cost += timing_delta_c; //Invalidates timing of modified connections for incremental timing updates - //Must be called before commit_td_cost since it relies on comparing - //proposed_connection_delay and connection_delay - invalidate_affected_connection_delays(blocks_affected, - pin_timing_invalidator, - timing_info); - - /*update the connection_timing_cost and connection_delay - * values from the temporary values */ + //This routine relies on comparing proposed_connection_delay and connection_delay + //If the setup slack analysis was not performed, the + //sink pins are yet to be invalidated. + invalidate_affected_connections(blocks_affected, + pin_timing_invalidator, + timing_info); + + //update the connection_timing_cost and connection_delay + //values from the temporary values commit_td_cost(blocks_affected); } @@ -1509,14 +1729,42 @@ static e_move_result try_swap(float t, /* Update clb data structures since we kept the move. */ commit_move_blocks(blocks_affected); - } else { /* Move was rejected. */ - /* Reset the net cost function flags first. */ + } else { //move_outcome == REJECTED + + /* Reset the net cost function flags first. */ reset_move_nets(num_nets_affected); /* Restore the place_ctx.block_locs data structures to their state before the move. */ revert_move_blocks(blocks_affected); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (place_algorithm == SLACK_TIMING_PLACE) { + //Revert the timing delays and costs to pre-update values + //These routines must be called after reverting the block moves + //TODO: make this process incremental + comp_td_connection_delays(delay_model); + comp_td_costs(delay_model, *criticalities, &costs->timing_cost); + + //Re-invalidate the affected sink pins since the proposed move is + //rejected, and the same blocks are reverted to their original + //positions. The affected sink pins should stay the same. + invalidate_affected_connections(blocks_affected, + pin_timing_invalidator, + timing_info); + + /* Revert the timing update */ + update_timing_classes(crit_exponent, + timing_info, + criticalities, + setup_slacks, + pin_timing_invalidator); + + VTR_ASSERT_SAFE_MSG( + verify_connection_setup_slacks(setup_slacks), + "The current setup slacks should be identical to the values before the try swap timing info update."); + } + + if (place_algorithm == CRITICALITY_TIMING_PLACE) { + /* Unstage the values stored in proposed_* data structures */ revert_td_cost(blocks_affected); } } @@ -1547,14 +1795,14 @@ static e_move_result try_swap(float t, check_place(*costs, delay_model, place_algorithm); #endif - return (move_outcome); + return move_outcome; } //Puts all the nets changed by the current swap into nets_to_update, //and updates their bounding box. // //Returns the number of affected nets. -static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm, +static int find_affected_nets_and_update_costs(const t_place_algorithm& place_algorithm, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, t_pl_blocks_to_be_moved& blocks_affected, @@ -1587,8 +1835,8 @@ static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm //once per net, not once per pin. update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin); - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { - //Determine the change in timing costs if required + if (place_algorithm.is_timing_driven()) { + /* Determine the change in connection delay and timing cost */ update_td_delta_costs(delay_model, *criticalities, net_id, blk_pin, blocks_affected, timing_delta_c); } } @@ -1650,6 +1898,35 @@ static void update_net_bb(const ClusterNetId net, } } +/** + * @brief Calculate the new connection delay and timing cost of all the + * sink pins affected by moving a specific pin to a new location. + * Also calculates the total change in the timing cost. + * + * Assumes that the blocks have been moved to the proposed new locations. + * Otherwise, the routine comp_td_connection_delay() will not be able to + * calculate the most up to date connection delay estimation value. + * + * If the moved pin is a driver pin, then all the sink connections that are + * driven by this driver pin are considered. + * + * If the moved pin is a sink pin, then it is the only pin considered. But + * in some cases, the sink is already accounted for if it is also driven + * by a driver pin located on a moved block. Computing it again would double + * count its affect on the total timing cost change (delta_timing_cost). + * + * It is possible for some connections to have unchanged delays. For instance, + * if we are using a dx/dy delay model, this could occur if a sink pin moved + * to a new position with the same dx/dy from its net's driver pin. + * + * We skip these connections with unchanged delay values as their delay need + * not be updated. Their timing costs also do not require any update, since + * the criticalities values are always kept stale/unchanged during an block + * swap attempt. (Unchanged Delay * Unchanged Criticality = Unchanged Cost) + * + * This is also done to minimize the number of timing node/edge invalidations + * for incremental static timing analysis (incremental STA). + */ static void update_td_delta_costs(const PlaceDelayModel* delay_model, const PlacerCriticalities& criticalities, const ClusterNetId net, @@ -1659,42 +1936,103 @@ static void update_td_delta_costs(const PlaceDelayModel* delay_model, auto& cluster_ctx = g_vpr_ctx.clustering(); if (cluster_ctx.clb_nlist.pin_type(pin) == PinType::DRIVER) { - //This pin is a net driver on a moved block. - //Re-compute all point to point connections for this net. + /* This pin is a net driver on a moved block. */ + /* Recompute all point to point connection delays for the net sinks. */ for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) { float temp_delay = comp_td_connection_delay(delay_model, net, ipin); - proposed_connection_delay[net][ipin] = temp_delay; + /* If the delay hasn't changed, do not mark this pin as affected */ + if (temp_delay == connection_delay[net][ipin]) { + continue; + } + /* Calculate proposed delay and cost values */ + proposed_connection_delay[net][ipin] = temp_delay; proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay; delta_timing_cost += proposed_connection_timing_cost[net][ipin] - connection_timing_cost[net][ipin]; + /* Record this connection in blocks_affected.affected_pins */ ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net, ipin); blocks_affected.affected_pins.push_back(sink_pin); } } else { - //This pin is a net sink on a moved block + /* This pin is a net sink on a moved block */ VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin) == PinType::SINK); - //If this net is being driven by a moved block, we do not - //need to compute the change in the timing cost (here) since it will - //be computed by the net's driver pin (since the driver block moved). - // - //Computing it here would double count the change, and mess up the - //delta_timing_cost value. + /* Check if this sink's net is driven by a moved block */ if (!driven_by_moved_block(net, blocks_affected)) { - int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin); + /* Get the sink pin index in the net */ + int ipin = cluster_ctx.clb_nlist.pin_net_index(pin); - float temp_delay = comp_td_connection_delay(delay_model, net, net_pin); - proposed_connection_delay[net][net_pin] = temp_delay; + float temp_delay = comp_td_connection_delay(delay_model, net, ipin); + /* If the delay hasn't changed, do not mark this pin as affected */ + if (temp_delay == connection_delay[net][ipin]) { + return; + } - proposed_connection_timing_cost[net][net_pin] = criticalities.criticality(net, net_pin) * temp_delay; - delta_timing_cost += proposed_connection_timing_cost[net][net_pin] - connection_timing_cost[net][net_pin]; + /* Calculate proposed delay and cost values */ + proposed_connection_delay[net][ipin] = temp_delay; + proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay; + delta_timing_cost += proposed_connection_timing_cost[net][ipin] - connection_timing_cost[net][ipin]; + /* Record this connection in blocks_affected.affected_pins */ blocks_affected.affected_pins.push_back(pin); } } } +/** + * @brief Check if the setup slack has gotten better or worse due to block swap. + * + * Get all the modified slack values via the PlacerSetupSlacks class, and compare + * then with the original values at these connections. Sort them and compare them + * one by one, and return the difference of the first different pair. + * + * If the new slack value is larger(better), than return a negative value so that + * the move will be accepted. If the new slack value is smaller(worse), return a + * positive value so that the move will be rejected. + * + * If no slack values have changed, then return an arbitrary positive number. A + * move resulting in no change in the slack values should probably be unnecessary. + * + * The sorting is need to prevent in the unlikely circumstances that a bad slack + * value suddenly got very good due to the block move, while a good slack value + * got very bad, perhaps even worse than the original worse slack value. + */ +static float analyze_setup_slack_cost(const PlacerSetupSlacks* setup_slacks) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& clb_nlist = cluster_ctx.clb_nlist; + + //Find the original/proposed setup slacks of pins with modified values + std::vector original_setup_slacks, proposed_setup_slacks; + + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId clb_pin : clb_pins_modified) { + ClusterNetId net_id = clb_nlist.pin_net(clb_pin); + size_t ipin = clb_nlist.pin_net_index(clb_pin); + + original_setup_slacks.push_back(connection_setup_slack[net_id][ipin]); + proposed_setup_slacks.push_back(setup_slacks->setup_slack(net_id, ipin)); + } + + //Sort in ascending order, from the worse slack value to the best + std::sort(original_setup_slacks.begin(), original_setup_slacks.end()); + std::sort(proposed_setup_slacks.begin(), proposed_setup_slacks.end()); + + //Check the first pair of slack values that are different + //If found, return their difference + for (size_t idiff = 0; idiff < original_setup_slacks.size(); ++idiff) { + float slack_diff = original_setup_slacks[idiff] - proposed_setup_slacks[idiff]; + + if (slack_diff != 0) { + return slack_diff; + } + } + + //If all slack values are identical (or no modified slack values), + //reject this move by returning an arbitrary positive number as cost. + return 1; +} + static e_move_result assess_swap(double delta_c, double t) { /* Returns: 1 -> move accepted, 0 -> rejected. */ if (delta_c <= 0) { @@ -1787,7 +2125,7 @@ static float comp_td_connection_delay(const PlaceDelayModel* delay_model, Cluste //Recompute all point to point delays, updating connection_delay static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { - auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); for (auto net_id : cluster_ctx.clb_nlist.nets()) { for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { @@ -1796,45 +2134,83 @@ static void comp_td_connection_delays(const PlaceDelayModel* delay_model) { } } -/* Update the connection_timing_cost values from the temporary * - * values for all connections that have changed. */ -static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { - auto& cluster_ctx = g_vpr_ctx.clustering(); +/** + * @brief Commit all the setup slack values from the PlacerSetupSlacks + * class to `connection_setup_slack`. + * + * This routine is incremental since it relies on the pins_with_modified_setup_slack() + * to detect which pins need to be updated and which pins do not. + * + * Therefore, it is assumed that this routine is always called immediately after + * each time update_timing_classes() is called with setup slack update enabled. + * Otherwise, pins_with_modified_setup_slack() cannot accurately account for all + * the pins that have their setup slacks changed, making this routine incorrect. + * + * Currently, the only exception to the rule above is when setup slack analysis is used + * during the placement quench. The new setup slacks might be either accepted or + * rejected, so for efficiency reasons, this routine is not called if the slacks are + * rejected in the end. For more detailed info, see the try_swap() routine. + */ +static void commit_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - /* Go through all the blocks moved. */ - for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) { - ClusterBlockId bnum = blocks_affected.moved_blocks[iblk].block_num; - for (ClusterPinId pin_id : cluster_ctx.clb_nlist.block_pins(bnum)) { - ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(pin_id); + //Incremental: only go through sink pins with modified setup slack + auto clb_pins_modified = setup_slacks->pins_with_modified_setup_slack(); + for (ClusterPinId pin_id : clb_pins_modified) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + size_t pin_index_in_net = clb_nlist.pin_net_index(pin_id); - if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) - continue; + connection_setup_slack[net_id][pin_index_in_net] = setup_slacks->setup_slack(net_id, pin_index_in_net); + } +} - if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::DRIVER) { - //This net is being driven by a moved block, recompute - //all point to point connections on this net. - for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) { - connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin]; - proposed_connection_delay[net_id][ipin] = INVALID_DELAY; - connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin]; - proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY; - } - } else { - //This pin is a net sink on a moved block - VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK); +/** + * @brief Verify that the values in `connection_setup_slack` matches PlacerSetupSlacks. + * + * Return true if all connection values are identical. Otherwise, return false. + * + * Currently, this routine is called to check if the timing update has been successfully + * reverted after a proposed move is rejected when applying setup slack analysis during + * the placement quench. If successful, the setup slacks in PlacerSetupSlacks should be + * the same as the values in `connection_setup_slack` without running commit_setup_slacks(). + * For more detailed info, see the try_swap() routine. + */ +static bool verify_connection_setup_slacks(const PlacerSetupSlacks* setup_slacks) { + const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; + + //Go through every single sink pin to check that the slack values are the same + for (ClusterNetId net_id : clb_nlist.nets()) { + for (size_t ipin = 1; ipin < clb_nlist.net_pins(net_id).size(); ++ipin) { + if (connection_setup_slack[net_id][ipin] != setup_slacks->setup_slack(net_id, ipin)) { + return false; + } + } + } + return true; +} - /* The following "if" prevents the value from being updated twice. */ - if (!driven_by_moved_block(net_id, blocks_affected)) { - int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin_id); +/** + * @brief Update the connection_timing_cost values from the temporary + * values for all connections that have/haven't changed. + * + * All the connections have already been gathered by blocks_affected.affected_pins + * after running the routine find_affected_nets_and_update_costs() in try_swap(). + */ +static void commit_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& clb_nlist = cluster_ctx.clb_nlist; - connection_delay[net_id][net_pin] = proposed_connection_delay[net_id][net_pin]; - proposed_connection_delay[net_id][net_pin] = INVALID_DELAY; - connection_timing_cost[net_id][net_pin] = proposed_connection_timing_cost[net_id][net_pin]; - proposed_connection_timing_cost[net_id][net_pin] = INVALID_DELAY; - } - } - } /* Finished going through all the pins in the moved block */ - } /* Finished going through all the blocks moved */ + //Go through all the sink pins affected + for (ClusterPinId pin_id : blocks_affected.affected_pins) { + ClusterNetId net_id = clb_nlist.pin_net(pin_id); + int ipin = clb_nlist.pin_net_index(pin_id); + + //Commit the timing delay and cost values + connection_delay[net_id][ipin] = proposed_connection_delay[net_id][ipin]; + proposed_connection_delay[net_id][ipin] = INVALID_DELAY; + connection_timing_cost[net_id][ipin] = proposed_connection_timing_cost[net_id][ipin]; + proposed_connection_timing_cost[net_id][ipin] = INVALID_DELAY; + } } //Reverts modifications to proposed_connection_delay and proposed_connection_timing_cost based on @@ -1857,35 +2233,24 @@ static void revert_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) { #endif } -//Invalidates the delays of connections effected by the specified move -// -//Relies on proposed_connection_delay and connection_delay to detect -//which connections have actually had their delay changed. -static void invalidate_affected_connection_delays(const t_pl_blocks_to_be_moved& blocks_affected, - ClusteredPinTimingInvalidator* pin_tedges_invalidator, - TimingInfo* timing_info) { +/** + * @brief Invalidates the connections affected by the specified block moves. + * + * All the connections recorded in blocks_affected.affected_pins have different + * values for `proposed_connection_delay` and `connection_delay`. + * + * Invalidate all the timing graph edges associated with these connections via + * the ClusteredPinTimingInvalidator class. + */ +static void invalidate_affected_connections(const t_pl_blocks_to_be_moved& blocks_affected, + ClusteredPinTimingInvalidator* pin_tedges_invalidator, + TimingInfo* timing_info) { VTR_ASSERT_SAFE(timing_info); VTR_ASSERT_SAFE(pin_tedges_invalidator); - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& clb_nlist = cluster_ctx.clb_nlist; - - //Inalidate timing graph edges affected by the move + /* Invalidate timing graph edges affected by the move */ for (ClusterPinId pin : blocks_affected.affected_pins) { - //It is possible that some connections may not have changed delay.(e.g. - //For instance, if using a dx/dy delay model, this could occur if a sink - //moved to a new position with the same dx/dy from it's driver. - // - //To minimze work during the incremental STA update we do not invalidate - //such unchanged connections. - - ClusterNetId net = clb_nlist.pin_net(pin); - int ipin = clb_nlist.pin_net_index(pin); - - if (proposed_connection_delay[net][ipin] != connection_delay[net][ipin]) { - //Delay changed, must invalidate - pin_tedges_invalidator->invalidate_connection(pin, timing_info); - } + pin_tedges_invalidator->invalidate_connection(pin, timing_info); } } @@ -1942,7 +2307,7 @@ static void update_td_costs(const PlaceDelayModel* delay_model, const PlacerCrit if (cluster_ctx.clb_nlist.net_is_ignored(clb_net)) continue; int ipin = clb_nlist.pin_net_index(clb_pin); - VTR_ASSERT_SAFE(ipin >= 0 && ipin < int(clb_nlist.net_pins(clb_net).size())); + VTR_ASSERT_SAFE(ipin >= 1 && ipin < int(clb_nlist.net_pins(clb_net).size())); double new_timing_cost = comp_td_connection_cost(delay_model, place_crit, clb_net, ipin); @@ -2111,12 +2476,14 @@ static void alloc_and_load_placement_structs(float place_cost_exp, max_pins_per_clb = max(max_pins_per_clb, type.num_pins); } - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { /* Allocate structures associated with timing driven placement */ /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ connection_delay = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.f); proposed_connection_delay = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.f); + connection_setup_slack = make_net_pins_matrix(cluster_ctx.clb_nlist, std::numeric_limits::infinity()); + connection_timing_cost = PlacerTimingCosts(cluster_ctx.clb_nlist); proposed_connection_timing_cost = make_net_pins_matrix(cluster_ctx.clb_nlist, 0.); net_timing_cost.resize(num_nets, 0.); @@ -2155,9 +2522,10 @@ static void alloc_and_load_placement_structs(float place_cost_exp, /* Frees the major structures needed by the placer (and not needed * * elsewhere). */ static void free_placement_structs(const t_placer_opts& placer_opts) { - if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (placer_opts.place_algorithm.is_timing_driven()) { vtr::release_memory(connection_timing_cost); vtr::release_memory(connection_delay); + vtr::release_memory(connection_setup_slack); vtr::release_memory(proposed_connection_timing_cost); vtr::release_memory(proposed_connection_delay); @@ -2714,7 +3082,7 @@ static void alloc_and_load_for_fast_cost_update(float place_cost_exp) { static void check_place(const t_placer_costs& costs, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, - enum e_place_algorithm place_algorithm) { + const t_place_algorithm& place_algorithm) { /* Checks that the placement has not confused our data structures. * * i.e. the clb and block structures agree about the locations of * * every block, blocks are in legal spots, etc. Also recomputes * @@ -2741,7 +3109,7 @@ static void check_place(const t_placer_costs& costs, static int check_placement_costs(const t_placer_costs& costs, const PlaceDelayModel* delay_model, const PlacerCriticalities* criticalities, - enum e_place_algorithm place_algorithm) { + const t_place_algorithm& place_algorithm) { int error = 0; double bb_cost_check; double timing_cost_check; @@ -2753,7 +3121,7 @@ static int check_placement_costs(const t_placer_costs& costs, error++; } - if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) { + if (place_algorithm.is_timing_driven()) { comp_td_costs(delay_model, *criticalities, &timing_cost_check); //VTR_LOG("timing_cost recomputed from scratch: %g\n", timing_cost_check); if (fabs(timing_cost_check - costs.timing_cost) > costs.timing_cost * ERROR_TOL) { @@ -3023,7 +3391,7 @@ static void init_annealing_state(t_annealing_state* state, } bool placer_needs_lookahead(const t_vpr_setup& vpr_setup) { - return (vpr_setup.PlacerOpts.place_algorithm == PATH_TIMING_DRIVEN_PLACE); + return (vpr_setup.PlacerOpts.place_algorithm.is_timing_driven()); } //transforms the vector moved_blocks to a vector of ints and adds it in glob_breakpoint_state diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp index e62eab6c894..d4dfbcc6f52 100644 --- a/vpr/src/place/timing_place.cpp +++ b/vpr/src/place/timing_place.cpp @@ -1,3 +1,8 @@ +/** + * @file timing_place.cpp + * @brief Stores the method definitions of classes defined in timing_place.h. + */ + #include #include @@ -14,71 +19,46 @@ #include "timing_info.h" -//Use an incremental approach to updaing criticalities? -constexpr bool INCR_UPDATE_CRITICALITIES = true; - -/**************************************/ - -/* Allocates space for the timing_place_crit_ data structure * - * I chunk the data to save space on large problems. */ +///@brief Allocates space for the timing_place_crit_ data structure. PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) : clb_nlist_(clb_nlist) , pin_lookup_(netlist_pin_lookup) , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { } -/**************************************/ +/** + * @brief Updated the criticalities in the timing_place_crit_ data structure. + * + * If the criticalities are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_criticality() + * cannot accurately account for all the pins that need to be updated. In this case, + * `recompute_required` would be true, and we update all criticalities from scratch. + * + * If the criticality exponent has changed, we also need to update from scratch. + */ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_info, float crit_exponent) { - /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. - * For every pin on every net (or, equivalently, for every tedge ending - * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ + /* If update is not enabled, exit the routine. */ + if (!update_enabled) { + /* re-computation is required on the next iteration */ + recompute_required = true; + return; + } - //Determine what pins need updating - if (INCR_UPDATE_CRITICALITIES) { - cluster_pins_with_modified_criticality_.clear(); - if (crit_exponent != last_crit_exponent_) { - //Criticality exponent changed, must re-calculate criticalities for *all* sink pins - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } - - //Record new criticality exponent - last_crit_exponent_ = crit_exponent; - } else { - //Criticality exponent unchanged - // - //Collect the cluster pins which need to be updated based on the latest timing - //analysis - // - //Note we use the set of pins reported by the *timing_info* as having modified - //criticality, rather than those marked as modified by the timing analyzer. - //Since timing_info uses shifted/relaxed criticality (which depends on max - //required time and worst case slacks), additional nodes may be modified - //when updating the atom pin criticalities. - - for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { - ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); - - //Some atom pins correspond to connections which are completely - //contained within a cluster, and hence have no corresponding - //clustered pin. - if (!clb_pin) continue; - - cluster_pins_with_modified_criticality_.insert(clb_pin); - } - } + /* Determine what pins need updating */ + if (!recompute_required && crit_exponent == last_crit_exponent_) { + incr_update_criticalities(timing_info); } else { - //Non-incremental: all pins and nets need updating - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } + recompute_criticalities(); + + /* Record new criticality exponent */ + last_crit_exponent_ = crit_exponent; } - //Update the effected pins + /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. + * For every pin on every net (or, equivalently, for every tedge ending + * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ + + /* Update the affected pins */ for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) { ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); @@ -90,16 +70,176 @@ void PlacerCriticalities::update_criticalities(const SetupTimingInfo* timing_inf * criticality by taking it to some power, crit_exponent (between 1 and 8 by default). */ timing_place_crit_[clb_net][pin_index_in_net] = pow(clb_pin_crit, crit_exponent); } + + /* Criticalities updated. In sync with timing info. */ + /* Can be incrementally updated on the next iteration */ + recompute_required = false; +} + +/** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to criticalities can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * criticality, rather than those marked as modified by the timing analyzer. + * + * Since timing_info uses shifted/relaxed criticality (which depends on max required + * time and worst case slacks), additional nodes may be modified when updating the + * atom pin criticalities. + */ + +void PlacerCriticalities::incr_update_criticalities(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_criticality_.clear(); + + for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_criticality()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_criticality_.insert(clb_pin); + } +} + +/** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerCriticalities::incr_update_criticalities(). + */ +void PlacerCriticalities::recompute_criticalities() { + cluster_pins_with_modified_criticality_.clear(); + + /* Non-incremental: all sink pins need updating */ + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_criticality_.insert(pin_id); + } + } } -void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float val) { - timing_place_crit_[net_id][ipin] = val; +///@brief Override the criticality of a particular connection. +void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float crit_val) { + VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)"); + VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout"); + + timing_place_crit_[net_id][ipin] = crit_val; } +/** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const { return vtr::make_range(cluster_pins_with_modified_criticality_); } +/**************************************/ + +///@brief Allocates space for the timing_place_setup_slacks_ data structure. +PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup) + : clb_nlist_(clb_nlist) + , pin_lookup_(netlist_pin_lookup) + , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { +} + +/** + * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure. + * + * If the setup slacks are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_slack() + * cannot accurately account for all the pins that need to be updated. + * + * In this case, `recompute_required` would be true, and we update all setup slacks + * from scratch. + */ +void PlacerSetupSlacks::update_setup_slacks(const SetupTimingInfo* timing_info) { + /* If update is not enabled, exit the routine. */ + if (!update_enabled) { + /* re-computation is required on the next iteration */ + recompute_required = true; + return; + } + + /* Determine what pins need updating */ + if (!recompute_required) { + incr_update_setup_slacks(timing_info); + } else { + recompute_setup_slacks(); + } + + /* Update the affected pins */ + for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) { + ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); + int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); + + float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info, pin_lookup_, clb_pin); + + timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack; + } + + /* Setup slacks updated. In sync with timing info. */ + /* Can be incrementally updated on the next iteration. */ + recompute_required = false; +} + +/** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to setup slacks can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * setup slacks, rather than those marked as modified by the timing analyzer. + */ +void PlacerSetupSlacks::incr_update_setup_slacks(const SetupTimingInfo* timing_info) { + cluster_pins_with_modified_setup_slack_.clear(); + + for (AtomPinId atom_pin : timing_info->pins_with_modified_setup_slack()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_setup_slack_.insert(clb_pin); + } +} + +/** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks(). + */ +void PlacerSetupSlacks::recompute_setup_slacks() { + cluster_pins_with_modified_setup_slack_.clear(); + + /* Non-incremental: all sink pins need updating */ + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_setup_slack_.insert(pin_id); + } + } +} + +///@brief Override the setup slack of a particular connection. +void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float slack_val) { + VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)"); + VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout"); + + timing_place_setup_slacks_[net_id][ipin] = slack_val; +} + +/** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ +PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { + return vtr::make_range(cluster_pins_with_modified_setup_slack_); +} + +/**************************************/ + std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_dist chan_width_dist, const t_placer_opts& placer_opts, const t_router_opts& router_opts, diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h index c3d8a41c3a1..74996de4a5a 100644 --- a/vpr/src/place/timing_place.h +++ b/vpr/src/place/timing_place.h @@ -1,3 +1,39 @@ +/** + * @file timing_place.h + * @brief Interface used by the VPR placer to query information + * from the Tatum timing analyzer. + * + * @class PlacerSetupSlacks + * Queries connection **RAW** setup slacks, which can + * range from negative to positive values. Also maps + * atom pin setup slacks to clb pin setup slacks. + * @class PlacerCriticalities + * Query connection criticalities, which are calculuated + * based on the raw setup slacks and ranges from 0 to 1. + * Also maps atom pin crit. to clb pin crit. + * @class PlacerTimingCosts + * Hierarchical structure used by update_td_costs() to + * maintain the order of addition operation of float values + * (to avoid round-offs) while doing incremental updates. + * + * Calculating criticalities: + * All the raw setup slack values across a single clock domain are gathered + * and rated from the best to the worst in terms of criticalities. In order + * to calculate criticalities, all the slack values need to be non-negative. + * Hence, if the worst slack is negative, all the slack values are shifted + * by the value of the worst slack so that the value is at least 0. If the + * worst slack is positive, then no shift happens. + * + * The best (shifted) slack (the most positive one) will have a criticality of 0. + * The worst (shifted) slack value will have a criticality of 1. + * + * Criticalities are used to calculated timing costs for each connection. + * The formula is cost = delay * criticality. + * + * For a more detailed description on how criticalities are calculated, see + * calc_relaxed_criticality() in `timing_util.cpp`. + */ + #ifndef TIMING_PLACE #define TIMING_PLACE @@ -14,32 +50,46 @@ std::unique_ptr alloc_lookups_and_criticalities(t_chan_width_di std::vector& segment_inf, const t_direct_inf* directs, const int num_directs); -/* Usage + +/** + * @brief PlacerCriticalities returns the clustered netlist connection criticalities + * used by the placer ('sharpened' by a criticality exponent). + * + * Usage * ===== - * PlacerCriticalities returns the clustered netlist connection criticalities used by - * the placer ('sharpened' by a criticality exponent). This also serves to map atom - * netlist level criticalites (i.e. on AtomPinIds) to the clustered netlist (i.e. - * ClusterPinIds) used during placement. + * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds) + * to the clustered netlist (i.e. ClusterPinIds) used during placement. * - * Criticalities are calculated by calling update_criticalities(), which will - * update criticalities based on the atom netlist connection criticalities provided by - * the passed in SetupTimingInfo. This is done incrementally, based on the modified - * connections/AtomPinIds returned by SetupTimingInfo. + * Criticalities are updated by update_criticalities(), given that `update_enabled` is + * set to true. It will update criticalities based on the atom netlist connection + * criticalities provided by the passed in SetupTimingInfo. * - * The criticalities of individual connections can then be queried by calling the - * criticality() member function. + * This process can be done incrementally, based on the modified connections/AtomPinIds + * returned by SetupTimingInfo. However, the set returned only reflects the connections + * changed by the last call to the timing info update. * - * It also supports iterating via pins_with_modified_criticalities() through the - * clustered netlist pins/connections which have had their criticality modified by - * the last call to update_criticalities(), which is useful for incrementally - * re-calculating timing costs. + * Therefore, if SetupTimingInfo is updated twice in succession without criticalities + * getting updated (update_enabled = false), the returned set cannot account for all + * the connections that have been modified. In this case, we flag `recompute_required` + * as false, and we recompute the criticalities for every connection to ensure that + * they are all up to date. Hence, each time update_setup_slacks_and_criticalities() + * is called, we assign `recompute_required` the opposite value of `update_enabled`. + * + * This class also maps/transforms the modified atom connections/pins returned by the + * timing info into modified clustered netlist connections/pins after calling + * update_criticalities(). The interface then enables users to iterate over this range + * via pins_with_modified_criticalities(). This is useful for incrementally re-calculating + * the timing costs. + * + * The criticalities of individual connections can then be queried by calling the + * criticality() member function. * * Implementation * ============== - * To support incremental re-calculation the class saves the last criticality exponent - * passed to update_criticalites(). If the next update uses the same exponent criticalities - * can be incrementally updated. Otherwise they must be re-calculated from scratch, since - * a change in exponent changes *all* criticalities. + * To support incremental re-calculation, the class saves the last criticality exponent + * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same + * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated + * from scratch, since a change in exponent changes *all* criticalities. */ class PlacerCriticalities { public: //Types @@ -55,40 +105,175 @@ class PlacerCriticalities { PlacerCriticalities& operator=(const PlacerCriticalities& clb_nlist) = delete; public: //Accessors - //Returns the criticality of the specified connection + ///@brief Returns the criticality of the specified connection. float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; } - //Returns the range of clustered netlist pins (i.e. ClusterPinIds) which were modified - //by the last call to update_criticalities() + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ pin_range pins_with_modified_criticality() const; public: //Modifiers - //Incrementally updates criticalities based on the atom netlist criticalitites provied by - //timing_info and the provided criticality_exponent. + /** + * @brief Updates criticalities based on the atom netlist criticalitites + * provided by timing_info and the provided criticality_exponent. + * + * Should consistently call this method after the most recent timing analysis to + * keep the criticalities stored in this class in sync with the timing analyzer. + * If out of sync, then the criticalities cannot be incrementally updated on + * during the next timing analysis iteration. + */ void update_criticalities(const SetupTimingInfo* timing_info, float criticality_exponent); - //Override the criticality of a particular connection - void set_criticality(ClusterNetId net, int ipin, float val); + ///@brief Override the criticality of a particular connection. + void set_criticality(ClusterNetId net, int ipin, float crit_val); + + ///@brief Set `update_enabled` to true. + void enable_update() { update_enabled = true; } + + ///@brief Set `update_enabled` to true. + void disable_update() { update_enabled = false; } private: //Data + ///@brief The clb netlist in the placement context. const ClusteredNetlist& clb_nlist_; - const ClusteredPinAtomPinsLookup& pin_lookup_; - ClbNetPinsMatrix timing_place_crit_; /* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */ + ///@brief The lookup table that maps atom pins to clb pins. + const ClusteredPinAtomPinsLookup& pin_lookup_; - //The criticality exponent when update_criticalites() was last called (used to detect if incremental update can be used) + /** + * @brief The matrix that stores criticality value for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_crit_; + + /** + * The criticality exponent when update_criticalites() was last called + * (used to detect if incremental update can be used). + */ float last_crit_exponent_ = std::numeric_limits::quiet_NaN(); - //Set of pins with criticaltites modified by last call to update_criticalities() + ///@brief Set of pins with criticaltites modified by last call to update_criticalities(). vtr::vec_id_set cluster_pins_with_modified_criticality_; + + ///@brief Incremental update. See timing_place.cpp for more. + void incr_update_criticalities(const SetupTimingInfo* timing_info); + + ///@brief From scratch update. See timing_place.cpp for more. + void recompute_criticalities(); + + ///@brief Flag that turns on/off the update_criticalities() routine. + bool update_enabled = true; + + /** + * @brief Flag that checks if criticalities need to be recomputed for all connections. + * + * Used by the method update_criticalities(). They incremental update is not possible + * if this method wasn't called updated after the previous timing info update. + */ + bool recompute_required = true; }; -/* Usage +/** + * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection. + * + * Usage * ===== - * PlacerTimingCosts mimics a 2D array of connection timing costs running from: - * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + * This class mirrors PlacerCriticalities by both its methods and its members. The only + * difference is that this class deals with RAW setup slacks returned by SetupTimingInfo + * rather than criticalities. See the documentation on PlacerCriticalities for more. * - * So it can be used similar to: + * RAW setup slacks are unlike criticalities. Their values are not confined between + * 0 and 1. Their values can be either positive or negative. + * + * This class also provides iterating over the clustered netlist connections/pins that + * have modified setup slacks by the last call to update_setup_slacks(). However, this + * utility is mainly used for incrementally committing the setup slack values into the + * structure `connection_setup_slack` used by many placer routines. + */ +class PlacerSetupSlacks { + public: //Types + typedef vtr::vec_id_set::iterator pin_iterator; + typedef vtr::vec_id_set::iterator net_iterator; + + typedef vtr::Range pin_range; + typedef vtr::Range net_range; + + public: //Lifetime + PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, const ClusteredPinAtomPinsLookup& netlist_pin_lookup); + PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete; + PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete; + + public: //Accessors + ///@brief Returns the setup slack of the specified connection. + float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; } + + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ + pin_range pins_with_modified_setup_slack() const; + + public: //Modifiers + /** + * @brief Updates setup slacks based on the atom netlist setup slacks provided + * by timing_info. + * + * Should consistently call this method after the most recent timing analysis to + * keep the setup slacks stored in this class in sync with the timing analyzer. + * If out of sync, then the setup slacks cannot be incrementally updated on + * during the next timing analysis iteration. + */ + void update_setup_slacks(const SetupTimingInfo* timing_info); + + ///@brief Override the setup slack of a particular connection. + void set_setup_slack(ClusterNetId net, int ipin, float slack_val); + + ///@brief Set `update_enabled` to true. + void enable_update() { update_enabled = true; } + + ///@brief Set `update_enabled` to true. + void disable_update() { update_enabled = false; } + + private: //Data + const ClusteredNetlist& clb_nlist_; + const ClusteredPinAtomPinsLookup& pin_lookup_; + + /** + * @brief The matrix that stores raw setup slack values for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_setup_slacks_; + + ///@brief Set of pins with raw setup slacks modified by last call to update_setup_slacks() + vtr::vec_id_set cluster_pins_with_modified_setup_slack_; + + ///@brief Incremental update. See timing_place.cpp for more. + void incr_update_setup_slacks(const SetupTimingInfo* timing_info); + + ///@brief Incremental update. See timing_place.cpp for more. + void recompute_setup_slacks(); + + ///@brief Flag that turns on/off the update_setup_slacks() routine. + bool update_enabled = true; + + /** + * @brief Flag that checks if setup slacks need to be recomputed for all connections. + * + * Used by the method update_setup_slacks(). They incremental update is not possible + * if this method wasn't called updated after the previous timing info update. + */ + bool recompute_required = true; +}; + +/** + * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from: + * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. + * + * It can be used similar to: * * PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct * @@ -99,53 +284,53 @@ class PlacerCriticalities { * * //Potentially other modifications... * - * //Calculate the updated timing cost, of all connections, incrementally based - * //on modifications + * //Calculate the updated timing cost, of all connections, + * //incrementally based on modifications * float total_timing_cost = connection_timing_costs.total_cost(); - * + * * However behind the scenes PlacerTimingCosts tracks when connection costs are modified, * and efficiently re-calculates the total timing cost incrementally based on the connections * which have had their cost modified. * - * Implementaion - * ============= - * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part + * Implementation + * ============== + * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part * of connection_costs_. To mimic 2d-array like access PlacerTimingCosts also uses two proxy * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy * respectively). * * The first part of connection_costs_ stores intermediate sums of the connection costs for - * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary + * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary * tree, where leaves correspond to individual connection costs and intermediate nodes the - * partial sums of the connection costs. (The binary tree is stored implicitly in the - * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary + * partial sums of the connection costs. (The binary tree is stored implicitly in the + * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary * tree we calculate the total timing cost over all connections. * * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up - * to the root) which have ancestors (leaves) with modified connection costs. When the + * to the root) which have ancestors (leaves) with modified connection costs. When the * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost. - * Only invalidated nodes are traversed, with valid nodes just returning their previously + * Only invalidated nodes are traversed, with valid nodes just returning their previously * calculated (and unchanged) value. * - * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can + * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can * be done in O(k log K) time. * - * It is important to note that due to limited floating point precision, floating point + * It is important to note that due to limited floating point precision, floating point * arithmetic has an order dependence (due to round-off). Using a binary tree to total - * the timing connection costs allows us to incrementally update the total timign cost while - * maintianing the *same order of operations* as if it was re-computed from scratch. This + * the timing connection costs allows us to incrementally update the total timing cost while + * maintianing the *same order of operations* as if it was re-computed from scratch. This * ensures we *always* get consistent results regardless of what/when connections are changed. * * Proxy Classes - * ------------- + * ============= * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of * internal storage of that net's connection costs. * - * ConnectionProxy is returnd by NetProxy's operator[], and holds a reference to a particular - * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy - * supports assignment, allowing clients to modify the connection cost. It also detects if the - * assigned value differs from the previous value and if so, calls PlacerTimingCosts's + * ConnectionProxy is returned by NetProxy's operator[], and holds a reference to a particular + * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy + * supports assignment, allowing clients to modify the connection cost. It also detects if the + * assigned value differs from the previous value and if so, calls PlacerTimingCosts's * invalidate() method on that connection cost. * * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN) @@ -193,7 +378,9 @@ class PlacerTimingCosts { size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1); VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections"); - VTR_ASSERT_MSG(num_connections == 0 || num_level_before_leaves < num_connections, "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); + VTR_ASSERT_MSG( + num_connections == 0 || num_level_before_leaves < num_connections, + "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); //We don't need to store all possible leaves if we have fewer connections //(i.e. bottom-right of tree is empty) @@ -213,16 +400,19 @@ class PlacerTimingCosts { } } - //Proxy class representing a connection cost - // Supports modification of connection cost while detecting changes and - // reporting them up to PlacerTimingCosts + /** + * @brief Proxy class representing a connection cost. + * + * Supports modification of connection cost while detecting + * changes and reporting them up to PlacerTimingCosts. + */ class ConnectionProxy { public: ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost) : timing_costs_(timing_costs) , connection_cost_(connection_cost) {} - //Allow clients to modify the connection cost via assignment + ///@brief Allow clients to modify the connection cost via assignment. ConnectionProxy& operator=(double new_cost) { if (new_cost != connection_cost_) { //If connection cost changed, update it, and mark it @@ -233,9 +423,11 @@ class PlacerTimingCosts { return *this; } - //Support getting the current connection cost as a double - // Useful for client code operating on the cost values (e.g. - // difference between costs) + /** + * @brief Support getting the current connection cost as a double. + * + * Useful for client code operating on the cost values (e.g. difference between costs). + */ operator double() { return connection_cost_; } @@ -245,15 +437,18 @@ class PlacerTimingCosts { double& connection_cost_; }; - //Proxy class representing the connection costs of a net - // Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection + /** + * @brief Proxy class representing the connection costs of a net. + * + * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection. + */ class NetProxy { public: NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs) : timing_costs_(timing_costs) , net_sink_costs_(net_sink_costs) {} - //Indexes into the specific net pin/connection + ///@brief Indexes into the specific net pin/connection. ConnectionProxy operator[](size_t ipin) { return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); } @@ -263,7 +458,7 @@ class PlacerTimingCosts { double* net_sink_costs_; }; - //Indexes into the specific net + ///@brief Indexes into the specific net. NetProxy operator[](ClusterNetId net_id) { VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0); @@ -282,8 +477,10 @@ class PlacerTimingCosts { std::swap(num_levels_, other.num_levels_); } - //Calculates the total cost of all connections efficiently - //in the face of modified connection costs + /** + * @brief Calculates the total cost of all connections efficiently + * in the face of modified connection costs. + */ double total_cost() { float cost = total_cost_recurr(0); //Root @@ -294,7 +491,7 @@ class PlacerTimingCosts { } private: - //Recursively calculate and update the timing cost rooted at inode + ///@brief Recursively calculate and update the timing cost rooted at inode. double total_cost_recurr(size_t inode) { //Prune out-of-tree if (inode > connection_costs_.size() - 1) { @@ -329,12 +526,18 @@ class PlacerTimingCosts { return node_cost; } - friend ConnectionProxy; //So it can call invalidate() + ///@brief Friend-ed so it can call invalidate(). + friend ConnectionProxy; void invalidate(double* invalidated_cost) { //Check pointer within range of internal storage - VTR_ASSERT_SAFE_MSG(invalidated_cost >= &connection_costs_[0], "Connection cost pointer should be after start of internal storage"); - VTR_ASSERT_SAFE_MSG(invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], "Connection cost pointer should be before end of internal storage"); + VTR_ASSERT_SAFE_MSG( + invalidated_cost >= &connection_costs_[0], + "Connection cost pointer should be after start of internal storage"); + + VTR_ASSERT_SAFE_MSG( + invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], + "Connection cost pointer should be before end of internal storage"); size_t icost = invalidated_cost - &connection_costs_[0]; @@ -343,7 +546,7 @@ class PlacerTimingCosts { //Invalidate parent intermediate costs up to root or first //already-invalidated parent size_t iparent = parent(icost); - ; + while (!std::isnan(connection_costs_[iparent])) { //Invalidate connection_costs_[iparent] = std::numeric_limits::quiet_NaN(); @@ -371,33 +574,41 @@ class PlacerTimingCosts { return (i - 1) / 2; } - //Returns the number of nodes in ilevel'th level - //If ilevel is negative, return 0, since the root shouldn't be counted - //as a leaf node candidate + /** + * @brief Returns the number of nodes in ilevel'th level. + * + * If ilevel is negative, return 0, since the root shouldn't + * be counted as a leaf node candidate. + */ size_t num_nodes_in_level(int ilevel) const { return ilevel < 0 ? 0 : (2 << (ilevel)); } - //Returns the total number of nodes in levels [0..ilevel] (inclusive) + ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive). size_t num_nodes_up_to_level(int ilevel) const { return (2 << (ilevel + 1)) - 1; } private: - //Vector storing the implicit binary tree of connection costs - // The actual connections are stored at the end of the vector - // (last level of the binary tree). The earlier portions of - // the tree are the intermediate nodes. - // - // The methods left_child()/right_child()/parent() can be used - // to traverse the tree by indicies into this vector + /** + * @brief Vector storing the implicit binary tree of connection costs. + * + * The actual connections are stored at the end of the vector + * (last level of the binary tree). The earlier portions of + * the tree are the intermediate nodes. + * + * The methods left_child()/right_child()/parent() can be used + * to traverse the tree by indicies into this vector. + */ std::vector connection_costs_; - //Vector storing the indicies of the first connection for - //each net in the netlist, used for indexing by net. + /** + * @brief Vector storing the indicies of the first connection + * for each net in the netlist, used for indexing by net. + */ vtr::vector net_start_indicies_; - //Number of levels in the binary tree + ///@brief Number of levels in the binary tree. size_t num_levels_ = 0; }; diff --git a/vpr/src/timing/timing_util.cpp b/vpr/src/timing/timing_util.cpp index 6dd2c06d249..d1da2fbc164 100644 --- a/vpr/src/timing/timing_util.cpp +++ b/vpr/src/timing/timing_util.cpp @@ -564,10 +564,13 @@ std::map count_clock_fanouts(const tatum::TimingGraph& } /* - * Slack and criticality calculation utilities + * Criticalities and setup slacks calculation utilities */ -//Return the criticality of a net's pin in the CLB netlist +/** + * @brief Returns the criticality of a net's pin in the CLB netlist. + * Assumes that the timing graph is correct and up to date. + */ float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin) { //There may be multiple atom netlist pins connected to this CLB pin float clb_pin_crit = 0.; @@ -579,6 +582,21 @@ float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, cons return clb_pin_crit; } +/** + * @brief Returns the raw setup slack of a net's pin in the CLB netlist. + * Assumes that the timing graph is correct and up to date. + */ +float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin) { + //There may be multiple atom netlist pins connected to this CLB pin + float clb_pin_setup_slack = std::numeric_limits::infinity(); + for (const auto atom_pin : pin_lookup.connected_atom_pins(clb_pin)) { + //Take the worst/minimum of the atom pin slack as the CLB pin slack + clb_pin_setup_slack = std::min(clb_pin_setup_slack, timing_info.setup_pin_slack(atom_pin)); + } + + return clb_pin_setup_slack; +} + //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum //required time and worst slack for all domain pairs represent by the slack tags // diff --git a/vpr/src/timing/timing_util.h b/vpr/src/timing/timing_util.h index 87f6b86787b..682771e9763 100644 --- a/vpr/src/timing/timing_util.h +++ b/vpr/src/timing/timing_util.h @@ -183,6 +183,9 @@ class ClusteredPinTimingInvalidator { //Return the criticality of a net's pin in the CLB netlist float calculate_clb_net_pin_criticality(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin); +//Return the setup slack of a net's pin in the CLB netlist +float calculate_clb_net_pin_setup_slack(const SetupTimingInfo& timing_info, const ClusteredPinAtomPinsLookup& pin_lookup, ClusterPinId clb_pin); + //Returns the worst (maximum) criticality of the set of slack tags specified. Requires the maximum //required time and worst slack for all domain pairs represent by the slack tags // diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/config/config.txt new file mode 100644 index 00000000000..c61444daf19 --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/config/config.txt @@ -0,0 +1,27 @@ +############################################## +# Configuration file for running experiments +############################################## + +# Path to directory of circuits to use +circuits_dir=benchmarks/verilog + +# Path to directory of architectures to use +archs_dir=arch/timing + +# Add circuits to list to sweep +circuit_list_add=stereovision3.v + +# Add architectures to list to sweep +arch_list_add=k6_N10_mem32K_40nm.xml + +# Parse info and how to parse +parse_file=vpr_standard.txt + +# How to parse QoR info +qor_parse_file=qor_standard.txt + +# Pass requirements +pass_requirements_file=pass_requirements.txt + +# Script parameters +script_params = --place_quench_algorithm slack_timing diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/config/golden_results.txt new file mode 100644 index 00000000000..5053a6f6894 --- /dev/null +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/config/golden_results.txt @@ -0,0 +1,2 @@ +arch circuit script_params vtr_flow_elapsed_time error odin_synth_time max_odin_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_time placed_wirelength_est place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6_N10_mem32K_40nm.xml stereovision3.v common 2.19 0.07 9296 4 0.16 -1 -1 32824 -1 -1 19 11 0 0 success v8.0.0-2579-g270d1efd9-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 7.5.0 on Linux-4.15.0-60-generic x86_64 2020-09-04T06:15:46 betzgrp-wintermute.eecg.utoronto.ca /home/hubingra/master/vtr-verilog-to-routing/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_place_quench_slack/run003/k6_N10_mem32K_40nm.xml/stereovision3.v/common 28964 11 30 262 292 2 104 60 7 7 49 clb auto 0.05 453 0.24 0.13 2.18141 -165.789 -2.18141 2.0954 0.12497 0.10019 0.156789 0.124805 26 608 25 1.07788e+06 1.02399e+06 65453.8 1335.79 0.27 0.252669 0.202403 608 25 973 2367 87670 24993 2.53264 2.50992 -189.166 -2.53264 0 0 80140.9 1635.53 0.03 0.0187426 0.0157532 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt index 5cf098b2f77..e59cabff1c1 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt @@ -70,3 +70,4 @@ regression_tests/vtr_reg_strong/strong_timing_update_type regression_tests/vtr_reg_strong/strong_timing_update_diff regression_tests/vtr_reg_strong/strong_blocks_with_no_inputs regression_tests/vtr_reg_strong/strong_fix_clusters +regression_tests/vtr_reg_strong/strong_place_quench_slack