diff --git a/vpr/src/analytical_place/full_legalizer.cpp b/vpr/src/analytical_place/full_legalizer.cpp index 78150b70305..6981908c479 100644 --- a/vpr/src/analytical_place/full_legalizer.cpp +++ b/vpr/src/analytical_place/full_legalizer.cpp @@ -58,7 +58,7 @@ std::unique_ptr make_full_legalizer(e_ap_full_legalizer full_lega const APNetlist& ap_netlist, const AtomNetlist& atom_netlist, const Prepacker& prepacker, - t_vpr_setup& vpr_setup, + const t_vpr_setup& vpr_setup, const t_arch& arch, const DeviceGrid& device_grid) { switch (full_legalizer_type) { @@ -513,8 +513,8 @@ void APPack::legalize(const PartialPlacement& p_placement) { } // Run the Packer stage with the flat placement as a hint. - try_pack(&vpr_setup_.PackerOpts, - &vpr_setup_.AnalysisOpts, + try_pack(vpr_setup_.PackerOpts, + vpr_setup_.AnalysisOpts, arch_, vpr_setup_.RoutingArch, vpr_setup_.PackerRRGraph, diff --git a/vpr/src/analytical_place/full_legalizer.h b/vpr/src/analytical_place/full_legalizer.h index 62c42d1b722..3532022760d 100644 --- a/vpr/src/analytical_place/full_legalizer.h +++ b/vpr/src/analytical_place/full_legalizer.h @@ -37,7 +37,7 @@ class FullLegalizer { FullLegalizer(const APNetlist& ap_netlist, const AtomNetlist& atom_netlist, const Prepacker& prepacker, - t_vpr_setup& vpr_setup, + const t_vpr_setup& vpr_setup, const t_arch& arch, const DeviceGrid& device_grid) : ap_netlist_(ap_netlist) @@ -68,7 +68,7 @@ class FullLegalizer { /// @brief The VPR setup options passed into the VPR flow. This must be /// mutable since some parts of packing modify the options. - t_vpr_setup& vpr_setup_; + const t_vpr_setup& vpr_setup_; /// @brief Information on the architecture of the FPGA. const t_arch& arch_; @@ -84,7 +84,7 @@ std::unique_ptr make_full_legalizer(e_ap_full_legalizer full_lega const APNetlist& ap_netlist, const AtomNetlist& atom_netlist, const Prepacker& prepacker, - t_vpr_setup& vpr_setup, + const t_vpr_setup& vpr_setup, const t_arch& arch, const DeviceGrid& device_grid); diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index eb9af5943ad..fe046fd932e 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -591,10 +591,6 @@ void SetupPackerOpts(const t_options& Options, PackerOpts->feasible_block_array_size = Options.pack_feasible_block_array_size; PackerOpts->use_attraction_groups = Options.use_attraction_groups; - //TODO: document? - PackerOpts->inter_cluster_net_delay = 1.0; /* DEFAULT */ - PackerOpts->auto_compute_inter_cluster_net_delay = true; - PackerOpts->device_layout = Options.device_layout; PackerOpts->timing_update_type = Options.timing_update_type; diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp index b1de3da9729..712fa5619c1 100644 --- a/vpr/src/base/ShowSetup.cpp +++ b/vpr/src/base/ShowSetup.cpp @@ -757,7 +757,6 @@ static void ShowPackerOpts(const t_packer_opts& PackerOpts) { } VTR_LOG("PackerOpts.connection_driven: %s", (PackerOpts.connection_driven ? "true\n" : "false\n")); VTR_LOG("PackerOpts.global_clocks: %s", (PackerOpts.global_clocks ? "true\n" : "false\n")); - VTR_LOG("PackerOpts.inter_cluster_net_delay: %f\n", PackerOpts.inter_cluster_net_delay); VTR_LOG("PackerOpts.timing_driven: %s", (PackerOpts.timing_driven ? "true\n" : "false\n")); VTR_LOG("PackerOpts.target_external_pin_util: %s", vtr::join(PackerOpts.target_external_pin_util, " ").c_str()); VTR_LOG("\n"); diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp index afd4c211160..3e2c0fd4f48 100644 --- a/vpr/src/base/vpr_api.cpp +++ b/vpr/src/base/vpr_api.cpp @@ -620,7 +620,7 @@ bool vpr_pack(t_vpr_setup& vpr_setup, const t_arch& arch) { const Prepacker prepacker(g_vpr_ctx.atom().netlist(), g_vpr_ctx.device().logical_block_types); - return try_pack(&vpr_setup.PackerOpts, &vpr_setup.AnalysisOpts, + return try_pack(vpr_setup.PackerOpts, vpr_setup.AnalysisOpts, arch, vpr_setup.RoutingArch, vpr_setup.PackerRRGraph, prepacker, diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 56e68526277..78f28407612 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -717,9 +717,7 @@ struct t_packer_opts { enum e_cluster_seed cluster_seed_type; float alpha; float beta; - float inter_cluster_net_delay; float target_device_utilization; - bool auto_compute_inter_cluster_net_delay; e_unrelated_clustering allow_unrelated_clustering; bool connection_driven; int pack_verbosity; diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index 0a5dcd88577..dd307168a36 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -2,17 +2,13 @@ #include #include -#include "PreClusterTimingGraphResolver.h" -#include "PreClusterDelayCalculator.h" #include "atom_netlist.h" #include "attraction_groups.h" #include "cluster_legalizer.h" #include "clustered_netlist.h" -#include "concrete_timing_info.h" +#include "globals.h" #include "output_clustering.h" #include "prepack.h" -#include "tatum/TimingReporter.hpp" -#include "tatum/echo_writer.hpp" #include "vpr_context.h" /*Print the contents of each cluster to an echo file*/ @@ -67,58 +63,6 @@ static void echo_clusters(char* filename, const ClusterLegalizer& cluster_legali fclose(fp); } -void calc_init_packing_timing(const t_packer_opts& packer_opts, - const t_analysis_opts& analysis_opts, - const Prepacker& prepacker, - std::shared_ptr& clustering_delay_calc, - std::shared_ptr& timing_info, - vtr::vector& atom_criticality) { - const AtomContext& atom_ctx = g_vpr_ctx.atom(); - - /* - * Initialize the timing analyzer - */ - clustering_delay_calc = std::make_shared(atom_ctx.netlist(), atom_ctx.lookup(), packer_opts.inter_cluster_net_delay, prepacker); - timing_info = make_setup_timing_info(clustering_delay_calc, packer_opts.timing_update_type); - - //Calculate the initial timing - timing_info->update(); - - if (isEchoFileEnabled(E_ECHO_PRE_PACKING_TIMING_GRAPH)) { - auto& timing_ctx = g_vpr_ctx.timing(); - tatum::write_echo(getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH), - *timing_ctx.graph, *timing_ctx.constraints, *clustering_delay_calc, timing_info->analyzer()); - - tatum::NodeId debug_tnode = id_or_pin_name_to_tnode(analysis_opts.echo_dot_timing_graph_node); - write_setup_timing_graph_dot(getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH) + std::string(".dot"), - *timing_info, debug_tnode); - } - - { - auto& timing_ctx = g_vpr_ctx.timing(); - PreClusterTimingGraphResolver resolver(atom_ctx.netlist(), - atom_ctx.lookup(), *timing_ctx.graph, *clustering_delay_calc); - resolver.set_detail_level(analysis_opts.timing_report_detail); - - tatum::TimingReporter timing_reporter(resolver, *timing_ctx.graph, - *timing_ctx.constraints); - - timing_reporter.report_timing_setup( - "pre_pack.report_timing.setup.rpt", - *timing_info->setup_analyzer(), - analysis_opts.timing_report_npaths); - } - - //Calculate true criticalities of each block - for (AtomBlockId blk : atom_ctx.netlist().blocks()) { - for (AtomPinId in_pin : atom_ctx.netlist().block_input_pins(blk)) { - //Max criticality over incoming nets - float crit = timing_info->setup_pin_criticality(in_pin); - atom_criticality[blk] = std::max(atom_criticality[blk], crit); - } - } -} - void check_and_output_clustering(ClusterLegalizer& cluster_legalizer, const t_packer_opts& packer_opts, const std::unordered_set& is_clock, diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index 8f74ed9c91a..4f4c2b5bec8 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -11,10 +11,6 @@ class AttractionInfo; class ClusterBlockId; class ClusterLegalizer; class ClusteredNetlist; -class PreClusterDelayCalculator; -class Prepacker; -class SetupTimingInfo; -class t_pack_molecule; struct AtomContext; /** @@ -26,16 +22,6 @@ struct AtomContext; /* Clustering helper functions */ /***********************************/ -/* - * @brief Calculate the initial timing at the start of packing stage. - */ -void calc_init_packing_timing(const t_packer_opts& packer_opts, - const t_analysis_opts& analysis_opts, - const Prepacker& prepacker, - std::shared_ptr& clustering_delay_calc, - std::shared_ptr& timing_info, - vtr::vector& atom_criticality); - /* * @brief Check clustering legality and output it. */ diff --git a/vpr/src/pack/greedy_candidate_selector.cpp b/vpr/src/pack/greedy_candidate_selector.cpp index 26a0f7f2ec3..b202035ec59 100644 --- a/vpr/src/pack/greedy_candidate_selector.cpp +++ b/vpr/src/pack/greedy_candidate_selector.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "PreClusterTimingManager.h" #include "appack_context.h" #include "flat_placement_types.h" #include "flat_placement_utils.h" @@ -90,7 +91,7 @@ GreedyCandidateSelector::GreedyCandidateSelector( const std::unordered_set& is_clock, const std::unordered_set& is_global, const std::unordered_set& net_output_feeds_driving_block_input, - const SetupTimingInfo& timing_info, + const PreClusterTimingManager& pre_cluster_timing_manager, const APPackContext& appack_ctx, int log_verbosity) : atom_netlist_(atom_netlist) @@ -103,7 +104,7 @@ GreedyCandidateSelector::GreedyCandidateSelector( , is_clock_(is_clock) , is_global_(is_global) , net_output_feeds_driving_block_input_(net_output_feeds_driving_block_input) - , timing_info_(timing_info) + , pre_cluster_timing_manager_(pre_cluster_timing_manager) , appack_ctx_(appack_ctx) , rng_(0) { @@ -544,12 +545,15 @@ void GreedyCandidateSelector::update_timing_gain_values( if (net_output_feeds_driving_block_input_.count(net_id) != 0) pins = atom_netlist_.net_sinks(net_id); + // Get the setup timing info used to compute timing gain terms. + const SetupTimingInfo& timing_info = pre_cluster_timing_manager_.get_timing_info(); + if (net_relation_to_clustered_block == e_net_relation_to_clustered_block::OUTPUT && !is_global_.count(net_id)) { for (AtomPinId pin_id : pins) { AtomBlockId blk_id = atom_netlist_.pin_block(pin_id); if (!cluster_legalizer.is_atom_clustered(blk_id)) { - double timing_gain = timing_info_.setup_pin_criticality(pin_id); + double timing_gain = timing_info.setup_pin_criticality(pin_id); if (cluster_gain_stats.timing_gain.count(blk_id) == 0) { cluster_gain_stats.timing_gain[blk_id] = 0; @@ -569,7 +573,7 @@ void GreedyCandidateSelector::update_timing_gain_values( if (!cluster_legalizer.is_atom_clustered(new_blk_id)) { for (AtomPinId pin_id : atom_netlist_.net_sinks(net_id)) { - double timing_gain = timing_info_.setup_pin_criticality(pin_id); + double timing_gain = timing_info.setup_pin_criticality(pin_id); if (cluster_gain_stats.timing_gain.count(new_blk_id) == 0) { cluster_gain_stats.timing_gain[new_blk_id] = 0; diff --git a/vpr/src/pack/greedy_candidate_selector.h b/vpr/src/pack/greedy_candidate_selector.h index 89931662a54..2b3eb23a1f5 100644 --- a/vpr/src/pack/greedy_candidate_selector.h +++ b/vpr/src/pack/greedy_candidate_selector.h @@ -26,8 +26,8 @@ class AtomNetlist; class AttractionInfo; class FlatPlacementInfo; +class PreClusterTimingManager; class Prepacker; -class SetupTimingInfo; class t_pack_high_fanout_thresholds; struct t_model; struct t_molecule_stats; @@ -225,9 +225,10 @@ class GreedyCandidateSelector { * The set of nets whose output feeds the block that drives * itself. This may cause double-counting in the gain * calculations and needs special handling. - * @param timing_info - * Setup timing info for this Atom Netlist. Used to incorporate - * timing / criticality into the gain calculation. + * @param pre_cluster_timing_manager + * Timing manager that holds the information on timing of + * different connections in the circuit. Used for computing + * the timing gain terms. * @param appack_ctx * The APPack context which contains options for the flat * placement guided packing. @@ -244,7 +245,7 @@ class GreedyCandidateSelector { const std::unordered_set& is_clock, const std::unordered_set& is_global, const std::unordered_set& net_output_feeds_driving_block_input, - const SetupTimingInfo& timing_info, + const PreClusterTimingManager& pre_cluster_timing_manager, const APPackContext& appack_ctx, int log_verbosity); @@ -565,8 +566,9 @@ class GreedyCandidateSelector { /// drive them. const std::unordered_set& net_output_feeds_driving_block_input_; - /// @brief Setup timing info used to help select critical candidates to pack. - const SetupTimingInfo& timing_info_; + /// @brief The pre-clustering timing manager which holds the timing information + /// of the primitive netlist. + const PreClusterTimingManager& pre_cluster_timing_manager_; /// @brief Inter-block nets within a finalized cluster. Used for finding /// transitive candidates. diff --git a/vpr/src/pack/greedy_clusterer.cpp b/vpr/src/pack/greedy_clusterer.cpp index 6312c8be79c..7673005af93 100644 --- a/vpr/src/pack/greedy_clusterer.cpp +++ b/vpr/src/pack/greedy_clusterer.cpp @@ -79,6 +79,7 @@ GreedyClusterer::GreedyClusterer(const t_packer_opts& packer_opts, const t_pack_high_fanout_thresholds& high_fanout_thresholds, const std::unordered_set& is_clock, const std::unordered_set& is_global, + const PreClusterTimingManager& pre_cluster_timing_manager, const APPackContext& appack_ctx) : packer_opts_(packer_opts) , analysis_opts_(analysis_opts) @@ -87,6 +88,7 @@ GreedyClusterer::GreedyClusterer(const t_packer_opts& packer_opts, , high_fanout_thresholds_(high_fanout_thresholds) , is_clock_(is_clock) , is_global_(is_global) + , pre_cluster_timing_manager_(pre_cluster_timing_manager) , appack_ctx_(appack_ctx) , primitive_candidate_block_types_(identify_primitive_candidate_block_types()) , log_verbosity_(packer_opts.pack_verbosity) @@ -113,18 +115,6 @@ GreedyClusterer::do_clustering(ClusterLegalizer& cluster_legalizer, t_cluster_progress_stats clustering_stats; clustering_stats.num_molecules = prepacker.molecules().size(); - // TODO: Create a ClusteringTimingManager class. - // This code relies on the prepacker, once the prepacker is moved to - // the constructor, this code can also move to the constructor. - std::shared_ptr clustering_delay_calc; - std::shared_ptr timing_info; - // Default criticalities set to zero (e.g. if not timing driven) - vtr::vector atom_criticality(atom_netlist_.blocks().size(), 0.f); - if (packer_opts_.timing_driven) { - calc_init_packing_timing(packer_opts_, analysis_opts_, prepacker, - clustering_delay_calc, timing_info, atom_criticality); - } - // Calculate the max molecule stats, which is used for gain calculation. const t_molecule_stats max_molecule_stats = prepacker.calc_max_molecule_stats(atom_netlist_); @@ -140,7 +130,7 @@ GreedyClusterer::do_clustering(ClusterLegalizer& cluster_legalizer, is_clock_, is_global_, net_output_feeds_driving_block_input_, - *timing_info, + pre_cluster_timing_manager_, appack_ctx_, log_verbosity_); @@ -149,7 +139,7 @@ GreedyClusterer::do_clustering(ClusterLegalizer& cluster_legalizer, prepacker, packer_opts_.cluster_seed_type, max_molecule_stats, - atom_criticality); + pre_cluster_timing_manager_); // Pick the first seed molecule. PackMoleculeId seed_mol_id = seed_selector.get_next_seed(prepacker, diff --git a/vpr/src/pack/greedy_clusterer.h b/vpr/src/pack/greedy_clusterer.h index e246d9c679d..4c805ffa594 100644 --- a/vpr/src/pack/greedy_clusterer.h +++ b/vpr/src/pack/greedy_clusterer.h @@ -22,7 +22,7 @@ class AtomNetlist; class AttractionInfo; class DeviceContext; class GreedyCandidateSelector; -class SetupTimingInfo; +class PreClusterTimingManager; class t_pack_high_fanout_thresholds; struct t_analysis_opts; struct t_clustering_data; @@ -76,6 +76,11 @@ class GreedyClusterer { * The set of global nets in the Atom Netlist. These will be * routed on special dedicated networks, and hence are less * relavent to locality / attraction. + * @param pre_cluster_timing_manager + * Timing manager class which holds the timing information of + * the primitive netlist. Used by the seed selector to select + * critical seeds and the candidate selector to select + * timing critical candidates. * @param appack_ctx * The APPack state. This contains the options used to * configure APPack and the flat placement. @@ -87,6 +92,7 @@ class GreedyClusterer { const t_pack_high_fanout_thresholds& high_fanout_thresholds, const std::unordered_set& is_clock, const std::unordered_set& is_global, + const PreClusterTimingManager& pre_cluster_timing_manager, const APPackContext& appack_ctx); /** @@ -233,6 +239,9 @@ class GreedyClusterer { /// @brief A set of atom nets which are considered as global nets. const std::unordered_set& is_global_; + /// @brief Timing manager class which holds the primitive-level timing information. + const PreClusterTimingManager& pre_cluster_timing_manager_; + /// @brief The APPack state. This is used by the candidate selector to try /// and propose better candidates based on a flat placement. const APPackContext& appack_ctx_; diff --git a/vpr/src/pack/greedy_seed_selector.cpp b/vpr/src/pack/greedy_seed_selector.cpp index 592ddced59d..c9d1b9397c5 100644 --- a/vpr/src/pack/greedy_seed_selector.cpp +++ b/vpr/src/pack/greedy_seed_selector.cpp @@ -9,6 +9,7 @@ #include #include +#include "PreClusterTimingManager.h" #include "flat_placement_types.h" #include "atom_netlist.h" #include "cluster_legalizer.h" @@ -167,10 +168,21 @@ GreedySeedSelector::GreedySeedSelector(const AtomNetlist& atom_netlist, const Prepacker& prepacker, const e_cluster_seed seed_type, const t_molecule_stats& max_molecule_stats, - const vtr::vector& atom_criticality) + const PreClusterTimingManager& pre_cluster_timing_manager) : seed_atoms_(atom_netlist.blocks().begin(), atom_netlist.blocks().end()) { // Seed atoms list is initialized with all atoms in the atom netlist. + // Pre-compute the criticality of each atom + // Default criticalities set to zero (e.g. if not timing driven) + vtr::vector atom_criticality(atom_netlist.blocks().size(), 0.0f); + if (pre_cluster_timing_manager.is_valid()) { + // If the timing manager is valid (meaning the packing is timing driven) + // compute the criticality of each atom. + for (AtomBlockId atom_blk_id : atom_netlist.blocks()) { + atom_criticality[atom_blk_id] = pre_cluster_timing_manager.calc_atom_setup_criticality(atom_blk_id, atom_netlist); + } + } + // Maintain a lookup table of the seed gain for each atom. This will be // used to sort the seed atoms. // Initially all gains are zero. diff --git a/vpr/src/pack/greedy_seed_selector.h b/vpr/src/pack/greedy_seed_selector.h index 16bbbc7cf19..5f152f65236 100644 --- a/vpr/src/pack/greedy_seed_selector.h +++ b/vpr/src/pack/greedy_seed_selector.h @@ -14,6 +14,7 @@ // Forward declarations class AtomNetlist; class ClusterLegalizer; +class PreClusterTimingManager; struct t_molecule_stats; /** @@ -44,14 +45,15 @@ class GreedySeedSelector { * @param max_molecule_stats * The maximum stats over all molecules. Used for normalizing * terms in the gain. - * @param atom_criticality - * The timing criticality of each atom. + * @param pre_cluster_timing_manager + * Timing manager class for the primitive netlist. Used to + * compute the criticalities of seeds. */ GreedySeedSelector(const AtomNetlist& atom_netlist, const Prepacker& prepacker, const e_cluster_seed seed_type, const t_molecule_stats& max_molecule_stats, - const vtr::vector& atom_criticality); + const PreClusterTimingManager& pre_cluster_timing_manager); /** * @brief Propose a new seed molecule to start a new cluster with. If no diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 5f4f2849b52..ae1cde8244d 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -2,23 +2,23 @@ #include "pack.h" #include -#include "appack_context.h" -#include "flat_placement_types.h" +#include "PreClusterTimingManager.h" #include "SetupGrid.h" +#include "appack_context.h" #include "attraction_groups.h" #include "cluster_legalizer.h" #include "cluster_util.h" #include "constraints_report.h" +#include "flat_placement_types.h" #include "globals.h" #include "greedy_clusterer.h" #include "partition_region.h" -#include "physical_types_util.h" #include "prepack.h" +#include "stats.h" #include "verify_flat_placement.h" #include "vpr_context.h" #include "vpr_error.h" #include "vpr_types.h" -#include "stats.h" #include "vtr_assert.h" #include "vtr_log.h" @@ -27,32 +27,8 @@ static bool try_size_device_grid(const t_arch& arch, float target_device_utilization, const std::string& device_layout_name); -/** - * Since the parameters of a switch may change as a function of its fanin, - * to get an estimation of inter-cluster delays we need a reasonable estimation - * of the fan-ins of switches that connect clusters together. These switches are - * 1) opin to wire switch - * 2) wire to wire switch - * 3) wire to ipin switch - * We can estimate the fan-in of these switches based on the Fc_in/Fc_out of - * a logic block, and the switch block Fs value - */ -static void get_intercluster_switch_fanin_estimates(const t_arch& arch, - const t_det_routing_arch& routing_arch, - const std::string& device_layout, - const int wire_segment_length, - int* opin_switch_fanin, - int* wire_switch_fanin, - int* ipin_switch_fanin); - -static float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch); - -static float approximate_inter_cluster_delay(const t_arch& arch, - const t_det_routing_arch& routing_arch, - const std::string& device_layout); - -bool try_pack(t_packer_opts* packer_opts, - const t_analysis_opts* analysis_opts, +bool try_pack(const t_packer_opts& packer_opts, + const t_analysis_opts& analysis_opts, const t_arch& arch, const t_det_routing_arch& routing_arch, std::vector* lb_type_rr_graphs, @@ -65,7 +41,7 @@ bool try_pack(t_packer_opts* packer_opts, DeviceContext& mutable_device_ctx = g_vpr_ctx.mutable_device(); std::unordered_set is_clock, is_global; - VTR_LOG("Begin packing '%s'.\n", packer_opts->circuit_file_name.c_str()); + VTR_LOG("Begin packing '%s'.\n", packer_opts.circuit_file_name.c_str()); is_clock = alloc_and_load_is_clock(); is_global.insert(is_clock.begin(), is_clock.end()); @@ -91,6 +67,17 @@ bool try_pack(t_packer_opts* packer_opts, */ AttractionInfo attraction_groups(false); + // Setup pre-clustering timing analysis + PreClusterTimingManager pre_cluster_timing_manager(packer_opts.timing_driven, + atom_ctx.netlist(), + atom_ctx.lookup(), + prepacker, + packer_opts.timing_update_type, + arch, + routing_arch, + packer_opts.device_layout, + analysis_opts); + // We keep track of the overfilled partition regions from all pack iterations in // this vector. This is so that if the first iteration fails due to overfilled // partition regions, and it fails again, we can carry over the previous failed @@ -115,34 +102,23 @@ bool try_pack(t_packer_opts* packer_opts, } } - if (packer_opts->auto_compute_inter_cluster_net_delay) { - float interc_delay = UNDEFINED; - if (packer_opts->timing_driven) { - interc_delay = approximate_inter_cluster_delay(arch, - routing_arch, - packer_opts->device_layout); - } - packer_opts->inter_cluster_net_delay = interc_delay; - VTR_LOG("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay); - } - // During clustering, a block is related to un-clustered primitives with nets. // This relation has three types: low fanout, high fanout, and transitive // high_fanout_thresholds stores the threshold for nets to a block type to // be considered high fanout. - t_pack_high_fanout_thresholds high_fanout_thresholds(packer_opts->high_fanout_threshold); + t_pack_high_fanout_thresholds high_fanout_thresholds(packer_opts.high_fanout_threshold); bool allow_unrelated_clustering = false; - if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::ON) { + if (packer_opts.allow_unrelated_clustering == e_unrelated_clustering::ON) { allow_unrelated_clustering = true; - } else if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::OFF) { + } else if (packer_opts.allow_unrelated_clustering == e_unrelated_clustering::OFF) { allow_unrelated_clustering = false; } bool balance_block_type_util = false; - if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::ON) { + if (packer_opts.balance_block_type_utilization == e_balance_block_type_util::ON) { balance_block_type_util = true; - } else if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::OFF) { + } else if (packer_opts.balance_block_type_utilization == e_balance_block_type_util::OFF) { balance_block_type_util = false; } @@ -151,11 +127,11 @@ bool try_pack(t_packer_opts* packer_opts, ClusterLegalizer cluster_legalizer(atom_ctx.netlist(), prepacker, lb_type_rr_graphs, - packer_opts->target_external_pin_util, + packer_opts.target_external_pin_util, high_fanout_thresholds, ClusterLegalizationStrategy::SKIP_INTRA_LB_ROUTE, - packer_opts->enable_pin_feasibility_filter, - packer_opts->pack_verbosity); + packer_opts.enable_pin_feasibility_filter, + packer_opts.pack_verbosity); VTR_LOG("Packing with pin utilization targets: %s\n", cluster_legalizer.get_target_external_pin_util().to_string().c_str()); VTR_LOG("Packing with high fanout thresholds: %s\n", high_fanout_thresholds.to_string().c_str()); @@ -163,13 +139,14 @@ bool try_pack(t_packer_opts* packer_opts, APPackContext appack_ctx(flat_placement_info, device_ctx.grid); // Initialize the greedy clusterer. - GreedyClusterer clusterer(*packer_opts, - *analysis_opts, + GreedyClusterer clusterer(packer_opts, + analysis_opts, atom_ctx.netlist(), arch, high_fanout_thresholds, is_clock, is_global, + pre_cluster_timing_manager, appack_ctx); g_vpr_ctx.mutable_atom().mutable_lookup().set_atom_pb_bimap_lock(true); @@ -187,7 +164,7 @@ bool try_pack(t_packer_opts* packer_opts, mutable_device_ctx); //Try to size/find a device - bool fits_on_device = try_size_device_grid(arch, num_used_type_instances, packer_opts->target_device_utilization, packer_opts->device_layout); + bool fits_on_device = try_size_device_grid(arch, num_used_type_instances, packer_opts.target_device_utilization, packer_opts.device_layout); /* We use this bool to determine the cause for the clustering not being dense enough. If the clustering * is not dense enough and there are floorplan constraints, it is presumed that the constraints are the cause @@ -205,11 +182,11 @@ bool try_pack(t_packer_opts* packer_opts, //1st pack attempt was unsuccessful (i.e. not dense enough) and we have control of unrelated clustering // //Turn it on to increase packing density - if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::AUTO) { + if (packer_opts.allow_unrelated_clustering == e_unrelated_clustering::AUTO) { VTR_ASSERT(allow_unrelated_clustering == false); allow_unrelated_clustering = true; } - if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::AUTO) { + if (packer_opts.balance_block_type_utilization == e_balance_block_type_util::AUTO) { VTR_ASSERT(balance_block_type_util == false); balance_block_type_util = true; } @@ -313,7 +290,7 @@ bool try_pack(t_packer_opts* packer_opts, g_vpr_ctx.mutable_atom().mutable_lookup().set_atom_pb_bimap_lock(false); g_vpr_ctx.mutable_atom().mutable_lookup().set_atom_to_pb_bimap(cluster_legalizer.atom_pb_lookup()); //check clustering and output it - check_and_output_clustering(cluster_legalizer, *packer_opts, is_clock, &arch); + check_and_output_clustering(cluster_legalizer, packer_opts, is_clock, &arch); VTR_LOG("\n"); VTR_LOG("Netlist conversion complete.\n"); @@ -322,24 +299,6 @@ bool try_pack(t_packer_opts* packer_opts, return true; } -static float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch) { - /* Fetches delay, resistance and output capacitance of the architecture switch at switch_index. - * Returns the total delay through the switch. Used to calculate inter-cluster net delay. */ - - /* The intrinsic delay may depend on fanin to the switch. If the delay map of a - * switch from the architecture file has multiple (#inputs, delay) entries, we - * interpolate/extrapolate to get the delay at 'switch_fanin'. */ - auto& device_ctx = g_vpr_ctx.device(); - - Tdel_switch = device_ctx.arch_switch_inf[switch_index].Tdel(switch_fanin); - R_switch = device_ctx.arch_switch_inf[switch_index].R; - Cout_switch = device_ctx.arch_switch_inf[switch_index].Cout; - - /* The delay through a loaded switch is its intrinsic (unloaded) - * delay plus the product of its resistance and output capacitance. */ - return Tdel_switch + R_switch * Cout_switch; -} - std::unordered_set alloc_and_load_is_clock() { /* Looks through all the atom blocks to find and mark all the clocks, by setting * the corresponding entry by adding the clock to is_clock. @@ -409,132 +368,3 @@ static bool try_size_device_grid(const t_arch& arch, return fits_on_device; } - -static void get_intercluster_switch_fanin_estimates(const t_arch& arch, - const t_det_routing_arch& routing_arch, - const std::string& device_layout, - const int wire_segment_length, - int* opin_switch_fanin, - int* wire_switch_fanin, - int* ipin_switch_fanin) { - // W is unknown pre-packing, so *if* we need W here, we will assume a value of 100 - constexpr int W = 100; - - //Build a dummy 10x10 device to determine the 'best' block type to use - auto grid = create_device_grid(device_layout, arch.grid_layouts, 10, 10); - - auto type = find_most_common_tile_type(grid); - /* get Fc_in/out for most common block (e.g. logic blocks) */ - VTR_ASSERT(!type->fc_specs.empty()); - - //Estimate the maximum Fc_in/Fc_out - float Fc_in = 0.f; - float Fc_out = 0.f; - for (const t_fc_specification& fc_spec : type->fc_specs) { - float Fc = fc_spec.fc_value; - - if (fc_spec.fc_value_type == e_fc_value_type::ABSOLUTE) { - //Convert to estimated fractional - Fc /= W; - } - VTR_ASSERT_MSG(Fc >= 0 && Fc <= 1., "Fc should be fractional"); - - for (int ipin : fc_spec.pins) { - e_pin_type pin_type = get_pin_type_from_pin_physical_num(type, ipin); - - if (pin_type == DRIVER) { - Fc_out = std::max(Fc, Fc_out); - } else { - VTR_ASSERT(pin_type == RECEIVER); - Fc_in = std::max(Fc, Fc_in); - } - } - } - - /* Estimates of switch fan-in are done as follows: - * 1) opin to wire switch: - * 2 CLBs connect to a channel, each with #opins/4 pins. Each pin has Fc_out*W - * switches, and then we assume the switches are distributed evenly over the W wires. - * In the unidirectional case, all these switches are then crammed down to W/wire_segment_length wires. - * - * Unidirectional: 2 * #opins_per_side * Fc_out * wire_segment_length - * Bidirectional: 2 * #opins_per_side * Fc_out - * - * 2) wire to wire switch - * A wire segment in a switchblock connects to Fs other wires. Assuming these connections are evenly - * distributed, each target wire receives Fs connections as well. In the unidirectional case, - * source wires can only connect to W/wire_segment_length wires. - * - * Unidirectional: Fs * wire_segment_length - * Bidirectional: Fs - * - * 3) wire to ipin switch - * An input pin of a CLB simply receives Fc_in connections. - * - * Unidirectional: Fc_in - * Bidirectional: Fc_in - */ - - /* Fan-in to opin/ipin/wire switches depends on whether the architecture is unidirectional/bidirectional */ - (*opin_switch_fanin) = 2.f * type->num_drivers / 4.f * Fc_out; - (*wire_switch_fanin) = routing_arch.Fs; - (*ipin_switch_fanin) = Fc_in; - if (routing_arch.directionality == UNI_DIRECTIONAL) { - /* adjustments to opin-to-wire and wire-to-wire switch fan-ins */ - (*opin_switch_fanin) *= wire_segment_length; - (*wire_switch_fanin) *= wire_segment_length; - } else if (routing_arch.directionality == BI_DIRECTIONAL) { - /* no adjustments need to be made here */ - } else { - VPR_FATAL_ERROR(VPR_ERROR_PACK, "Unrecognized directionality: %d\n", - (int)routing_arch.directionality); - } -} - -static float approximate_inter_cluster_delay(const t_arch& arch, - const t_det_routing_arch& routing_arch, - const std::string& device_layout) { - - /* If needed, estimate inter-cluster delay. Assume the average routing hop goes out of - * a block through an opin switch to a length-4 wire, then through a wire switch to another - * length-4 wire, then through a wire-to-ipin-switch into another block. */ - constexpr int wire_segment_length = 4; - - /* We want to determine a reasonable fan-in to the opin, wire, and ipin switches, based - * on which the intercluster delays can be estimated. The fan-in of a switch influences its - * delay. - * - * The fan-in of the switch depends on the architecture (unidirectional/bidirectional), as - * well as Fc_in/out and Fs */ - int opin_switch_fanin, wire_switch_fanin, ipin_switch_fanin; - get_intercluster_switch_fanin_estimates(arch, routing_arch, device_layout, wire_segment_length, &opin_switch_fanin, - &wire_switch_fanin, &ipin_switch_fanin); - - float Tdel_opin_switch, R_opin_switch, Cout_opin_switch; - float opin_switch_del = get_arch_switch_info(arch.Segments[0].arch_opin_switch, opin_switch_fanin, - Tdel_opin_switch, R_opin_switch, Cout_opin_switch); - - float Tdel_wire_switch, R_wire_switch, Cout_wire_switch; - float wire_switch_del = get_arch_switch_info(arch.Segments[0].arch_wire_switch, wire_switch_fanin, - Tdel_wire_switch, R_wire_switch, Cout_wire_switch); - - float Tdel_wtoi_switch, R_wtoi_switch, Cout_wtoi_switch; - float wtoi_switch_del = get_arch_switch_info(routing_arch.wire_to_arch_ipin_switch, ipin_switch_fanin, - Tdel_wtoi_switch, R_wtoi_switch, Cout_wtoi_switch); - - float Rmetal = arch.Segments[0].Rmetal; - float Cmetal = arch.Segments[0].Cmetal; - - /* The delay of a wire with its driving switch is the switch delay plus the - * product of the equivalent resistance and capacitance experienced by the wire. */ - - float first_wire_seg_delay = opin_switch_del - + (R_opin_switch + Rmetal * (float)wire_segment_length / 2) - * (Cout_opin_switch + Cmetal * (float)wire_segment_length); - float second_wire_seg_delay = wire_switch_del - + (R_wire_switch + Rmetal * (float)wire_segment_length / 2) - * (Cout_wire_switch + Cmetal * (float)wire_segment_length); - - /* multiply by 4 to get a more conservative estimate */ - return 4 * (first_wire_seg_delay + second_wire_seg_delay + wtoi_switch_del); -} diff --git a/vpr/src/pack/pack.h b/vpr/src/pack/pack.h index 2d22a8dc230..c0cb1a4581f 100644 --- a/vpr/src/pack/pack.h +++ b/vpr/src/pack/pack.h @@ -31,8 +31,8 @@ struct t_packer_opts; * provided by the user as a hint for packing. Will be invalid if * there is no flat placement information provided. */ -bool try_pack(t_packer_opts* packer_opts, - const t_analysis_opts* analysis_opts, +bool try_pack(const t_packer_opts& packer_opts, + const t_analysis_opts& analysis_opts, const t_arch& arch, const t_det_routing_arch& routing_arch, std::vector* lb_type_rr_graphs, diff --git a/vpr/src/timing/PreClusterTimingManager.cpp b/vpr/src/timing/PreClusterTimingManager.cpp new file mode 100644 index 00000000000..ec3b6a44958 --- /dev/null +++ b/vpr/src/timing/PreClusterTimingManager.cpp @@ -0,0 +1,276 @@ +/** + * @file + * @author Alex Singer + * @date April 2025 + * @brief Implementation of the pre-cluster timing manager class. + */ + +#include "PreClusterTimingManager.h" +#include +#include +#include "PreClusterDelayCalculator.h" +#include "PreClusterTimingGraphResolver.h" +#include "SetupGrid.h" +#include "atom_lookup.h" +#include "atom_netlist.h" +#include "atom_netlist_fwd.h" +#include "concrete_timing_info.h" +#include "physical_types_util.h" +#include "prepack.h" +#include "tatum/TimingReporter.hpp" +#include "tatum/echo_writer.hpp" +#include "vpr_types.h" +#include "vtr_assert.h" + +/** + * Since the parameters of a switch may change as a function of its fanin, + * to get an estimation of inter-cluster delays we need a reasonable estimation + * of the fan-ins of switches that connect clusters together. These switches are + * 1) opin to wire switch + * 2) wire to wire switch + * 3) wire to ipin switch + * We can estimate the fan-in of these switches based on the Fc_in/Fc_out of + * a logic block, and the switch block Fs value + */ +static void get_intercluster_switch_fanin_estimates(const t_arch& arch, + const t_det_routing_arch& routing_arch, + const std::string& device_layout, + const int wire_segment_length, + int* opin_switch_fanin, + int* wire_switch_fanin, + int* ipin_switch_fanin); + +static float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch); + +static float approximate_inter_cluster_delay(const t_arch& arch, + const t_det_routing_arch& routing_arch, + const std::string& device_layout); + +PreClusterTimingManager::PreClusterTimingManager(bool timing_driven, + const AtomNetlist& atom_netlist, + const AtomLookup& atom_lookup, + const Prepacker& prepacker, + e_timing_update_type timing_update_type, + const t_arch& arch, + const t_det_routing_arch& routing_arch, + const std::string& device_layout, + const t_analysis_opts& analysis_opts) { + + // If the flow is not timing driven, do not initialize any of the timing + // objects and set the valid flag to false. This allows this object to be + // passed through the VPR flow when timing is turned off. + if (!timing_driven) { + is_valid_ = false; + return; + } + is_valid_ = true; + + // Approximate the inter-cluster delay + // FIXME: This can probably be simplified. It can also be improved using + // AP information. + float inter_cluster_net_delay = approximate_inter_cluster_delay(arch, routing_arch, device_layout); + VTR_LOG("Using inter-cluster delay: %g\n", inter_cluster_net_delay); + + // Initialize the timing analyzer + clustering_delay_calc_ = std::make_shared(atom_netlist, + atom_lookup, + inter_cluster_net_delay, + prepacker); + timing_info_ = make_setup_timing_info(clustering_delay_calc_, timing_update_type); + + // Calculate the initial timing + timing_info_->update(); + + // Create the echo file if requested. + if (isEchoFileEnabled(E_ECHO_PRE_PACKING_TIMING_GRAPH)) { + auto& timing_ctx = g_vpr_ctx.timing(); + tatum::write_echo(getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH), + *timing_ctx.graph, *timing_ctx.constraints, *clustering_delay_calc_, timing_info_->analyzer()); + + tatum::NodeId debug_tnode = id_or_pin_name_to_tnode(analysis_opts.echo_dot_timing_graph_node); + write_setup_timing_graph_dot(getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH) + std::string(".dot"), + *timing_info_, debug_tnode); + } + + // Write a timing report. + { + auto& timing_ctx = g_vpr_ctx.timing(); + PreClusterTimingGraphResolver resolver(atom_netlist, + atom_lookup, + *timing_ctx.graph, + *clustering_delay_calc_); + resolver.set_detail_level(analysis_opts.timing_report_detail); + + tatum::TimingReporter timing_reporter(resolver, *timing_ctx.graph, + *timing_ctx.constraints); + + timing_reporter.report_timing_setup( + "pre_pack.report_timing.setup.rpt", + *timing_info_->setup_analyzer(), + analysis_opts.timing_report_npaths); + } +} + +static float approximate_inter_cluster_delay(const t_arch& arch, + const t_det_routing_arch& routing_arch, + const std::string& device_layout) { + + /* If needed, estimate inter-cluster delay. Assume the average routing hop goes out of + * a block through an opin switch to a length-4 wire, then through a wire switch to another + * length-4 wire, then through a wire-to-ipin-switch into another block. */ + constexpr int wire_segment_length = 4; + + /* We want to determine a reasonable fan-in to the opin, wire, and ipin switches, based + * on which the intercluster delays can be estimated. The fan-in of a switch influences its + * delay. + * + * The fan-in of the switch depends on the architecture (unidirectional/bidirectional), as + * well as Fc_in/out and Fs */ + int opin_switch_fanin, wire_switch_fanin, ipin_switch_fanin; + get_intercluster_switch_fanin_estimates(arch, routing_arch, device_layout, wire_segment_length, &opin_switch_fanin, + &wire_switch_fanin, &ipin_switch_fanin); + + float Tdel_opin_switch, R_opin_switch, Cout_opin_switch; + float opin_switch_del = get_arch_switch_info(arch.Segments[0].arch_opin_switch, opin_switch_fanin, + Tdel_opin_switch, R_opin_switch, Cout_opin_switch); + + float Tdel_wire_switch, R_wire_switch, Cout_wire_switch; + float wire_switch_del = get_arch_switch_info(arch.Segments[0].arch_wire_switch, wire_switch_fanin, + Tdel_wire_switch, R_wire_switch, Cout_wire_switch); + + float Tdel_wtoi_switch, R_wtoi_switch, Cout_wtoi_switch; + float wtoi_switch_del = get_arch_switch_info(routing_arch.wire_to_arch_ipin_switch, ipin_switch_fanin, + Tdel_wtoi_switch, R_wtoi_switch, Cout_wtoi_switch); + + float Rmetal = arch.Segments[0].Rmetal; + float Cmetal = arch.Segments[0].Cmetal; + + /* The delay of a wire with its driving switch is the switch delay plus the + * product of the equivalent resistance and capacitance experienced by the wire. */ + + float first_wire_seg_delay = opin_switch_del + + (R_opin_switch + Rmetal * (float)wire_segment_length / 2) + * (Cout_opin_switch + Cmetal * (float)wire_segment_length); + float second_wire_seg_delay = wire_switch_del + + (R_wire_switch + Rmetal * (float)wire_segment_length / 2) + * (Cout_wire_switch + Cmetal * (float)wire_segment_length); + + /* multiply by 4 to get a more conservative estimate */ + return 4 * (first_wire_seg_delay + second_wire_seg_delay + wtoi_switch_del); +} + +static float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch) { + /* Fetches delay, resistance and output capacitance of the architecture switch at switch_index. + * Returns the total delay through the switch. Used to calculate inter-cluster net delay. */ + + /* The intrinsic delay may depend on fanin to the switch. If the delay map of a + * switch from the architecture file has multiple (#inputs, delay) entries, we + * interpolate/extrapolate to get the delay at 'switch_fanin'. */ + auto& device_ctx = g_vpr_ctx.device(); + + Tdel_switch = device_ctx.arch_switch_inf[switch_index].Tdel(switch_fanin); + R_switch = device_ctx.arch_switch_inf[switch_index].R; + Cout_switch = device_ctx.arch_switch_inf[switch_index].Cout; + + /* The delay through a loaded switch is its intrinsic (unloaded) + * delay plus the product of its resistance and output capacitance. */ + return Tdel_switch + R_switch * Cout_switch; +} + +static void get_intercluster_switch_fanin_estimates(const t_arch& arch, + const t_det_routing_arch& routing_arch, + const std::string& device_layout, + const int wire_segment_length, + int* opin_switch_fanin, + int* wire_switch_fanin, + int* ipin_switch_fanin) { + // W is unknown pre-packing, so *if* we need W here, we will assume a value of 100 + constexpr int W = 100; + + //Build a dummy 10x10 device to determine the 'best' block type to use + auto grid = create_device_grid(device_layout, arch.grid_layouts, 10, 10); + + auto type = find_most_common_tile_type(grid); + /* get Fc_in/out for most common block (e.g. logic blocks) */ + VTR_ASSERT(!type->fc_specs.empty()); + + //Estimate the maximum Fc_in/Fc_out + float Fc_in = 0.f; + float Fc_out = 0.f; + for (const t_fc_specification& fc_spec : type->fc_specs) { + float Fc = fc_spec.fc_value; + + if (fc_spec.fc_value_type == e_fc_value_type::ABSOLUTE) { + //Convert to estimated fractional + Fc /= W; + } + VTR_ASSERT_MSG(Fc >= 0 && Fc <= 1., "Fc should be fractional"); + + for (int ipin : fc_spec.pins) { + e_pin_type pin_type = get_pin_type_from_pin_physical_num(type, ipin); + + if (pin_type == DRIVER) { + Fc_out = std::max(Fc, Fc_out); + } else { + VTR_ASSERT(pin_type == RECEIVER); + Fc_in = std::max(Fc, Fc_in); + } + } + } + + /* Estimates of switch fan-in are done as follows: + * 1) opin to wire switch: + * 2 CLBs connect to a channel, each with #opins/4 pins. Each pin has Fc_out*W + * switches, and then we assume the switches are distributed evenly over the W wires. + * In the unidirectional case, all these switches are then crammed down to W/wire_segment_length wires. + * + * Unidirectional: 2 * #opins_per_side * Fc_out * wire_segment_length + * Bidirectional: 2 * #opins_per_side * Fc_out + * + * 2) wire to wire switch + * A wire segment in a switchblock connects to Fs other wires. Assuming these connections are evenly + * distributed, each target wire receives Fs connections as well. In the unidirectional case, + * source wires can only connect to W/wire_segment_length wires. + * + * Unidirectional: Fs * wire_segment_length + * Bidirectional: Fs + * + * 3) wire to ipin switch + * An input pin of a CLB simply receives Fc_in connections. + * + * Unidirectional: Fc_in + * Bidirectional: Fc_in + */ + + /* Fan-in to opin/ipin/wire switches depends on whether the architecture is unidirectional/bidirectional */ + (*opin_switch_fanin) = 2.f * type->num_drivers / 4.f * Fc_out; + (*wire_switch_fanin) = routing_arch.Fs; + (*ipin_switch_fanin) = Fc_in; + if (routing_arch.directionality == UNI_DIRECTIONAL) { + /* adjustments to opin-to-wire and wire-to-wire switch fan-ins */ + (*opin_switch_fanin) *= wire_segment_length; + (*wire_switch_fanin) *= wire_segment_length; + } else if (routing_arch.directionality == BI_DIRECTIONAL) { + /* no adjustments need to be made here */ + } else { + VPR_FATAL_ERROR(VPR_ERROR_PACK, "Unrecognized directionality: %d\n", + (int)routing_arch.directionality); + } +} + +float PreClusterTimingManager::calc_atom_setup_criticality(AtomBlockId blk_id, + const AtomNetlist& atom_netlist) const { + VTR_ASSERT_SAFE_MSG(is_valid_, + "PreClusterTimingManager has not been initialized"); + VTR_ASSERT_SAFE_MSG(blk_id.is_valid(), + "Invalid block ID"); + + float crit = 0.0f; + for (AtomPinId in_pin : atom_netlist.block_input_pins(blk_id)) { + // Max criticality over incoming nets + float pin_crit = timing_info_->setup_pin_criticality(in_pin); + crit = std::max(crit, pin_crit); + } + + return crit; +} diff --git a/vpr/src/timing/PreClusterTimingManager.h b/vpr/src/timing/PreClusterTimingManager.h new file mode 100644 index 00000000000..f76489b8ee9 --- /dev/null +++ b/vpr/src/timing/PreClusterTimingManager.h @@ -0,0 +1,108 @@ +/** + * @file + * @author Alex Singer + * @date April 2025 + * @brief Manager class for pre-cluster (primitive-level) timing analysis. + */ + +#pragma once + +#include +#include +#include "vpr_types.h" +#include "vtr_assert.h" + +// Forward declarations. +class AtomLookup; +class AtomNetlist; +class PreClusterDelayCalculator; +class Prepacker; +class SetupTimingInfo; + +/** + * @brief Pre-cluster timing manager class. + * + * This class encapsulates the timing computations used prior to clustering. + * This maintains all of the state necessary to perform these timing computations. + */ +class PreClusterTimingManager { + public: + /** + * @brief Constructor for the manager class. + * + * If timing_driven is set to true, this constructor will perform a setup + * timing analysis with a pre-clustered delay model. The delay model uses + * the primitive delays specified in the architecture file and a simple + * estimate of routing (a typical routing delay based on the wire delays + * found in the architecture, and more specific delays for direct connections + * like carry chains whose use we already know from the pre-packing). + * + * @param timing_driven + * Whether this class should compute timing information or not. This + * may seem counter-intuitive, but this class still needs to exist + * even if timing is turned off. This will not initialize anything + * and set the valid flag to false if we are not timing driven. + * @param atom_netlist + * The primitive netlist to perform timing analysis over. + * @param atom_lookup + * A lookup between the primitives and their timing nodes. + * @param prepacker + * The prepacker object used to prepack primitives into molecules. + * @param timing_update_type + * The type of timing update this class should perform. + * @param arch + * The architecture. + * @param routing_arch + * The routing architecture. + * @param analysis opts + * Options for the timing analysis in VPR. + */ + PreClusterTimingManager(bool timing_driven, + const AtomNetlist& atom_netlist, + const AtomLookup& atom_lookup, + const Prepacker& prepacker, + e_timing_update_type timing_update_type, + const t_arch& arch, + const t_det_routing_arch& routing_arch, + const std::string& device_layout, + const t_analysis_opts& analysis_opts); + + /** + * @brief Calculates the setup criticality of the given primitive block. + * + * Currently defined as the maximum criticality over the block inputs. + */ + float calc_atom_setup_criticality(AtomBlockId blk_id, + const AtomNetlist& atom_netlist) const; + + /** + * @brief Returns whether or not the pre-cluster timing manager was + * initialized (i.e. timing information can be computed). + */ + bool is_valid() const { + return is_valid_; + } + + /** + * @brief Get a reference to the setup timing info. + */ + const SetupTimingInfo& get_timing_info() const { + VTR_ASSERT_SAFE_MSG(is_valid_, + "Timing manager has not been initialized"); + return *timing_info_; + } + + private: + /// @brief A valid flag used to signify if the pre-cluster timing manager + /// class has been initialized or not. For example, if the flow is + /// not timing-driven, then this class will just be a shell which + /// should not have any timing information (but the object exists). + bool is_valid_; + + /// @brief The delay calculator used for computing timing. + std::shared_ptr clustering_delay_calc_; + + /// @brief The setup timing info used for getting the timing of edges + /// in the timing graph. + std::shared_ptr timing_info_; +};