diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a1a2150b8..8f86cefb29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,9 +42,6 @@ option(VTR_ENABLE_CAPNPROTO "Enable capnproto binary serialization support in VP #Allow the user to decide whether to compile the server module option(VPR_USE_SERVER "Specify whether vpr enables the server mode" ON) -#Allow the user to enable/disable VPR analytic placement -#VPR option --enable_analytic_placer is also required for Analytic Placement -option(VPR_ANALYTIC_PLACE "Enable analytic placement in VPR." ON) option(VPR_ENABLE_INTERCHANGE "Enable FPGA interchange." ON) option(VPR_ENABLE_NOC_SAT_ROUTING "Enable NoC SAT routing." OFF) diff --git a/vpr/CMakeLists.txt b/vpr/CMakeLists.txt index 67d9bcbd25..530928ac61 100644 --- a/vpr/CMakeLists.txt +++ b/vpr/CMakeLists.txt @@ -91,21 +91,6 @@ else () message(STATUS "Eigen3: Not Found. Some features may be disabled.") endif (TARGET Eigen3::Eigen) -#VPR_ANALYTIC_PLACE is initialized in the root CMakeLists -# NOTE: This is the cluster-level Analytical Placement which existed before the -# flat Analytical Placement flow. -if(${VPR_ANALYTIC_PLACE}) - message(STATUS "VPR Analytic Placement: Requested") - if (TARGET Eigen3::Eigen) - message(STATUS "VPR Analytic Placement dependency (Eigen3): Found") - message(STATUS "VPR Analytic Placement: Enabled") - target_compile_definitions(libvpr PUBLIC -DENABLE_ANALYTIC_PLACE) - else () - message(STATUS "VPR Analytic Placement dependency (Eigen3): Not Found (Download manually with sudo apt install libeigen3-dev, and rebuild)") - message(STATUS "VPR Analytic Placement: Disabled") - endif(TARGET Eigen3::Eigen) -endif() - if (${VPR_ENABLE_NOC_SAT_ROUTING}) message(STATUS "VPR NoC SAT Routing: Requested") find_package(ortools CONFIG REQUIRED) diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index 3c5d7e06d1..676f6378bd 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -673,7 +673,6 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts) PlacerOpts->effort_scaling = Options.place_effort_scaling; PlacerOpts->timing_update_type = Options.timing_update_type; - PlacerOpts->enable_analytic_placer = Options.enable_analytic_placer; PlacerOpts->place_static_move_prob = vtr::vector(Options.place_static_move_prob.value().begin(), Options.place_static_move_prob.value().end()); PlacerOpts->place_high_fanout_net = Options.place_high_fanout_net; diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 6b73aa5238..553a964d30 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -2244,13 +2244,6 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio .default_value("0") .show_in(argparse::ShowIn::HELP_ONLY); - place_grp.add_argument(args.enable_analytic_placer, "--enable_analytic_placer") - .help( - "Enables the analytic placer. " - "Once analytic placement is done, the result is passed through the quench phase of the annealing placer for local improvement") - .default_value("false") - .show_in(argparse::ShowIn::HELP_ONLY); - place_grp.add_argument(args.place_static_move_prob, "--place_static_move_prob") .help( "The percentage probabilities of different moves in Simulated Annealing placement. " diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h index bc16219446..5e26d36725 100644 --- a/vpr/src/base/read_options.h +++ b/vpr/src/base/read_options.h @@ -139,7 +139,6 @@ struct t_options { argparse::ArgValue placement_saves_per_temperature; argparse::ArgValue place_effort_scaling; argparse::ArgValue place_delta_delay_matrix_calculation_method; - argparse::ArgValue enable_analytic_placer; argparse::ArgValue> place_static_move_prob; argparse::ArgValue place_high_fanout_net; argparse::ArgValue place_bounding_box_mode; diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index ea8ea02481..c9b17f94ce 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -1071,14 +1071,6 @@ struct t_placer_opts { std::string allowed_tiles_for_delay_model; e_place_delta_delay_algorithm place_delta_delay_matrix_calculation_method; - - /* - * @brief enables the analytic placer. - * - * Once analytic placement is done, the result is passed through the quench phase - * of the annealing placer for local improvement - */ - bool enable_analytic_placer; }; /****************************************************************** diff --git a/vpr/src/place/analytic_placer.cpp b/vpr/src/place/analytic_placer.cpp deleted file mode 100644 index e460c5bd58..0000000000 --- a/vpr/src/place/analytic_placer.cpp +++ /dev/null @@ -1,865 +0,0 @@ -#include "place_macro.h" -#ifdef ENABLE_ANALYTIC_PLACE - -#include "analytic_placer.h" - -// The eigen library contains a warning in GCC13 for a null dereference. This -// causes the CI build to fail due to the warning. Ignoring the warning for -// these include files. Using push to return to the state of GCC diagnostics. -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wnull-dereference" -#include -#include -// Pop the GCC diagnostics state back to what it was before. -#pragma GCC diagnostic pop - -#include -#include -#include - -#include "vpr_types.h" -#include "vtr_time.h" -#include "read_place.h" -#include "globals.h" -#include "vtr_log.h" -#include "cut_spreader.h" -#include "vpr_utils.h" -#include "place_util.h" - -// Templated struct for constructing and solving matrix equations in analytic placer -template -struct EquationSystem { - EquationSystem(size_t rows, size_t cols) { - A.resize(cols); - rhs.resize(rows); - } - - // A[col] is an entire column of the sparse matrix - // each entry in A[col][index] is a pair with {row_number, matrix value}. - // - // The strategy of skipping 0 row entries in each column enables easy conversion to - // Compressed Column Storage scheme supported by Eigen to reduce memory consumption - // and increase performance - std::vector>> A; - // right hand side vector, i.e. b in Ax = b - std::vector rhs; - - // System of equation is reset by: - // Clearing all entries in A's column, but size of A (number of columns) is preserved - // right hand side vector is set to default value of its templated type - void reset() { - for (auto& col : A) - col.clear(); - std::fill(rhs.begin(), rhs.end(), T()); - } - - // Add val to the matrix entry at (row, col) - // create entry if it doesn't exist - void add_coeff(int row, int col, T val) { - auto& A_col = A.at(col); - // Binary search for the row entry in column col - int begin_i = 0, end_i = int(A_col.size()) - 1; - while (begin_i <= end_i) { - int i = (begin_i + end_i) / 2; - if (A_col.at(i).first == row) { - A_col.at(i).second += val; - return; - } - if (A_col.at(i).first > row) - end_i = i - 1; - else - begin_i = i + 1; - } - A_col.insert(A_col.begin() + begin_i, std::make_pair(row, val)); - } - - // Add val to the "row"-th entry of right hand side vector - void add_rhs(int row, T val) { rhs[row] += val; } - - // Solving Ax = b, using current x as an initial guess, returns x by reference. - // (x must be of correct size, A and rhs must have their entries filled in) - // tolerance is residual error from solver: |Ax-b|/|b|, 1e-5 works well, - // can be tuned in ap_cfg in AnalyticPlacer constructor - void solve(std::vector& x, float tolerance) { - using namespace Eigen; - - VTR_ASSERT(x.size() == A.size()); - - // Converting A into SparseMatrix format from Eigen - VectorXd vec_x_guess(x.size()), vec_rhs(rhs.size()); - SparseMatrix mat(A.size(), A.size()); - - std::vector colnnz; // vector containing number of entries in each column - for (auto& A_col : A) - colnnz.push_back(int(A_col.size())); - mat.reserve(colnnz); // reserve memory for mat depending on number of entries in each row - for (int col = 0; col < int(A.size()); col++) { - auto& A_col = A.at(col); - for (auto& row_entry : A_col) - mat.insert(row_entry.first, col) = row_entry.second; - } - - // use current value of x as guess for iterative solver - for (int i_row = 0; i_row < int(x.size()); i_row++) - vec_x_guess[i_row] = x.at(i_row); - - for (int i_row = 0; i_row < int(rhs.size()); i_row++) - vec_rhs[i_row] = rhs.at(i_row); - - ConjugateGradient, Lower | Upper> solver; - solver.setTolerance(tolerance); - VectorXd x_res = solver.compute(mat).solveWithGuess(vec_rhs, vec_x_guess); - for (int i_row = 0; i_row < int(x.size()); i_row++) - x.at(i_row) = x_res[i_row]; - } -}; - -// Stop optimizing once this many iterations of solve-legalize lead to negligible wirelength improvement -constexpr int HEAP_STALLED_ITERATIONS_STOP = 15; - -/* - * AnalyticPlacer constructor - * Currently only initializing AP configuration parameters - * Placement & device info is accessed via g_vpr_ctx - */ - -AnalyticPlacer::AnalyticPlacer(BlkLocRegistry& blk_loc_registry, - const PlaceMacros& place_macros) - : blk_loc_registry_ref_(blk_loc_registry) - , place_macros_(place_macros) { - //Eigen::initParallel(); - - // TODO: PlacerHeapCfg should be externally configured & supplied - // TODO: tune these parameters for better performance - ap_cfg.alpha = 0.1; // anchoring strength, after first AP iteration the legal position of each block - // becomes anchors. In the next AP iteration, pseudo-connection between each blocks - // current location and its anchor is formed with strength (alph * iter) - // @see build_equations() - - ap_cfg.beta = 1; // utilization factor, <= 1, used to determine if a cut-spreading region is - // overutilized with the formula: bool overutilized = (num_blks / num_tiles) > beta - // for beta < 1, a region must have more tiles than logical blks to not be overutilized - - ap_cfg.solverTolerance = 1e-5; // solver parameter, refers to residual error from solver, defined as |Ax-b|/|b| - - ap_cfg.buildSolveIter = 5; // number of build-solve iteration when calculating placement, used in - // build_solve_direction() - // for each build-solve iteration, the solution from previous build-solve iteration - // is used as a guess for the iterative solver. therefore more buildSolveIter should - // should improve result at the expense of runtime - - // following two parameters are used in CutSpreader::expand_regions(). - // they determine the number of steps to expand in x or y direction before switching to expand in the other direction. - ap_cfg.spread_scale_x = 1; - ap_cfg.spread_scale_y = 1; - - // following two timing parameters are used to add timing weights in matrix equation, currently not used - // see comment in add_pin_to_pin_connection() for usage - ap_cfg.criticalityExponent = 1; - ap_cfg.timingWeight = 10; -} - -/* - * Main function of analytic placement - * Takes the random initial placement from place.cpp through g_vpr_ctx - * Repeat the following until stopping criteria is met: - * * Formulate and solve equations in x & y directions for 1 type of logical block - * * Instantiate CutSpreader to spread and strict_legalize - * - * The final legal placement is passed back to annealer in g_vpr_ctx.mutable_placement() - */ -void AnalyticPlacer::ap_place() { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - vtr::ScopedStartFinishTimer timer("Analytic Placement"); - - init(); // transfer placement from g_vpr_ctx to AnalyticPlacer data members - build_legal_locations(); - int hpwl = total_hpwl(); - VTR_LOG("Creating analytic placement for %d cells, random placement hpwl = %d.\n", - int(clb_nlist.blocks().size()), int(hpwl)); - - // the order in which different logical block types are placed; - // going through ap_runs once completes 1 iteration of AP - std::vector ap_runs; - std::unordered_set all_blktypes; // set of all logical block types - - // setup ap_runs, run build/solve/legalize once for every block type - // each type is placed separately, but influenced by the current location of other types - for (auto blk : place_blks) { - if (!all_blktypes.count(clb_nlist.block_type(blk))) { - ap_runs.push_back(clb_nlist.block_type(blk)); - all_blktypes.insert(clb_nlist.block_type(blk)); - } - } - - // setup and solve matrix multiple times for all logic block types before main loop - // this helps eliminating randomness from initial placement (when placing one block type, the random placement - // of the other types may have residual effect on the result, since not all blocks are solved at the same time) - for (int i = 0; i < 1; i++) { // can tune number of iterations - for (auto run : ap_runs) { - build_solve_type(run, -1); - } - } - - int iter = 0, stalled = 0; - // variables for stats - int solved_hpwl = 0, spread_hpwl = 0, legal_hpwl = 0, best_hpwl = std::numeric_limits::max(); - float iter_start, iter_t, run_start, run_t, solve_t, spread_start, spread_t, legal_start, legal_t; - - print_AP_status_header(); - - // main loop for AP - // stopping criteria: stop after HEAP_STALLED_ITERATIONS_STOP iterations of no improvement - while (stalled < HEAP_STALLED_ITERATIONS_STOP) { - // TODO: investigate better stopping criteria - iter_start = timer.elapsed_sec(); - for (auto blk_type : ap_runs) { // for each type of logic blocks - run_start = timer.elapsed_sec(); - - // lower bound placement for blk_type - // build and solve matrix equation for blocks of type "blk_type" in both x and y directions - build_solve_type(blk_type, iter); - solve_t = timer.elapsed_sec() - run_start; - solved_hpwl = total_hpwl(); - // lower bound placement complete - - // upper bound placement - // cut-spreading logic blocks of type "blk_type", this will mostly legalize lower bound placement - spread_start = timer.elapsed_sec(); - CutSpreader spreader{this, blk_type}; // Legalizer - if (blk_type->name != "io") { - /* skip cut-spreading for IO blocks; they tend to cluster on 1 edge of the FPGA due to how cut-spreader works - * in HeAP, cut-spreading is invoked only on LUT, DSP, RAM etc. - * here, greedy legalization by spreader.strict_legalize() should be sufficient for IOs - */ - spreader.cutSpread(); - update_macros(); - spread_hpwl = total_hpwl(); - spread_t = timer.elapsed_sec() - spread_start; - } else { - spread_hpwl = -1; - spread_t = 0; - } - - // greedy legalizer for fully legal placement - legal_start = timer.elapsed_sec(); - spreader.strict_legalize(); // greedy legalization snaps blocks to the closest legal location - update_macros(); - legal_t = timer.elapsed_sec() - legal_start; - legal_hpwl = total_hpwl(); - - // upper bound placement complete - - run_t = timer.elapsed_sec() - run_start; - print_run_stats(iter, timer.elapsed_sec(), run_t, blk_type->name.c_str(), solve_blks.size(), solve_t, - spread_t, legal_t, solved_hpwl, spread_hpwl, legal_hpwl); - } - - // TODO: update timing info here after timing weights are implemented in build_equations() - - if (legal_hpwl < best_hpwl) { - best_hpwl = legal_hpwl; - stalled = 0; - } else { - ++stalled; - } - - // update legal locations for all blocks for pseudo-connections in next iteration - for (auto& bl : blk_locs) { - bl.legal_loc = bl.loc; - } - iter_t = timer.elapsed_sec() - iter_start; - print_iter_stats(iter, iter_t, timer.elapsed_sec(), best_hpwl, stalled); - ++iter; - } -} - -// build matrix equations and solve for block type "run" in both x and y directions -// macro member positions are updated after solving -void AnalyticPlacer::build_solve_type(t_logical_block_type_ptr run, int iter) { - setup_solve_blks(run); - // build and solve matrix equation for both x, y - // passing -1 as iter to build_solve_direction() signals build_equation() not to add pseudo-connections - build_solve_direction(false, (iter == 0) ? -1 : iter, ap_cfg.buildSolveIter); - build_solve_direction(true, (iter == 0) ? -1 : iter, ap_cfg.buildSolveIter); - update_macros(); // update macro member locations, since only macro head is solved -} - -// build legal_pos similar to initial_placement.cpp -// Go through the placement grid and saving all legal positions for each type of sub_tile -// (stored in legal_pos). For a type of sub_tile_t found in tile_t, legal_pos[tile_t][sub_tile_t] -// gives a vector containing all positions (t_pl_loc type) for this sub_tile_t. -void AnalyticPlacer::build_legal_locations() { - // invoking same function used in initial_placement.cpp (can ignore function name) - alloc_and_load_legal_placement_locations(legal_pos); -} - -// transfer initial placement from g_vpr_ctx to AnalyticPlacer data members, such as: blk_locs, place_blks -// initialize other data members -void AnalyticPlacer::init() { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - auto& init_block_locs = blk_loc_registry_ref_.block_locs(); - - for (auto blk_id : clb_nlist.blocks()) { - blk_locs.insert(blk_id, BlockLocation{}); - blk_locs[blk_id].loc = init_block_locs[blk_id].loc; // transfer of initial placement - row_num.insert(blk_id, DONT_SOLVE); // no blocks are moved by default, until they are setup in setup_solve_blks() - } - - // only blocks with connections are considered - auto has_connections = [&](ClusterBlockId blk_id) { - for (auto pin : clb_nlist.block_pins(blk_id)) { - int logical_pin_index = clb_nlist.pin_logical_index(pin); - if (clb_nlist.block_net(blk_id, logical_pin_index) != ClusterNetId::INVALID()) - return true; - } - return false; - }; - - for (auto blk_id : clb_nlist.blocks()) { - if (!init_block_locs[blk_id].is_fixed && has_connections(blk_id)) - // not fixed and has connections - // matrix equation is formulated based on connections, so requires at least one connection - if (place_macros_.get_imacro_from_iblk(blk_id) == NO_MACRO || place_macros_.macro_head(blk_id) == blk_id) { - // not in macro or head of macro - // for macro, only the head (base) block of the macro is a free variable, the location of other macro - // blocks can be calculated using offset of the head. They are not free variables in the equation system - place_blks.push_back(blk_id); - } - } -} - -// get hpwl of a net, taken from place.cpp get_bb_from_scratch() -// TODO: factor out this function from place.cpp and put into vpr_util -int AnalyticPlacer::get_net_hpwl(ClusterNetId net_id) { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - int max_x = g_vpr_ctx.device().grid.width(); - int max_y = g_vpr_ctx.device().grid.height(); - - // position is not accurate for tiles spanning multiple grid locations - // need to add pin offset in that case: physical_tile_type(bnum)->pin_width_offset[pnum] - // see place.cpp get_non_updateable_bb(); - // TODO: map net_pin to tile_pin and add pin offset to x, y locations (refer to place.cpp) - ClusterBlockId bnum = clb_nlist.net_driver_block(net_id); - int x = std::max(std::min(blk_locs[bnum].loc.x, max_x - 1), 1); - int y = std::max(std::min(blk_locs[bnum].loc.y, max_y - 1), 1); - - vtr::Rect bb = {x, y, x, y}; - - for (auto pin_id : clb_nlist.net_sinks(net_id)) { - bnum = clb_nlist.pin_block(pin_id); - x = std::max(std::min(blk_locs[bnum].loc.x, max_x - 1), 1); - y = std::max(std::min(blk_locs[bnum].loc.y, max_y - 1), 1); - - bb.expand_bounding_box({x, y, x, y}); - } - - return (bb.ymax() - bb.ymin()) + (bb.xmax() - bb.xmin()); -} - -// get hpwl for all nets -int AnalyticPlacer::total_hpwl() { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - int hpwl = 0; - for (auto net_id : clb_nlist.nets()) { - if (!clb_nlist.net_is_ignored(net_id)) { - hpwl += get_net_hpwl(net_id); - } - } - return hpwl; -} - -/* - * Setup the blocks of type blkTypes (ex. clb, io) to be solved. These blocks are put into - * solve_blks vector. Each of them is a free variable in the matrix equation (thus excluding - * macro members, as they are formulated into the equation for the macro's head) - * A row number is assigned to each of these blocks, which corresponds to its equation in - * the matrix (the equation acquired from differentiating the objective function w.r.t its - * x or y location). - */ -void AnalyticPlacer::setup_solve_blks(t_logical_block_type_ptr blkTypes) { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - int row = 0; - solve_blks.clear(); - // clear row_num of all cells, so no blocks are solved - for (auto& blk : row_num) { - blk = DONT_SOLVE; - } - // update blks to be solved/placed, excluding macro members (macro head included) - for (auto blk_id : place_blks) { // find blocks of type blkTypes in place_blks - if (blkTypes == (clb_nlist.block_type(blk_id))) { - row_num[blk_id] = row++; - solve_blks.push_back(blk_id); - } - } - // update row_num of macro members - for (auto& macro : place_macros_.macros()) { - for (auto& member : macro.members) { - row_num[member.blk_index] = row_num[place_macros_.macro_head(member.blk_index)]; - } - } -} - -/* - * Update the location of all members of all macros based on location of macro_head - * since only macro_head is solved (connections to macro members are also taken into account - * when formulating the matrix equations), an update for members is necessary - */ -void AnalyticPlacer::update_macros() { - for (auto& macro : place_macros_.macros()) { - ClusterBlockId head_id = macro.members[0].blk_index; - bool mac_can_be_placed = macro_can_be_placed(macro, blk_locs[head_id].loc, false, blk_loc_registry_ref_); - - //if macro can not be placed in this head pos, change the head pos - if (!mac_can_be_placed) { - size_t macro_size = macro.members.size(); - blk_locs[head_id].loc -= macro.members[macro_size - 1].offset; - } - - //macro should be placed successfully after changing the head position - VTR_ASSERT(macro_can_be_placed(macro, blk_locs[head_id].loc, false, blk_loc_registry_ref_)); - - //update other member's location based on head pos - for (auto member = ++macro.members.begin(); member != macro.members.end(); ++member) { - blk_locs[member->blk_index].loc = blk_locs[head_id].loc + member->offset; - } - } -} - -/* - * Build and solve in one direction - * Solved solutions are written back to block_locs[blk].rawx/rawy for double float raw solution, - * rounded int solutions are written back to block_locs[blk].loc, for each blk in solve_blks - * - * yaxis chooses x or y location of each block from blk_locs to formulate the matrix equation. - * true for y-directed, false for x-directed - * - * iter is the number of AnalyticPlacement iterations (solving and legalizing all types of logic - * blocks once). When iter != -1, at least one iteration has completed. It signals build_equations() - * to create pseudo-connections between each block and its prior legal position. - * - * build_solve_iter determines number of iterations of building and solving for the iterative solver, - * the solution from the previous build-solve iteration is used as a guess for the iterative solver. - * More build_solve_iter means better result, with runtime tradeoff. This parameter can be - * tuned for better performance. - */ -void AnalyticPlacer::build_solve_direction(bool yaxis, int iter, int build_solve_iter) { - for (int i = 0; i < build_solve_iter; i++) { - EquationSystem esx(solve_blks.size(), solve_blks.size()); - build_equations(esx, yaxis, iter); - solve_equations(esx, yaxis); - } -} - -/* - * stamp 1 weight for a connection on matrix or rhs vector. - * - * Block "eqn" specifies which equation (row in matrix system) the weight is added into. - * let eqn have row_num i, var have row_num j (which is also the column in eqn that corresponds to var). - * - * if eqn is not movable, return (eqn doesn't really have an equation as it's not a free variable) - * if var is movable, weight is added in matrix [j][i] - * if var is not movable, (var_pos * weight) is added in rhs vector[j] - * if var is a macro member, weight is added in matrix [j][i], and (-offset_from_head_block * weight) is added to rhs vector[j] - * - * for detailed derivation see comment for add_pin_to_pin_connection() - */ -void AnalyticPlacer::stamp_weight_on_matrix(EquationSystem& es, - bool dir, - ClusterBlockId var, - ClusterBlockId eqn, - double weight) { - // Return the x or y position of a block - auto blk_p = [&](ClusterBlockId blk_id) { return dir ? blk_locs[blk_id].loc.y : blk_locs[blk_id].loc.x; }; - - int eqn_row = row_num[eqn]; - if (eqn_row == DONT_SOLVE) // if eqn is not of the right type or is locked down - return; - int v_pos = blk_p(var); - int var_row = row_num[var]; - if (var_row != DONT_SOLVE) { // var is movable, stamp weight on matrix - es.add_coeff(eqn_row, var_row, weight); - } else { // var is not movable, stamp weight on rhs vector - es.add_rhs(eqn_row, -v_pos * weight); - } - if (place_macros_.get_imacro_from_iblk(var) != NO_MACRO) { // var is part of a macro, stamp on rhs vector - auto& members = place_macros_[place_macros_.get_imacro_from_iblk(var)].members; - for (auto& member : members) { // go through macro members to find the right member block - if (member.blk_index == var) - es.add_rhs(eqn_row, -(dir ? member.offset.y : member.offset.x) * weight); - } - } -} - -/* - * Add weights to matrix for the pin-to-pin connection between bound_blk and this_blk (bound2bound model) - * - * The matrix A in system of equation Ax=b is a symmetric sparse matrix. - * Each row of A corresponds to an equation for a free variable. This equation is acquired by differentiating - * the objective function with respect to the free variable (movable block's x or y location) and setting it - * to 0. - * - * Pin-to-pin connection between 2 movable blocks (call them b1 and b2, with connection weight W12) is the - * simplest case. Differentiating with respect to b1 and setting to 0 produces W12 * b1 - W12 * b2 = 0, where - * b1, b2 are the location variables to calculate. When cast into matrix form, the row number of this equation - * corresponds to b1. Let's assume b1 and b2's equations are in rows i, j. Row number for each free variable also - * indicates its position in other variable's equation. In our example, assume there are 5 free variables (free - * blocks), and i=2, j=4. Then, after adding weights to b1's equation, the system will look like the following: - * | x x x x x | |x | = | x | - * | 0 W12 0 -W12 0 | |b1| = | 0 | - * | x x x x x | * |x | = | x | - * | x x x x x | |b2| = | x | - * | x x x x x | |x | = | x | - * Differentiating with respect to b2 will result in same equation except flipped signs for the weight. This creates - * symmetry in the matrix, resulting in: - * | x x x x x | |x | = | x | - * | 0 W12 0 -W12 0 | |b1| = | 0 | - * | x x x x x | * |x | = | x | - * | 0 -W12 0 W12 0 | |b2| = | 0 | - * | x x x x x | |x | = | x | - * To generalize, for movable blocks b1, b2 in row i,j, with connection weight W, the W is added to matrix position - * [i][i] and [j][j], -W added to [i][j] and [j][i]. This is why stamp_weight_on_matrix is invoked 4 times below. - * - * Special Case: immovable/fixed block. - * Assume b2 in the above example is fixed, then it does not have an equation in the system as it's not a free variable. - * The new equation is now W12 * b1 = W12 * b2, where b2 is just a constant. (This makes sense as b1=b2 is optimal, - * since it has wirelength of 0). The matrix equation now looks like the following: - * | x x x x x | |x | = | x | - * | 0 W12 0 0 0 | |b1| = |W12*b2| - * | x x x x x | * |x | = | x | - * | x x x x x | |x | = | x | - * | x x x x x | |x | = | x | - * - * Special Case: connection to macro member. - * Assume b1 is the head block of a macro, b3 is its macro member with offset d. b3 has a connection with movable block - * b2, with weight W23. b3's location is then (b1 + d). The new equation w.r.t. b1 is W23 * (b1 + d - b2) = 0. - * New equation w.r.t. b3 is symmetrical, producing matrix: - * | x x x x x | |x | = | x | - * | 0 W23 0 -W23 0 | |b1| = |-W23*d| - * | x x x x x | * |x | = | x | - * | 0 -W23 0 W23 0 | |b2| = | W23*d| - * | x x x x x | |x | = | x | - * As shown here, connection to macro members are formulated into macro's head block's equation. This is why macro members - * are not formulated in equation system. - * - * EquationSystem is passed in for adding weights, dir selects x/y direction, num_pins is used in weight calculation - * (bound2bound model). bound_pin and this_pin specifies the 2 pins in the connection (one of them is always bound_pin). - */ -void AnalyticPlacer::add_pin_to_pin_connection(EquationSystem& es, - bool dir, - int num_pins, - ClusterPinId bound_pin, - ClusterPinId this_pin) { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - if (this_pin == bound_pin) - // no connection if 2 pins are the same - return; - - // this_blk and bound_blk locations may not be accurate for larger tiles spanning multiple grid locations - // need block_locs[blk_id].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum] - // however, in order to do so, need place_sync_external_block_connections(blk_id) for all blocks - // TODO: map logical pin to physical pin and add this offset for more accurate pin location - ClusterBlockId this_blk = clb_nlist.pin_block(this_pin); - int this_pos = dir ? blk_locs[this_blk].loc.y : blk_locs[this_blk].loc.x; - ClusterBlockId bound_blk = clb_nlist.pin_block(bound_pin); - int bound_pos = dir ? blk_locs[bound_blk].loc.y : blk_locs[bound_blk].loc.x; - // implementing the bound-to-bound net model detailed in HeAP paper, where each bound blk has (num_pins - 1) connections - // (bound_pos - this_pos) in the denominator "linearizes" the quadratic term (bound_pos - this_pos)^2 in the objective function - // This ensures that the objective function target HPWL, rather than quadratic wirelength. - double weight = 1.0 / ((num_pins - 1) * std::max(1, std::abs(bound_pos - this_pos))); - - /* - * TODO: adding timing weights to matrix entries - *if (this_pin != 0){ - * weight *= (1.0 + tmpCfg.timingWeight * std::pow(place_crit.criticality(net_id, this_pin), tmgCfg.criticalityExponent)); - * } - */ - - stamp_weight_on_matrix(es, dir, this_blk, this_blk, weight); - stamp_weight_on_matrix(es, dir, this_blk, bound_blk, -weight); - stamp_weight_on_matrix(es, dir, bound_blk, bound_blk, weight); - stamp_weight_on_matrix(es, dir, bound_blk, this_blk, -weight); -} - -// Build the system of equations for either X or Y -void AnalyticPlacer::build_equations(EquationSystem& es, bool yaxis, int iter) { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - // Return the x or y position of a block - auto blk_p = [&](ClusterBlockId blk_id) { return yaxis ? blk_locs[blk_id].loc.y : blk_locs[blk_id].loc.x; }; - // Return legal position from legalization, after first iteration - auto legal_p = [&](ClusterBlockId blk_id) { return yaxis ? blk_locs[blk_id].legal_loc.y : blk_locs[blk_id].legal_loc.x; }; - es.reset(); - - /* - * Bound2bound model is used in HeAP: - * For each net, the left-most and right-most (or down, up in y direction) are bound blocks - * These 2 blocks form connections with each other and all the other blocks (internal blocks) - * These connections are used to formulate the matrix equation - */ - for (auto net_id : clb_nlist.nets()) { - if (clb_nlist.net_is_ignored(net_id) - || clb_nlist.net_driver(net_id) == ClusterPinId::INVALID() - || clb_nlist.net_sinks(net_id).empty()) { - // ensure net is not ignored (ex. clk nets), has valid driver, has at least 1 sink - continue; - } - - // find the 2 bound pins (min and max pin) - ClusterPinId min_pin = ClusterPinId::INVALID(), max_pin = ClusterPinId::INVALID(); - int min_pos = std::numeric_limits::max(), max_pos = std::numeric_limits::min(); - for (auto pin_id : clb_nlist.net_pins(net_id)) { - int pos = blk_p(clb_nlist.pin_block(pin_id)); - if (pos < min_pos) { - min_pos = pos; - min_pin = pin_id; - } - if (pos > max_pos) { - max_pos = pos; - max_pin = pin_id; - } - } - VTR_ASSERT(min_pin != ClusterPinId::INVALID()); - VTR_ASSERT(max_pin != ClusterPinId::INVALID()); - - int num_pins = clb_nlist.net_pins(net_id).size(); - for (int ipin = 0; ipin < num_pins; ipin++) { - ClusterPinId pin_id = clb_nlist.net_pin(net_id, ipin); - // for each pin in net, connect to 2 bound pins (bound2bound model) - add_pin_to_pin_connection(es, yaxis, num_pins, min_pin, pin_id); - if (pin_id != min_pin) - // avoid adding min_pin to max_pin connection twice - add_pin_to_pin_connection(es, yaxis, num_pins, max_pin, pin_id); - } - } - - // Add pseudo-connections to anchor points (legalized position for each block) after first iteration - // These pseudo-connections pull blocks towards their legal locations, which tends to reduce overlaps in the placement, - // also so that the next iteration of build-solving matrix doesn't destroy the placement from last iteration. - // As weight increases with number of iterations, solver's solution converges with the legal placement. - if (iter != -1) { // if not the first AP iteration - for (size_t row = 0; row < solve_blks.size(); row++) { - int l_pos = legal_p(solve_blks.at(row)); // legalized position from last iteration (anchors) - int solver_blk_pos = blk_p(solve_blks.at(row)); // matrix solved block position from last iteration - - // weight increases with iteration --> psudo-connection strength increases to force convergence to legal placement - // weight is also higher for blocks that haven't moved much from their solver location to their legal location - double weight = ap_cfg.alpha * iter / std::max(1, std::abs(l_pos - solver_blk_pos)); - - // Adding coefficient to Matrix[row][row] and adding weight to rhs vector is equivalent to adding connection - // to an immovable block at legal position. - // The equation becomes Weight * (blk_pos - legal_pos) = 0, where blk_pos is the variable to solve in rhs[row], - // legal_pos is a constant - // see comment for add_pin_to_pin_connection() -> special_case: immovable/fixed block - es.add_coeff(row, row, weight); - es.add_rhs(row, weight * l_pos); - } - } -} - -/* - * Solve the system of equations - * A formulated system of equation es is passed in - * yaxis represents if it's x-directed or y-directed location problem - * Solved solution is moved to loc, rawx, rawy in blk_locs for each block - */ -void AnalyticPlacer::solve_equations(EquationSystem& es, bool yaxis) { - int max_x = g_vpr_ctx.device().grid.width(); - int max_y = g_vpr_ctx.device().grid.height(); - - auto blk_pos = [&](ClusterBlockId blk_id) { return yaxis ? blk_locs[blk_id].rawy : blk_locs[blk_id].rawx; }; - std::vector solve_blks_pos; // each row of solve_blks_pos is a free variable (movable block of the right type to be placed) - // put current location of solve_blks into solve_blks_pos as guess for iterative solver - std::transform(solve_blks.begin(), solve_blks.end(), std::back_inserter(solve_blks_pos), blk_pos); - es.solve(solve_blks_pos, ap_cfg.solverTolerance); - - // move solved locations of solve_blks from solve_blks_pos into blk_locs - // ensure that new location is strictly within [0, grid.width/height - 1]; - for (size_t i_row = 0; i_row < solve_blks_pos.size(); i_row++) - if (yaxis) { - blk_locs[solve_blks.at(i_row)].rawy = std::max(0.0, solve_blks_pos.at(i_row)); - blk_locs[solve_blks.at(i_row)].loc.y = std::min(max_y - 1, std::max(0, int(solve_blks_pos.at(i_row) + 0.5))); - } else { - blk_locs[solve_blks.at(i_row)].rawx = std::max(0.0, solve_blks_pos.at(i_row)); - blk_locs[solve_blks.at(i_row)].loc.x = std::min(max_x - 1, std::max(0, int(solve_blks_pos.at(i_row) + 0.5))); - } -} - -// Debug use, finds # of blocks on each tile location -void AnalyticPlacer::find_overlap(vtr::Matrix& overlap) { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - size_t max_x = g_vpr_ctx.device().grid.width(); - size_t max_y = g_vpr_ctx.device().grid.height(); - - overlap.resize({max_y, max_x}, 0); - - for (auto blk : clb_nlist.blocks()) { - overlap[blk_locs[blk].loc.y][blk_locs[blk].loc.x] += 1; - } -} - -// prints a simple figure of FPGA fabric, with numbers on each tile showing usage -// called in AnalyticPlacer::print_place() -std::string AnalyticPlacer::print_overlap(vtr::Matrix& overlap, FILE* fp) { - int max_x = g_vpr_ctx.device().grid.width(); - int max_y = g_vpr_ctx.device().grid.height(); - - std::string out = ""; - fprintf(fp, "%5s", ""); - for (int i = 0; i < max_x; i++) { - fprintf(fp, "%-5d", i); - } - fprintf(fp, "\n%4s", ""); - fprintf(fp, "%s\n", std::string(5 * max_x + 2, '-').c_str()); - for (int i = 0; i < max_y; i++) { - fprintf(fp, "%-4d|", i); - for (int j = 0; j < max_x; j++) { - int count = overlap[i][j]; - fprintf(fp, "%-5s", ((count == 0) ? "0" : std::to_string(count)).c_str()); - } - fprintf(fp, "|\n"); - } - fprintf(fp, "%4s", ""); - fprintf(fp, "%s\n", std::string(5 * max_x + 2, '-').c_str()); - return out; -} - -/* - * Prints the location of each block, and a simple drawing of FPGA fabric, showing num of blocks on each tile - * Very useful for debugging - * Usage: - * std::string filename = vtr::string_fmt("%s.post_AP.place", clb_nlist.netlist_name().substr(0, clb_nlist.netlist_name().size()-4).c_str()); - * print_place(filename.c_str()); - */ -void AnalyticPlacer::print_place(const char* place_file) { - const DeviceContext& device_ctx = g_vpr_ctx.device(); - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - auto& block_locs = blk_loc_registry_ref_.block_locs(); - - FILE* fp; - - fp = fopen(place_file, "w"); - - fprintf(fp, "Netlist_File: %s Netlist_ID: %s\n", - clb_nlist.netlist_name().c_str(), - clb_nlist.netlist_id().c_str()); - fprintf(fp, "Array size: %zu x %zu logic blocks\n\n", device_ctx.grid.width(), device_ctx.grid.height()); - fprintf(fp, "%-25s %-18s %-12s %-25s %-5s %-5s %-10s %-14s %-8s\n", - "block name", - "logic block type", - "pb_type", - "pb_name", - "x", - "y", - "subblk", - "block number", - "is_fixed"); - fprintf(fp, "%-25s %-18s %-12s %-25s %-5s %-5s %-10s %-14s %-8s\n", - "----------", - "----------------", - "-------", - "-------", - "--", - "--", - "------", - "------------", - "--------"); - - if (!block_locs.empty()) { //Only if placement exists - for (auto blk_id : clb_nlist.blocks()) { - fprintf(fp, "%-25s %-18s %-12s %-25s %-5d %-5d %-10d #%-13zu %-8s\n", - clb_nlist.block_name(blk_id).c_str(), - clb_nlist.block_type(blk_id)->name.c_str(), - clb_nlist.block_type(blk_id)->pb_type->name, - clb_nlist.block_pb(blk_id)->name, - blk_locs[blk_id].loc.x, - blk_locs[blk_id].loc.y, - blk_locs[blk_id].loc.sub_tile, - size_t(blk_id), - (block_locs[blk_id].is_fixed ? "true" : "false")); - } - fprintf(fp, "\ntotal_HPWL: %d\n", total_hpwl()); - vtr::Matrix overlap; - find_overlap(overlap); - fprintf(fp, "Occupancy diagram: \n"); - print_overlap(overlap, fp); - } - fclose(fp); -} - -void AnalyticPlacer::print_AP_status_header() { - VTR_LOG("\n"); - VTR_LOG("---- ------ ------ -------- ------- | ------ --------- ------ ------ ------ ------ -------- -------- --------\n"); - VTR_LOG("Iter Time Iter Best Stall | Run BlockType Solve Solve Spread Legal Solved Spread Legal\n"); - VTR_LOG(" Time hpwl | Time Block Time Time Time hpwl hpwl hpwl\n"); - VTR_LOG(" (sec) (sec) | (sec) Num (sec) (sec) (sec) \n"); - VTR_LOG("---- ------ ------ -------- ------- | ------ --------- ------ ------ ------ ------ -------- -------- --------\n"); -} - -void AnalyticPlacer::print_run_stats(const int iter, - const float time, - const float runTime, - const char* blockType, - const int blockNum, - const float solveTime, - const float spreadTime, - const float legalTime, - const int solvedHPWL, - const int spreadHPWL, - const int legalHPWL) { - VTR_LOG( - "%4zu " - "%6.3f " - " | " - "%6.3f " - "%9s " - "%6d " - "%6.3f " - "%6.3f " - "%6.3f " - "%8d " - "%8d " - "%8d \n", - iter, - time, - runTime, - blockType, - blockNum, - solveTime, - spreadTime, - legalTime, - solvedHPWL, - spreadHPWL, - legalHPWL); -} - -void AnalyticPlacer::print_iter_stats(const int iter, - const float iterTime, - const float time, - const int bestHPWL, - const int stall) { - VTR_LOG( - "%4zu " - "%6.3f " - "%6.3f " - "%8d " - "%7d |\n", - iter, - time, - iterTime, - bestHPWL, - stall); - VTR_LOG(" |\n"); -} - -// sentinel for blks not solved in current iteration -int DONT_SOLVE = std::numeric_limits::max(); - -// sentinel for blks not part of a placement macro -int NO_MACRO = -1; - -#endif /* ENABLE_ANALYTIC_PLACE */ diff --git a/vpr/src/place/analytic_placer.h b/vpr/src/place/analytic_placer.h deleted file mode 100644 index 86e3148185..0000000000 --- a/vpr/src/place/analytic_placer.h +++ /dev/null @@ -1,326 +0,0 @@ -#ifndef VPR_ANALYTIC_PLACEMENT_H -#define VPR_ANALYTIC_PLACEMENT_H - -#ifdef ENABLE_ANALYTIC_PLACE -/** - * @file - * @brief This file implements the analytic placer, described as lower-bound placement in SimPL. It formulates - * the placement problem into a set of linear equations, in the form of a matrix equation. Solving the matrix - * equation gives the minimum of the objective function, in this case wirelength. The result placement, although - * most optimal in terms of optimization, thus the name lower-bound placement, almost always is not legal. This - * lower-bound solution is then legalized using Cut-Spreading (@see cut_spreader.h). - * - ************************************************************************************************************** - * Algorithm Overview * - ************************************************************************************************************** - * - * The most common objective function for placement is the sum of half-perimeter wirelengths (HPWL) over all nets. - * Efficient AP techniques approximate this objective function with a function that can be minimized efficiently. - * - * First, all multi-pin nets are converted into a set of 2-pin connections. In SimPL/HeAP, the Bound2bound net - * model is used. For each multi-pin net, the blocks with the minimum and maximum locations (in either x or y directon - * as build-solve operates on only 1 direction at a time) on a net (so-called bound-blocks) are connected to each - * other and to each internal block on the net. In other words, for a p-terminal net, each internal block has 2 connections, - * one to each bound block, and each bound block has p-1 connections, one to every block other than itself. - * - * Then, the weighted sum of the squared lengths of these 2-pin connections are minimized. This objective function - * can be separated into x and y components and cast in matrix form. To minimize this degree-2 polynomial, partial - * derivative is taken with respect to each variable. Setting the resulting system of linear equations to 0 gives - * the following equation (only x direction shown): - * Qx = -c - * where Q is a matrix capturing all connection between movable objects (objects to solve), x is a vector of all - * movable block locations (free variables), and c is a vector representing connections between movable and fixed objects. - * *** for detailed derivation and an example, refer to comments for add_pin_to_pin_connection() in analytic_placer.cpp. - * - * After formulating Q and c, a standard off-the-shelf solver (Eigen package) is used to solve for x. This completes - * the lower-bound placement. - * - * However, since the objective function does not take placement constraints into consideration, the generated - * solution is not legal. It generally has many blocks overlapping with one another, and the blocks may be on - * incompatible physical tiles. To legalize this solution, a geometric partitioning and spreading technique, introduced - * in SimPL, is used (@see cut_sreader.h). This completes the upper-bound placement. - * - * After the completion of 1 iteration of lower-bound & upper-bound placement, artificial pseudo connections are created - * between each block and its target location in the legalized overlap-free placement. When the mathematical system is - * again formulated and solved, the pseudo connections pull blocks towards their target locations, which tends to reduce - * overlaps in the placement. The strength of pseudo-connections increase with iterations, making lower-bound and - * upper-bound solutions converge. - * - * This process of formulating the system, solving, and legalizing is repeated until sufficiently good placement is - * acquired. Currently the stopping criterion is HEAP_STALLED_ITERATIONS_STOP iterations without improvement in total_hpwl. - * - * - * Parameters to tweak & things to try out - * ======================================= - * Currently the QoR of AP+quench combination is slightly worse than SA. See PR #1504 for comparison. - * The following parameters/things can be tweaked to find the best configuration: - * - * * Stopping criteria when to stop AP iterations, see (AnalyticPlacer::ap_place()) - * * PlacerHeapCfg.alpha anchoring strength of pseudo-connection - * * PlacerHeapCfg.beta overutilization factor (@see CutSpreader::SpreaderRegion.overused()) - * * PlacerHeapCfg.timingWeight implement timing in AP (@see AnalyticPlacer::build_equations()) - * * PlacerHeapCfg.criticality same as above - * * Interaction with SA: - * * init_t Initial temperature of annealer after AP (currently init_t = 0) - * * quench inner_num how much swapping in quenching to attemp - * * quench_recompute_limit frequency of criticality update in quenching to improve quench results - * - * @cite SimPL - * Original analytic placer with cut-spreading legalizing was intended for ASIC design, proposed in SimPL. - * SimPL: An Effective Placement Algorithm, Myung-Chul Kim, Dong-Jin Lee and Igor L. Markov - * http://www.ece.umich.edu/cse/awards/pdfs/iccad10-simpl.pdf - * - * @cite HeAP - * FPGA adaptation of SimPL, targeting FPGAs with heterogeneous blocks located at discrete locations. - * Analytical Placement for Heterogeneous FPGAs, Marcel Gort and Jason H. Anderson - * https://janders.eecg.utoronto.ca/pdfs/marcelfpl12.pdf - * - * @cite nextpnr - * An implementation of HeAP, which the cut-spreader and legalizer here is based off of. Implementation details - * have been modified for the architecture and netlist specification of VTR, and better performance. - * nextpnr -- Next Generation Place and Route, placer_heap, David Shah - * https://github.com/YosysHQ/nextpnr - */ - -#include "vpr_context.h" -#include "PlacementDelayCalculator.h" - -class PlaceMacros; - -/* - * @brief Templated struct for constructing and solving matrix equations in analytic placer - * Eigen library is used in EquationSystem::solve() - */ -template -struct EquationSystem; - -// sentinel for blks not solved in current iteration -extern int DONT_SOLVE; - -// sentinel for blks not part of a placement macro -extern int NO_MACRO; - -class AnalyticPlacer { - public: - /* - * @brief Constructor of AnalyticPlacer, currently initializes AnalyticPlacerCfg for the analytic placer - * To tune these parameters, change directly in constructor - */ - AnalyticPlacer() = delete; - explicit AnalyticPlacer(BlkLocRegistry& blk_loc_registry, const PlaceMacros& place_macros); - - /* - * @brief main function of analytic placement - * Takes the random initial placement from place.cpp through g_vpr_ctx - * Repeat the following until stopping criteria is met: - * * Formulate and solve equations in x, y directions for 1 type of logial block - * * Instantiate CutSpreader to spread and strict_legalize() to strictly legalize - * - * The final legal placement is passed back to annealer in g_vpr_ctx.mutable_placement() - */ - void ap_place(); - - private: - // for CutSpreader to access placement info from solver (legal_pos, block_locs, etc). - friend class CutSpreader; - - // AP parameters that can influence it's behavior - struct AnalyticPlacerCfg { - float alpha; // anchoring strength of pseudo-connections - float beta; // over-utilization factor - int criticalityExponent; // not currently used, @see build_equations() - int timingWeight; // not currently used, @see build_equations() - float solverTolerance; // parameter of the solver - int buildSolveIter; // build_solve iterations for iterative solver - int spread_scale_x, spread_scale_y; // see CutSpreader::expand_regions() - }; - - AnalyticPlacerCfg ap_cfg; // TODO: PlacerHeapCfg should be externally configured & supplied - - // Lokup of all sub_tiles by sub_tile type - // legal_pos[0..device_ctx.num_block_types-1][0..num_sub_tiles - 1][0..num_legal - 1] = t_pl_loc for a single - // placement location of the proper tile type and sub_tile type. - std::vector>> legal_pos; - - // row number in the system of linear equations for each block - // which corresponds to the equation produced by differentiating objective function w.r.t that block location - vtr::vector_map row_num; - - // Encapsulates 3 types of locations for each logic block - struct BlockLocation { - t_pl_loc loc; // real, up-to-date location of the logic block in the AP process - // first initiated with initial random placement from g_vpr_ctx - // then, eath time after solving equations, it's updated with rounded - // raw solutions from solver - // finally, it is accessed and modified by legalizer to store legal placement - // at the end of each AP iteration - - t_pl_loc legal_loc; // legalized location, used to create psudo connections in the next AP iteration - // updated in AP main loop in ap_place() at the end of each iteration - - double rawx, rawy; // raw location storing float result from matrix solver - // used by cut_speader to spread out logic blocks using linear interpolation - }; - - // Lookup from blockID to block location - vtr::vector_map blk_locs; - - // reference to the placement location variables - BlkLocRegistry& blk_loc_registry_ref_; - - // Reference to the placement macros. - const PlaceMacros& place_macros_; - - /* - * The set of blks of different types to be placed by AnalyticPlacement process, - * i.e. the free variable blocks. - * Excludes non-head macro blocks (blocks part of placement macros but not the head), fixed blocks, and blocks - * with no connections. - */ - std::vector place_blks; - - // blocks of the same type to be solved in the current formulation of matrix equation - // which are a subset of place_blks - std::vector solve_blks; - - /* - * Prints the location of each block, and a simple drawing of FPGA fabric, showing num of blocks on each tile - * Very useful for debugging - * See implementation for usage - */ - void print_place(const char* place_file); - - //build fast lookup of compatible tiles/subtiles by tile, x, y, subtiles - void build_fast_tiles(); - - // build legal_pos - void build_legal_locations(); - - // build blk_locs based on initial placement from place_ctx. - // put blocks that needs to be placed in place_blks; - void init(); - - // get hpwl for a net - int get_net_hpwl(ClusterNetId net_id); - - // get hpwl for all nets - int total_hpwl(); - - // build matrix equations and solve for block type "run" in both x and y directions - // macro member positions are updated after solving - // iter is used to determine pseudo-connection strength - void build_solve_type(t_logical_block_type_ptr run, int iter); - - /* - * Setup the blocks of type blkTypes (ex. clb, io) to be solved. These blocks are put into - * solve_blks vector. Each of them is a free variable in the matrix equation (thus excluding - * macro members, as they are formulated into the equation for the macro's head) - * A row number is assigned to each of these blocks, which corresponds to its equation in - * the matrix (the equation acquired from differentiating the objective function w.r.t its - * x or y location). - */ - void setup_solve_blks(t_logical_block_type_ptr blkTypes); - - /* - * Update the location of all members of all macros based on location of macro_head - * since only macro_head is solved (connections to macro members are also taken into account - * when formulating the matrix equations), a location update for members is necessary - */ - void update_macros(); - - /* - * Build and solve in one direction - * yaxis chooses x or y location of each block from blk_locs to formulate the matrix equation - * Solved solutions are written back to block_locs[blk].rawx/rawy for double float raw solution, - * rounded int solutions are written back to block_locs[blk].loc, for each blk in solve_blks - * - * iter is the number of AnalyticPlacement iterations (solving and legalizing all types of logic - * blocks once). When iter != -1, at least one iteration has completed. It signals build_equations() - * to create pseudo-connections between each block and its prior legal position. - * - * build_solve_iter determines number of iterations of building and solving for the iterative solver - * (i.e. more build_solve_iter means better result, with runtime tradeoff. This parameter can be - * tuned for better performance) - * the solution from the previous build-solve iteration is used as a guess for the iterative solver - */ - void build_solve_direction(bool yaxis, int iter, int build_solve_iter); - - /* - * Stamp 1 weight for 1 connection on matrix or rhs vector - * if var is movable objects, weight is added on matrix - * if var is immovable objects, weight*-var_pos is added on rhs - * if var is a macro member (not macro head), weight*-offset_from_macro_head is added on rhs - * - * for detailed derivation and examples, see comments for add_pin_to_pin_connection() in analytic_placer.cpp - */ - void stamp_weight_on_matrix(EquationSystem& es, - bool dir, - ClusterBlockId var, - ClusterBlockId eqn, - double weight); - - /* - * Add weights for connection between bound_pin and this_pin into matrix - * Calculate weight for connection and stamp them into appropriate position in matrix by invoking - * stamp_weight_on_matrix() multiple times. For more detail, see comments in implementation. - */ - void add_pin_to_pin_connection(EquationSystem& es, - bool dir, - int num_pins, - ClusterPinId bound_pin, - ClusterPinId this_pin); - - /* - * Build the system of equations for either X or Y - * When iter != -1, for each block, psudo-conenction to its prior legal location is formed, - * the strength is determined by ap_cfg.alpha and iter - */ - void build_equations(EquationSystem& es, bool yaxis, int iter = -1); - - /* - * Solve the system of equations passed in by es, for the set of blocks in data member solve_blks - * yaxis is used to select current x or y location of these blocks from blk_locs - * this current location is provided to iterative solver as a guess - * the solved location is written back to blk_locs, and is used as guess for the next - * iteration of solving (@see build_solve_direct()) - */ - void solve_equations(EquationSystem& es, bool yaxis); - - /* - * Debug use - * finds # of blocks on each tile location, returned in overlap matrix - */ - void find_overlap(vtr::Matrix& overlap); - - /* - * Debug use - * prints a simple figure of FPGA fabric, with numbers on each tile showing usage. - * called in AnalyticPlacer::print_place() - */ - std::string print_overlap(vtr::Matrix& overlap, FILE* fp); - - // header of VTR_LOG for AP - void print_AP_status_header(); - - void print_run_stats(const int iter, - const float time, - const float runTime, - const char* blockType, - const int blockNum, - const float solveTime, - const float spreadTime, - const float legalTime, - const int solvedHPWL, - const int spreadHPWL, - const int legalHPWL); - - void print_iter_stats(const int iter, - const float iterTime, - const float time, - const int bestHPWL, - const int stall); -}; - -#endif /* ENABLE_ANALYTIC_PLACE */ - -#endif /* VPR_ANALYTIC_PLACEMENT_H */ diff --git a/vpr/src/place/cut_spreader.cpp b/vpr/src/place/cut_spreader.cpp deleted file mode 100644 index 9dfe17f83c..0000000000 --- a/vpr/src/place/cut_spreader.cpp +++ /dev/null @@ -1,1174 +0,0 @@ -#include "place_macro.h" -#ifdef ENABLE_ANALYTIC_PLACE - -#include "cut_spreader.h" -#include -#include -#include -#include - -#include "analytic_placer.h" -#include "vpr_types.h" -#include "vtr_time.h" -#include "globals.h" -#include "vtr_log.h" -#include "place_util.h" -#include "grid_block.h" - -// sentinel for base case in CutSpreader (i.e. only 1 block left in region) -constexpr std::pair BASE_CASE = {-2, -2}; - -// sentinel for cut-spreading fail, the other direction is run next -constexpr std::pair CUT_FAIL = {-1, -1}; - -// sentinel for a grid location that is not covered by any regions, for reg_id_at_grid data member -constexpr int AP_NO_REGION = -1; - -/* - * Constructor of CutSpreader - * @param analytic_placer: used to access AnalyticPlacer data members (lower-bound solutions) - * @param blk_t: logical block type to legalize - */ -CutSpreader::CutSpreader(AnalyticPlacer* analytic_placer, t_logical_block_type_ptr blk_t) - : ap(analytic_placer) - , blk_type(blk_t) { - // builds n_subtiles_at_location data member, which is a quick lookup of number of compatible subtiles at x, y. - size_t max_x = g_vpr_ctx.device().grid.width(); - size_t max_y = g_vpr_ctx.device().grid.height(); - subtiles_at_location.resize({max_x, max_y}); - for (auto& tile : blk_type->equivalent_tiles) { - for (auto sub_tile : tile->sub_tiles) { - // find all sub_tile types compatible with blk_t - auto result = std::find(sub_tile.equivalent_sites.begin(), sub_tile.equivalent_sites.end(), blk_type); - if (result != sub_tile.equivalent_sites.end()) { - for (auto loc : ap->legal_pos.at(tile->index).at(sub_tile.index)) { - subtiles_at_location[loc.x][loc.y].push_back(loc); - } - } - } - } -} - -/* - * @brief: Executes the cut-spreader algorithm described in algorithm overview in header file. - * Does not include strict_legalize so placement result is not guaranteed to be legal. - * Strict_legalize must be run after for legal placement result, and for legal placement to - * be passed to annealer through vpr_ctx. - * - * Input placement is passed by data members (blk_locs) in analytic_placer - * - * @return result placement is passed to strict legalizer by modifying blk_locs in analytic_placer - */ -void CutSpreader::cutSpread() { - init(); // initialize data members based on solved solutions from AnalyticPlacer - find_overused_regions(); //find all overused regions bordered by non-overused regions - expand_regions(); // expand overused regions until they have enough sub_tiles to accommodate their logic blks - - /* - * workqueue is a FIFO queue used to recursively cut-spread. - * - * In the region vector, the regions not in merged_regions (not absorbed in expansion process) - * are the initial regions placed in workqueue to cut-spread. - * - * After each of these initial regions are cut and spread, their child sub-regions - * (left and right) are placed at the back of workqueue, with alternated cut direction. - * This process continues until base case of region with only 1 block is reached, - * indicated by BASE_CASE return value. - * - * Return value of CUT_FAIL indicates that cutting is unsuccessful. This usually happens - * when regions are quite small: for example, region only has 1 column so a vertical cut - * is impossible. In this case cut in the other direction is attempted. - */ - std::queue> workqueue; - - // put initial regions into workqueue - for (auto& r : regions) { - if (!merged_regions.count(r.id)) - workqueue.emplace(r.id, false); - } - - while (!workqueue.empty()) { - auto front = workqueue.front(); - workqueue.pop(); - auto& r = regions.at(front.first); - - auto res = cut_region(r, front.second); - if (res == BASE_CASE) // only 1 block left, base case - continue; - if (res != CUT_FAIL) { // cut-spread successful - // place children regions in workqueue - workqueue.emplace(res.first, !front.second); - workqueue.emplace(res.second, !front.second); - } else { // cut-spread unsuccessful - auto res2 = cut_region(r, !front.second); // try other direction - if (res2 != CUT_FAIL) { - // place children regions in workqueue - workqueue.emplace(res2.first, front.second); - workqueue.emplace(res2.second, front.second); - } - } - } -} - -// setup CutSpreader data structures using information from AnalyticPlacer -void CutSpreader::init() { - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - const auto& place_macros = ap->place_macros_; - - size_t max_x = g_vpr_ctx.device().grid.width(); - size_t max_y = g_vpr_ctx.device().grid.height(); - - occupancy.resize({max_x, max_y}, 0); - macro_extent.resize({max_x, max_y}); - reg_id_at_grid.resize({max_x, max_y}, AP_NO_REGION); - blk_extents.resize(ap->blk_locs.size(), vtr::Rect{-1, -1, -1, -1}); - blks_at_location.resize({max_x, max_y}, std::vector{}); - - // Initialize occupancy matrix, reg_id_at_grid and macros matrix - for (int x = 0; x < (int)max_x; x++) { - for (int y = 0; y < (int)max_y; y++) { - occupancy[x][y] = 0; - reg_id_at_grid[x][y] = AP_NO_REGION; - macro_extent[x][y] = {x, y, x, y}; - } - } - - // lambda function to absorb x, y in blk's macro's extent - auto set_macro_ext = [&](ClusterBlockId blk, int x, int y) { - if (blk_extents[blk] == vtr::Rect{-1, -1, -1, -1}) { - blk_extents.update(blk, {x, y, x, y}); - } else { - blk_extents[blk].expand_bounding_box({x, y, x, y}); - } - }; - - for (size_t i = 0; i < ap->blk_locs.size(); i++) { // loop through ap->blk_locs - auto blk = ClusterBlockId{(int)i}; - if (clb_nlist.block_type(blk) == blk_type) { - auto loc = ap->blk_locs[blk].loc; - occupancy[loc.x][loc.y]++; - // compute extent of macro member - if (place_macros.get_imacro_from_iblk(blk) != NO_MACRO) { // if blk is a macro member - // only update macro heads' extent in blk_extents - set_macro_ext(place_macros.macro_head(blk), loc.x, loc.y); - } - } - } - - for (size_t i = 0; i < ap->blk_locs.size(); i++) { // loop through ap->blk_locs - ClusterBlockId blk = ClusterBlockId{(int)i}; - if (clb_nlist.block_type(blk) == blk_type) { - // Transfer macro extents to the actual macros structure; - if (place_macros.get_imacro_from_iblk(blk) != NO_MACRO) { // if blk is a macro member - // update macro_extent for all macro members in macros - // for single blocks (not in macro), macros[x][y] = {x, y, x, y} - vtr::Rect& me = blk_extents[place_macros.macro_head(blk)]; - auto loc = ap->blk_locs[blk].loc; - auto& lme = macro_extent[loc.x][loc.y]; - lme.expand_bounding_box(me); - } - } - } - - // get solved_solution from AnalyticPlacer - for (auto blk : ap->solve_blks) { - if (clb_nlist.block_type(blk) == blk_type) - blks_at_location[ap->blk_locs[blk].loc.x][ap->blk_locs[blk].loc.y].push_back(blk); - } -} - -int CutSpreader::occ_at(int x, int y) { - //TODO: layer_num should be passed - if (!is_loc_on_chip({x, y, 0})) { - return 0; - } - return occupancy[x][y]; -} - -int CutSpreader::tiles_at(int x, int y) { - //TODO: layer_num should be passed - if (!is_loc_on_chip({x, y, 0})) { - return 0; - } - return int(subtiles_at_location[x][y].size()); -} - -/* - * When expanding a region, it might overlap with another region, one of them (merger) will absorb - * the other (mergee) by merging. @see expand_regions() below; - * - * Merge mergee into merged by: - * * change group id at mergee grids to merged id - * * adds all n_blks and n_tiles from mergee to merged region - * * grow merged to include all mergee grids - */ -void CutSpreader::merge_regions(SpreaderRegion& merged, SpreaderRegion& mergee) { - for (int x = mergee.bb.xmin(); x <= mergee.bb.xmax(); x++) - for (int y = mergee.bb.ymin(); y <= mergee.bb.ymax(); y++) { - //TODO: layer_num should be passed - if (!is_loc_on_chip({x, y, 0})) { //location is not within the chip - continue; - } - //x and y might belong to "merged" region already, no further action is required - if (merged.id == reg_id_at_grid[x][y]) { - continue; - } - reg_id_at_grid[x][y] = merged.id; //change group id at mergee grids to merged id - //adds all n_blks and n_tiles from mergee to merged region - merged.n_blks += occ_at(x, y); - merged.n_tiles += tiles_at(x, y); - } - merged_regions.insert(mergee.id); // all merged_regions are ignored in main loop - grow_region(merged, mergee.bb); // grow merged to include all mergee grids -} - -/* - * grow r to include a rectangular region rect_to_include - * - * when init == true, grow_region() initializes SpreaderRegion r - * in this case, both r and rect_to_include contains the same 1 tile location: the initial overused tile - * see find_overused_regions where SpreaderRegion r is created. - * this tile location is processed although it's technically included. - */ -void CutSpreader::grow_region(SpreaderRegion& r, vtr::Rect rect_to_include, bool init) { - // when given location is within SpreaderRegion - if ((r.bb.contains(rect_to_include)) && !init) - return; - - vtr::Rect r_old = r.bb; - r_old.set_xmin(r.bb.xmin() + (init ? 1 : 0)); // ensure the initial location is processed in the for-loop later, when init == 1 - r.bb.expand_bounding_box(rect_to_include); - - auto process_location = [&](int x, int y) { - //x and y should represent a location on the chip, otherwise no processing is required - //TODO: layer_num should be passed - if (!is_loc_on_chip({x, y, 0})) { - return; - } - // kicks in only when grid is not claimed, claimed by another region, or part of a macro - // Merge with any overlapping regions - if (reg_id_at_grid[x][y] == AP_NO_REGION) { - r.n_tiles += tiles_at(x, y); - r.n_blks += occ_at(x, y); - } - if (reg_id_at_grid[x][y] != AP_NO_REGION && reg_id_at_grid[x][y] != r.id) - merge_regions(r, regions.at(reg_id_at_grid[x][y])); - reg_id_at_grid[x][y] = r.id; - // Grow to cover any macros - auto& macro_bb = macro_extent[x][y]; - grow_region(r, macro_bb); - }; - // process new areas after including rect_to_include, while avoiding double counting old region - for (int x = r.bb.xmin(); x < r_old.xmin(); x++) - for (int y = r.bb.ymin(); y <= r.bb.ymax(); y++) - process_location(x, y); - for (int x = r_old.xmax() + 1; x <= r.bb.xmax(); x++) - for (int y = r.bb.ymin(); y <= r.bb.ymax(); y++) - process_location(x, y); - for (int y = r.bb.ymin(); y < r_old.ymin(); y++) - for (int x = r.bb.xmin(); x <= r.bb.xmax(); x++) - process_location(x, y); - for (int y = r_old.ymax() + 1; y <= r.bb.ymax(); y++) - for (int x = r.bb.xmin(); x <= r.bb.xmax(); x++) - process_location(x, y); -} - -// Find overutilized regions surrounded by non-overutilized regions -void CutSpreader::find_overused_regions() { - int max_x = g_vpr_ctx.device().grid.width(); - int max_y = g_vpr_ctx.device().grid.height(); - for (int x = 0; x < max_x; x++) - for (int y = 0; y < max_y; y++) { - if (reg_id_at_grid[x][y] != AP_NO_REGION || (occ_at(x, y) <= tiles_at(x, y))) - // already in a region or not over-utilized - continue; - - // create new overused region - int id = int(regions.size()); - reg_id_at_grid[x][y] = id; - SpreaderRegion reg; - reg.id = id; - reg.bb = {x, y, x, y}; - reg.n_tiles = reg.n_blks = 0; - reg.n_tiles += tiles_at(x, y); - reg.n_blks += occ_at(x, y); - - // initialize reg and ensure it covers macros - grow_region(reg, {x, y, x, y}, true); - - bool expanded = true; - while (expanded) { - expanded = false; - // keep expanding in x and y, until expansion in x, y cannot find overutilised blks - - // try expanding in x - if (reg.bb.xmax() < max_x - 1) { - bool over_occ_x = false; - for (int y1 = reg.bb.ymin(); y1 <= reg.bb.ymax(); y1++) { - if (occ_at(reg.bb.xmax() + 1, y1) > tiles_at(reg.bb.xmax() + 1, y1)) { - over_occ_x = true; - break; - } - } - if (over_occ_x) { - expanded = true; - grow_region(reg, {reg.bb.xmin(), reg.bb.ymin(), reg.bb.xmax() + 1, reg.bb.ymax()}); - } - } - // try expanding in y - if (reg.bb.ymax() < max_y - 1) { - bool over_occ_y = false; - for (int x1 = reg.bb.xmin(); x1 <= reg.bb.xmax(); x1++) { - if (occ_at(x1, reg.bb.ymax() + 1) > tiles_at(x1, reg.bb.ymax() + 1)) { - over_occ_y = true; - break; - } - } - if (over_occ_y) { - expanded = true; - grow_region(reg, {reg.bb.xmin(), reg.bb.ymin(), reg.bb.xmax(), reg.bb.ymax() + 1}); - } - } - } - regions.push_back(reg); - } -} - -/* - * Expand all utilized regions until they satisfy n_tiles * beta >= n_blocks - * If overutilized regions overlap in this process, they are merged - */ -void CutSpreader::expand_regions() { - int max_x = g_vpr_ctx.device().grid.width(); - int max_y = g_vpr_ctx.device().grid.height(); - - std::queue overused_regions; - float beta = ap->ap_cfg.beta; - for (auto& r : regions) - // if region is not merged and is overused, move into overused_regions queue - if (!merged_regions.count(r.id) && r.overused(beta)) - overused_regions.push(r.id); - - while (!overused_regions.empty()) { // expand all overused regions - int rid = overused_regions.front(); - overused_regions.pop(); - if (merged_regions.count(rid)) - continue; - auto& reg = regions.at(rid); - while (reg.overused(beta)) { - bool changed = false; - - // spread_scale determines steps in x or y direction to expand each time - for (int j = 0; j < ap->ap_cfg.spread_scale_x; j++) { - if (reg.bb.xmin() > 0) { // expand in -x direction - grow_region(reg, {reg.bb.xmin() - 1, reg.bb.ymin(), reg.bb.xmax(), reg.bb.ymax()}); - changed = true; - if (!reg.overused(beta)) - break; - } - if (reg.bb.xmax() < max_x - 1) { // expand in +x direction - grow_region(reg, {reg.bb.xmin(), reg.bb.ymin(), reg.bb.xmax() + 1, reg.bb.ymax()}); - changed = true; - if (!reg.overused(beta)) - break; - } - } - - for (int j = 0; j < ap->ap_cfg.spread_scale_y; j++) { - if (reg.bb.ymin() > 0) { // expand in -y direction - grow_region(reg, {reg.bb.xmin(), reg.bb.ymin() - 1, reg.bb.xmax(), reg.bb.ymax()}); - changed = true; - if (!reg.overused(beta)) - break; - } - if (reg.bb.ymax() < max_y - 1) { // expand in +y direction - grow_region(reg, {reg.bb.xmin(), reg.bb.ymin(), reg.bb.xmax(), reg.bb.ymax() + 1}); - changed = true; - if (!reg.overused(beta)) - break; - } - } - VTR_ASSERT(changed || reg.n_tiles >= reg.n_blks); - } - VTR_ASSERT(reg.n_blks <= reg.n_tiles); - } -} - -/* - * Recursive cut-based spreading in HeAP paper - * "left" denotes "-x, -y", "right" denotes "+x, +y" depending on dir - * - * @param r region to cut & spread - * @param dir direction, true for y, false for x - * - * @return a pair of sub-region IDs created from cutting region r. - * BASE_CASE if base case is reached - * CUT_FAIL if cut unsuccessful, need to cut in the other direction - */ -std::pair CutSpreader::cut_region(SpreaderRegion& r, bool dir) { - const DeviceContext& device_ctx = g_vpr_ctx.device(); - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - const auto& place_macros = ap->place_macros_; - - // TODO: CutSpreader is not compatible with 3D FPGA - VTR_ASSERT(device_ctx.grid.get_num_layers() == 1); - int layer_num = 0; - - std::vector cut_blks; - init_cut_blks(r, cut_blks); // copy all logic blocks to cut into cut_blks - - // Trim the boundaries of the region in axis-of-interest, skipping any rows/cols without any tiles of the right type - int trimmed_l, trimmed_r; - std::pair(trimmed_l, trimmed_r) = trim_region(r, dir); - - // base case (only 1 block left in region) - if (cut_blks.size() == 1) { - // ensure placement of last block is on right type of tile - auto blk = cut_blks.at(0); - auto& tiles_type = clb_nlist.block_type(blk)->equivalent_tiles; - auto loc = ap->blk_locs[blk].loc; - if (std::find(tiles_type.begin(), tiles_type.end(), device_ctx.grid.get_physical_type({loc.x, loc.y, loc.layer})) == tiles_type.end()) { - // logic block type doesn't match tile type - // exhaustive search for tile of right type - // this search should be fast as region must be small at this point (only 1 logic block left) - for (int x = r.bb.xmin(); x <= r.bb.xmax(); x++) - for (int y = r.bb.ymin(); y <= r.bb.ymax(); y++) { - if (std::find(tiles_type.begin(), tiles_type.end(), device_ctx.grid.get_physical_type({x, y, layer_num})) != tiles_type.end()) { - VTR_ASSERT(blks_at_location[x][y].empty()); - ap->blk_locs[blk].rawx = x; - ap->blk_locs[blk].rawy = y; - ap->blk_locs[blk].loc.x = x; - ap->blk_locs[blk].loc.y = y; - blks_at_location[x][y].push_back(blk); - blks_at_location[loc.x][loc.y].clear(); - return BASE_CASE; - } - } - } - return BASE_CASE; - } - - // sort blks based on raw location - std::stable_sort(cut_blks.begin(), cut_blks.end(), [&](const ClusterBlockId a, const ClusterBlockId b) { - return dir ? (ap->blk_locs[a].rawy < ap->blk_locs[b].rawy) : (ap->blk_locs[a].rawx < ap->blk_locs[b].rawx); - }); - - /* - * Generate initial source cut. It cuts logic blocks in region r into 2 partitions. - * Initially, ensure that both partitions have similar numbers of logic blocks. - * Find the midpoint (in terms of total block size, including macros) in sorted cut_blks - * This is the initial source cut - */ - int clearance_l, clearance_r; - int pivot = initial_source_cut(r, cut_blks, dir, clearance_l, clearance_r); - - /* - * Generate initial target cut. It cuts the physical tiles into 2 sub-areas, into which - * the 2 partitions of logic blocks will be placed. - * - * The difference in utilization (# blocks / # tiles) should be smallest, while meeting - * clearance requirement for macros - */ - int left_blks_n, right_blks_n, left_tiles_n, right_tiles_n; - int best_tgt_cut = initial_target_cut(r, cut_blks, pivot, dir, trimmed_l, trimmed_r, - clearance_l, clearance_r, left_blks_n, right_blks_n, left_tiles_n, right_tiles_n); - if (best_tgt_cut == -1) // target cut fails clearance requirement for macros - return CUT_FAIL; - - // Once target_cut is acquired, define left and right subareas - // The boundaries are defined using the trimmed edges and best target cut - // The n_tiles will be final while n_blks may change by perturbing the source cut to eliminate - // overutilization in subareas - SpreaderRegion rl, rr; - rl.id = int(regions.size()); - rl.bb = dir ? vtr::Rect{r.bb.xmin(), trimmed_l, r.bb.xmax(), best_tgt_cut} - : vtr::Rect{trimmed_l, r.bb.ymin(), best_tgt_cut, r.bb.ymax()}; - rl.n_blks = left_blks_n; - rl.n_tiles = left_tiles_n; - rr.id = int(regions.size()) + 1; - rr.bb = dir ? vtr::Rect{r.bb.xmin(), best_tgt_cut + 1, r.bb.xmax(), trimmed_r} - : vtr::Rect{best_tgt_cut + 1, r.bb.ymin(), trimmed_r, r.bb.ymax()}; - rr.n_blks = right_blks_n; - rr.n_tiles = right_tiles_n; - // change the region IDs in each subarea's grid location to subarea's id - for (int x = rl.bb.xmin(); x <= rl.bb.xmax(); x++) - for (int y = rl.bb.ymin(); y <= rl.bb.ymax(); y++) - reg_id_at_grid[x][y] = rl.id; - for (int x = rr.bb.xmin(); x <= rr.bb.xmax(); x++) - for (int y = rr.bb.ymin(); y <= rr.bb.ymax(); y++) - reg_id_at_grid[x][y] = rr.id; - - /* - * Perturb source cut to eliminate over-utilization - * This is done by moving logic blocks from overused subarea to the other subarea one at a time - * until they are no longer overused. - */ - // while left subarea is over-utilized, move logic blocks to the right subarea one at a time - while (pivot > 0 && rl.overused(ap->ap_cfg.beta)) { - auto& move_blk = cut_blks.at(pivot); - int size = (place_macros.get_imacro_from_iblk(move_blk) != NO_MACRO) ? place_macros[place_macros.get_imacro_from_iblk(move_blk)].members.size() : 1; - rl.n_blks -= size; - rr.n_blks += size; - pivot--; - } - // while right subarea is over-utilized, move logic blocks to the left subarea one at a time - while (pivot < int(cut_blks.size()) - 1 && rr.overused(ap->ap_cfg.beta)) { - auto& move_blk = cut_blks.at(pivot + 1); - int size = (place_macros.get_imacro_from_iblk(move_blk) != NO_MACRO) ? place_macros[place_macros.get_imacro_from_iblk(move_blk)].members.size() : 1; - rl.n_blks += size; - rr.n_blks -= size; - pivot++; - } - - // within each subarea, spread the logic blocks into bins to make them more evenly spread out - linear_spread_subarea(cut_blks, dir, 0, pivot + 1, rl); - linear_spread_subarea(cut_blks, dir, pivot + 1, cut_blks.size(), rr); - - // push subareas back to regions so that they can be accessed by their IDs later - regions.push_back(rl); - regions.push_back(rr); - - return std::make_pair(rl.id, rr.id); -} - -// copy all logic blocks to cut into cut_blks -void CutSpreader::init_cut_blks(SpreaderRegion& r, std::vector& cut_blks) { - cut_blks.clear(); - for (int x = r.bb.xmin(); x <= r.bb.xmax(); x++) { - for (int y = r.bb.ymin(); y <= r.bb.ymax(); y++) { - std::copy(blks_at_location[x][y].begin(), blks_at_location[x][y].end(), std::back_inserter(cut_blks)); - } - } -} - -/* - * Trim the boundaries of the region r in axis-of-interest dir, skipping any rows/cols without - * tiles of the right type. - * Afterwards, move blocks in trimmed locations to new trimmed boundaries - */ -std::pair CutSpreader::trim_region(SpreaderRegion& r, bool dir) { - int bb_min = dir ? r.bb.ymin() : r.bb.xmin(); - int bb_max = dir ? r.bb.ymax() : r.bb.xmax(); - int trimmed_l = bb_min, trimmed_r = bb_max; - bool have_tiles = false; - while (trimmed_l < bb_max && !have_tiles) { // trim from left - for (int i = bb_min; i <= bb_max; i++) - if (tiles_at(dir ? i : trimmed_l, dir ? trimmed_l : i) > 0) { - have_tiles = true; - break; - } - if (!have_tiles) // trim when the row/col doesn't have tiles - trimmed_l++; - } - - have_tiles = false; - while (trimmed_r > bb_min && !have_tiles) { // trim from right - for (int i = bb_min; i <= bb_max; i++) - if (tiles_at(dir ? i : trimmed_r, dir ? trimmed_r : i) > 0) { - have_tiles = true; - break; - } - if (!have_tiles) // trim when the row/col doesn't have tiles - trimmed_r--; - } - - // move blocks from trimmed locations to new boundaries - for (int x = r.bb.xmin(); x < (dir ? r.bb.xmax() + 1 : trimmed_l); x++) { - for (int y = r.bb.ymin(); y < (dir ? trimmed_l : r.bb.ymax() + 1); y++) { - for (auto& blk : blks_at_location[x][y]) { - // new location is the closest trimmed boundary - int blk_new_x = dir ? x : trimmed_l, blk_new_y = dir ? trimmed_l : y; - ap->blk_locs[blk].rawx = blk_new_x; - ap->blk_locs[blk].rawy = blk_new_y; - ap->blk_locs[blk].loc.x = blk_new_x; - ap->blk_locs[blk].loc.y = blk_new_y; - blks_at_location[blk_new_x][blk_new_y].push_back(blk); - } - blks_at_location[x][y].clear(); // clear blocks at old location - } - } - - for (int x = (dir ? r.bb.xmin() : trimmed_r + 1); x <= r.bb.xmax(); x++) { - for (int y = (dir ? trimmed_r + 1 : r.bb.ymin()); y <= r.bb.ymax(); y++) { - for (auto& blk : blks_at_location[x][y]) { - // new location is the closest trimmed boundary - int blk_new_x = dir ? x : trimmed_r, blk_new_y = dir ? trimmed_r : y; - ap->blk_locs[blk].rawx = blk_new_x; - ap->blk_locs[blk].rawy = blk_new_y; - ap->blk_locs[blk].loc.x = blk_new_x; - ap->blk_locs[blk].loc.y = blk_new_y; - blks_at_location[blk_new_x][blk_new_y].push_back(blk); - } - blks_at_location[x][y].clear(); // clear blocks at old location - } - } - - return {trimmed_l, trimmed_r}; -} - -/* - * generate the initial source_cut for region r, ensure there is enough clearance on either side of the - * initial cut to accommodate macros - * returns the initial source cut (index into cut_blks) - * returns the clearance in clearance_l, clearance_r - * returns -1 if cannot generate initial source_cut (not enough clearance for macros) - * - * see CutSpreader::cut_region() invocation of initial_source_cut for more detail - */ -int CutSpreader::initial_source_cut(SpreaderRegion& r, - std::vector& cut_blks, - bool dir, - int& clearance_l, - int& clearance_r) { - const auto& place_macros = ap->place_macros_; - - // pivot is the midpoint of cut_blks in terms of total block size (counting macro members) - // this ensures the initial partitions have similar number of blocks - int pivot_blks = 0; // midpoint in terms of total number of blocks - int pivot = 0; // midpoint in terms of index of cut_blks - for (auto& blk : cut_blks) { - // if blk is part of macro (only macro heads in cut_blks, no macro members), add that macro's size - pivot_blks += (place_macros.get_imacro_from_iblk(blk) != NO_MACRO) ? place_macros[place_macros.get_imacro_from_iblk(blk)].members.size() : 1; - if (pivot_blks >= r.n_blks / 2) - break; - pivot++; - } - if (pivot >= int(cut_blks.size())) - pivot = int(cut_blks.size()) - 1; - - // Find clearance required on either side of the pivot - // i.e. minimum distance from left and right bounds of region to pivot - // (no cut within clearance to accommodate macros) - clearance_l = 0, clearance_r = 0; - for (size_t i = 0; i < cut_blks.size(); i++) { - int size; - if (blk_extents.count(cut_blks.at(i))) { - auto& be = blk_extents[cut_blks.at(i)]; - size = dir ? (be.ymax() - be.ymin() + 1) : (be.xmax() - be.xmin() + 1); - } else { - size = 1; - } - if (int(i) < pivot) - clearance_l = std::max(clearance_l, size); - else - clearance_r = std::max(clearance_r, size); - } - return pivot; -} - -/* - * generate the initial target_cut for region r, ensure that utilization in 2 subareas are closest possible - * while meeting clearance requirements for macros - * returns best target cut - */ -int CutSpreader::initial_target_cut(SpreaderRegion& r, - std::vector& cut_blks, - int init_source_cut, - bool dir, - int trimmed_l, - int trimmed_r, - int clearance_l, - int clearance_r, - int& left_blks_n, - int& right_blks_n, - int& left_tiles_n, - int& right_tiles_n) { - const auto& place_macros = ap->place_macros_; - - // To achieve smallest difference in utilization, first move all tiles to right partition - left_blks_n = 0, right_blks_n = 0; - left_tiles_n = 0, right_tiles_n = r.n_tiles; - // count number of blks in each partition, from initial source cut - for (int i = 0; i <= init_source_cut; i++) - left_blks_n += (place_macros.get_imacro_from_iblk(cut_blks.at(i)) != NO_MACRO) ? place_macros[place_macros.get_imacro_from_iblk(cut_blks.at(i))].members.size() : 1; - for (int i = init_source_cut + 1; i < int(cut_blks.size()); i++) - right_blks_n += (place_macros.get_imacro_from_iblk(cut_blks.at(i)) != NO_MACRO) ? place_macros[place_macros.get_imacro_from_iblk(cut_blks.at(i))].members.size() : 1; - - int best_tgt_cut = -1; - double best_deltaU = std::numeric_limits::max(); - - // sweep source cut from left to right, moving tiles from right partition to the left - // calculate the difference in utilization for all target cuts, return the best result - for (int i = trimmed_l; i <= trimmed_r; i++) { - int slither_tiles = 0; - for (int j = dir ? r.bb.xmin() : r.bb.ymin(); j <= (dir ? r.bb.xmax() : r.bb.ymax()); j++) { - slither_tiles += dir ? tiles_at(j, i) : tiles_at(i, j); - } - - left_tiles_n += slither_tiles; - right_tiles_n -= slither_tiles; - - if (((i - trimmed_l) + 1) >= clearance_l && ((trimmed_r - i) + 1) >= clearance_r) { - // if solution accommodates macro clearances - // compare difference in utilization - double tmpU = std::abs(double(left_blks_n) / double(std::max(left_tiles_n, 1)) - double(right_blks_n) / double(std::max(right_tiles_n, 1))); - if (tmpU < best_deltaU) { - best_deltaU = tmpU; - best_tgt_cut = i; - } - } - } - - if (best_tgt_cut == -1) // failed clearance requirement for macros - return best_tgt_cut; - - // update number of tiles for each subarea - left_tiles_n = 0, right_tiles_n = 0; - for (int x = r.bb.xmin(); x <= (dir ? r.bb.xmax() : best_tgt_cut); x++) - for (int y = r.bb.ymin(); y <= (dir ? best_tgt_cut : r.bb.ymax()); y++) - left_tiles_n += tiles_at(x, y); - for (int x = dir ? r.bb.xmin() : (best_tgt_cut + 1); x <= r.bb.xmax(); x++) - for (int y = dir ? (best_tgt_cut + 1) : r.bb.ymin(); y <= r.bb.ymax(); y++) - right_tiles_n += tiles_at(x, y); - - if (left_tiles_n == 0 || right_tiles_n == 0) - // target cut failed since all tiles are still in one subarea - return -1; - - return best_tgt_cut; -} - -/* - * Spread blocks in subarea by linear interpolation - * blks_start and blks_end are indices into cut_blks. The blks between these indices will be spread by: - * * first split the subarea boundaries (area_l and area_r) - * into min(number_of_logic_blocks_in_subarea, 10) number of bins. - * * split the logic blocks into the corresponding number of groups - * * place the logic blocks from their group to their bin, by linear interpolation using their original - * locations to map to a new location in the bin. - */ -void CutSpreader::linear_spread_subarea(std::vector& cut_blks, - bool dir, - int blks_start, - int blks_end, - SpreaderRegion& sub_area) { - double area_l = dir ? sub_area.bb.ymin() : sub_area.bb.xmin(); // left boundary - double area_r = dir ? sub_area.bb.ymax() : sub_area.bb.xmax(); // right boundary - int N = blks_end - blks_start; // number of logic blocks in subarea - if (N <= 2) { // only 1 bin, skip binning and directly linear interpolate - for (int i = blks_start; i < blks_end; i++) { - auto& pos = dir ? ap->blk_locs[cut_blks.at(i)].rawy - : ap->blk_locs[cut_blks.at(i)].rawx; - pos = area_l + (i - blks_start) * ((area_r - area_l) / N); - } - } else { - // Split tiles into K bins, split blocks into K groups - // Since cut_blks are sorted, to specify block groups, only need the index of the left and right block - // Each block group has its original left and right bounds, the goal is to map this group's bound into - // bin's bounds, and assign new locations to blocks using linear interpolation - int K = std::min(N, 10); // number of bins/groups - std::vector> bin_bounds; // (0-th group's first block, 0-th bin's left bound) - bin_bounds.emplace_back(blks_start, area_l); - for (int i_bin = 1; i_bin < K; i_bin++) - // find i-th group's first block, i-th bin's left bound - bin_bounds.emplace_back(blks_start + (N * i_bin) / K, area_l + ((area_r - area_l + 0.99) * i_bin) / K); - bin_bounds.emplace_back(blks_end, area_r + 0.99); // find K-th group's last block, K-th bin's right bound - for (int i_bin = 0; i_bin < K; i_bin++) { - auto &bl = bin_bounds.at(i_bin), br = bin_bounds.at(i_bin + 1); // i-th bin's left and right bound - // i-th group's original bounds (left and right most block's original location) - double group_left = dir ? ap->blk_locs[cut_blks.at(bl.first)].rawy - : ap->blk_locs[cut_blks.at(bl.first)].rawx; - double group_right = dir ? ap->blk_locs[cut_blks.at(br.first - 1)].rawy - : ap->blk_locs[cut_blks.at(br.first - 1)].rawx; - double bin_left = bl.second; - double bin_right = br.second; - // mapping from i-th block group's original bounds to i-th bin's bounds - double mapping = (bin_right - bin_left) / std::max(0.00001, group_right - group_left); // prevent division by 0 - // map blks in i-th group to new location in i-th bin using linear interpolation - for (int i_blk = bl.first; i_blk < br.first; i_blk++) { - // new location is stored back into rawx/rawy - auto& blk_pos = dir ? ap->blk_locs[cut_blks.at(i_blk)].rawy - : ap->blk_locs[cut_blks.at(i_blk)].rawx; - - blk_pos = bin_left + mapping * (blk_pos - group_left); // linear interpolation - } - } - } - - // Update blks_at_location for each block with their new location - for (int x = sub_area.bb.xmin(); x <= sub_area.bb.xmax(); x++) - for (int y = sub_area.bb.ymin(); y <= sub_area.bb.ymax(); y++) { - blks_at_location[x][y].clear(); - } - for (int i_blk = blks_start; i_blk < blks_end; i_blk++) { - auto& bl = ap->blk_locs[cut_blks[i_blk]]; - bl.loc.x = std::min(sub_area.bb.xmax(), std::max(sub_area.bb.xmin(), int(bl.rawx))); - bl.loc.y = std::min(sub_area.bb.ymax(), std::max(sub_area.bb.ymin(), int(bl.rawy))); - blks_at_location[bl.loc.x][bl.loc.y].push_back(cut_blks[i_blk]); - } -} - -/* - * @brief: Greedy strict legalize using algorithm described in algorithm overview above. - * - * Input illegal placement from data members (blk_locs) in analytic_placer - * - * @return: both ap->blk_locs and vpr_ctx.mutable_placement() are modified with legal placement, - * to be used in next solve/spread/legalize iteration or to pass back to annealer. - */ -void CutSpreader::strict_legalize() { - auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - const auto& block_locs = ap->blk_loc_registry_ref_.block_locs(); - const auto& place_macros = ap->place_macros_; - int max_x = g_vpr_ctx.device().grid.width(); - int max_y = g_vpr_ctx.device().grid.height(); - - // clear the location of all blocks in place_ctx - for (auto blk : clb_nlist.blocks()) { - if (!block_locs[blk].is_fixed && (ap->row_num[blk] != DONT_SOLVE || (place_macros.get_imacro_from_iblk(blk) != NO_MACRO && ap->row_num[place_macros.macro_head(blk)] != DONT_SOLVE))) { - unbind_tile(block_locs[blk].loc); - } - } - - // Greedy largest-macro-first approach - // put all blocks being placed in current AP in priority_queue "remaining" with the priority being the - // length of the macro they are in (for single blocks, priority = 1). - // This prioritizes the placement of longest macros over single blocks - std::priority_queue> remaining; - for (ClusterBlockId blk : ap->solve_blks) { - if (place_macros.get_imacro_from_iblk(blk) != NO_MACRO) { // blk is head block of a macro (only head blks are solved) - remaining.emplace(place_macros[place_macros.get_imacro_from_iblk(blk)].members.size(), blk); - } else { - remaining.emplace(1, blk); - } - } - - /* - * ripup_radius determines at which point already placed single logic blocks will be "ripped up" for placement of - * the current block. Specifically, when radius of random selection (determined by availability of compatible sub_tiles) - * is larger than ripup_radius, occupied sub_tiles are also considered for blk's placement (not just unoccupied sub_tiles). - * - * Therefore, a small ripup_radius honors the current location of blk (from spreading) more, as it allows placement at - * occupied sub_tiles when random selection radius around current location is still small. When ripup_radius is large, - * blk can only search unoccupied sub_tiles in a large area before it can rip up placed blks. This will make blk more likely - * to stray far from current location. - * - * ripup_radius is doubled every time outer while-loop executes (ap->solve_blks.size()) times, - * i.e. after trying to place each block once, if there's still block to place (some block displaced/ripped up other blocks), - * ripup_radius is doubled, allowing these ripped up blocks to look for unoccupied sub_tiles in a larger area. - * - * Only applies for single blocks - */ - int ripup_radius = 2; - // num of iters of outer most while loop, cleared when it equals the number of blocks that needs to be place for this - // build-solve-legalize iteration. When cleared, ripup_radius is doubled. - int total_iters = 0; - // total_iters without clearing, used for time-out - int total_iters_noreset = 0; - - // outer while loop, each loop iteration aims to place one solve_blk (either single blk or head blk of a macro) - while (!remaining.empty()) { - auto top = remaining.top(); - remaining.pop(); - ClusterBlockId blk = top.second; - - if (is_placed(blk)) // ignore if already placed - continue; - - int radius = 0; // radius of 0 means initial candidate location is the current location of blk after spreading - int iter = 0; // iterations of the inner while-loop, used for timeout - - /* - * iter_at_radius: number of inner-loop iterations (number of proposed candidate locations) at current radius - * used to determine whether to explore more candidate locations (iter_at_radius < explore limit) - * or take the current best_subtile for blk - * - * only applies for single blocks - */ - int iter_at_radius = 0; - bool placed = false; // flag for inner-loop - t_pl_loc best_subtile = t_pl_loc{}; // current best candidate with smallest best_inp_len, only for single blocks - int best_inp_len = std::numeric_limits::max(); // used to choose best_subtile, only for single blocks - - total_iters++; - total_iters_noreset++; - - // clear total_iters and double ripup_radius when all solve_blks have been attempted to place once - if (total_iters > int(ap->solve_blks.size())) { - total_iters = 0; - ripup_radius = std::min(std::max(max_x - 1, max_y - 1), ripup_radius * 2); - } - - // timeout - // VTR_ASSERT(total_iters_noreset <= std::max(5000, 8 * int(clb_nlist.blocks().size()))); - - while (!placed) { // while blk is not placed - // timeout - VTR_ASSERT(iter <= std::max(10000, 3 * int(clb_nlist.blocks().size()))); - - // randomly choose a location within radius around current location (given by spreading) - int nx = rand() % (2 * radius + 1) + std::max(ap->blk_locs[blk].loc.x - radius, 0); - int ny = rand() % (2 * radius + 1) + std::max(ap->blk_locs[blk].loc.y - radius, 0); - - iter++; - iter_at_radius++; - if (iter >= (10 * (radius + 1))) { // a heuristic to determine when to increase radius - // check if there's sub_tiles of right type within radius. - // If no, increase radius until at least 1 compatible sub_tile is found - radius = std::min(std::max(max_x - 1, max_y - 1), radius + 1); - while (radius < std::max(max_x - 1, max_y - 1)) { - // search every location within radius for compatible sub_tiles - for (int x = std::max(0, ap->blk_locs[blk].loc.x - radius); - x <= std::min(max_x - 1, ap->blk_locs[blk].loc.x + radius); - x++) { - for (int y = std::max(0, ap->blk_locs[blk].loc.y - radius); - y <= std::min(max_y - 1, ap->blk_locs[blk].loc.y + radius); - y++) { - if (subtiles_at_location[x][y].size() > 0) // compatible sub_tiles found within radius - goto notempty; - } - } - // no sub_tiles found, increase radius - radius = std::min(std::max(max_x - 1, max_y - 1), radius + 1); - } - notempty: - iter_at_radius = 0; - iter = 0; - } - - if (nx < 0 || nx >= max_x || ny < 0 || ny >= max_y || subtiles_at_location[nx][ny].empty()) - // try another random location if candidate location is illegal or has no sub_tiles - continue; - - /* - * explore_limit determines when to stop exploring for better sub_tiles for blk - * When explore_limit is not met (iter_at_radius < explore_limit), each candidate sub_tile is evaluated based on - * their resulting total input wirelength (a heuristic) for blk. - * When explore_limit is met and a best_sub_tile is found, blk is placed there. - * - * Only applies for single blocks - * @see comments for try_place_blk() - */ - int explore_limit = 2 * radius; - - // if blk is not a macro member - if (place_macros.get_imacro_from_iblk(blk) == NO_MACRO) { - placed = try_place_blk(blk, - nx, - ny, - radius > ripup_radius, // bool ripup_radius_met - iter_at_radius >= explore_limit, // bool exceeds_explore_limit - best_inp_len, - best_subtile, - remaining); - } else { - placed = try_place_macro(blk, - nx, - ny, - remaining); - } - } - } -} - -/* - * Helper function in strict_legalize() - * Place blk on sub_tile location by modifying place_ctx.grid_blocks, place_ctx.block_locs, and ap->blk_locs[blk].loc - */ -void CutSpreader::bind_tile(t_pl_loc sub_tile, ClusterBlockId blk) { - auto& grid_blocks = ap->blk_loc_registry_ref_.mutable_grid_blocks(); - auto& block_locs = ap->blk_loc_registry_ref_.mutable_block_locs(); - - VTR_ASSERT(grid_blocks.block_at_location(sub_tile) == ClusterBlockId::INVALID()); - VTR_ASSERT(block_locs[blk].is_fixed == false); - grid_blocks.set_block_at_location(sub_tile, blk); - block_locs[blk].loc = sub_tile; - grid_blocks.increment_usage({sub_tile.x, sub_tile.y, sub_tile.layer}); - ap->blk_locs[blk].loc = sub_tile; -} - -/* - * Helper function in strict_legalize() - * Remove placement at sub_tile location by clearing place_ctx.block_locs and place_Ctx.grid_blocks - */ -void CutSpreader::unbind_tile(t_pl_loc sub_tile) { - auto& grid_blocks = ap->blk_loc_registry_ref_.mutable_grid_blocks(); - auto& block_locs = ap->blk_loc_registry_ref_.mutable_block_locs(); - - VTR_ASSERT(grid_blocks.block_at_location(sub_tile) != ClusterBlockId::INVALID()); - ClusterBlockId blk = grid_blocks.block_at_location(sub_tile); - VTR_ASSERT(block_locs[blk].is_fixed == false); - block_locs[blk].loc = t_pl_loc{}; - grid_blocks.set_block_at_location(sub_tile, ClusterBlockId::INVALID()); - grid_blocks.decrement_usage({sub_tile.x, sub_tile.y, sub_tile.layer}); -} - -/* - * Helper function in strict_legalze() - * Check if the block is placed in place_ctx (place_ctx.block_locs[blk] has a location that matches - * the block in place_ctx.grid_blocks) - */ -bool CutSpreader::is_placed(ClusterBlockId blk) { - const auto& grid_blocks = ap->blk_loc_registry_ref_.grid_blocks(); - const auto& block_locs = ap->blk_loc_registry_ref_.block_locs(); - - if (block_locs[blk].loc != t_pl_loc{}) { - auto loc = block_locs[blk].loc; - VTR_ASSERT(grid_blocks.block_at_location(loc) == blk); - return true; - } - return false; -} - -/* - * Sub-routine of strict_legalize() - * Tries to place a single block "blk" at a candidate location nx, ny. Returns whether the blk is successfully placed. - * - * If number of iterations at current radius has exceeded the exploration limit (exceeds_explore_limit), - * and a candidate sub_tile is already found (best_subtile), then candidate location is ignored, and blk is - * placed in best_subtile. - * - * Else, if exploration limit is not exceeded, the sub_tiles at nx, ny are evaluated on the blk's resulting total - * input wirelength (a heuristic). If this total input wirelength is shorter than current best_inp_len, it becomes - * the new best_subtile. - * If exploration limit is exceeded and no candidate sub_tile is available in (best_subtile), then blk is placed at - * next compatible sub_tile at candidate location nx, ny. - * - * If blk displaces a logic block by taking its sub_tile, the displaced logic block is put back into remaining queue. - */ -bool CutSpreader::try_place_blk(ClusterBlockId blk, - int nx, - int ny, - bool ripup_radius_met, - bool exceeds_explore_limit, - int& best_inp_len, - t_pl_loc& best_subtile, - std::priority_queue>& remaining) { - const auto& grid_blocks = ap->blk_loc_registry_ref_.grid_blocks(); - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - const auto& place_macros = ap->place_macros_; - - // iteration at current radius has exceeded exploration limit, and a candidate sub_tile (best_subtile) is found - // then blk is placed in best_subtile - if (exceeds_explore_limit && best_subtile != t_pl_loc{}) { - // find the logic block bound to (placed on) best_subtile - ClusterBlockId bound_blk = grid_blocks.block_at_location(best_subtile); - if (bound_blk) { // if best_subtile has a logic block - unbind_tile(best_subtile); // clear bound_block and best_subtile's placement info - remaining.emplace(1, bound_blk); // put bound_blk back into remaining blocks to place - } - bind_tile(best_subtile, blk); // place blk on best_subtile - return true; - } - - // if exploration limit is not met or a candidate sub_tile is not found yet - for (auto sub_t : subtiles_at_location[nx][ny]) { // for each available sub_tile at random location - ClusterBlockId bound_blk = grid_blocks.block_at_location(sub_t); // logic blk at [nx, ny] - if (bound_blk == ClusterBlockId::INVALID() - || ripup_radius_met - || rand() % (20000) < 10) { - /* conditions when a sub_tile at nx, ny is considered: - * - sub_tile is not occupied (no bound_blk) - * - occupied sub_tile is considered when: - * 1) current radius > ripup-radius. (see strict_legalize() for more details) - * OR - * 2) a 0.05% chance of acceptance. - */ - if (bound_blk && place_macros.get_imacro_from_iblk(bound_blk) != NO_MACRO) - // do not sub_tiles when the block placed on it is part of a macro, as they have higher priority - continue; - if (!exceeds_explore_limit) { // if still in exploration phase, find best_subtile with smallest best_inp_len - int input_len = 0; - // find all input pins and add up input wirelength - for (auto pin : clb_nlist.block_input_pins(blk)) { - ClusterNetId net = clb_nlist.pin_net(pin); - if (net == ClusterNetId::INVALID() - || clb_nlist.net_is_ignored(net) - || clb_nlist.net_driver(net) == ClusterPinId::INVALID()) - continue; - ClusterBlockId driver = clb_nlist.pin_block(clb_nlist.net_driver(net)); - auto driver_loc = ap->blk_locs[driver].loc; - input_len += std::abs(driver_loc.x - nx) + std::abs(driver_loc.y - ny); - } - if (input_len < best_inp_len) { - // update best_subtile - best_inp_len = input_len; - best_subtile = sub_t; - } - break; - } else { // exploration phase passed and still no best_subtile yet, choose the next compatible sub_tile - if (bound_blk) { - remaining.emplace(1, bound_blk); - unbind_tile(sub_t); // remove bound_blk and place blk on sub_t - } - bind_tile(sub_t, blk); - return true; - } - } - } - return false; -} - -/* - * Sub-routine of strict_legalize() - * - * Tries to place the macro with the head block on candidate location nx, ny. Returns if the macro is successfully placed. - * - * For each possible macro placement starting from nx, ny, if any block's position in the macro does not have compatible - * sub_tiles or overlaps with another macro, the placement is impossible. - * - * If a possible placement is found, it's applied to all blocks. - */ -bool CutSpreader::try_place_macro(ClusterBlockId blk, - int nx, - int ny, - std::priority_queue>& remaining) { - const auto& place_macros = ap->place_macros_; - const auto& grid_blocks = ap->blk_loc_registry_ref_.grid_blocks(); - const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist; - - for (auto sub_t : subtiles_at_location[nx][ny]) { - std::vector> targets; // contains the target placement location for each macro block - std::queue> visit; // visit goes through all macro members once - visit.emplace(blk, sub_t); // push head block and target sub_tile first - bool placement_impossible = false; // once set to true, break while loop and try next sub_t - while (!visit.empty()) { // go through every macro block - ClusterBlockId visit_blk = visit.front().first; - VTR_ASSERT(!is_placed(visit_blk)); - t_pl_loc target = visit.front().second; // target location - visit.pop(); - - // ensure the target location has compatible tile - auto blk_t = clb_nlist.block_type(blk); - auto result = std::find(blk_t->equivalent_tiles.begin(), blk_t->equivalent_tiles.end(), g_vpr_ctx.device().grid.get_physical_type({target.x, target.y, target.layer})); - if (result == blk_t->equivalent_tiles.end()) { - placement_impossible = true; - break; - } - - // if the target location has a logic block, ensure it's not part of a macro - // because a macro placed before the current one has higher priority (longer chain) - ClusterBlockId bound = grid_blocks.block_at_location(target); - if (bound && place_macros.get_imacro_from_iblk(bound) != NO_MACRO) { - placement_impossible = true; - break; - } - // place macro block into target vector along with its target location - targets.emplace_back(visit_blk, target); - if (place_macros.macro_head(visit_blk) == visit_blk) { // if visit_blk is the head block of the macro - // push all macro members to visit queue along with their calculated positions - const std::vector& members = place_macros[place_macros.get_imacro_from_iblk(blk)].members; - for (auto member = members.begin() + 1; member != members.end(); ++member) { - t_pl_loc mloc = target + member->offset; // calculate member_loc using (head blk location + offset) - visit.emplace(member->blk_index, mloc); - } - } - } - - if (!placement_impossible) { // if placement is possible, apply this placement - for (auto& target : targets) { - ClusterBlockId bound = grid_blocks.block_at_location(target.second); - if (bound) { - // if target location has a logic block, displace it and put it in remaining queue to be placed later - unbind_tile(target.second); - remaining.emplace(1, bound); - } - bind_tile(target.second, target.first); - } - return true; - } - } - return false; -} - -#endif /* ENABLE_ANALYTIC_PLACE */ diff --git a/vpr/src/place/cut_spreader.h b/vpr/src/place/cut_spreader.h deleted file mode 100644 index 5be5f0ed79..0000000000 --- a/vpr/src/place/cut_spreader.h +++ /dev/null @@ -1,378 +0,0 @@ -#ifndef VPR_SRC_PLACE_LEGALIZER_H_ -#define VPR_SRC_PLACE_LEGALIZER_H_ - -#ifdef ENABLE_ANALYTIC_PLACE - -/** - * @file - * @brief This file defines the cut-spreader class with a greedy legalizer as a member method. - * Cut-spreader roughly legalizes overutilized tiles present in illegal placement from the matrix equation - * solution (lower-bound placement), using geometric partitioning to recursively cut and spread tiles within - * these regions, eliminating most overutilizations. - * Legalizer then strictly legalizes the placement using a greedy strategy, ensuring logic block to physical - * subtile type-matching and eliminating all overutilizations. This completes the lower-bound placement. - * - ************************************************************************************************************** - * Algorithm Overview * - ************************************************************************************************************** - * The solution produced by the solver almost always contains 2 types of illegality: overutilization and - * logical-physical type mismatch. - * - * Cut-Spreader - * ============ - * To resolve overutilization, a recursive partitioning-style placement approach is used. It consists of the following - * steps: - * - * find_overused_regions & expand_regions - * -------------------------------------- - * @see find_overused_regions() - * @see expand_regions() - * The first step is to find an area of the FPGA that is overutilized for which the blocks contained within - * must be spread to a larger area. To obtain this overutilized area, adjacent locations on the FPGA that are - * occupied by more than one block (also overutilized) are repeatedly clustered together, until all clusters - * are bordered on all sides by non-overutilized locations. Next, the area is expanded in both the x and y - * directions until it's large enough to accommodate all blocks contained. Overutilization is defined as follows: - * (Occupancy / Capacity) > beta, where beta is a constant <=1, currently defined in AnalyticPlacer::PlacerHeapCfg. - * - * cut_region - * ---------- - * @see cut_region() - * @see run() - * In the second step, two cuts are generated: a source cut and a target cut. The source cut pertains to the blocks - * being placed; the target cut pertains to the area into which the blocks are placed. The source cut splits the - * blocks into two partitions, while the target cut splits the area into two sub-areas, into which the blocks in - * each partition are spread. Two objectives are minimized during this process: the imbalance between the number of - * blocks in each partition, and the difference in the utilization (Occupancy / Capacity) of each subarea. - * - * To generate the source cut, the logic blocks are sorted by their raw_x or raw_y location, depending on the - * orientation of the desired cut. After sorting, a pivot is chosen, all blocks to the left of the pivot are - * assigned to the left partition, and all blocks to the right are assigned to the right partition (we use left - * for left/top, right for right/bottom in x/y directions respectively in the implementation). - * - * The target cut is an x or y cut of the area such that all blocks in each partition fit in their respective - * subareas, and the difference in utilization is minimized. This is difficult due to the discrete nature of FPGA - * architecture. To achieve this, first assign all tiles to right region, then move tiles to the left one by one, - * calculating the difference in utilization for each move. The move with smallest delta_utilization is chosen as - * the target cut. To eliminate possible overutilization in either subarea, perturb the source cut by moving a - * single block from the over-utilized sub-area to the other, until neither is overutilized. - * - * Next, the blocks in sub-areas are spread to distribute them evenly. We split the sub-area into 10 equally-sized - * bins and logic blocks into 10 equal-capacity source bins. Then linearly interpolate to map blocks from their - * original locations in their source bins to new spread location in target bins. - * - * This cutting and spreading is repeated recursively. The cut-spreading process returns the left and right - * (or top and bottom) subareas, which are pushed into a workqueue FIFO. Their direction of cut in the next - * cut-spreading process is alternated, i.e. if the first cut is in y direction, the resulting left and right - * sub-areas are further cut in x direction, each producing 2 subareas top and bottom, and so forth. - * The first region in the FIFO is then popped and goes through cut-spreading. This process is repeated until the base - * case of only 1 block in the region is reached. At this point the placement is mostly not overutilized and ready - * for strict legalization. - * - * ************************************************************************************************************** - * - * Strict Legalizer - * ================ - * @see strict_legalize() - * Strict Legalizer ensures that the placement is strictly legal. It does so using a greedy approach. - * - * All blocks are sorted in descending macro lengths order and put in a priority queue (only macro heads are - * considered, while the rest of the macro members are ignored; single blocks have length 1). Each block goes through - * the following procedure: - * - * * Find all compatible sub_tile types, based on which all potential sub_tile locations are found (this process is - * made computationally cheap by legal_pos data member in AnalyticPlacer) - * * Within a radius (starting from 0) of the block's currently location, randomly choose a location as candidate. - * * If the block is a single block (not in macro), multiple candidates are potentially chosen, and the one that - * results in the smallest input wirelength (sum of wirelengths to its inputs) for the block is chosen. - * * If the block is a macro head, the location for all member blocks are calculated using member offsets. If all - * member locations have compatible sub_tile and does not overlap with another macro, then place the macro. - * * In either case, if the candidate fails to satisfy legality constraints, the radius may increase (depending on - * number of iterations at current radius), and a new candidate will be chosen. - * - * - * @cite SimPL - * Original analytic placer with cut-spreading legalizing was intended for ASIC design, proposed in SimPL. - * SimPL: An Effective Placement Algorithm, Myung-Chul Kim, Dong-Jin Lee and Igor L. Markov - * http://www.ece.umich.edu/cse/awards/pdfs/iccad10-simpl.pdf - * - * @cite HeAP - * FPGA adaptation of SimPL, targeting FPGAs with heterogeneous blocks located at discrete locations. - * Analytical Placement for Heterogeneous FPGAs, Marcel Gort and Jason H. Anderson - * https://janders.eecg.utoronto.ca/pdfs/marcelfpl12.pdf - * - * @cite nextpnr - * An implementation of HeAP, which the cut-spreader and legalizer here is based off of. Implementation details - * have been modified for the architecture and netlist specification of VTR, and better performance. - * nextpnr -- Next Generation Place and Route, placer_heap, David Shah - * https://github.com/YosysHQ/nextpnr - * - */ -#include "vpr_context.h" -#include - -// declaration of used types; -class AnalyticPlacer; -struct t_logical_block_type; - -// Cut-spreader, as described in HeAP/SimPL papers -class CutSpreader { - public: - /* - * @brief: Constructor of CutSpreader - * - * @param analytic_placer pointer to the analytic_placer that invokes this instance of CutSpreader. - * passed for CutSpreader to directly access data members in analytic_placer such as - * blk_locs, blk_info, solve_blks, etc, without re-packaging the data to pass to - * CutSpreader. - * - * @param blk_t logical_block_type for CutSpreader to legalize. Currently can only legalize one - * type each time. - */ - CutSpreader(AnalyticPlacer* analytic_placer, t_logical_block_type_ptr blk_t); - - /* - * @brief: Executes the cut-spreader algorithm described in algorithm overview above. - * Does not include strict_legalize so placement result is not guaranteed to be legal. - * Strict_legalize must run after for legal placement result, and for legal placement to - * be passed to annealer through vpr_ctx. - * - * Input placement is passed by data members (blk_locs) in analytic_placer - * - * @return result placement is passed to strict legalizer by modifying blk_locs in analytic_placer - */ - void cutSpread(); - - /* - * @brief: Greedy strict legalize using algorithm described in algorithm overview above. - * - * Input illegal placement from data members (blk_locs) in analytic_placer, - * previously modified by cut_spreader - * - * @return: both ap->blk_locs and vpr_ctx.mutable_placement() are modified with legal placement, - * to be used in next solve/spread/legalize iteration or to pass back to annealer. - */ - void strict_legalize(); - - private: - // pointer to analytic_placer to access its data members - AnalyticPlacer* ap; - - // block type to legalize - t_logical_block_type_ptr blk_type; - - // struct describing regions on FPGA to cut_spread - struct SpreaderRegion { - int id; // index of regions in regions vector - vtr::Rect bb; // bounding box of the region - int n_blks, n_tiles; // number of netlist blocks and compatible tiles (placement locations) - bool overused(float beta) const { - // determines whether region is overutilized: overused = (Occupancy / Capacity) > beta - if (n_blks > beta * n_tiles) - return true; - else - return false; - } - }; - - // Utilization of each tile, indexed by x, y - vtr::Matrix occupancy; - - // Region ID of each tile, indexed by x, y. AP_NO_REGION if not covered by any region - // Used to check ownership of a grid position by region. - vtr::Matrix reg_id_at_grid; - - // Extent of macro at x, y location. If blk is not in any macros, it only covers a single location - vtr::Matrix> macro_extent; - - // List of logic blocks of blk_type at x, y location, indexed by x, y - // ex. to find all logic blocks occupying location x, y, blks_at_location[x][y] gives a vector of - // block IDs at that location - vtr::Matrix> blks_at_location; - - // List of all compatible sub_tiles for the type of blocks being cut-spread, at location x, y. - // usage: subtiles_at_location[x][y] - vtr::Matrix> subtiles_at_location; - - // List of all SpreaderRegion, index of vector members is the id of the region - std::vector regions; - - // List of all merged_regions, these regions are merged in larger regions and should be skipped when - // recursively cut_spreading. Each entry is the region's ID, which is also the index into the regions vector - std::unordered_set merged_regions; - - // Lookup of macro's extent by block ID. If block is a single block, it contains only 1 tile location - vtr::vector_map> blk_extents; - - // Setup CutSpreader data structures using information from AnalyticPlacer - // including blks_at_location, macros, groups, etc. - void init(); - - // Returns number of logical blocks at x, y location - int occ_at(int x, int y); - - // Returns number of compatible sub_tiles at x, y location - int tiles_at(int x, int y); - - /* - * When expanding a region, it might overlap with another region, one of them (merger) will absorb - * the other (mergee) by merging. @see expand_regions() below; - * - * Merge mergee into merged by: - * * change group id at mergee grids to merged id - * * adds all n_blks and n_tiles from mergee to merged region - * * grow merged to include all mergee grids - */ - void merge_regions(SpreaderRegion& merged, SpreaderRegion& mergee); - - /* - * Grow region r to include a rectangular region - * Pass init = true if first time calling for a newly created region - */ - void grow_region(SpreaderRegion& r, vtr::Rect rect_to_include, bool init = false); - - /* - * Expand all over-utilized regions until they satisfy n_tiles * beta >= n_blocks - * If overutilized regions overlap in this process, they are merged - */ - void expand_regions(); - - /* - * Find overutilized regions surrounded by non-overutilized regions - * Start off at an overutilized tile and expand in x, y directions 1 step at a time in both directions - * until the region is surrounded by non-overutilized regions. - */ - void find_overused_regions(); - - // copy all logic blocks that needs to be cut into cut_blks - void init_cut_blks(SpreaderRegion& r, std::vector& cut_blks); - - /* - * generate the initial source_cut for region r, ensure there is enough clearance on either side of the - * initial cut to accommodate macros - * returns the initial source cut (index into cut_blks) - * returns the clearance in clearance_l, clearance_r - * returns -1 if cannot generate initial source_cut (not enough clearance for macros) - */ - int initial_source_cut(SpreaderRegion& r, - std::vector& cut_blks, - bool dir, - int& clearance_l, - int& clearance_r); - - /* - * generate the initial target_cut for region r, ensure that utilization in 2 subareas are closest possible - * while meeting clearance requirements for macros - * returns best target cut - * returns the resulting number of blocks in left and right partitions in left_blks_n, right_blks_n - * returns the resulting number of tiles in left and right subareas in left_tiles_n, right_tiles_n - */ - int initial_target_cut(SpreaderRegion& r, - std::vector& cut_blks, - int init_source_cut, - bool dir, - int trimmed_l, - int trimmed_r, - int clearance_l, - int clearance_r, - int& left_blks_n, - int& right_blks_n, - int& left_tiles_n, - int& right_tiles_n); - - /* - * Trim the boundaries of the region in axis-of-interest, skipping any rows/cols without any tiles - * of the right type. - * Afterwards, move blocks in trimmed locations to new trimmed boundaries - */ - std::pair trim_region(SpreaderRegion& r, bool dir); - - /* - * Spread blocks in subarea by linear interpolation - * blks_start and blks_end are indices into cut_blks. The blks between these indices will be spread by: - * * first split the subarea (between boundaries area_l and area_r) into - * min(number_of_logic_blocks_in_subarea, 10) number of bins. - * * split the logic blocks into the corresponding number of groups - * * place the logic blocks from their group to their bin, by linear interpolation using their original - * locations to map to a new location in the bin. - */ - void linear_spread_subarea(std::vector& cut_blks, - bool dir, - int blks_start, - int blks_end, - SpreaderRegion& sub_area); - - /* - * Recursive cut-based spreading in HeAP paper - * "left" denotes "-x, -y", "right" denotes "+x, +y" depending on dir - * - * @param r region to cut & spread - * @param dir direction, true for y, false for x - * - * @return a pair of sub-region IDs created from cutting region r. - * BASE_CASE if base case is reached - * CUT_FAIL if cut unsuccessful, need to cut in the other direction - */ - std::pair cut_region(SpreaderRegion& r, bool dir); - - /* - * Helper function in strict_legalize() - * Place blk on sub_tile location by modifying place_ctx.grid_blocks and place_ctx.block_locs - */ - void bind_tile(t_pl_loc sub_tile, ClusterBlockId blk); - - /* - * Helper function in strict_legalize() - * Remove placement at sub_tile location by clearing place_ctx.block_locs and place_Ctx.grid_blocks - */ - void unbind_tile(t_pl_loc sub_tile); - - /* - * Helper function in strict_legalize() - * Check if the block is placed in place_ctx (place_ctx.block_locs[blk] has a location that matches - * the block in place_ctx.grid_blocks) - */ - bool is_placed(ClusterBlockId blk); - - /* - * Sub-routine of strict_legalize() - * Tries to place a single block "blk" at a candidate location nx, ny. Returns whether the blk is succesfully placed. - * - * If number of iterations at current radius has exceeded the exploration limit (exceeds_explore_limit), - * and a candidate sub_tile is already found (best_subtile), then candidate location is ignored, and blk is - * placed in best_subtile. - * - * Else, if exploration limit is not exceeded, the subtiles at nx, ny are evaluated on the blk's resulting total - * input wirelength (a heuristic). If this total input wirelength is shorter than current best_inp_len, it becomes - * the new best_subtile. - * If exploration limit is exceeded and no candidate sub_tile is available in (best_subtile), then blk is placed at - * next sub_tile at candidate location nx, ny. - * - * If blk displaces a logic block by taking its sub_tile, the displaced logic block is put back into remaining queue. - */ - bool try_place_blk(ClusterBlockId blk, - int nx, - int ny, - bool ripup_radius_met, - bool exceeds_need_to_explore, - int& best_inp_len, - t_pl_loc& best_subtile, - std::priority_queue>& remaining); - - /* - * Sub-routine of strict_legalize() - * - * Tries to place the macro with the head block on candidate location nx, ny. Returns if the macro is successfully placed. - * - * For each possible macro placement starting from nx, ny, if any block's position in the macro does not have compatible - * sub_tiles or overlaps with another macro, the placement is impossible. - * - * If a possible placement is found, it's applied to all blocks. - */ - bool try_place_macro(ClusterBlockId blk, - int nx, - int ny, - std::priority_queue>& remaining); -}; -#endif /* ENABLE_ANALYTIC_PLACE */ - -#endif /* VPR_SRC_PLACE_LEGALIZER_H_ */ diff --git a/vpr/src/place/placer.cpp b/vpr/src/place/placer.cpp index d850fb0144..b2ea7078e9 100644 --- a/vpr/src/place/placer.cpp +++ b/vpr/src/place/placer.cpp @@ -11,7 +11,6 @@ #include "vtr_time.h" #include "draw.h" #include "read_place.h" -#include "analytic_placer.h" #include "initial_placement.h" #include "load_flat_place.h" #include "concrete_timing_info.h" @@ -94,19 +93,6 @@ Placer::Placer(const Netlist<>& net_list, print_place(nullptr, nullptr, placer_opts.write_initial_place_file.c_str(), placer_state_.block_locs()); } -#ifdef ENABLE_ANALYTIC_PLACE - /* - * Cluster-level Analytic Placer: - * Passes in the initial_placement via vpr_context, and passes its placement back via locations marked on - * both the clb_netlist and the gird. - * Most of anneal is disabled later by setting initial temperature to 0 and only further optimizes in quench - */ - if (placer_opts.enable_analytic_placer) { - AnalyticPlacer{blk_loc_registry, place_macros}.ap_place(); - } - -#endif /* ENABLE_ANALYTIC_PLACE */ - // Update physical pin values for (const ClusterBlockId block_id : cluster_ctx.clb_nlist.blocks()) { blk_loc_registry.place_sync_external_block_connections(block_id); @@ -291,12 +277,6 @@ void Placer::place() { const auto& timing_ctx = g_vpr_ctx.timing(); const auto& cluster_ctx = g_vpr_ctx.clustering(); bool analytic_place_enabled = false; -#ifdef ENABLE_ANALYTIC_PLACE - // Cluster-level analytic placer: when enabled, skip most of the annealing and go straight to quench - if (placer_opts_.enable_analytic_placer) { - analytic_place_enabled = true; - } -#endif if (!analytic_place_enabled && !quench_only_) { // Table header diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_analytic_placer/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_analytic_placer/config/config.txt deleted file mode 100644 index b1a0c92126..0000000000 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_analytic_placer/config/config.txt +++ /dev/null @@ -1,28 +0,0 @@ -############################################## -# Configuration file for running experiments -############################################## - -# Path to directory of circuits to use -circuits_dir=benchmarks/verilog - -# Path to directory of architectures to use -archs_dir=arch/timing - -# Add circuits to list to sweep -circuit_list_add=ch_intrinsics.v - -# Add architectures to list to sweep -arch_list_add=k6_frac_N10_mem32K_40nm.xml - -# Parse info and how to parse -parse_file=vpr_standard.txt - -# How to parse QoR info -qor_parse_file=qor_standard.txt - -# Pass requirements -pass_requirements_file=pass_requirements.txt - -# Script parameters -#script_params="" -script_params = -track_memory_usage --enable_analytic_placer true diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_analytic_placer/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_analytic_placer/config/golden_results.txt deleted file mode 100644 index 249ce143da..0000000000 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_analytic_placer/config/golden_results.txt +++ /dev/null @@ -1,2 +0,0 @@ -arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time initial_placed_wirelength_est placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time initial_placed_CPD_est placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time ap_mem ap_time ap_full_legalizer_mem ap_full_legalizer_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time -k6_frac_N10_mem32K_40nm.xml ch_intrinsics.v common 1.60 vpr 66.94 MiB -1 -1 0.22 18440 3 0.06 -1 -1 33128 -1 -1 68 99 1 0 success v8.0.0-12648-g259ceba57-dirty release IPO VTR_ASSERT_LEVEL=2 Clang 18.1.3 on Linux-6.8.0-58-generic x86_64 2025-05-06T12:34:13 betzgrp-wintermute /home/zhan6738/VTR/vtr-verilog-to-routing/vtr_flow/tasks 68544 99 130 344 474 1 228 298 12 12 144 clb auto 27.5 MiB 0.10 863 800 1293 264 867 162 66.9 MiB 0.04 0.00 1.86362 1.90582 -117.68 -1.90582 1.90582 0.09 0.000566314 0.000530143 0.0035014 0.00338303 -1 -1 -1 -1 40 1473 16 5.66058e+06 4.21279e+06 333335. 2314.82 0.32 0.11664 0.106115 12666 64609 -1 1318 11 405 616 29250 9869 1.99389 1.99389 -129.176 -1.99389 -0.260939 -0.108257 419432. 2912.72 0.01 0.03 0.04 -1 -1 0.01 0.0187502 0.0175858 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt index 443a41856f..303cdc93c8 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/task_list.txt @@ -13,7 +13,6 @@ regression_tests/vtr_reg_strong/strong_ap/qp_hybrid_analytical_solver regression_tests/vtr_reg_strong/strong_ap/lp_b2b_analytical_solver regression_tests/vtr_reg_strong/strong_absorb_buffers regression_tests/vtr_reg_strong/strong_analysis_only -regression_tests/vtr_reg_strong/strong_analytic_placer regression_tests/vtr_reg_strong/strong_bidir regression_tests/vtr_reg_strong/strong_binary regression_tests/vtr_reg_strong/strong_binary_heap diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/strong_analytic_placer/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/strong_analytic_placer/config/config.txt deleted file mode 100644 index 4dd0bc69c7..0000000000 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/strong_analytic_placer/config/config.txt +++ /dev/null @@ -1,28 +0,0 @@ -############################################## -# Configuration file for running experiments -############################################## - -# Path to directory of circuits to use -circuits_dir=benchmarks/verilog - -# Path to directory of architectures to use -archs_dir=arch/timing - -# Add circuits to list to sweep -circuit_list_add=ch_intrinsics.v - -# Add architectures to list to sweep -arch_list_add=k6_frac_N10_mem32K_40nm.xml - -# Parse info and how to parse -parse_file=vpr_standard.txt - -# How to parse QoR info -qor_parse_file=qor_standard.txt - -# Pass requirements -pass_requirements_file=pass_requirements.txt - -# Script parameters -#script_params="" -script_params =-start odin -track_memory_usage --enable_analytic_placer true diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/strong_analytic_placer/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/strong_analytic_placer/config/golden_results.txt deleted file mode 100644 index 300983f84f..0000000000 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/strong_analytic_placer/config/golden_results.txt +++ /dev/null @@ -1,2 +0,0 @@ -arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time initial_placed_wirelength_est placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time initial_placed_CPD_est placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time ap_mem ap_time ap_full_legalizer_mem ap_full_legalizer_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time -k6_frac_N10_mem32K_40nm.xml ch_intrinsics.v common 3.81 odin 100.50 MiB 2.18 102912 -1 -1 3 0.19 -1 -1 34104 -1 -1 68 99 1 0 success v8.0.0-12648-g259ceba57-dirty release IPO VTR_ASSERT_LEVEL=2 Clang 18.1.3 on Linux-6.8.0-58-generic x86_64 2025-05-06T12:34:13 betzgrp-wintermute /home/zhan6738/VTR/vtr-verilog-to-routing/vtr_flow/tasks 67628 99 130 363 493 1 252 298 12 12 144 clb auto 26.8 MiB 0.07 1168 1026 1293 339 819 135 66.0 MiB 0.05 0.00 2.24785 2.1902 -216.85 -2.1902 2.1902 0.09 0.000548458 0.000511942 0.00340329 0.00328177 -1 -1 -1 -1 38 1909 17 5.66058e+06 4.21279e+06 319130. 2216.18 0.37 0.116514 0.106136 12522 62564 -1 1585 12 545 712 54323 17831 2.61371 2.61371 -231.046 -2.61371 0 0 406292. 2821.48 0.01 0.03 0.04 -1 -1 0.01 0.0185263 0.0173288 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/task_list.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/task_list.txt index 37eedf040f..4df0977db1 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/task_list.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong_odin/task_list.txt @@ -1,6 +1,5 @@ regression_tests/vtr_reg_strong_odin/strong_absorb_buffers regression_tests/vtr_reg_strong_odin/strong_analysis_only -regression_tests/vtr_reg_strong_odin/strong_analytic_placer regression_tests/vtr_reg_strong_odin/strong_bidir regression_tests/vtr_reg_strong_odin/strong_binary regression_tests/vtr_reg_strong_odin/strong_blocks_with_no_inputs