Skip to content

Commit a2685c7

Browse files
committed
Changed major place.cpp data structures from file scope to global scope. Moved delay routines to place_delay_model.*. Moved annealing update routines to place_util.*. Enhanced documentations.
1 parent 870eca6 commit a2685c7

File tree

8 files changed

+367
-261
lines changed

8 files changed

+367
-261
lines changed

vpr/src/place/place.cpp

Lines changed: 95 additions & 208 deletions
Large diffs are not rendered by default.

vpr/src/place/place_delay_model.cpp

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include "vtr_math.h"
1111
#include "vpr_error.h"
1212

13+
#include "place_global.h"
14+
1315
#ifdef VTR_ENABLE_CAPNPROTO
1416
# include "capnp/serialize.h"
1517
# include "place_delay_model.capnp.h"
@@ -18,10 +20,7 @@
1820
# include "serdes_utils.h"
1921
#endif /* VTR_ENABLE_CAPNPROTO */
2022

21-
/*
22-
* DeltaDelayModel
23-
*/
24-
23+
///@brief DeltaDelayModel methods.
2524
float DeltaDelayModel::delay(int from_x, int from_y, int /*from_pin*/, int to_x, int to_y, int /*to_pin*/) const {
2625
int delta_x = std::abs(from_x - to_x);
2726
int delta_y = std::abs(from_y - to_y);
@@ -46,9 +45,11 @@ void DeltaDelayModel::dump_echo(std::string filepath) const {
4645
vtr::fclose(f);
4746
}
4847

49-
/*
50-
* OverrideDelayModel
51-
*/
48+
const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
49+
return base_delay_model_.get();
50+
}
51+
52+
///@brief OverrideDelayModel methods.
5253
float OverrideDelayModel::delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const {
5354
//First check to if there is an override delay value
5455
auto& device_ctx = g_vpr_ctx.device();
@@ -136,18 +137,14 @@ float OverrideDelayModel::get_delay_override(int from_type, int from_class, int
136137
return iter->second;
137138
}
138139

139-
const DeltaDelayModel* OverrideDelayModel::base_delay_model() const {
140-
return base_delay_model_.get();
141-
}
142-
143140
void OverrideDelayModel::set_base_delay_model(std::unique_ptr<DeltaDelayModel> base_delay_model_obj) {
144141
base_delay_model_ = std::move(base_delay_model_obj);
145142
}
146143

147-
// When writing capnp targetted serialization, always allow compilation when
148-
// VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception
149-
// instead.
150-
//
144+
/**
145+
* When writing capnp targetted serialization, always allow compilation when
146+
* VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead.
147+
*/
151148
#ifndef VTR_ENABLE_CAPNPROTO
152149

153150
# define DISABLE_ERROR \
@@ -300,3 +297,81 @@ void OverrideDelayModel::write(const std::string& file) const {
300297
}
301298

302299
#endif
300+
301+
///@brief Initialize the placer delay model.
302+
std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist,
303+
const t_placer_opts& placer_opts,
304+
const t_router_opts& router_opts,
305+
t_det_routing_arch* det_routing_arch,
306+
std::vector<t_segment_inf>& segment_inf,
307+
const t_direct_inf* directs,
308+
const int num_directs) {
309+
return compute_place_delay_model(placer_opts, router_opts, det_routing_arch, segment_inf,
310+
chan_width_dist, directs, num_directs);
311+
}
312+
313+
/**
314+
* @brief Returns the delay of one point to point connection.
315+
*
316+
* Only estimate delay for signals routed through the inter-block routing network.
317+
* TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay."
318+
*/
319+
float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) {
320+
auto& cluster_ctx = g_vpr_ctx.clustering();
321+
auto& place_ctx = g_vpr_ctx.placement();
322+
323+
float delay_source_to_sink = 0.;
324+
325+
if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
326+
ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id);
327+
ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin);
328+
329+
ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin);
330+
ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin);
331+
332+
int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin);
333+
int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin);
334+
335+
int source_x = place_ctx.block_locs[source_block].loc.x;
336+
int source_y = place_ctx.block_locs[source_block].loc.y;
337+
int sink_x = place_ctx.block_locs[sink_block].loc.x;
338+
int sink_y = place_ctx.block_locs[sink_block].loc.y;
339+
340+
/**
341+
* This heuristic only considers delta_x and delta_y, a much better
342+
* heuristic would be to to create a more comprehensive lookup table.
343+
*
344+
* In particular this approach does not accurately capture the effect
345+
* of fast carry-chain connections.
346+
*/
347+
delay_source_to_sink = delay_model->delay(source_x,
348+
source_y,
349+
source_block_ipin,
350+
sink_x,
351+
sink_y,
352+
sink_block_ipin);
353+
if (delay_source_to_sink < 0) {
354+
VPR_ERROR(VPR_ERROR_PLACE,
355+
"in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n"
356+
"in comp_td_single_connection_delay: Delay is less than 0\n",
357+
block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(),
358+
source_x, source_y,
359+
block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(),
360+
sink_x, sink_y,
361+
delay_source_to_sink);
362+
}
363+
}
364+
365+
return (delay_source_to_sink);
366+
}
367+
368+
///@brief Recompute all point to point delays, updating `connection_delay` matrix.
369+
void comp_td_connection_delays(const PlaceDelayModel* delay_model) {
370+
const auto& cluster_ctx = g_vpr_ctx.clustering();
371+
372+
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
373+
for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) {
374+
connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, net_id, ipin);
375+
}
376+
}
377+
}

vpr/src/place/place_delay_model.h

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
/**
2+
* @file
3+
* @brief This file contains all the class and function declarations related to
4+
* the placer delay model. For implementations, see place_delay_model.cpp.
5+
*/
6+
17
#ifndef PLACE_DELAY_MODEL_H
28
#define PLACE_DELAY_MODEL_H
39

@@ -20,38 +26,63 @@
2026
# define ALWAYS_INLINE inline
2127
#endif
2228

23-
//Abstract interface to a placement delay model
29+
///@brief Forward declarations.
30+
class PlaceDelayModel;
31+
32+
///@brief Initialize the placer delay model.
33+
std::unique_ptr<PlaceDelayModel> alloc_lookups_and_delay_model(t_chan_width_dist chan_width_dist,
34+
const t_placer_opts& place_opts,
35+
const t_router_opts& router_opts,
36+
t_det_routing_arch* det_routing_arch,
37+
std::vector<t_segment_inf>& segment_inf,
38+
const t_direct_inf* directs,
39+
const int num_directs);
40+
41+
///@brief Returns the delay of one point to point connection.
42+
float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin);
43+
44+
///@brief Recompute all point to point delays, updating `connection_delay` matrix.
45+
void comp_td_connection_delays(const PlaceDelayModel* delay_model);
46+
47+
///@brief Abstract interface to a placement delay model.
2448
class PlaceDelayModel {
2549
public:
2650
virtual ~PlaceDelayModel() = default;
2751

28-
// Computes place delay model.
52+
///@brief Computes place delay model.
2953
virtual void compute(
3054
RouterDelayProfiler& route_profiler,
3155
const t_placer_opts& placer_opts,
3256
const t_router_opts& router_opts,
3357
int longest_length)
3458
= 0;
3559

36-
//Returns the delay estimate between the specified block pins
37-
//
38-
// Either compute or read methods must be invoked before invoking
39-
// delay.
60+
/**
61+
* @brief Returns the delay estimate between the specified block pins.
62+
*
63+
* Either compute or read methods must be invoked before invoking delay.
64+
*/
4065
virtual float delay(int from_x, int from_y, int from_pin, int to_x, int to_y, int to_pin) const = 0;
4166

42-
//Dumps the delay model to an echo file
67+
///@brief Dumps the delay model to an echo file.
4368
virtual void dump_echo(std::string filename) const = 0;
4469

45-
// Write place delay model to specified file.
46-
// May be unimplemented, in which case method should throw an exception.
70+
/**
71+
* @brief Write place delay model to specified file.
72+
*
73+
* May be unimplemented, in which case method should throw an exception.
74+
*/
4775
virtual void write(const std::string& file) const = 0;
4876

49-
// Read place delay model from specified file.
50-
// May be unimplemented, in which case method should throw an exception.
77+
/**
78+
* @brief Read place delay model from specified file.
79+
*
80+
* May be unimplemented, in which case method should throw an exception.
81+
*/
5182
virtual void read(const std::string& file) = 0;
5283
};
5384

54-
//A simple delay model based on the distance (delta) between block locations
85+
///@brief A simple delay model based on the distance (delta) between block locations.
5586
class DeltaDelayModel : public PlaceDelayModel {
5687
public:
5788
DeltaDelayModel() {}
@@ -109,10 +140,13 @@ class OverrideDelayModel : public PlaceDelayModel {
109140
short delta_x;
110141
short delta_y;
111142

112-
//A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
113-
//is required for operator< to be inlined by compiler.
114-
//Proper inlining of the function reduces place time by around 5%.
115-
//For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
143+
/**
144+
* A combination of ALWAYS_INLINE attribute and std::lexicographical_compare
145+
* is required for operator< to be inlined by compiler. Proper inlining of the
146+
* function reduces place time by around 5%.
147+
*
148+
* For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225
149+
*/
116150
friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) {
117151
const short* left = reinterpret_cast<const short*>(&lhs);
118152
const short* right = reinterpret_cast<const short*>(&rhs);
@@ -123,8 +157,11 @@ class OverrideDelayModel : public PlaceDelayModel {
123157

124158
vtr::flat_map2<t_override, float> delay_overrides_;
125159

126-
//operator< treats memory layout of t_override as an array of short
127-
//this requires all members of t_override are shorts and there is no padding between members of t_override
160+
/**
161+
* operator< treats memory layout of t_override as an array of short.
162+
* This requires all members of t_override are shorts and there is no
163+
* padding between members of t_override.
164+
*/
128165
static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)");
129166
static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts");
130167
static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts");

vpr/src/place/place_global.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/**
2+
* @file
3+
* @brief This file contains all the global data structures referenced across
4+
* multiple files in ./vpr/src/place.
5+
*
6+
* These global data structures were originally local to place.cpp, and they
7+
* were referenced by a lot of routines local to place.cpp. However, to shorten
8+
* the file size of place.cpp, these routines are moved to other files.
9+
*
10+
* Instead of elongating the argument list of the moved routines, I moved the
11+
* data structures to here so that they can be easily shared across different
12+
* files.
13+
*
14+
* For detailed descriptions on what each data structure stores, please see
15+
* place.cpp, where these variables are defined.
16+
*/
17+
18+
#pragma once
19+
#include <vector>
20+
#include "vtr_vector.h"
21+
#include "vpr_net_pins_matrix.h"
22+
#include "timing_place.h"
23+
24+
extern vtr::vector<ClusterNetId, double> net_cost, proposed_net_cost;
25+
extern vtr::vector<ClusterNetId, char> bb_updated_before;
26+
extern ClbNetPinsMatrix<float> connection_delay;
27+
extern ClbNetPinsMatrix<float> proposed_connection_delay;
28+
extern ClbNetPinsMatrix<float> connection_setup_slack;
29+
extern PlacerTimingCosts connection_timing_cost;
30+
extern ClbNetPinsMatrix<double> proposed_connection_timing_cost;
31+
extern vtr::vector<ClusterNetId, double> net_timing_cost;
32+
extern vtr::vector<ClusterNetId, t_bb> bb_coords, bb_num_on_edges;
33+
extern vtr::vector<ClusterNetId, t_bb> ts_bb_coord_new, ts_bb_edge_new;
34+
extern std::vector<ClusterNetId> ts_nets_to_update;

vpr/src/place/place_util.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "globals.h"
33

44
static vtr::Matrix<t_grid_blocks> init_grid_blocks();
5+
static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid);
56

67
void init_placement_context() {
78
auto& place_ctx = g_vpr_ctx.mutable_placement();
@@ -119,3 +120,84 @@ int get_initial_move_lim(const t_placer_opts& placer_opts, const t_annealing_sch
119120

120121
return move_lim;
121122
}
123+
124+
/**
125+
* @brief Update the annealing state according to the annealing schedule selected.
126+
*
127+
* USER_SCHED: A manual fixed schedule with fixed alpha and exit criteria.
128+
* AUTO_SCHED: A more sophisticated schedule where alpha varies based on success ratio.
129+
* DUSTY_SCHED: This schedule jumps backward and slows down in response to success ratio.
130+
* See doc/src/vpr/dusty_sa.rst for more details.
131+
*
132+
* Returns true until the schedule is finished.
133+
*/
134+
bool update_annealing_state(t_annealing_state* state,
135+
float success_rat,
136+
const t_placer_costs& costs,
137+
const t_placer_opts& placer_opts,
138+
const t_annealing_sched& annealing_sched) {
139+
/* Return `false` when the exit criterion is met. */
140+
if (annealing_sched.type == USER_SCHED) {
141+
state->t *= annealing_sched.alpha_t;
142+
return state->t >= annealing_sched.exit_t;
143+
}
144+
145+
auto& device_ctx = g_vpr_ctx.device();
146+
auto& cluster_ctx = g_vpr_ctx.clustering();
147+
148+
/* Automatic annealing schedule */
149+
float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size();
150+
151+
if (annealing_sched.type == DUSTY_SCHED) {
152+
bool restart_temp = state->t < t_exit || std::isnan(t_exit); //May get nan if there are no nets
153+
if (success_rat < annealing_sched.success_min || restart_temp) {
154+
if (state->alpha > annealing_sched.alpha_max) return false;
155+
state->t = state->restart_t / sqrt(state->alpha); // Take a half step from the restart temperature.
156+
state->alpha = 1.0 - ((1.0 - state->alpha) * annealing_sched.alpha_decay);
157+
} else {
158+
if (success_rat > annealing_sched.success_target) {
159+
state->restart_t = state->t;
160+
}
161+
state->t *= state->alpha;
162+
}
163+
state->move_lim = std::max(1, std::min(state->move_lim_max, (int)(state->move_lim_max * (annealing_sched.success_target / success_rat))));
164+
} else { /* annealing_sched.type == AUTO_SCHED */
165+
if (success_rat > 0.96) {
166+
state->alpha = 0.5;
167+
} else if (success_rat > 0.8) {
168+
state->alpha = 0.9;
169+
} else if (success_rat > 0.15 || state->rlim > 1.) {
170+
state->alpha = 0.95;
171+
} else {
172+
state->alpha = 0.8;
173+
}
174+
state->t *= state->alpha;
175+
176+
// Must be duplicated to retain previous behavior
177+
if (state->t < t_exit || std::isnan(t_exit)) return false;
178+
}
179+
180+
// Gradually changes from the initial crit_exponent to the final crit_exponent based on how much the range limit has shrunk.
181+
// The idea is that as the range limit shrinks (indicating we are fine-tuning a more optimized placement) we can focus more on a smaller number of critical connections, which a higher crit_exponent achieves.
182+
update_rlim(&state->rlim, success_rat, device_ctx.grid);
183+
184+
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
185+
state->crit_exponent = (1 - (state->rlim - state->final_rlim()) * state->inverse_delta_rlim)
186+
* (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first)
187+
+ placer_opts.td_place_exp_first;
188+
}
189+
190+
return true;
191+
}
192+
193+
/**
194+
* @brief Update the range limited to keep acceptance prob. near 0.44.
195+
*
196+
* Use a floating point rlim to allow gradual transitions at low temps.
197+
*/
198+
static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) {
199+
float upper_lim = std::max(grid.width() - 1, grid.height() - 1);
200+
201+
*rlim *= (1. - 0.44 + success_rat);
202+
*rlim = std::max(std::min(*rlim, upper_lim), 1.f);
203+
}

0 commit comments

Comments
 (0)