Skip to content

Commit 91cb6c1

Browse files
committed
[ParallelRouter] Added Customized Heap and Occupancy Profiling for MQ
Added a customized heap with indexing from one optimization and the ability to drain/clear the heap directly. Added a heap occupancy profiling method to gain insight into the MQ heap occupancy and workload.
1 parent e4af5f5 commit 91cb6c1

File tree

4 files changed

+104
-1
lines changed

4 files changed

+104
-1
lines changed

vpr/src/route/multi_queue_priority_queue.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ void MultiQueuePriorityQueue::init_heap(const DeviceGrid& grid) {
2020
}
2121

2222
bool MultiQueuePriorityQueue::try_pop(pq_prio_t &prio, RRNodeId &node) {
23-
auto tmp = pq_->tryPop();
23+
auto tmp = pq_->tryPopWithMinPrio();
2424
if (!tmp) {
2525
return false;
2626
} else {

vpr/src/route/multi_queue_priority_queue.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ class MultiQueuePriorityQueue {
3434
void build_heap();
3535
inline uint64_t getNumPushes() const { return pq_->getNumPushes(); }
3636
inline uint64_t getNumPops() const { return pq_->getNumPops(); }
37+
inline uint64_t getHeapOccupancy() const { return pq_->getQueueOccupancy(); }
3738
inline void reset() { pq_->reset(); }
39+
inline void setMinPrio(const pq_prio_t min_prio) { pq_->setMinPrioForPop(min_prio); }
3840

3941
private:
4042
MQ_IO* pq_;

vpr/src/route/parallel_connection_router.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,8 +382,22 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
382382
// cheapest t_heap in current route tree to be expanded on
383383
float new_total_cost;
384384
RRNodeId inode;
385+
#ifdef PROFILE_HEAP_OCCUPANCY
386+
unsigned count = 0;
387+
if (thread_idx == 0) {
388+
heap_occ_profile_ << size_t(sink_node) << "\n";
389+
}
390+
#endif
385391
// While the heap is not empty do
386392
while (heap_.try_pop(new_total_cost, inode)) {
393+
#ifdef PROFILE_HEAP_OCCUPANCY
394+
if (thread_idx == 0) {
395+
if (count % (1000 / mq_num_threads) == 0) {
396+
heap_occ_profile_ << count << " " << heap_.getHeapOccupancy() << "\n";
397+
}
398+
count ++;
399+
}
400+
#endif
387401
// update_router_stats(router_stats_,
388402
// false,
389403
// cheapest->index,
@@ -392,6 +406,7 @@ void ParallelConnectionRouter::timing_driven_route_connection_from_heap_thread_f
392406
// Should we explore the neighbors of this node?
393407

394408
if (inode == sink_node) {
409+
heap_.setMinPrio(new_total_cost);
395410
continue;
396411
}
397412

vpr/src/route/parallel_connection_router.h

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
#include "router_stats.h"
1212
#include "spatial_route_tree_lookup.h"
1313

14+
#include <fstream>
15+
16+
#define ENABLE_CORE_AFFINITY
17+
1418
#define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE
1519
// #define VPR_PARALLEL_CONNECTION_ROUTER_USE_ONE_TBB
1620

@@ -110,6 +114,41 @@ class barrier_spin_t {
110114

111115
using barrier_t = barrier_spin_t;
112116

117+
inline std::vector<std::string> get_tokens_split_by_delimiter(std::string str, char delimiter) {
118+
std::vector<std::string> tokens;
119+
std::string acc = "";
120+
for(const auto &x : str) {
121+
if (x == delimiter) {
122+
tokens.push_back(acc);
123+
acc = "";
124+
} else {
125+
acc += x;
126+
}
127+
}
128+
tokens.push_back(acc);
129+
return tokens;
130+
}
131+
132+
inline std::vector<size_t> parse_core_affinity_list(std::string str) {
133+
std::vector<size_t> core_affinity_list;
134+
std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ',');
135+
for (const auto &l1_token : lv1_tokens_split_by_comma) {
136+
std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-');
137+
size_t num_lv2_tokens = lv2_tokens_split_by_dash.size();
138+
assert(num_lv2_tokens == 1 || num_lv2_tokens == 2);
139+
if (num_lv2_tokens == 2) {
140+
int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]);
141+
int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]);
142+
for (int i = start_core_id; i <= end_core_id; ++i) {
143+
core_affinity_list.push_back(i);
144+
}
145+
} else {
146+
core_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0]));
147+
}
148+
}
149+
return core_affinity_list;
150+
}
151+
113152
// Prune the heap when it contains 4x the number of nodes in the RR graph.
114153
// constexpr size_t kHeapPruneFactor = 4;
115154

@@ -154,17 +193,59 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
154193
std::cout << "#T=" << mq_num_threads << " #Q=" << mq_num_queues << std::endl << std::flush;
155194
sub_threads_.resize(mq_num_threads-1);
156195
thread_barrier_.init();
196+
197+
#ifdef PROFILE_HEAP_OCCUPANCY
198+
heap_occ_profile_.open("occupancy.txt", std::ios::trunc);
199+
#endif
200+
201+
#ifdef ENABLE_CORE_AFFINITY
202+
std::vector<size_t> thread_core_affinity_mapping;
203+
if (std::getenv("VPR_CORE_AFFINITY")) {
204+
thread_core_affinity_mapping = parse_core_affinity_list(std::getenv("VPR_CORE_AFFINITY"));
205+
assert(thread_core_affinity_mapping.size() == mq_num_threads);
206+
} else {
207+
for (size_t i = 0; i < mq_num_threads; ++i) {
208+
thread_core_affinity_mapping.push_back(i);
209+
}
210+
}
211+
#endif
212+
157213
for (size_t i = 0 ; i < mq_num_threads - 1; ++i) {
158214
sub_threads_[i] = std::thread(&ParallelConnectionRouter::timing_driven_route_connection_from_heap_sub_thread_wrapper, this, i + 1 /*0: main thread*/);
215+
// Create a cpu_set_t object representing a set of CPUs. Clear it and mark only CPU i as set.
216+
#ifdef ENABLE_CORE_AFFINITY
217+
cpu_set_t cpuset;
218+
CPU_ZERO(&cpuset);
219+
CPU_SET(thread_core_affinity_mapping[i + 1], &cpuset);
220+
int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(),
221+
sizeof(cpu_set_t), &cpuset);
222+
if (rc != 0) {
223+
VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
224+
}
225+
#endif
159226
sub_threads_[i].detach();
160227
}
228+
#ifdef ENABLE_CORE_AFFINITY
229+
cpu_set_t cpuset;
230+
CPU_ZERO(&cpuset);
231+
CPU_SET(thread_core_affinity_mapping[0], &cpuset);
232+
int rc = pthread_setaffinity_np(pthread_self(),
233+
sizeof(cpu_set_t), &cpuset);
234+
if (rc != 0) {
235+
VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc);
236+
}
237+
#endif
161238
}
162239

163240
~ParallelConnectionRouter() {
164241
is_router_destroying_ = true;
165242
thread_barrier_.wait();
166243

167244
VTR_LOG("Parallel Connection Router is being destroyed. Time spent computing SSSP: %g seconds\n.", this->sssp_total_time.count() / 1000000.0);
245+
246+
#ifdef PROFILE_HEAP_OCCUPANCY
247+
heap_occ_profile_.close();
248+
#endif
168249
}
169250

170251
// Clear's the modified list. Should be called after reset_path_costs
@@ -424,6 +505,11 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
424505

425506
// Timing
426507
std::chrono::microseconds sssp_total_time{0};
508+
509+
// Profiling
510+
#ifdef PROFILE_HEAP_OCCUPANCY
511+
std::ofstream heap_occ_profile_;
512+
#endif
427513
};
428514

429515
#endif /* _PARALLEL_CONNECTION_ROUTER_H */

0 commit comments

Comments
 (0)