|
11 | 11 | #include "router_stats.h"
|
12 | 12 | #include "spatial_route_tree_lookup.h"
|
13 | 13 |
|
| 14 | +#include <fstream> |
| 15 | + |
| 16 | +#define ENABLE_CORE_AFFINITY |
| 17 | + |
14 | 18 | #define VPR_PARALLEL_CONNECTION_ROUTER_USE_MULTI_QUEUE
|
15 | 19 | // #define VPR_PARALLEL_CONNECTION_ROUTER_USE_ONE_TBB
|
16 | 20 |
|
@@ -110,6 +114,41 @@ class barrier_spin_t {
|
110 | 114 |
|
111 | 115 | using barrier_t = barrier_spin_t;
|
112 | 116 |
|
| 117 | +inline std::vector<std::string> get_tokens_split_by_delimiter(std::string str, char delimiter) { |
| 118 | + std::vector<std::string> tokens; |
| 119 | + std::string acc = ""; |
| 120 | + for(const auto &x : str) { |
| 121 | + if (x == delimiter) { |
| 122 | + tokens.push_back(acc); |
| 123 | + acc = ""; |
| 124 | + } else { |
| 125 | + acc += x; |
| 126 | + } |
| 127 | + } |
| 128 | + tokens.push_back(acc); |
| 129 | + return tokens; |
| 130 | +} |
| 131 | + |
| 132 | +inline std::vector<size_t> parse_core_affinity_list(std::string str) { |
| 133 | + std::vector<size_t> core_affinity_list; |
| 134 | + std::vector<std::string> lv1_tokens_split_by_comma = get_tokens_split_by_delimiter(str, ','); |
| 135 | + for (const auto &l1_token : lv1_tokens_split_by_comma) { |
| 136 | + std::vector<std::string> lv2_tokens_split_by_dash = get_tokens_split_by_delimiter(l1_token, '-'); |
| 137 | + size_t num_lv2_tokens = lv2_tokens_split_by_dash.size(); |
| 138 | + assert(num_lv2_tokens == 1 || num_lv2_tokens == 2); |
| 139 | + if (num_lv2_tokens == 2) { |
| 140 | + int start_core_id = std::stoi(lv2_tokens_split_by_dash[0]); |
| 141 | + int end_core_id = std::stoi(lv2_tokens_split_by_dash[1]); |
| 142 | + for (int i = start_core_id; i <= end_core_id; ++i) { |
| 143 | + core_affinity_list.push_back(i); |
| 144 | + } |
| 145 | + } else { |
| 146 | + core_affinity_list.push_back(std::stoi(lv2_tokens_split_by_dash[0])); |
| 147 | + } |
| 148 | + } |
| 149 | + return core_affinity_list; |
| 150 | +} |
| 151 | + |
113 | 152 | // Prune the heap when it contains 4x the number of nodes in the RR graph.
|
114 | 153 | // constexpr size_t kHeapPruneFactor = 4;
|
115 | 154 |
|
@@ -154,17 +193,59 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
|
154 | 193 | std::cout << "#T=" << mq_num_threads << " #Q=" << mq_num_queues << std::endl << std::flush;
|
155 | 194 | sub_threads_.resize(mq_num_threads-1);
|
156 | 195 | thread_barrier_.init();
|
| 196 | + |
| 197 | +#ifdef PROFILE_HEAP_OCCUPANCY |
| 198 | + heap_occ_profile_.open("occupancy.txt", std::ios::trunc); |
| 199 | +#endif |
| 200 | + |
| 201 | +#ifdef ENABLE_CORE_AFFINITY |
| 202 | + std::vector<size_t> thread_core_affinity_mapping; |
| 203 | + if (std::getenv("VPR_CORE_AFFINITY")) { |
| 204 | + thread_core_affinity_mapping = parse_core_affinity_list(std::getenv("VPR_CORE_AFFINITY")); |
| 205 | + assert(thread_core_affinity_mapping.size() == mq_num_threads); |
| 206 | + } else { |
| 207 | + for (size_t i = 0; i < mq_num_threads; ++i) { |
| 208 | + thread_core_affinity_mapping.push_back(i); |
| 209 | + } |
| 210 | + } |
| 211 | +#endif |
| 212 | + |
157 | 213 | for (size_t i = 0 ; i < mq_num_threads - 1; ++i) {
|
158 | 214 | sub_threads_[i] = std::thread(&ParallelConnectionRouter::timing_driven_route_connection_from_heap_sub_thread_wrapper, this, i + 1 /*0: main thread*/);
|
| 215 | + // Create a cpu_set_t object representing a set of CPUs. Clear it and mark only CPU i as set. |
| 216 | +#ifdef ENABLE_CORE_AFFINITY |
| 217 | + cpu_set_t cpuset; |
| 218 | + CPU_ZERO(&cpuset); |
| 219 | + CPU_SET(thread_core_affinity_mapping[i + 1], &cpuset); |
| 220 | + int rc = pthread_setaffinity_np(sub_threads_[i].native_handle(), |
| 221 | + sizeof(cpu_set_t), &cpuset); |
| 222 | + if (rc != 0) { |
| 223 | + VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc); |
| 224 | + } |
| 225 | +#endif |
159 | 226 | sub_threads_[i].detach();
|
160 | 227 | }
|
| 228 | +#ifdef ENABLE_CORE_AFFINITY |
| 229 | + cpu_set_t cpuset; |
| 230 | + CPU_ZERO(&cpuset); |
| 231 | + CPU_SET(thread_core_affinity_mapping[0], &cpuset); |
| 232 | + int rc = pthread_setaffinity_np(pthread_self(), |
| 233 | + sizeof(cpu_set_t), &cpuset); |
| 234 | + if (rc != 0) { |
| 235 | + VTR_LOG("Error calling pthread_setaffinity_np: %d\n", rc); |
| 236 | + } |
| 237 | +#endif |
161 | 238 | }
|
162 | 239 |
|
163 | 240 | ~ParallelConnectionRouter() {
|
164 | 241 | is_router_destroying_ = true;
|
165 | 242 | thread_barrier_.wait();
|
166 | 243 |
|
167 | 244 | VTR_LOG("Parallel Connection Router is being destroyed. Time spent computing SSSP: %g seconds\n.", this->sssp_total_time.count() / 1000000.0);
|
| 245 | + |
| 246 | +#ifdef PROFILE_HEAP_OCCUPANCY |
| 247 | + heap_occ_profile_.close(); |
| 248 | +#endif |
168 | 249 | }
|
169 | 250 |
|
170 | 251 | // Clear's the modified list. Should be called after reset_path_costs
|
@@ -424,6 +505,11 @@ class ParallelConnectionRouter : public ConnectionRouterInterface {
|
424 | 505 |
|
425 | 506 | // Timing
|
426 | 507 | std::chrono::microseconds sssp_total_time{0};
|
| 508 | + |
| 509 | + // Profiling |
| 510 | +#ifdef PROFILE_HEAP_OCCUPANCY |
| 511 | + std::ofstream heap_occ_profile_; |
| 512 | +#endif |
427 | 513 | };
|
428 | 514 |
|
429 | 515 | #endif /* _PARALLEL_CONNECTION_ROUTER_H */
|
0 commit comments