diff --git a/libs/libvtrutil/src/vtr_prefix_sum.h b/libs/libvtrutil/src/vtr_prefix_sum.h new file mode 100644 index 00000000000..030cce9d3d3 --- /dev/null +++ b/libs/libvtrutil/src/vtr_prefix_sum.h @@ -0,0 +1,283 @@ +/** + * @file + * @author Alex Singer + * @date February 2025 + * @brief Definition of the Prefix Sum class which enables O(1) time-complexity + * sums over regions of an unchanging grid of values. + */ + +#pragma once + +#include +#include +#include "vtr_assert.h" +#include "vtr_ndmatrix.h" + +namespace vtr { + +/** + * @brief 1D Prefix Sum manager class. + * + * Given an array of values, it may be necessary to find the sum of values + * within a continuous sub-section of the array. If this operation needs to be + * performed many times, this may be expensive in runtime to calculate. + * + * If the array of values does not change, we can create a prefix sum which will + * allow us to get the sum of values in some continuous sub-section of the array + * in O(1) time, instead of O(k) time where k is the number of values in the + * sub-section. + * + * This class has a space complexity of O(l) where l is the length of the array + * of values. + * + * + * Static Array of Values Example (values stored in a vector): + * + * std::vector vals = {...}; + * + * // Build the Prefix Sum + * vtr::PrefixSum1D prefix_sum(vals); + * + * // Compute the sum of the values between index 3 and 7 of the array (inclusive) + * float sum = prefix_sum.get_sum(3, 7); + * + * + * Dynamic Vector of Values Example (values derived at runtime): + * + * // Build the Prefix Sum using a lambda + * vtr::PrefixSum1D prefix_sum(length, [&](size_t x) { + * // This lambda returns the value that would be in the array at index x. + * return static_cast(x * x); + * }); + * + * // Compute the sum of the values between index 0 and 5 of the array (inclusive) + * float sum = prefix_sum.get_sum(0, 5); + */ +template +class PrefixSum1D { +public: + PrefixSum1D() = default; + + /** + * @brief Construct the 1D prefix sum. + * + * This pre-computes the sums of values in the array, making it faster to + * get the sum of sub-regions of the array later. + * + * This constructor has a time complexity of O(length) + * + * @param length + * The length of the array to a make a prefix sum of. + * @param lookup + * A lambda function which will return the value in the array at + * the given x index. This is a lambda to allow a prefix sum to be + * created, even if the values in the array are not stored in a + * vector (may be computed on the spot). + * @param zero + * What is zero for this data type. For most basic data types (like + * int float, etc.) this parameter can be ignored; for more complex + * data classes (like multi-dimensional vectors) this is necessary + * to be passed in. + */ + PrefixSum1D(size_t length, std::function lookup, T zero = T()) + : prefix_sum_(length + 1, zero) { + // The first value in the prefix sum is already initialized to 0. + + // Initialize the prefix sum. The prefix sum at position x is the sum + // of all values in the original array from 0 to x - 1. + for (size_t x = 1; x < length + 1; x++) { + prefix_sum_[x] = prefix_sum_[x - 1] + lookup(x - 1); + } + } + + /** + * @brief Construct the 1D prefix sum from a vector. + */ + PrefixSum1D(std::vector vals, T zero = T()) + : PrefixSum1D(vals.size(), + [&](size_t x) noexcept { + return vals[x]; + }, + zero) {} + + /** + * @brief Get the sum of all values in the original array of values between + * lower_x and upper_x (inclusive). + * + * Inclusive means that the sum will include the values at lower_x and + * upper_x. + * + * This method has O(1) time complexity. + */ + T get_sum(size_t lower_x, size_t upper_x) const { + // Some safety asserts. + VTR_ASSERT_SAFE_MSG(lower_x <= upper_x, "lower_x is larger than upper_x"); + VTR_ASSERT_SAFE_MSG(lower_x < prefix_sum_.size() - 1, "lower_x out of range"); + VTR_ASSERT_SAFE_MSG(upper_x < prefix_sum_.size() - 1, "upper_x out of range"); + + // The sum of the region lower_x to upper_x inclusive is equal to + // - The sum from 0 to upper_x + // - Minus the sum from 0 to lower_x - 1 + // Note: These are all offset by 1 since the first value is zero. This + // saves us from having to do bound checking. + return prefix_sum_[upper_x + 1] - prefix_sum_[lower_x]; + } + +private: + /** + * @brief The 1D prefix sum of the original array of values. + * + * Index x of the prefix sum contains the sum of all values in the original + * array from 0 to x - 1. The first value in this array is 0. By setting the + * first value in the array to 0, we can avoid bound checking. This data + * structure has the special property that the sum of any sub-array can be + * computed in O(1) time. + */ + std::vector prefix_sum_; +}; + +/** + * @brief 2D Prefix Sum manager class. + * + * Given a 2D grid of values, it may be necessary to find the sum of values + * within some rectangular sub-region of that grid. If this operation needs to + * be performed many times, this may be expensive in runtime to calculate. + * + * If the grid of values does not change, we can create a prefix sum which will + * allow us to get the sum of values in some rectangular sub-region of the + * grid in O(1) time, instead of O(k) time where k is the number of values + * in the region. + * + * This class has a space complexity of O(w * h) where w and h are the width + * and height of the grid of values. + * + * + * Static Matrix of Values Example (values stored in a matrix): + * + * vtr::NdMatrix vals({w, h}); + * + * // ... Initialize vals + * + * // Build the Prefix Sum + * vtr::PrefixSum2D prefix_sum(vals); + * + * // Compute the sum of the rectangular region from (1, 2) to (3, 4) inclusive. + * float sum = prefix_sum.get_sum(1, 2, 3, 4); + * + * + * Dynamic Matrix of Values Example (values derived at runtime): + * + * // Build the Prefix Sum using a lambda + * vtr::PrefixSum2D prefix_sum(w, h, [&](size_t x, size_t y) { + * // This lambda returns the value that would be in the matrix at (x, y) + * return (x + y) / 2.f; + * }); + * + * // Compute the sum of the rectangular region from (0, 4) to (3, 5) inclusive. + * float sum = prefix_sum.get_sum(0, 4, 3, 5); + */ +template +class PrefixSum2D { +public: + PrefixSum2D() = default; + + /** + * @brief Construct the 2D prefix sum. + * + * This pre-computes the sums of values in the grid, making it faster to + * get the sum of sub-regions of the grid later. + * + * This constructor has a time complexity of O(w * h). + * + * @param w + * The width of the grid of values to make a prefix sum over. + * @param h + * The height of the grid of values to make a prefix sum over. + * @param lookup + * A lambda function which will return the value in the grid at the + * given x, y position. This is a lambda to allow a prefix sum to + * be created, even if the values in the grid are not stored in + * a matrix (may be computed at runtime). + * @param zero + * What is zero for this data type. For most basic data types (like + * int, float, etc.) this parameter can be ignored; for more complex + * data classes (like multi-dimensional vectors) this is necessary + * to be passed in. + */ + PrefixSum2D(size_t w, size_t h, std::function lookup, T zero = T()) + : prefix_sum_({w + 1, h + 1}, zero) { + // The first row and first column should already be initialized to zero. + + // Initialize the prefix sum. The prefix sum at position (x, y) is the + // sum of all values in the original matrix in the rectangle from (0, 0) + // to (x - 1, y - 1) inclusive. + for (size_t x = 1; x < w + 1; x++) { + for (size_t y = 1; y < h + 1; y++) { + prefix_sum_[x][y] = prefix_sum_[x - 1][y] + + prefix_sum_[x][y - 1] + + lookup(x - 1, y - 1) - + prefix_sum_[x - 1][y - 1]; + } + } + } + + /** + * @brief Constructs a 2D prefix sum from a 2D grid of values. + */ + PrefixSum2D(const vtr::NdMatrix& vals, T zero = T()) + : PrefixSum2D(vals.dim_size(0), + vals.dim_size(1), + [&](size_t x, size_t y) { + return vals[x][y]; + }, + zero) {} + + /** + * @brief Get the sum of all values in the original grid of values between + * x = [lower_x, upper_x] and y = [lower_y, upper_y]. + * + * This sum is inclusive, so it also sums the values at (upper_x, upper_y). + * + * This method has O(1) time complexity. + */ + T get_sum(size_t lower_x, size_t lower_y, size_t upper_x, size_t upper_y) const { + // Some safety asserts. + VTR_ASSERT_SAFE_MSG(lower_x <= upper_x, "lower_x is larger than upper_x"); + VTR_ASSERT_SAFE_MSG(lower_y <= upper_y, "lower_y is larger than upper_y"); + VTR_ASSERT_SAFE_MSG(lower_x < prefix_sum_.dim_size(0) - 1, "lower_x out of range"); + VTR_ASSERT_SAFE_MSG(upper_x < prefix_sum_.dim_size(0) - 1, "upper_x out of range"); + VTR_ASSERT_SAFE_MSG(lower_y < prefix_sum_.dim_size(1) - 1, "lower_y out of range"); + VTR_ASSERT_SAFE_MSG(upper_y < prefix_sum_.dim_size(1) - 1, "upper_y out of range"); + + // The sum of the region (lower_x, lower_y) to (upper_x, upper_y) + // inclusive is equal to: + // - The sum of the region (0, 0) to (upper_x, upper_y) + // - Minus the sum of the region (0, 0) to (lower_x - 1, upper_y) + // - Remove the part below the region + // - Minus the sum of the region (0, 0) to (upper_x, lower_y - 1) + // - Remove the part left of the region + // - Plus the sum of the region (0, 0) to (lower_x - 1, lower_y - 1) + // - Add back on the lower-left corner which was subtracted twice. + // Note: all of these are offset by 1 since the first row and column + // are all zeros. This allows us to avoid bounds checking when + // lower_x or lower_y are 0. + return prefix_sum_[upper_x + 1][upper_y + 1] - prefix_sum_[lower_x][upper_y + 1] + - prefix_sum_[upper_x + 1][lower_y] + + prefix_sum_[lower_x][lower_y]; + } + +private: + /** + * @brief The 2D prefix sum of the original grid of values. + * + * Position (x, y) of the prefix sum contains the sum of all values in the + * rectangle (0, 0) -> (x - 1, y - 1) inclusive. The first row and column + * are all zeros. By setting these to zero, we can avoid bound checking. + * This data structure has the special property that the sum of any + * rectangular region can be computed in O(1) time. + */ + vtr::NdMatrix prefix_sum_; +}; + +} // namespace vtr + diff --git a/libs/libvtrutil/test/test_prefix_sum.cpp b/libs/libvtrutil/test/test_prefix_sum.cpp new file mode 100644 index 00000000000..6babecf9167 --- /dev/null +++ b/libs/libvtrutil/test/test_prefix_sum.cpp @@ -0,0 +1,99 @@ +/** + * @file + * @author Alex Singer + * @date February 2025 + * @brief Test cases for the Prefix Sum class in vtr_util. + */ + +#include "catch2/catch_test_macros.hpp" + +#include "vtr_ndmatrix.h" +#include "vtr_prefix_sum.h" + +using namespace Catch; + +TEST_CASE("PrefixSum1D", "[vtr_prefix_sum/PrefixSum1D]") { + // Construct a 1D array to compute the prefix sum over. + std::vector vals = {1.f, 7.f, 2.f, 2.f, 5.f, 6.f, 1.f, 9.f, 1.f, 3.f}; + + // Construct the Prefix Sum. + vtr::PrefixSum1D prefix_sum(vals); + + // Check that the sum of each length 1 region is the original value. + SECTION("construction") { + for (size_t x = 0; x < vals.size(); x++) { + float sum_val = prefix_sum.get_sum(x, x); + REQUIRE(sum_val == vals[x]); + } + } + + float sum_of_all_vals = 0.f; + for (size_t x = 0; x < vals.size(); x++) { + sum_of_all_vals += vals[x]; + } + + // Check that get_sum is working on some testcases. + SECTION("get_sum") { + REQUIRE(prefix_sum.get_sum(0, vals.size() - 1) == sum_of_all_vals); + REQUIRE(prefix_sum.get_sum(0, 2) == 10.f); + REQUIRE(prefix_sum.get_sum(7, 9) == 13.f); + REQUIRE(prefix_sum.get_sum(2, 5) == 15.f); + } +} + +TEST_CASE("PrefixSum2D", "[vtr_prefix_sum/PrefixSum2D]") { + // Construct a 2D grid to compute the prefix sum over. + vtr::NdMatrix vals({4, 4}); + /* + * [ 1 3 9 2 ] + * [ 2 4 0 8 ] + * [ 3 7 1 3 ] + * [ 5 6 9 2 ] + */ + vals[0][0] = 5.f; + vals[1][0] = 6.f; + vals[2][0] = 9.f; + vals[3][0] = 2.f; + vals[0][1] = 3.f; + vals[1][1] = 7.f; + vals[2][1] = 1.f; + vals[3][1] = 3.f; + vals[0][2] = 2.f; + vals[1][2] = 4.f; + vals[2][2] = 0.f; + vals[3][2] = 8.f; + vals[0][3] = 1.f; + vals[1][3] = 3.f; + vals[2][3] = 9.f; + vals[3][3] = 2.f; + + // Construct the Prefix Sum. + vtr::PrefixSum2D prefix_sum(vals); + + // Check that the sum of each 1x1 region is the original value. + SECTION("construction") { + for (size_t x = 0; x < 4; x++) { + for (size_t y = 0; y < 4; y++) { + float sum_val = prefix_sum.get_sum(x, y, x, y); + REQUIRE(sum_val == vals[x][y]); + } + } + } + + float sum_of_all_vals = 0; + for (size_t x = 0; x < 4; x++) { + for (size_t y = 0; y < 4; y++) { + sum_of_all_vals += vals[x][y]; + } + } + + // Check that get_sum is working on some testcases. + SECTION("get_sum") { + REQUIRE(prefix_sum.get_sum(0, 0, 3, 3) == sum_of_all_vals); + REQUIRE(prefix_sum.get_sum(1, 1, 2, 2) == 12.f); + REQUIRE(prefix_sum.get_sum(0, 0, 3, 0) == 22.f); + REQUIRE(prefix_sum.get_sum(0, 0, 0, 3) == 11.f); + REQUIRE(prefix_sum.get_sum(1, 2, 2, 3) == 16.f); + } +} + diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp index 7e4d36cf54c..cebdcd1762e 100644 --- a/vpr/src/place/net_cost_handler.cpp +++ b/vpr/src/place/net_cost_handler.cpp @@ -33,8 +33,8 @@ #include "place_timing_update.h" #include "vtr_math.h" #include "vtr_ndmatrix.h" -#include "vtr_ndoffsetmatrix.h" #include "PlacerCriticalities.h" +#include "vtr_prefix_sum.h" #include @@ -154,8 +154,8 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts, void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() { const auto& device_ctx = g_vpr_ctx.device(); - const int grid_height = (int)device_ctx.grid.height(); - const int grid_width = (int)device_ctx.grid.width(); + const size_t grid_height = device_ctx.grid.height(); + const size_t grid_width = device_ctx.grid.width(); /* These arrays contain accumulative channel width between channel zero and * the channel specified by the given index. The accumulated channel width @@ -165,37 +165,28 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() { * acc_chan?_width_[high] - acc_chan?_width_[low - 1] * This returns the total number of tracks between channels 'low' and 'high', * including tracks in these channels. - * - * Channel -1 doesn't exist, so we can say it has zero tracks. We need to be able - * to access these arrays with index -1 to handle cases where the lower channel is 0. */ - acc_chanx_width_ = vtr::NdOffsetMatrix({{{-1, grid_height}}}); - acc_chany_width_ = vtr::NdOffsetMatrix({{{-1, grid_width}}}); - - // initialize the first element (index -1) with zero - acc_chanx_width_[-1] = 0; - for (int y = 0; y < grid_height; y++) { - acc_chanx_width_[y] = acc_chanx_width_[y - 1] + device_ctx.chan_width.x_list[y]; + acc_chanx_width_ = vtr::PrefixSum1D(grid_height, [&](size_t y) noexcept { + int chan_x_width = device_ctx.chan_width.x_list[y]; /* If the number of tracks in a channel is zero, two consecutive elements take the same * value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this * potential issue, we assume that the channel width is at least 1. */ - if (acc_chanx_width_[y] == acc_chanx_width_[y - 1]) { - acc_chanx_width_[y]++; - } - } + if (chan_x_width == 0) + return 1; - // initialize the first element (index -1) with zero - acc_chany_width_[-1] = 0; - for (int x = 0; x < grid_width; x++) { - acc_chany_width_[x] = acc_chany_width_[x - 1] + device_ctx.chan_width.y_list[x]; + return chan_x_width; + }); + acc_chany_width_ = vtr::PrefixSum1D(grid_width, [&](size_t x) noexcept { + int chan_y_width = device_ctx.chan_width.y_list[x]; // to avoid a division by zero - if (acc_chany_width_[x] == acc_chany_width_[x - 1]) { - acc_chany_width_[x]++; - } - } + if (chan_y_width == 0) + return 1; + + return chan_y_width; + }); if (is_multi_layer_) { alloc_and_load_for_fast_vertical_cost_update_(); @@ -209,8 +200,6 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() { const size_t grid_height = device_ctx.grid.height(); const size_t grid_width = device_ctx.grid.width(); - acc_tile_num_inter_die_conn_ = vtr::NdMatrix({grid_width, grid_height}, 0); - vtr::NdMatrix tile_num_inter_die_conn({grid_width, grid_height}, 0.); /* @@ -255,26 +244,11 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() { } // Step 2: Calculate prefix sum of the inter-die connectivity up to and including the channel at (x, y). - acc_tile_num_inter_die_conn_[0][0] = tile_num_inter_die_conn[0][0]; - // Initialize the first row and column - for (size_t x = 1; x < device_ctx.grid.width(); x++) { - acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] + - tile_num_inter_die_conn[x][0]; - } - - for (size_t y = 1; y < device_ctx.grid.height(); y++) { - acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] + - tile_num_inter_die_conn[0][y]; - } - - for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) { - for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) { - acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] + - acc_tile_num_inter_die_conn_[x_high][y_high-1] + - tile_num_inter_die_conn[x_high][y_high] - - acc_tile_num_inter_die_conn_[x_high-1][y_high-1]; - } - } + acc_tile_num_inter_die_conn_ = vtr::PrefixSum2D(grid_width, + grid_height, + [&](size_t x, size_t y) { + return (int)tile_num_inter_die_conn[x][y]; + }); } std::pair NetCostHandler::comp_bb_cost(e_cost_methods method) { @@ -1504,22 +1478,10 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) { } float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) { - int num_inter_dir_conn; - - if (bb.xmin == 0 && bb.ymin == 0) { - num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax]; - } else if (bb.xmin == 0) { - num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] - - acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1]; - } else if (bb.ymin == 0) { - num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] - - acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax]; - } else { - num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] - - acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax] - - acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1] + - acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymin-1]; - } + int num_inter_dir_conn = acc_tile_num_inter_die_conn_.get_sum(bb.xmin, + bb.ymin, + bb.xmax, + bb.ymax); float z_cost_factor; if (num_inter_dir_conn == 0) { diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h index 8049cd42c64..ffc04f6f3c5 100644 --- a/vpr/src/place/net_cost_handler.h +++ b/vpr/src/place/net_cost_handler.h @@ -9,7 +9,7 @@ #include "place_delay_model.h" #include "move_transactions.h" #include "place_util.h" -#include "vtr_ndoffsetmatrix.h" +#include "vtr_prefix_sum.h" #include @@ -197,8 +197,8 @@ class NetCostHandler { * number of tracks in that direction; for other cost functions they * will never be used. */ - vtr::NdOffsetMatrix acc_chanx_width_; // [-1...device_ctx.grid.width()-1] - vtr::NdOffsetMatrix acc_chany_width_; // [-1...device_ctx.grid.height()-1] + vtr::PrefixSum1D acc_chanx_width_; // [0..device_ctx.grid.width()-1] + vtr::PrefixSum1D acc_chany_width_; // [0..device_ctx.grid.height()-1] /** * @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in @@ -208,7 +208,7 @@ class NetCostHandler { * (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh) * region in O(1) (by adding and subtracting 4 entries) */ - vtr::NdMatrix acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1] + vtr::PrefixSum2D acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1] private: @@ -526,10 +526,10 @@ class NetCostHandler { */ template std::pair get_chanxy_cost_fac_(const BBT& bb) { - const int total_chanx_width = acc_chanx_width_[bb.ymax] - acc_chanx_width_[bb.ymin - 1]; + const int total_chanx_width = acc_chanx_width_.get_sum(bb.ymin, bb.ymax); const double inverse_average_chanx_width = (bb.ymax - bb.ymin + 1.0) / total_chanx_width; - const int total_chany_width = acc_chany_width_[bb.xmax] - acc_chany_width_[bb.xmin - 1]; + const int total_chany_width = acc_chany_width_.get_sum(bb.xmin, bb.xmax); const double inverse_average_chany_width = (bb.xmax - bb.xmin + 1.0) / total_chany_width; return {inverse_average_chanx_width, inverse_average_chany_width};