Skip to content

Commit f70f2cd

Browse files
authored
Enable hugepage for arrow host allocations (#13914)
This PR enables Transparent Huge Pages (THP) for large (>4MB) arrow allocations (host memory only). ### Performance results on a DGX-1 (`dgx14`) | | 8MB | 80MB | 800MB | 8GB | Method | |:--------------:|:-----:|:-----:|:-----:|:-----:|------------------------------------| | cudf-native | 0.006 | 0.049 | 0.485 | 4.787 | `df.to_arrow()` (branch-23.10) | | Dask-serialize | 0.004 | 0.032 | 0.310 | 3.122 | `distributed.protocol.serialize(df)` | | cudf-hugepage | 0.004 | 0.030 | 0.299 | 3.046 | `df.to_arrow()` (this PR) | | speedup | 1.5 | 1.63 | 1.62 | 1.57 | cudf-native vs. cudf-hugepage | Notice, Dask-serialize also use THP, which is why its performance is on par with cudf-hugepage. Authors: - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Mark Harris (https://github.com/harrism) - Vukasin Milovanovic (https://github.com/vuule) URL: #13914
1 parent 83f9cbf commit f70f2cd

File tree

1 file changed

+35
-4
lines changed

1 file changed

+35
-4
lines changed

cpp/src/interop/detail/arrow_allocator.cpp

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,40 @@
1616

1717
#include <cudf/detail/interop.hpp>
1818

19+
#include <memory>
20+
#include <sys/mman.h>
21+
#include <unistd.h>
22+
1923
namespace cudf {
2024
namespace detail {
2125

26+
/*
27+
Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
28+
`buf` is returned untouched.
29+
Enabling THP can improve performance of device-host memory transfers
30+
significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
31+
*/
32+
template <typename T>
33+
T enable_hugepage(T&& buf)
34+
{
35+
if (buf->size() < (1u << 22u)) { // Smaller than 4 MB
36+
return std::move(buf);
37+
}
38+
39+
#ifdef MADV_HUGEPAGE
40+
const auto pagesize = sysconf(_SC_PAGESIZE);
41+
void* addr = const_cast<uint8_t*>(buf->data());
42+
if (addr == nullptr) { return std::move(buf); }
43+
auto length{static_cast<std::size_t>(buf->size())};
44+
if (std::align(pagesize, pagesize, addr, length)) {
45+
// Intentionally not checking for errors that may be returned by older kernel versions;
46+
// optimistically tries enabling huge pages.
47+
madvise(addr, length, MADV_HUGEPAGE);
48+
}
49+
#endif
50+
return std::move(buf);
51+
}
52+
2253
std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr)
2354
{
2455
/*
@@ -28,9 +59,9 @@ std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::
2859
To work around this issue we compile an allocation shim in C++ and use
2960
that from our cuda sources
3061
*/
31-
auto result = arrow::AllocateBuffer(size, ar_mr);
62+
arrow::Result<std::unique_ptr<arrow::Buffer>> result = arrow::AllocateBuffer(size, ar_mr);
3263
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
33-
return std::move(result).ValueOrDie();
64+
return enable_hugepage(std::move(result).ValueOrDie());
3465
}
3566

3667
std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr)
@@ -42,9 +73,9 @@ std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::
4273
To work around this issue we compile an allocation shim in C++ and use
4374
that from our cuda sources
4475
*/
45-
auto result = arrow::AllocateBitmap(size, ar_mr);
76+
arrow::Result<std::shared_ptr<arrow::Buffer>> result = arrow::AllocateBitmap(size, ar_mr);
4677
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
47-
return std::move(result).ValueOrDie();
78+
return enable_hugepage(std::move(result).ValueOrDie());
4879
}
4980

5081
} // namespace detail

0 commit comments

Comments
 (0)