Enable hugepage for arrow host allocations (#13914)

madsbk · web-flow · commit f70f2cd415a1 · 2023-08-24T09:06:13.000Z
This PR enables Transparent Huge Pages (THP) for large (>4MB) arrow allocations (host memory only). ### Performance results on a DGX-1 (`dgx14`) | | 8MB | 80MB | 800MB | 8GB | Method | |:--------------:|:-----:|:-----:|:-----:|:-----:|------------------------------------| | cudf-native | 0.006 | 0.049 | 0.485 | 4.787 | `df.to_arrow()` (branch-23.10) | | Dask-serialize | 0.004 | 0.032 | 0.310 | 3.122 | `distributed.protocol.serialize(df)` | | cudf-hugepage | 0.004 | 0.030 | 0.299 | 3.046 | `df.to_arrow()` (this PR) | | speedup | 1.5 | 1.63 | 1.62 | 1.57 | cudf-native vs. cudf-hugepage | Notice, Dask-serialize also use THP, which is why its performance is on par with cudf-hugepage. Authors: - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Mark Harris (https://github.com/harrism) - Vukasin Milovanovic (https://github.com/vuule) URL: #13914
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
@@ -16,9 +16,40 @@
 
 #include <cudf/detail/interop.hpp>
 
+#include <memory>
+#include <sys/mman.h>
+#include <unistd.h>
+
 namespace cudf {
 namespace detail {
 
+/*
+  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
+  `buf` is returned untouched.
+  Enabling THP can improve performance of device-host memory transfers
+  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
+*/
+template <typename T>
+T enable_hugepage(T&& buf)
+{
+  if (buf->size() < (1u << 22u)) {  // Smaller than 4 MB
+    return std::move(buf);
+  }
+
+#ifdef MADV_HUGEPAGE
+  const auto pagesize = sysconf(_SC_PAGESIZE);
+  void* addr          = const_cast<uint8_t*>(buf->data());
+  if (addr == nullptr) { return std::move(buf); }
+  auto length{static_cast<std::size_t>(buf->size())};
+  if (std::align(pagesize, pagesize, addr, length)) {
+    // Intentionally not checking for errors that may be returned by older kernel versions;
+    // optimistically tries enabling huge pages.
+    madvise(addr, length, MADV_HUGEPAGE);
+  }
+#endif
+  return std::move(buf);
+}
+
 std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr)
 {
   /*
@@ -28,9 +59,9 @@ std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::
   To work around this issue we compile an allocation shim in C++ and use
   that from our cuda sources
   */
-  auto result = arrow::AllocateBuffer(size, ar_mr);
+  arrow::Result<std::unique_ptr<arrow::Buffer>> result = arrow::AllocateBuffer(size, ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
-  return std::move(result).ValueOrDie();
+  return enable_hugepage(std::move(result).ValueOrDie());
 }
 
 std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr)
@@ -42,9 +73,9 @@ std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::
   To work around this issue we compile an allocation shim in C++ and use
   that from our cuda sources
   */
-  auto result = arrow::AllocateBitmap(size, ar_mr);
+  arrow::Result<std::shared_ptr<arrow::Buffer>> result = arrow::AllocateBitmap(size, ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
-  return std::move(result).ValueOrDie();
+  return enable_hugepage(std::move(result).ValueOrDie());
 }
 
 }  // namespace detail