Skip to content

Commit 565a075

Browse files
authored
[flang][cuda][rt] Track asynchronous allocation stream for deallocation (llvm#137073)
When an asynchronous allocation is made, we call `cudaMallocAsync` with a stream. For deallocation, we need to call `cudaFreeAsync` with the same stream. in order to achieve that, we need to track the allocation and their respective stream. This patch adds a simple sorted array of asynchronous allocations. A binary search is performed to retrieve the allocation when deallocation is needed.
1 parent e329b6c commit 565a075

File tree

2 files changed

+171
-1
lines changed

2 files changed

+171
-1
lines changed

flang-rt/lib/cuda/allocator.cpp

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "flang-rt/runtime/derived.h"
1212
#include "flang-rt/runtime/descriptor.h"
1313
#include "flang-rt/runtime/environment.h"
14+
#include "flang-rt/runtime/lock.h"
1415
#include "flang-rt/runtime/stat.h"
1516
#include "flang-rt/runtime/terminator.h"
1617
#include "flang-rt/runtime/type-info.h"
@@ -21,6 +22,105 @@
2122
#include "cuda_runtime.h"
2223

2324
namespace Fortran::runtime::cuda {
25+
26+
struct DeviceAllocation {
27+
void *ptr;
28+
std::size_t size;
29+
cudaStream_t stream;
30+
};
31+
32+
// Compare address values. nullptr will be sorted at the end of the array.
33+
int compareDeviceAlloc(const void *a, const void *b) {
34+
const DeviceAllocation *deva = (const DeviceAllocation *)a;
35+
const DeviceAllocation *devb = (const DeviceAllocation *)b;
36+
if (deva->ptr == nullptr && devb->ptr == nullptr)
37+
return 0;
38+
if (deva->ptr == nullptr)
39+
return 1;
40+
if (devb->ptr == nullptr)
41+
return -1;
42+
return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
43+
}
44+
45+
// Dynamic array for tracking asynchronous allocations.
46+
static DeviceAllocation *deviceAllocations = nullptr;
47+
Lock lock;
48+
static int maxDeviceAllocations{512}; // Initial size
49+
static int numDeviceAllocations{0};
50+
static constexpr int allocNotFound{-1};
51+
52+
static void initAllocations() {
53+
if (!deviceAllocations) {
54+
deviceAllocations = static_cast<DeviceAllocation *>(
55+
malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
56+
if (!deviceAllocations) {
57+
Terminator terminator{__FILE__, __LINE__};
58+
terminator.Crash("Failed to allocate tracking array");
59+
}
60+
}
61+
}
62+
63+
static void doubleAllocationArray() {
64+
unsigned newSize = maxDeviceAllocations * 2;
65+
DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
66+
realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
67+
if (!newArray) {
68+
Terminator terminator{__FILE__, __LINE__};
69+
terminator.Crash("Failed to reallocate tracking array");
70+
}
71+
deviceAllocations = newArray;
72+
maxDeviceAllocations = newSize;
73+
}
74+
75+
static unsigned findAllocation(void *ptr) {
76+
if (numDeviceAllocations == 0) {
77+
return allocNotFound;
78+
}
79+
80+
int left{0};
81+
int right{numDeviceAllocations - 1};
82+
83+
if (left == right) {
84+
return left;
85+
}
86+
87+
while (left <= right) {
88+
int mid = left + (right - left) / 2;
89+
if (deviceAllocations[mid].ptr == ptr) {
90+
return mid;
91+
}
92+
if (deviceAllocations[mid].ptr < ptr) {
93+
left = mid + 1;
94+
} else {
95+
right = mid - 1;
96+
}
97+
}
98+
return allocNotFound;
99+
}
100+
101+
static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
102+
CriticalSection critical{lock};
103+
initAllocations();
104+
if (numDeviceAllocations >= maxDeviceAllocations) {
105+
doubleAllocationArray();
106+
}
107+
deviceAllocations[numDeviceAllocations].ptr = ptr;
108+
deviceAllocations[numDeviceAllocations].size = size;
109+
deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
110+
++numDeviceAllocations;
111+
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
112+
compareDeviceAlloc);
113+
}
114+
115+
static void eraseAllocation(int pos) {
116+
deviceAllocations[pos].ptr = nullptr;
117+
deviceAllocations[pos].size = 0;
118+
deviceAllocations[pos].stream = (cudaStream_t)0;
119+
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
120+
compareDeviceAlloc);
121+
--numDeviceAllocations;
122+
}
123+
24124
extern "C" {
25125

26126
void RTDEF(CUFRegisterAllocator)() {
@@ -55,12 +155,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
55155
} else {
56156
CUDA_REPORT_IF_ERROR(
57157
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
158+
insertAllocation(p, sizeInBytes, asyncId);
58159
}
59160
}
60161
return p;
61162
}
62163

63-
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
164+
void CUFFreeDevice(void *p) {
165+
CriticalSection critical{lock};
166+
int pos = findAllocation(p);
167+
if (pos >= 0) {
168+
cudaStream_t stream = deviceAllocations[pos].stream;
169+
eraseAllocation(pos);
170+
CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
171+
} else {
172+
CUDA_REPORT_IF_ERROR(cudaFree(p));
173+
}
174+
}
64175

65176
void *CUFAllocManaged(
66177
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {

flang-rt/unittests/Runtime/CUDA/Allocatable.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
5858

5959
EXPECT_EQ(cudaSuccess, cudaGetLastError());
6060
}
61+
62+
TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
63+
using Fortran::common::TypeCategory;
64+
RTNAME(CUFRegisterAllocator)();
65+
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
66+
auto a{createAllocatable(TypeCategory::Real, 4)};
67+
a->SetAllocIdx(kDeviceAllocatorPos);
68+
EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
69+
EXPECT_FALSE(a->HasAddendum());
70+
RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
71+
72+
auto b{createAllocatable(TypeCategory::Real, 4)};
73+
b->SetAllocIdx(kDeviceAllocatorPos);
74+
EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx());
75+
EXPECT_FALSE(b->HasAddendum());
76+
RTNAME(AllocatableSetBounds)(*b, 0, 1, 20);
77+
78+
auto c{createAllocatable(TypeCategory::Real, 4)};
79+
c->SetAllocIdx(kDeviceAllocatorPos);
80+
EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx());
81+
EXPECT_FALSE(b->HasAddendum());
82+
RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);
83+
84+
RTNAME(AllocatableAllocate)
85+
(*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
86+
EXPECT_TRUE(a->IsAllocated());
87+
cudaDeviceSynchronize();
88+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
89+
90+
RTNAME(AllocatableAllocate)
91+
(*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
92+
EXPECT_TRUE(b->IsAllocated());
93+
cudaDeviceSynchronize();
94+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
95+
96+
RTNAME(AllocatableAllocate)
97+
(*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
98+
EXPECT_TRUE(c->IsAllocated());
99+
cudaDeviceSynchronize();
100+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
101+
102+
RTNAME(AllocatableDeallocate)
103+
(*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
104+
EXPECT_FALSE(b->IsAllocated());
105+
cudaDeviceSynchronize();
106+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
107+
108+
RTNAME(AllocatableDeallocate)
109+
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
110+
EXPECT_FALSE(a->IsAllocated());
111+
cudaDeviceSynchronize();
112+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
113+
114+
RTNAME(AllocatableDeallocate)
115+
(*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
116+
EXPECT_FALSE(c->IsAllocated());
117+
cudaDeviceSynchronize();
118+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
119+
}

0 commit comments

Comments
 (0)