Skip to content

Commit 160ff83

Browse files
committed
[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3
Provides AMDGCN and NVPTX specific specialization of getGPUWarpSize, getGPUThreadID, and getGPUNumThreads methods. Adds tests for AMDGCN codegen for these methods in generic and simd modes. Also changes the precondition in InitTempAlloca to be slightly more permissive. Useful for AMDGCN OpenMP codegen where allocas are created with a cast to an address space. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D84260
1 parent 40da58a commit 160ff83

11 files changed

+242
-35
lines changed

clang/lib/CodeGen/CGExpr.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,13 @@ Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,
125125
}
126126

127127
void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {
128-
assert(isa<llvm::AllocaInst>(Var.getPointer()));
129-
auto *Store = new llvm::StoreInst(Init, Var.getPointer(), /*volatile*/ false,
128+
auto *Alloca = Var.getPointer();
129+
assert(isa<llvm::AllocaInst>(Alloca) ||
130+
(isa<llvm::AddrSpaceCastInst>(Alloca) &&
131+
isa<llvm::AllocaInst>(
132+
cast<llvm::AddrSpaceCastInst>(Alloca)->getPointerOperand())));
133+
134+
auto *Store = new llvm::StoreInst(Init, Alloca, /*volatile*/ false,
130135
Var.getAlignment().getAsAlign());
131136
llvm::BasicBlock *Block = AllocaInsertPt->getParent();
132137
Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This provides a class for OpenMP runtime code generation specialized to
10+
// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "CGOpenMPRuntimeAMDGCN.h"
15+
#include "CGOpenMPRuntimeGPU.h"
16+
#include "CodeGenFunction.h"
17+
#include "clang/AST/Attr.h"
18+
#include "clang/AST/DeclOpenMP.h"
19+
#include "clang/AST/StmtOpenMP.h"
20+
#include "clang/AST/StmtVisitor.h"
21+
#include "clang/Basic/Cuda.h"
22+
#include "llvm/ADT/SmallPtrSet.h"
23+
#include "llvm/IR/IntrinsicsAMDGPU.h"
24+
25+
using namespace clang;
26+
using namespace CodeGen;
27+
using namespace llvm::omp;
28+
29+
CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
30+
: CGOpenMPRuntimeGPU(CGM) {
31+
if (!CGM.getLangOpts().OpenMPIsDevice)
32+
llvm_unreachable("OpenMP AMDGCN can only handle device code.");
33+
}
34+
35+
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
36+
CGBuilderTy &Bld = CGF.Builder;
37+
// return constant compile-time target-specific warp size
38+
unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
39+
return Bld.getInt32(WarpSize);
40+
}
41+
42+
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
43+
CGBuilderTy &Bld = CGF.Builder;
44+
llvm::Function *F =
45+
CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
46+
return Bld.CreateCall(F, llvm::None, "nvptx_tid");
47+
}
48+
49+
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
50+
CGBuilderTy &Bld = CGF.Builder;
51+
llvm::Module *M = &CGF.CGM.getModule();
52+
const char *LocSize = "__ockl_get_local_size";
53+
llvm::Function *F = M->getFunction(LocSize);
54+
if (!F) {
55+
F = llvm::Function::Create(
56+
llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
57+
llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
58+
}
59+
return Bld.CreateTrunc(
60+
Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
61+
}
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This provides a class for OpenMP runtime code generation specialized to
10+
// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
15+
#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
16+
17+
#include "CGOpenMPRuntime.h"
18+
#include "CGOpenMPRuntimeGPU.h"
19+
#include "CodeGenFunction.h"
20+
#include "clang/AST/StmtOpenMP.h"
21+
22+
namespace clang {
23+
namespace CodeGen {
24+
25+
class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
26+
27+
public:
28+
explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
29+
30+
/// Get the GPU warp size.
31+
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
32+
33+
/// Get the id of the current thread on the GPU.
34+
llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
35+
36+
/// Get the maximum number of threads in a block of the GPU.
37+
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
38+
};
39+
40+
} // namespace CodeGen
41+
} // namespace clang
42+
43+
#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

+21-29
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88
//
99
// This provides a generalized class for OpenMP runtime code generation
10-
// specialized by GPU target NVPTX.
10+
// specialized by GPU targets NVPTX and AMDGCN.
1111
//
1212
//===----------------------------------------------------------------------===//
1313

@@ -621,22 +621,15 @@ class CheckVarsEscapingDeclContext final
621621
};
622622
} // anonymous namespace
623623

624-
/// Get the id of the current thread on the GPU.
625-
static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
626-
return CGF.EmitRuntimeCall(
627-
llvm::Intrinsic::getDeclaration(
628-
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
629-
"nvptx_tid");
630-
}
631-
632624
/// Get the id of the warp in the block.
633625
/// We assume that the warp size is 32, which is always the case
634626
/// on the NVPTX device, to generate more efficient code.
635627
static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
636628
CGBuilderTy &Bld = CGF.Builder;
637629
unsigned LaneIDBits =
638630
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
639-
return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
631+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
632+
return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
640633
}
641634

642635
/// Get the id of the current lane in the Warp.
@@ -646,18 +639,11 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
646639
CGBuilderTy &Bld = CGF.Builder;
647640
unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
648641
llvm::omp::GV_Warp_Size_Log2_Mask);
649-
return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
642+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
643+
return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
650644
"nvptx_lane_id");
651645
}
652646

653-
/// Get the maximum number of threads in a block of the GPU.
654-
static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
655-
return CGF.EmitRuntimeCall(
656-
llvm::Intrinsic::getDeclaration(
657-
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
658-
"nvptx_num_threads");
659-
}
660-
661647
/// Get the value of the thread_limit clause in the teams directive.
662648
/// For the 'generic' execution mode, the runtime encodes thread_limit in
663649
/// the launch parameters, always starting thread_limit+warpSize threads per
@@ -668,9 +654,9 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
668654
CGBuilderTy &Bld = CGF.Builder;
669655
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
670656
return IsInSPMDExecutionMode
671-
? getNVPTXNumThreads(CGF)
672-
: Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),
673-
"thread_limit");
657+
? RT.getGPUNumThreads(CGF)
658+
: Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
659+
RT.getGPUWarpSize(CGF), "thread_limit");
674660
}
675661

676662
/// Get the thread id of the OMP master thread.
@@ -682,8 +668,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
682668
/// If NumThreads is 1024, master id is 992.
683669
static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
684670
CGBuilderTy &Bld = CGF.Builder;
685-
llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
686671
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
672+
llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
687673
// We assume that the warp size is a power of 2.
688674
llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
689675

@@ -1235,8 +1221,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
12351221
llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
12361222
EST.ExitBB = CGF.createBasicBlock(".exit");
12371223

1224+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
12381225
llvm::Value *IsWorker =
1239-
Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1226+
Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
12401227
Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
12411228

12421229
CGF.EmitBlock(WorkerBB);
@@ -1245,7 +1232,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
12451232

12461233
CGF.EmitBlock(MasterCheckBB);
12471234
llvm::Value *IsMaster =
1248-
Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1235+
Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
12491236
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
12501237

12511238
CGF.EmitBlock(MasterBB);
@@ -2780,14 +2767,16 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion(
27802767
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
27812768
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
27822769

2770+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2771+
27832772
// Get the mask of active threads in the warp.
27842773
llvm::Value *Mask = CGF.EmitRuntimeCall(
27852774
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
27862775
// Fetch team-local id of the thread.
2787-
llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2776+
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
27882777

27892778
// Get the width of the team.
2790-
llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2779+
llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
27912780

27922781
// Initialize the counter variable for the loop.
27932782
QualType Int32Ty =
@@ -3250,8 +3239,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
32503239
CGM.addCompilerUsedGlobal(TransferMedium);
32513240
}
32523241

3242+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
32533243
// Get the CUDA thread id of the current OpenMP thread on the GPU.
3254-
llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3244+
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
32553245
// nvptx_lane_id = nvptx_id % warpsize
32563246
llvm::Value *LaneID = getNVPTXLaneID(CGF);
32573247
// nvptx_warp_id = nvptx_id / warpsize
@@ -4844,9 +4834,11 @@ void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
48444834
CodeGenFunction &CGF, const OMPLoopDirective &S,
48454835
OpenMPDistScheduleClauseKind &ScheduleKind,
48464836
llvm::Value *&Chunk) const {
4837+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
48474838
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
48484839
ScheduleKind = OMPC_DIST_SCHEDULE_static;
4849-
Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4840+
Chunk = CGF.EmitScalarConversion(
4841+
RT.getGPUNumThreads(CGF),
48504842
CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
48514843
S.getIterationVariable()->getType(), S.getBeginLoc());
48524844
return;

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

+11-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88
//
99
// This provides a generalized class for OpenMP runtime code generation
10-
// specialized by GPU target NVPTX.
10+
// specialized by GPU targets NVPTX and AMDGCN.
1111
//
1212
//===----------------------------------------------------------------------===//
1313

@@ -199,9 +199,18 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
199199
void clear() override;
200200

201201
/// Declare generalized virtual functions which need to be defined
202-
/// by all specializations of OpenMPGPURuntime Targets.
202+
/// by all specializations of OpenMPGPURuntime Targets like AMDGCN
203+
/// and NVPTX.
204+
205+
/// Get the GPU warp size.
203206
virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
204207

208+
/// Get the id of the current thread on the GPU.
209+
virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
210+
211+
/// Get the maximum number of threads in a block of the GPU.
212+
virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
213+
205214
/// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
206215
/// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
207216
virtual void emitProcBindClause(CodeGenFunction &CGF,

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

+16-1
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,25 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
3232
llvm_unreachable("OpenMP NVPTX can only handle device code.");
3333
}
3434

35-
/// Get the GPU warp size.
3635
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
3736
return CGF.EmitRuntimeCall(
3837
llvm::Intrinsic::getDeclaration(
3938
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
4039
"nvptx_warp_size");
4140
}
41+
42+
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
43+
CGBuilderTy &Bld = CGF.Builder;
44+
llvm::Function *F;
45+
F = llvm::Intrinsic::getDeclaration(
46+
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
47+
return Bld.CreateCall(F, llvm::None, "nvptx_tid");
48+
}
49+
50+
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
51+
CGBuilderTy &Bld = CGF.Builder;
52+
llvm::Function *F;
53+
F = llvm::Intrinsic::getDeclaration(
54+
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
55+
return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
56+
}

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,19 @@
2222
namespace clang {
2323
namespace CodeGen {
2424

25-
class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
25+
class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
2626

2727
public:
2828
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
29+
30+
/// Get the GPU warp size.
2931
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
32+
33+
/// Get the id of the current thread on the GPU.
34+
llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
35+
36+
/// Get the maximum number of threads in a block of the GPU.
37+
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
3038
};
3139

3240
} // CodeGen namespace.

clang/lib/CodeGen/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ add_clang_library(clangCodeGen
5858
CGObjCRuntime.cpp
5959
CGOpenCLRuntime.cpp
6060
CGOpenMPRuntime.cpp
61+
CGOpenMPRuntimeAMDGCN.cpp
6162
CGOpenMPRuntimeGPU.cpp
6263
CGOpenMPRuntimeNVPTX.cpp
6364
CGRecordLayoutBuilder.cpp

clang/lib/CodeGen/CodeGenModule.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "CGObjCRuntime.h"
2020
#include "CGOpenCLRuntime.h"
2121
#include "CGOpenMPRuntime.h"
22+
#include "CGOpenMPRuntimeAMDGCN.h"
2223
#include "CGOpenMPRuntimeNVPTX.h"
2324
#include "CodeGenFunction.h"
2425
#include "CodeGenPGO.h"
@@ -215,6 +216,11 @@ void CodeGenModule::createOpenMPRuntime() {
215216
"OpenMP NVPTX is only prepared to deal with device code.");
216217
OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
217218
break;
219+
case llvm::Triple::amdgcn:
220+
assert(getLangOpts().OpenMPIsDevice &&
221+
"OpenMP AMDGCN is only prepared to deal with device code.");
222+
OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
223+
break;
218224
default:
219225
if (LangOpts.OpenMPSimd)
220226
OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));

0 commit comments

Comments
 (0)