7
7
// ===----------------------------------------------------------------------===//
8
8
//
9
9
// This provides a generalized class for OpenMP runtime code generation
10
- // specialized by GPU target NVPTX.
10
+ // specialized by GPU targets NVPTX and AMDGCN .
11
11
//
12
12
// ===----------------------------------------------------------------------===//
13
13
@@ -621,22 +621,15 @@ class CheckVarsEscapingDeclContext final
621
621
};
622
622
} // anonymous namespace
623
623
624
- // / Get the id of the current thread on the GPU.
625
- static llvm::Value *getNVPTXThreadID (CodeGenFunction &CGF) {
626
- return CGF.EmitRuntimeCall (
627
- llvm::Intrinsic::getDeclaration (
628
- &CGF.CGM .getModule (), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
629
- " nvptx_tid" );
630
- }
631
-
632
624
// / Get the id of the warp in the block.
633
625
// / We assume that the warp size is 32, which is always the case
634
626
// / on the NVPTX device, to generate more efficient code.
635
627
static llvm::Value *getNVPTXWarpID (CodeGenFunction &CGF) {
636
628
CGBuilderTy &Bld = CGF.Builder ;
637
629
unsigned LaneIDBits =
638
630
CGF.getTarget ().getGridValue (llvm::omp::GV_Warp_Size_Log2);
639
- return Bld.CreateAShr (getNVPTXThreadID (CGF), LaneIDBits, " nvptx_warp_id" );
631
+ auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
632
+ return Bld.CreateAShr (RT.getGPUThreadID (CGF), LaneIDBits, " nvptx_warp_id" );
640
633
}
641
634
642
635
// / Get the id of the current lane in the Warp.
@@ -646,18 +639,11 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
646
639
CGBuilderTy &Bld = CGF.Builder ;
647
640
unsigned LaneIDMask = CGF.getContext ().getTargetInfo ().getGridValue (
648
641
llvm::omp::GV_Warp_Size_Log2_Mask);
649
- return Bld.CreateAnd (getNVPTXThreadID (CGF), Bld.getInt32 (LaneIDMask),
642
+ auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
643
+ return Bld.CreateAnd (RT.getGPUThreadID (CGF), Bld.getInt32 (LaneIDMask),
650
644
" nvptx_lane_id" );
651
645
}
652
646
653
- // / Get the maximum number of threads in a block of the GPU.
654
- static llvm::Value *getNVPTXNumThreads (CodeGenFunction &CGF) {
655
- return CGF.EmitRuntimeCall (
656
- llvm::Intrinsic::getDeclaration (
657
- &CGF.CGM .getModule (), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
658
- " nvptx_num_threads" );
659
- }
660
-
661
647
// / Get the value of the thread_limit clause in the teams directive.
662
648
// / For the 'generic' execution mode, the runtime encodes thread_limit in
663
649
// / the launch parameters, always starting thread_limit+warpSize threads per
@@ -668,9 +654,9 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
668
654
CGBuilderTy &Bld = CGF.Builder ;
669
655
auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
670
656
return IsInSPMDExecutionMode
671
- ? getNVPTXNumThreads (CGF)
672
- : Bld.CreateNUWSub (getNVPTXNumThreads (CGF), RT.getGPUWarpSize (CGF),
673
- " thread_limit" );
657
+ ? RT. getGPUNumThreads (CGF)
658
+ : Bld.CreateNUWSub (RT.getGPUNumThreads (CGF),
659
+ RT. getGPUWarpSize (CGF), " thread_limit" );
674
660
}
675
661
676
662
// / Get the thread id of the OMP master thread.
@@ -682,8 +668,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
682
668
// / If NumThreads is 1024, master id is 992.
683
669
static llvm::Value *getMasterThreadID (CodeGenFunction &CGF) {
684
670
CGBuilderTy &Bld = CGF.Builder ;
685
- llvm::Value *NumThreads = getNVPTXNumThreads (CGF);
686
671
auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
672
+ llvm::Value *NumThreads = RT.getGPUNumThreads (CGF);
687
673
// We assume that the warp size is a power of 2.
688
674
llvm::Value *Mask = Bld.CreateNUWSub (RT.getGPUWarpSize (CGF), Bld.getInt32 (1 ));
689
675
@@ -1235,8 +1221,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1235
1221
llvm::BasicBlock *MasterBB = CGF.createBasicBlock (" .master" );
1236
1222
EST.ExitBB = CGF.createBasicBlock (" .exit" );
1237
1223
1224
+ auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
1238
1225
llvm::Value *IsWorker =
1239
- Bld.CreateICmpULT (getNVPTXThreadID (CGF), getThreadLimit (CGF));
1226
+ Bld.CreateICmpULT (RT. getGPUThreadID (CGF), getThreadLimit (CGF));
1240
1227
Bld.CreateCondBr (IsWorker, WorkerBB, MasterCheckBB);
1241
1228
1242
1229
CGF.EmitBlock (WorkerBB);
@@ -1245,7 +1232,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1245
1232
1246
1233
CGF.EmitBlock (MasterCheckBB);
1247
1234
llvm::Value *IsMaster =
1248
- Bld.CreateICmpEQ (getNVPTXThreadID (CGF), getMasterThreadID (CGF));
1235
+ Bld.CreateICmpEQ (RT. getGPUThreadID (CGF), getMasterThreadID (CGF));
1249
1236
Bld.CreateCondBr (IsMaster, MasterBB, EST.ExitBB );
1250
1237
1251
1238
CGF.EmitBlock (MasterBB);
@@ -2780,14 +2767,16 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion(
2780
2767
llvm::BasicBlock *BodyBB = CGF.createBasicBlock (" omp.critical.body" );
2781
2768
llvm::BasicBlock *ExitBB = CGF.createBasicBlock (" omp.critical.exit" );
2782
2769
2770
+ auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
2771
+
2783
2772
// Get the mask of active threads in the warp.
2784
2773
llvm::Value *Mask = CGF.EmitRuntimeCall (
2785
2774
createNVPTXRuntimeFunction (OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
2786
2775
// Fetch team-local id of the thread.
2787
- llvm::Value *ThreadID = getNVPTXThreadID (CGF);
2776
+ llvm::Value *ThreadID = RT. getGPUThreadID (CGF);
2788
2777
2789
2778
// Get the width of the team.
2790
- llvm::Value *TeamWidth = getNVPTXNumThreads (CGF);
2779
+ llvm::Value *TeamWidth = RT. getGPUNumThreads (CGF);
2791
2780
2792
2781
// Initialize the counter variable for the loop.
2793
2782
QualType Int32Ty =
@@ -3250,8 +3239,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3250
3239
CGM.addCompilerUsedGlobal (TransferMedium);
3251
3240
}
3252
3241
3242
+ auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
3253
3243
// Get the CUDA thread id of the current OpenMP thread on the GPU.
3254
- llvm::Value *ThreadID = getNVPTXThreadID (CGF);
3244
+ llvm::Value *ThreadID = RT. getGPUThreadID (CGF);
3255
3245
// nvptx_lane_id = nvptx_id % warpsize
3256
3246
llvm::Value *LaneID = getNVPTXLaneID (CGF);
3257
3247
// nvptx_warp_id = nvptx_id / warpsize
@@ -4844,9 +4834,11 @@ void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
4844
4834
CodeGenFunction &CGF, const OMPLoopDirective &S,
4845
4835
OpenMPDistScheduleClauseKind &ScheduleKind,
4846
4836
llvm::Value *&Chunk) const {
4837
+ auto &RT = static_cast <CGOpenMPRuntimeGPU &>(CGF.CGM .getOpenMPRuntime ());
4847
4838
if (getExecutionMode () == CGOpenMPRuntimeGPU::EM_SPMD) {
4848
4839
ScheduleKind = OMPC_DIST_SCHEDULE_static;
4849
- Chunk = CGF.EmitScalarConversion (getNVPTXNumThreads (CGF),
4840
+ Chunk = CGF.EmitScalarConversion (
4841
+ RT.getGPUNumThreads (CGF),
4850
4842
CGF.getContext ().getIntTypeForBitwidth (32 , /* Signed=*/ 0 ),
4851
4843
S.getIterationVariable ()->getType (), S.getBeginLoc ());
4852
4844
return ;
0 commit comments