Skip to content

Commit 3c8efd7

Browse files
authored
[OpenMP] Ensure the actual kernel is annotated with launch bounds (llvm#99927)
In debug mode there is a wrapper (the kernel) around the function in which we generate the kernel code. We worked around this before to get the correct kernel name, but now we really distinguish both to attach the launch bounds to the kernel, not the inner function.
1 parent df4fa47 commit 3c8efd7

8 files changed

+1570
-1553
lines changed

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -670,27 +670,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
670670
// Build the argument list.
671671
bool NeedWrapperFunction =
672672
getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo();
673-
FunctionArgList Args;
674-
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
675-
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
673+
FunctionArgList Args, WrapperArgs;
674+
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs,
675+
WrapperLocalAddrs;
676+
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes,
677+
WrapperVLASizes;
676678
SmallString<256> Buffer;
677679
llvm::raw_svector_ostream Out(Buffer);
678680
Out << CapturedStmtInfo->getHelperName();
679-
if (NeedWrapperFunction)
681+
682+
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
683+
llvm::Function *WrapperF = nullptr;
684+
if (NeedWrapperFunction) {
685+
// Emit the final kernel early to allow attributes to be added by the
686+
// OpenMPI-IR-Builder.
687+
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
688+
/*RegisterCastedArgsOnly=*/true,
689+
CapturedStmtInfo->getHelperName(), Loc);
690+
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
691+
WrapperF =
692+
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
693+
WrapperCGF.CXXThisValue, WrapperFO);
680694
Out << "_debug__";
695+
}
681696
FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false,
682697
Out.str(), Loc);
683-
llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs,
684-
VLASizes, CXXThisValue, FO);
698+
llvm::Function *F = emitOutlinedFunctionPrologue(
699+
*this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO);
685700
CodeGenFunction::OMPPrivateScope LocalScope(*this);
686-
for (const auto &LocalAddrPair : LocalAddrs) {
701+
for (const auto &LocalAddrPair : WrapperLocalAddrs) {
687702
if (LocalAddrPair.second.first) {
688703
LocalScope.addPrivate(LocalAddrPair.second.first,
689704
LocalAddrPair.second.second);
690705
}
691706
}
692707
(void)LocalScope.Privatize();
693-
for (const auto &VLASizePair : VLASizes)
708+
for (const auto &VLASizePair : WrapperVLASizes)
694709
VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
695710
PGO.assignRegionCounters(GlobalDecl(CD), F);
696711
CapturedStmtInfo->EmitBody(*this, CD->getBody());
@@ -699,17 +714,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
699714
if (!NeedWrapperFunction)
700715
return F;
701716

702-
FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
703-
/*RegisterCastedArgsOnly=*/true,
704-
CapturedStmtInfo->getHelperName(), Loc);
705-
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
706-
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
707-
Args.clear();
708-
LocalAddrs.clear();
709-
VLASizes.clear();
710-
llvm::Function *WrapperF =
711-
emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
712-
WrapperCGF.CXXThisValue, WrapperFO);
717+
// Reverse the order.
718+
WrapperF->removeFromParent();
719+
F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF);
720+
713721
llvm::SmallVector<llvm::Value *, 4> CallArgs;
714722
auto *PI = F->arg_begin();
715723
for (const auto *Arg : Args) {

clang/test/OpenMP/ompx_attributes_codegen.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@
33
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
44
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
55
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
6+
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -dwarf-version=5 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
67
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
8+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -dwarf-version=5 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
79
// expected-no-diagnostics
810

911

1012
// Check that the target attributes are set on the generated kernel
1113
void func() {
12-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16(ptr {{[^,]+}}) #0
13-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}})
14-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #4
14+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
15+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
16+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
1517

1618
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
1719
{}
@@ -35,6 +37,6 @@ void func() {
3537
// NVIDIA: "omp_target_thread_limit"="20"
3638
// NVIDIA: "omp_target_thread_limit"="45"
3739
// NVIDIA: "omp_target_thread_limit"="17"
38-
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
39-
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"maxntidx", i32 45}
40-
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}
40+
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
41+
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
42+
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}

0 commit comments

Comments
 (0)