Skip to content

Commit 25708b3

Browse files
committed
[NVPTX, CUDA] barrier intrinsics and builtins for sm_90
Differential Revision: https://reviews.llvm.org/D151363
1 parent 0a0bae1 commit 25708b3

File tree

6 files changed

+71
-3
lines changed

6 files changed

+71
-3
lines changed

Diff for: clang/include/clang/Basic/BuiltinsNVPTX.def

+5
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,11 @@ TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", PTX60)
582582
TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60)
583583
TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60)
584584

585+
TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78))
586+
TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80))
587+
TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78))
588+
TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78))
589+
585590
// Shuffle
586591

587592
BUILTIN(__nvvm_shfl_down_i32, "iiii", "")

Diff for: clang/lib/CodeGen/CGBuiltin.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -18962,6 +18962,18 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
1896218962
return Builder.CreateCall(
1896318963
CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
1896418964
EmitScalarExpr(E->getArg(0)));
18965+
case NVPTX::BI__nvvm_barrier_cluster_arrive:
18966+
return Builder.CreateCall(
18967+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
18968+
case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
18969+
return Builder.CreateCall(
18970+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
18971+
case NVPTX::BI__nvvm_barrier_cluster_wait:
18972+
return Builder.CreateCall(
18973+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
18974+
case NVPTX::BI__nvvm_fence_sc_cluster:
18975+
return Builder.CreateCall(
18976+
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1896518977
default:
1896618978
return nullptr;
1896718979
}

Diff for: clang/test/CodeGenCUDA/builtins-sm90.cu

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx78" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
1+
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx80" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
22

33
// CHECK: define{{.*}} void @_Z6kernelPlPvj(
44
__attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
@@ -57,5 +57,14 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
5757
// CHECK: call i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) {{.*}})
5858
out[i++] = __nvvm_getctarank_shared_cluster(sptr);
5959

60+
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
61+
__nvvm_barrier_cluster_arrive();
62+
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
63+
__nvvm_barrier_cluster_arrive_relaxed();
64+
// CHECK: call void @llvm.nvvm.barrier.cluster.wait()
65+
__nvvm_barrier_cluster_wait();
66+
// CHECK: call void @llvm.nvvm.fence.sc.cluster()
67+
__nvvm_fence_sc_cluster();
68+
6069
// CHECK: ret void
6170
}

Diff for: llvm/include/llvm/IR/IntrinsicsNVVM.td

+10
Original file line numberDiff line numberDiff line change
@@ -1358,13 +1358,23 @@ let TargetPrefix = "nvvm" in {
13581358
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
13591359
ClangBuiltin<"__nvvm_barrier_sync_cnt">;
13601360

1361+
// barrier.cluster.[wait, arrive, arrive.relaxed]
1362+
def int_nvvm_barrier_cluster_arrive :
1363+
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1364+
def int_nvvm_barrier_cluster_arrive_relaxed :
1365+
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1366+
def int_nvvm_barrier_cluster_wait :
1367+
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1368+
13611369
// Membar
13621370
def int_nvvm_membar_cta : ClangBuiltin<"__nvvm_membar_cta">,
13631371
Intrinsic<[], [], [IntrNoCallback]>;
13641372
def int_nvvm_membar_gl : ClangBuiltin<"__nvvm_membar_gl">,
13651373
Intrinsic<[], [], [IntrNoCallback]>;
13661374
def int_nvvm_membar_sys : ClangBuiltin<"__nvvm_membar_sys">,
13671375
Intrinsic<[], [], [IntrNoCallback]>;
1376+
def int_nvvm_fence_sc_cluster:
1377+
Intrinsic<[], [], [IntrNoCallback]>;
13681378

13691379
// Async Copy
13701380
def int_nvvm_cp_async_mbarrier_arrive :

Diff for: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

+15
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,18 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
132132
"barrier.sync \t$id, $cnt;",
133133
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
134134
Requires<[hasPTX<60>, hasSM<30>]>;
135+
class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
136+
list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
137+
NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
138+
Requires<Preds>;
139+
140+
def barrier_cluster_arrive:
141+
INT_BARRIER_CLUSTER<"arrive", int_nvvm_barrier_cluster_arrive>;
142+
def barrier_cluster_arrive_relaxed:
143+
INT_BARRIER_CLUSTER<"arrive.relaxed",
144+
int_nvvm_barrier_cluster_arrive_relaxed, [hasPTX<80>, hasSM<90>]>;
145+
def barrier_cluster_wait:
146+
INT_BARRIER_CLUSTER<"wait", int_nvvm_barrier_cluster_wait>;
135147

136148
class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
137149
bit offset_imm, bit mask_imm, bit threadmask_imm>
@@ -303,6 +315,9 @@ def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
303315
def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
304316
def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
305317

318+
def INT_FENCE_SC_CLUSTER:
319+
MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
320+
Requires<[hasPTX<78>, hasSM<90>]>;
306321

307322
//-----------------------------------
308323
// Async Copy Functions

Diff for: llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll

+19-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s
2-
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
2+
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
33

44
; CHECK-LABEL: test_isspacep
55
define i1 @test_isspacep_shared_cluster(ptr %p) {
@@ -120,6 +120,19 @@ define i1 @test_is_explicit_cluster() {
120120
ret i1 %x
121121
}
122122

123+
; CHECK-LABEL: test_barrier_cluster(
124+
define void @test_barrier_cluster() {
125+
; CHECK: barrier.cluster.arrive;
126+
call void @llvm.nvvm.barrier.cluster.arrive()
127+
; CHECK: barrier.cluster.arrive.relaxed;
128+
call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
129+
; CHECK: barrier.cluster.wait;
130+
call void @llvm.nvvm.barrier.cluster.wait()
131+
; CHECK: fence.sc.cluster
132+
call void @llvm.nvvm.fence.sc.cluster()
133+
ret void
134+
}
135+
123136

124137
declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p);
125138
declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r);
@@ -137,3 +150,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
137150
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
138151
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
139152
declare i1 @llvm.nvvm.is_explicit_cluster()
153+
declare void @llvm.nvvm.barrier.cluster.arrive()
154+
declare void @llvm.nvvm.barrier.cluster.arrive.relaxed()
155+
declare void @llvm.nvvm.barrier.cluster.wait()
156+
declare void @llvm.nvvm.fence.sc.cluster()

0 commit comments

Comments
 (0)