Skip to content

Commit 735209c

Browse files
authored
[NVPTX] Unify and extend barrier{.cta} intrinsic support (llvm#140615)
Our current intrinsic support for barrier intrinsics is confusing and incomplete, with multiple intrinsics mapping to the same instruction and intrinsic names not clearly conveying intrinsic semantics. Further, we lack support for some variants. This change unifies the IR representation to a single consistently named set of intrinsics. - llvm.nvvm.barrier.cta.sync.aligned.all(i32) - llvm.nvvm.barrier.cta.sync.aligned(i32, i32) - llvm.nvvm.barrier.cta.arrive.aligned(i32, i32) - llvm.nvvm.barrier.cta.sync.all(i32) - llvm.nvvm.barrier.cta.sync(i32, i32) - llvm.nvvm.barrier.cta.arrive(i32, i32) The following Auto-Upgrade rules are used to maintain compatibility with IR using the legacy intrinsics: * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y) * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y)
1 parent a28d753 commit 735209c

File tree

22 files changed

+389
-197
lines changed

22 files changed

+389
-197
lines changed

clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
11601160
case NVPTX::BI__nvvm_fence_sc_cluster:
11611161
return Builder.CreateCall(
11621162
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1163+
case NVPTX::BI__nvvm_bar_sync:
1164+
return Builder.CreateCall(
1165+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1166+
EmitScalarExpr(E->getArg(0)));
1167+
case NVPTX::BI__syncthreads:
1168+
return Builder.CreateCall(
1169+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1170+
Builder.getInt32(0));
1171+
case NVPTX::BI__nvvm_barrier_sync:
1172+
return Builder.CreateCall(
1173+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
1174+
EmitScalarExpr(E->getArg(0)));
1175+
case NVPTX::BI__nvvm_barrier_sync_cnt:
1176+
return Builder.CreateCall(
1177+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync),
1178+
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
11631179
default:
11641180
return nullptr;
11651181
}

clang/test/CodeGen/builtins-nvptx-ptx60.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b,
3232
// CHECK: call void @llvm.nvvm.bar.warp.sync(i32
3333
// expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}}
3434
__nvvm_bar_warp_sync(mask);
35-
// CHECK: call void @llvm.nvvm.barrier.sync(i32
35+
// CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32
3636
// expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}}
3737
__nvvm_barrier_sync(mask);
38-
// CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32
38+
// CHECK: call void @llvm.nvvm.barrier.cta.sync(i32
3939
// expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}}
4040
__nvvm_barrier_sync_cnt(mask, i);
4141

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ __device__ int read_pms() {
198198

199199
__device__ void sync() {
200200

201-
// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
201+
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
202202

203203
__nvvm_bar_sync(0);
204204

@@ -259,7 +259,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
259259
__nvvm_membar_gl();
260260
// CHECK: call void @llvm.nvvm.membar.sys()
261261
__nvvm_membar_sys();
262-
// CHECK: call void @llvm.nvvm.barrier0()
262+
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
263263
__syncthreads();
264264
}
265265

clang/test/Headers/gpuintrin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,7 @@ __gpu_kernel void foo() {
887887
// NVPTX-LABEL: define internal void @__gpu_sync_threads(
888888
// NVPTX-SAME: ) #[[ATTR0]] {
889889
// NVPTX-NEXT: [[ENTRY:.*:]]
890-
// NVPTX-NEXT: call void @llvm.nvvm.barrier0()
890+
// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
891891
// NVPTX-NEXT: ret void
892892
//
893893
//

llvm/docs/NVPTXUsage.rst

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,21 +199,58 @@ map in the following way to CUDA builtins:
199199
Barriers
200200
--------
201201

202-
'``llvm.nvvm.barrier0``'
203-
^^^^^^^^^^^^^^^^^^^^^^^^^^^
202+
'``llvm.nvvm.barrier.cta.*``'
203+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
204204

205205
Syntax:
206206
"""""""
207207

208208
.. code-block:: llvm
209209
210-
declare void @llvm.nvvm.barrier0()
210+
declare void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %n)
211+
declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
212+
declare void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %n)
213+
214+
declare void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %n)
215+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
216+
declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %n)
211217
212218
Overview:
213219
"""""""""
214220

215-
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
216-
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
221+
The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier
222+
synchronization and communication within a CTA. They can be used by the threads
223+
within the CTA for synchronization and communication.
224+
225+
Semantics:
226+
""""""""""
227+
228+
Operand %id specifies a logical barrier resource and must fall within the range
229+
0 through 15. When present, operand %n specifies the number of threads
230+
participating in the barrier. When specifying a thread count, the value must be
231+
a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``'
232+
variants, the '``.all``' suffix indicates that all threads in the CTA should
233+
participate in the barrier and the %n operand is not present.
234+
235+
All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
236+
thread to wait for all non-exited threads from its warp and then marks the
237+
warp's arrival at the barrier. In addition to signaling its arrival at the
238+
barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
239+
thread to wait for non-exited threads of all other warps participating in the
240+
barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
241+
intrinsic does not cause the executing thread to wait for threads of other
242+
participating warps.
243+
244+
When a barrier completes, the waiting threads are restarted without delay,
245+
and the barrier is reinitialized so that it can be immediately reused.
246+
247+
The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
248+
modifier to indicate textual alignment of the barrier. When specified, it
249+
indicates that all threads in the CTA will execute the same
250+
'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an
251+
aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is
252+
known that all threads in the CTA evaluate the condition identically, otherwise
253+
behavior is undefined.
217254

218255
Electing a thread
219256
-----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,12 @@
128128
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
129129
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
130130
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
131+
// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0)
132+
// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
133+
// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
134+
// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y)
135+
// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x)
136+
// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y)
131137

132138
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
133139
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1263,35 +1269,28 @@ let TargetPrefix = "nvvm" in {
12631269
defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>;
12641270

12651271
// Bar.Sync
1266-
1267-
// The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the
1268-
// intrinsics in this file, this one is a user-facing API.
1269-
def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">,
1270-
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1271-
// Synchronize all threads in the CTA at barrier 'n'.
1272-
def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">,
1273-
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1274-
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
1275-
// the CTA at barrier 'n' (arg 1).
1276-
def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">,
1277-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12781272
def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
12791273
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12801274
def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
12811275
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12821276
def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
12831277
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12841278

1285-
def int_nvvm_bar_sync : NVVMBuiltin,
1286-
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12871279
def int_nvvm_bar_warp_sync : NVVMBuiltin,
12881280
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12891281

1290-
// barrier.sync id[, cnt]
1291-
def int_nvvm_barrier_sync : NVVMBuiltin,
1292-
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1293-
def int_nvvm_barrier_sync_cnt : NVVMBuiltin,
1294-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1282+
// barrier{.cta}.sync{.aligned} a{, b};
1283+
// barrier{.cta}.arrive{.aligned} a, b;
1284+
let IntrProperties = [IntrConvergent, IntrNoCallback] in {
1285+
foreach align = ["", "_aligned"] in {
1286+
def int_nvvm_barrier_cta_sync # align # _all :
1287+
Intrinsic<[], [llvm_i32_ty]>;
1288+
def int_nvvm_barrier_cta_sync # align :
1289+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1290+
def int_nvvm_barrier_cta_arrive # align :
1291+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1292+
}
1293+
}
12951294

12961295
// barrier.cluster.[wait, arrive, arrive.relaxed]
12971296
def int_nvvm_barrier_cluster_arrive :

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,12 +1343,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13431343
// nvvm.abs.{i,ii}
13441344
Expand =
13451345
Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2";
1346-
else if (Name == "fabs.f" || Name == "fabs.ftz.f" || Name == "fabs.d")
1346+
else if (Name.consume_front("fabs."))
13471347
// nvvm.fabs.{f,ftz.f,d}
1348-
Expand = true;
1349-
else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
1350-
Name == "swap.lo.hi.b64")
1351-
Expand = true;
1348+
Expand = Name == "f" || Name == "ftz.f" || Name == "d";
13521349
else if (Name.consume_front("max.") || Name.consume_front("min."))
13531350
// nvvm.{min,max}.{i,ii,ui,ull}
13541351
Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -1380,7 +1377,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13801377
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
13811378
Name.starts_with("p."));
13821379
else
1383-
Expand = false;
1380+
Expand = StringSwitch<bool>(Name)
1381+
.Case("barrier0", true)
1382+
.Case("barrier.n", true)
1383+
.Case("barrier.sync.cnt", true)
1384+
.Case("barrier.sync", true)
1385+
.Case("barrier", true)
1386+
.Case("bar.sync", true)
1387+
.Case("clz.ll", true)
1388+
.Case("popc.ll", true)
1389+
.Case("h2f", true)
1390+
.Case("swap.lo.hi.b64", true)
1391+
.Default(false);
13841392

13851393
if (Expand) {
13861394
NewFn = nullptr;
@@ -2478,6 +2486,20 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
24782486
MDNode *MD = MDNode::get(Builder.getContext(), {});
24792487
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
24802488
return LD;
2489+
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
2490+
Value *Arg =
2491+
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
2492+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all,
2493+
{}, {Arg});
2494+
} else if (Name == "barrier") {
2495+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned, {},
2496+
{CI->getArgOperand(0), CI->getArgOperand(1)});
2497+
} else if (Name == "barrier.sync") {
2498+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all, {},
2499+
{CI->getArgOperand(0)});
2500+
} else if (Name == "barrier.sync.cnt") {
2501+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync, {},
2502+
{CI->getArgOperand(0), CI->getArgOperand(1)});
24812503
} else {
24822504
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
24832505
if (IID != Intrinsic::not_intrinsic &&

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,47 @@ def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>;
237237
def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
238238
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
239239

240+
// This class provides a basic wrapper around an NVPTXInst that abstracts the
241+
// specific syntax of most PTX instructions. It automatically handles the
242+
// construction of the asm string based on the provided dag arguments.
243+
// For example, the following asm-strings would be computed:
244+
//
245+
// * BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
246+
// (ins Int32Regs:$a, Int32Regs:$b), (ins),
247+
// "add.s32">;
248+
// ---> "add.s32 \t$dst, $a, $b;"
249+
//
250+
// * BasicFlagsNVPTXInst<(outs Int32Regs:$d),
251+
// (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
252+
// (ins PrmtMode:$mode),
253+
// "prmt.b32${mode}">;
254+
// ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
255+
//
256+
class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
257+
list<dag> pattern = []>
258+
: NVPTXInst<
259+
outs_dag,
260+
!con(ins_dag, flags_dag),
261+
!strconcat(
262+
asmstr,
263+
!if(!and(!empty(ins_dag), !empty(outs_dag)), "",
264+
!strconcat(
265+
" \t",
266+
!interleave(
267+
!foreach(i, !range(!size(outs_dag)),
268+
"$" # !getdagname(outs_dag, i)),
269+
"|"),
270+
!if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
271+
!interleave(
272+
!foreach(i, !range(!size(ins_dag)),
273+
"$" # !getdagname(ins_dag, i)),
274+
", "))),
275+
";"),
276+
pattern>;
277+
278+
class BasicNVPTXInst<dag outs, dag insv, string asmstr, list<dag> pattern = []>
279+
: BasicFlagsNVPTXInst<outs, insv, (ins), asmstr, pattern>;
280+
240281

241282
multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
242283
bit commutative, list<Predicate> requires = []> {

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,6 @@ class THREADMASK_INFO<bit sync> {
6767
// Synchronization and shuffle functions
6868
//-----------------------------------
6969
let isConvergent = true in {
70-
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
71-
"bar.sync \t0;",
72-
[(int_nvvm_barrier0)]>;
73-
def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
74-
"bar.sync \t$src1;",
75-
[(int_nvvm_barrier_n i32:$src1)]>;
76-
def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
77-
"bar.sync \t$src1, $src2;",
78-
[(int_nvvm_barrier i32:$src1, i32:$src2)]>;
7970
def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
8071
!strconcat("{{ \n\t",
8172
".reg .pred \t%p1; \n\t",
@@ -102,39 +93,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
10293
"}}"),
10394
[(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;
10495

105-
def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
106-
[(int_nvvm_bar_sync imm:$i)]>;
107-
10896
def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;",
10997
[(int_nvvm_bar_warp_sync imm:$i)]>,
11098
Requires<[hasPTX<60>, hasSM<30>]>;
11199
def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;",
112100
[(int_nvvm_bar_warp_sync i32:$i)]>,
113101
Requires<[hasPTX<60>, hasSM<30>]>;
114102

115-
def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;",
116-
[(int_nvvm_barrier_sync imm:$i)]>,
117-
Requires<[hasPTX<60>, hasSM<30>]>;
118-
def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;",
119-
[(int_nvvm_barrier_sync i32:$i)]>,
120-
Requires<[hasPTX<60>, hasSM<30>]>;
103+
multiclass BARRIER1<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
104+
def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr,
105+
[(intrinsic imm:$i)]>,
106+
Requires<requires>;
121107

122-
def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt),
123-
"barrier.sync \t$id, $cnt;",
124-
[(int_nvvm_barrier_sync_cnt i32:$id, i32:$cnt)]>,
125-
Requires<[hasPTX<60>, hasSM<30>]>;
126-
def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt),
127-
"barrier.sync \t$id, $cnt;",
128-
[(int_nvvm_barrier_sync_cnt i32:$id, imm:$cnt)]>,
129-
Requires<[hasPTX<60>, hasSM<30>]>;
130-
def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt),
131-
"barrier.sync \t$id, $cnt;",
132-
[(int_nvvm_barrier_sync_cnt imm:$id, i32:$cnt)]>,
133-
Requires<[hasPTX<60>, hasSM<30>]>;
134-
def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
135-
"barrier.sync \t$id, $cnt;",
136-
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
137-
Requires<[hasPTX<60>, hasSM<30>]>;
108+
def _r : BasicNVPTXInst<(outs), (ins Int32Regs:$i), asmstr,
109+
[(intrinsic i32:$i)]>,
110+
Requires<requires>;
111+
}
112+
113+
multiclass BARRIER2<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
114+
def _rr : BasicNVPTXInst<(outs), (ins Int32Regs:$i, Int32Regs:$j), asmstr,
115+
[(intrinsic i32:$i, i32:$j)]>,
116+
Requires<requires>;
117+
118+
def _ri : BasicNVPTXInst<(outs), (ins Int32Regs:$i, i32imm:$j), asmstr,
119+
[(intrinsic i32:$i, imm:$j)]>,
120+
Requires<requires>;
121+
122+
def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, Int32Regs:$j), asmstr,
123+
[(intrinsic imm:$i, i32:$j)]>,
124+
Requires<requires>;
125+
126+
def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
127+
[(intrinsic imm:$i, imm:$j)]>,
128+
Requires<requires>;
129+
}
130+
131+
// Note the "bar.sync" variants could be renamed to the equivalent corresponding
132+
// "barrier.*.aligned" variants. We use the older syntax for compatibility with
133+
// older versions of the PTX ISA.
134+
defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER1<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>;
135+
defm BARRIER_CTA_SYNC_ALIGNED : BARRIER2<"bar.sync", int_nvvm_barrier_cta_sync_aligned>;
136+
defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER2<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned>;
137+
138+
defm BARRIER_CTA_SYNC_ALL : BARRIER1<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>;
139+
defm BARRIER_CTA_SYNC : BARRIER2<"barrier.sync", int_nvvm_barrier_cta_sync, [hasPTX<60>]>;
140+
defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive, [hasPTX<60>]>;
138141

139142
class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
140143
list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2150,7 +2150,8 @@ struct AANoUnwindCallSite final
21502150

21512151
bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
21522152
switch (CB.getIntrinsicID()) {
2153-
case Intrinsic::nvvm_barrier0:
2153+
case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
2154+
case Intrinsic::nvvm_barrier_cta_sync_aligned:
21542155
case Intrinsic::nvvm_barrier0_and:
21552156
case Intrinsic::nvvm_barrier0_or:
21562157
case Intrinsic::nvvm_barrier0_popc:

0 commit comments

Comments
 (0)