Skip to content

Commit e72d8b2

Browse files
committed
Revert "[NVPTX] Unify and extend barrier{.cta} intrinsic support (llvm#140615)"
This reverts commit 735209c.
1 parent 3eb9e77 commit e72d8b2

File tree

22 files changed

+197
-389
lines changed

22 files changed

+197
-389
lines changed

clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,22 +1160,6 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
11601160
case NVPTX::BI__nvvm_fence_sc_cluster:
11611161
return Builder.CreateCall(
11621162
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1163-
case NVPTX::BI__nvvm_bar_sync:
1164-
return Builder.CreateCall(
1165-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1166-
EmitScalarExpr(E->getArg(0)));
1167-
case NVPTX::BI__syncthreads:
1168-
return Builder.CreateCall(
1169-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1170-
Builder.getInt32(0));
1171-
case NVPTX::BI__nvvm_barrier_sync:
1172-
return Builder.CreateCall(
1173-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
1174-
EmitScalarExpr(E->getArg(0)));
1175-
case NVPTX::BI__nvvm_barrier_sync_cnt:
1176-
return Builder.CreateCall(
1177-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync),
1178-
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
11791163
default:
11801164
return nullptr;
11811165
}

clang/test/CodeGen/builtins-nvptx-ptx60.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b,
3232
// CHECK: call void @llvm.nvvm.bar.warp.sync(i32
3333
// expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}}
3434
__nvvm_bar_warp_sync(mask);
35-
// CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32
35+
// CHECK: call void @llvm.nvvm.barrier.sync(i32
3636
// expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}}
3737
__nvvm_barrier_sync(mask);
38-
// CHECK: call void @llvm.nvvm.barrier.cta.sync(i32
38+
// CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32
3939
// expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}}
4040
__nvvm_barrier_sync_cnt(mask, i);
4141

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ __device__ int read_pms() {
198198

199199
__device__ void sync() {
200200

201-
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
201+
// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
202202

203203
__nvvm_bar_sync(0);
204204

@@ -259,7 +259,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
259259
__nvvm_membar_gl();
260260
// CHECK: call void @llvm.nvvm.membar.sys()
261261
__nvvm_membar_sys();
262-
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
262+
// CHECK: call void @llvm.nvvm.barrier0()
263263
__syncthreads();
264264
}
265265

clang/test/Headers/gpuintrin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,7 @@ __gpu_kernel void foo() {
887887
// NVPTX-LABEL: define internal void @__gpu_sync_threads(
888888
// NVPTX-SAME: ) #[[ATTR0]] {
889889
// NVPTX-NEXT: [[ENTRY:.*:]]
890-
// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
890+
// NVPTX-NEXT: call void @llvm.nvvm.barrier0()
891891
// NVPTX-NEXT: ret void
892892
//
893893
//

llvm/docs/NVPTXUsage.rst

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -199,58 +199,21 @@ map in the following way to CUDA builtins:
199199
Barriers
200200
--------
201201

202-
'``llvm.nvvm.barrier.cta.*``'
203-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
202+
'``llvm.nvvm.barrier0``'
203+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
204204

205205
Syntax:
206206
"""""""
207207

208208
.. code-block:: llvm
209209
210-
declare void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %n)
211-
declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
212-
declare void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %n)
213-
214-
declare void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %n)
215-
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
216-
declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %n)
210+
declare void @llvm.nvvm.barrier0()
217211
218212
Overview:
219213
"""""""""
220214

221-
The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier
222-
synchronization and communication within a CTA. They can be used by the threads
223-
within the CTA for synchronization and communication.
224-
225-
Semantics:
226-
""""""""""
227-
228-
Operand %id specifies a logical barrier resource and must fall within the range
229-
0 through 15. When present, operand %n specifies the number of threads
230-
participating in the barrier. When specifying a thread count, the value must be
231-
a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``'
232-
variants, the '``.all``' suffix indicates that all threads in the CTA should
233-
participate in the barrier and the %n operand is not present.
234-
235-
All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
236-
thread to wait for all non-exited threads from its warp and then marks the
237-
warp's arrival at the barrier. In addition to signaling its arrival at the
238-
barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
239-
thread to wait for non-exited threads of all other warps participating in the
240-
barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
241-
intrinsic does not cause the executing thread to wait for threads of other
242-
participating warps.
243-
244-
When a barrier completes, the waiting threads are restarted without delay,
245-
and the barrier is reinitialized so that it can be immediately reused.
246-
247-
The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
248-
modifier to indicate textual alignment of the barrier. When specified, it
249-
indicates that all threads in the CTA will execute the same
250-
'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an
251-
aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is
252-
known that all threads in the CTA evaluate the condition identically, otherwise
253-
behavior is undefined.
215+
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
216+
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
254217

255218
Electing a thread
256219
-----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,6 @@
128128
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
129129
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
130130
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
131-
// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0)
132-
// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
133-
// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
134-
// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y)
135-
// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x)
136-
// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y)
137131

138132
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
139133
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1269,28 +1263,35 @@ let TargetPrefix = "nvvm" in {
12691263
defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>;
12701264

12711265
// Bar.Sync
1266+
1267+
// The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the
1268+
// intrinsics in this file, this one is a user-facing API.
1269+
def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">,
1270+
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1271+
// Synchronize all threads in the CTA at barrier 'n'.
1272+
def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">,
1273+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1274+
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
1275+
// the CTA at barrier 'n' (arg 1).
1276+
def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">,
1277+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12721278
def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
12731279
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12741280
def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
12751281
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12761282
def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
12771283
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12781284

1285+
def int_nvvm_bar_sync : NVVMBuiltin,
1286+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12791287
def int_nvvm_bar_warp_sync : NVVMBuiltin,
12801288
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12811289

1282-
// barrier{.cta}.sync{.aligned} a{, b};
1283-
// barrier{.cta}.arrive{.aligned} a, b;
1284-
let IntrProperties = [IntrConvergent, IntrNoCallback] in {
1285-
foreach align = ["", "_aligned"] in {
1286-
def int_nvvm_barrier_cta_sync # align # _all :
1287-
Intrinsic<[], [llvm_i32_ty]>;
1288-
def int_nvvm_barrier_cta_sync # align :
1289-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1290-
def int_nvvm_barrier_cta_arrive # align :
1291-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1292-
}
1293-
}
1290+
// barrier.sync id[, cnt]
1291+
def int_nvvm_barrier_sync : NVVMBuiltin,
1292+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1293+
def int_nvvm_barrier_sync_cnt : NVVMBuiltin,
1294+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12941295

12951296
// barrier.cluster.[wait, arrive, arrive.relaxed]
12961297
def int_nvvm_barrier_cluster_arrive :

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,9 +1343,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13431343
// nvvm.abs.{i,ii}
13441344
Expand =
13451345
Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2";
1346-
else if (Name.consume_front("fabs."))
1346+
else if (Name == "fabs.f" || Name == "fabs.ftz.f" || Name == "fabs.d")
13471347
// nvvm.fabs.{f,ftz.f,d}
1348-
Expand = Name == "f" || Name == "ftz.f" || Name == "d";
1348+
Expand = true;
1349+
else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
1350+
Name == "swap.lo.hi.b64")
1351+
Expand = true;
13491352
else if (Name.consume_front("max.") || Name.consume_front("min."))
13501353
// nvvm.{min,max}.{i,ii,ui,ull}
13511354
Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -1377,18 +1380,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13771380
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
13781381
Name.starts_with("p."));
13791382
else
1380-
Expand = StringSwitch<bool>(Name)
1381-
.Case("barrier0", true)
1382-
.Case("barrier.n", true)
1383-
.Case("barrier.sync.cnt", true)
1384-
.Case("barrier.sync", true)
1385-
.Case("barrier", true)
1386-
.Case("bar.sync", true)
1387-
.Case("clz.ll", true)
1388-
.Case("popc.ll", true)
1389-
.Case("h2f", true)
1390-
.Case("swap.lo.hi.b64", true)
1391-
.Default(false);
1383+
Expand = false;
13921384

13931385
if (Expand) {
13941386
NewFn = nullptr;
@@ -2486,20 +2478,6 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
24862478
MDNode *MD = MDNode::get(Builder.getContext(), {});
24872479
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
24882480
return LD;
2489-
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
2490-
Value *Arg =
2491-
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
2492-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all,
2493-
{}, {Arg});
2494-
} else if (Name == "barrier") {
2495-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned, {},
2496-
{CI->getArgOperand(0), CI->getArgOperand(1)});
2497-
} else if (Name == "barrier.sync") {
2498-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all, {},
2499-
{CI->getArgOperand(0)});
2500-
} else if (Name == "barrier.sync.cnt") {
2501-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync, {},
2502-
{CI->getArgOperand(0), CI->getArgOperand(1)});
25032481
} else {
25042482
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
25052483
if (IID != Intrinsic::not_intrinsic &&

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -237,47 +237,6 @@ def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>;
237237
def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
238238
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
239239

240-
// This class provides a basic wrapper around an NVPTXInst that abstracts the
241-
// specific syntax of most PTX instructions. It automatically handles the
242-
// construction of the asm string based on the provided dag arguments.
243-
// For example, the following asm-strings would be computed:
244-
//
245-
// * BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
246-
// (ins Int32Regs:$a, Int32Regs:$b), (ins),
247-
// "add.s32">;
248-
// ---> "add.s32 \t$dst, $a, $b;"
249-
//
250-
// * BasicFlagsNVPTXInst<(outs Int32Regs:$d),
251-
// (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
252-
// (ins PrmtMode:$mode),
253-
// "prmt.b32${mode}">;
254-
// ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
255-
//
256-
class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
257-
list<dag> pattern = []>
258-
: NVPTXInst<
259-
outs_dag,
260-
!con(ins_dag, flags_dag),
261-
!strconcat(
262-
asmstr,
263-
!if(!and(!empty(ins_dag), !empty(outs_dag)), "",
264-
!strconcat(
265-
" \t",
266-
!interleave(
267-
!foreach(i, !range(!size(outs_dag)),
268-
"$" # !getdagname(outs_dag, i)),
269-
"|"),
270-
!if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
271-
!interleave(
272-
!foreach(i, !range(!size(ins_dag)),
273-
"$" # !getdagname(ins_dag, i)),
274-
", "))),
275-
";"),
276-
pattern>;
277-
278-
class BasicNVPTXInst<dag outs, dag insv, string asmstr, list<dag> pattern = []>
279-
: BasicFlagsNVPTXInst<outs, insv, (ins), asmstr, pattern>;
280-
281240

282241
multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
283242
bit commutative, list<Predicate> requires = []> {

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ class THREADMASK_INFO<bit sync> {
6767
// Synchronization and shuffle functions
6868
//-----------------------------------
6969
let isConvergent = true in {
70+
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
71+
"bar.sync \t0;",
72+
[(int_nvvm_barrier0)]>;
73+
def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
74+
"bar.sync \t$src1;",
75+
[(int_nvvm_barrier_n i32:$src1)]>;
76+
def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
77+
"bar.sync \t$src1, $src2;",
78+
[(int_nvvm_barrier i32:$src1, i32:$src2)]>;
7079
def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
7180
!strconcat("{{ \n\t",
7281
".reg .pred \t%p1; \n\t",
@@ -93,51 +102,39 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
93102
"}}"),
94103
[(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;
95104

105+
def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
106+
[(int_nvvm_bar_sync imm:$i)]>;
107+
96108
def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;",
97109
[(int_nvvm_bar_warp_sync imm:$i)]>,
98110
Requires<[hasPTX<60>, hasSM<30>]>;
99111
def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;",
100112
[(int_nvvm_bar_warp_sync i32:$i)]>,
101113
Requires<[hasPTX<60>, hasSM<30>]>;
102114

103-
multiclass BARRIER1<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
104-
def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr,
105-
[(intrinsic imm:$i)]>,
106-
Requires<requires>;
107-
108-
def _r : BasicNVPTXInst<(outs), (ins Int32Regs:$i), asmstr,
109-
[(intrinsic i32:$i)]>,
110-
Requires<requires>;
111-
}
112-
113-
multiclass BARRIER2<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
114-
def _rr : BasicNVPTXInst<(outs), (ins Int32Regs:$i, Int32Regs:$j), asmstr,
115-
[(intrinsic i32:$i, i32:$j)]>,
116-
Requires<requires>;
117-
118-
def _ri : BasicNVPTXInst<(outs), (ins Int32Regs:$i, i32imm:$j), asmstr,
119-
[(intrinsic i32:$i, imm:$j)]>,
120-
Requires<requires>;
121-
122-
def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, Int32Regs:$j), asmstr,
123-
[(intrinsic imm:$i, i32:$j)]>,
124-
Requires<requires>;
125-
126-
def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
127-
[(intrinsic imm:$i, imm:$j)]>,
128-
Requires<requires>;
129-
}
130-
131-
// Note the "bar.sync" variants could be renamed to the equivalent corresponding
132-
// "barrier.*.aligned" variants. We use the older syntax for compatibility with
133-
// older versions of the PTX ISA.
134-
defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER1<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>;
135-
defm BARRIER_CTA_SYNC_ALIGNED : BARRIER2<"bar.sync", int_nvvm_barrier_cta_sync_aligned>;
136-
defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER2<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned>;
115+
def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;",
116+
[(int_nvvm_barrier_sync imm:$i)]>,
117+
Requires<[hasPTX<60>, hasSM<30>]>;
118+
def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;",
119+
[(int_nvvm_barrier_sync i32:$i)]>,
120+
Requires<[hasPTX<60>, hasSM<30>]>;
137121

138-
defm BARRIER_CTA_SYNC_ALL : BARRIER1<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>;
139-
defm BARRIER_CTA_SYNC : BARRIER2<"barrier.sync", int_nvvm_barrier_cta_sync, [hasPTX<60>]>;
140-
defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive, [hasPTX<60>]>;
122+
def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt),
123+
"barrier.sync \t$id, $cnt;",
124+
[(int_nvvm_barrier_sync_cnt i32:$id, i32:$cnt)]>,
125+
Requires<[hasPTX<60>, hasSM<30>]>;
126+
def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt),
127+
"barrier.sync \t$id, $cnt;",
128+
[(int_nvvm_barrier_sync_cnt i32:$id, imm:$cnt)]>,
129+
Requires<[hasPTX<60>, hasSM<30>]>;
130+
def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt),
131+
"barrier.sync \t$id, $cnt;",
132+
[(int_nvvm_barrier_sync_cnt imm:$id, i32:$cnt)]>,
133+
Requires<[hasPTX<60>, hasSM<30>]>;
134+
def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
135+
"barrier.sync \t$id, $cnt;",
136+
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
137+
Requires<[hasPTX<60>, hasSM<30>]>;
141138

142139
class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
143140
list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2150,8 +2150,7 @@ struct AANoUnwindCallSite final
21502150

21512151
bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
21522152
switch (CB.getIntrinsicID()) {
2153-
case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
2154-
case Intrinsic::nvvm_barrier_cta_sync_aligned:
2153+
case Intrinsic::nvvm_barrier0:
21552154
case Intrinsic::nvvm_barrier0_and:
21562155
case Intrinsic::nvvm_barrier0_or:
21572156
case Intrinsic::nvvm_barrier0_popc:

0 commit comments

Comments
 (0)