Skip to content

Commit 89b7b3b

Browse files
authored
[NVPTX] support dynamic allocas with PTX alloca instruction (llvm#84585)
Add support for dynamically sized alloca instructions with the PTX alloca instruction introduced in PTX 7.3 ([9.7.15.3. Stack Manipulation Instructions: alloca] (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca))
1 parent 1b00727 commit 89b7b3b

File tree

6 files changed

+105
-22
lines changed

6 files changed

+105
-22
lines changed

llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,12 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
6060
NRI->getFrameRegister(MF))
6161
.addReg(NRI->getFrameLocalRegister(MF));
6262
}
63-
BuildMI(MBB, MBBI, dl,
64-
MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
65-
NRI->getFrameLocalRegister(MF))
66-
.addImm(MF.getFunctionNumber());
63+
if (!MR.use_empty(NRI->getFrameLocalRegister(MF))) {
64+
BuildMI(MBB, MBBI, dl,
65+
MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
66+
NRI->getFrameLocalRegister(MF))
67+
.addImm(MF.getFunctionNumber());
68+
}
6769
}
6870
}
6971

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -645,8 +645,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
645645
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
646646
setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
647647

648-
// Lowering of DYNAMIC_STACKALLOC is unsupported.
649-
// Custom lower to produce an error.
650648
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
651649
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
652650

@@ -937,6 +935,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
937935
MAKE_CASE(NVPTXISD::BFE)
938936
MAKE_CASE(NVPTXISD::BFI)
939937
MAKE_CASE(NVPTXISD::PRMT)
938+
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
940939
MAKE_CASE(NVPTXISD::SETP_F16X2)
941940
MAKE_CASE(NVPTXISD::SETP_BF16X2)
942941
MAKE_CASE(NVPTXISD::Dummy)
@@ -2211,14 +2210,39 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
22112210

22122211
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
22132212
SelectionDAG &DAG) const {
2214-
const Function &Fn = DAG.getMachineFunction().getFunction();
2215-
2216-
DiagnosticInfoUnsupported NoDynamicAlloca(
2217-
Fn, "dynamic alloca unsupported by NVPTX backend",
2218-
SDLoc(Op).getDebugLoc());
2219-
DAG.getContext()->diagnose(NoDynamicAlloca);
2220-
auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
2221-
return DAG.getMergeValues(Ops, SDLoc());
2213+
2214+
if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2215+
const Function &Fn = DAG.getMachineFunction().getFunction();
2216+
2217+
DiagnosticInfoUnsupported NoDynamicAlloca(
2218+
Fn,
2219+
"Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2220+
"requires target sm_52.",
2221+
SDLoc(Op).getDebugLoc());
2222+
DAG.getContext()->diagnose(NoDynamicAlloca);
2223+
auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2224+
Op.getOperand(0)};
2225+
return DAG.getMergeValues(Ops, SDLoc());
2226+
}
2227+
2228+
SDValue Chain = Op.getOperand(0);
2229+
SDValue Size = Op.getOperand(1);
2230+
uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2231+
SDLoc DL(Op.getNode());
2232+
2233+
// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2234+
if (nvTM->is64Bit())
2235+
Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2236+
else
2237+
Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2238+
2239+
SDValue AllocOps[] = {Chain, Size,
2240+
DAG.getTargetConstant(Align, DL, MVT::i32)};
2241+
SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
2242+
nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2243+
2244+
SDValue MergeOps[] = {Alloca, Chain};
2245+
return DAG.getMergeValues(MergeOps, DL);
22222246
}
22232247

22242248
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ enum NodeType : unsigned {
6161
BFE,
6262
BFI,
6363
PRMT,
64+
DYNAMIC_STACKALLOC,
6465
Dummy,
6566

6667
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3805,6 +3805,28 @@ def CALL_PROTOTYPE :
38053805
NVPTXInst<(outs), (ins ProtoIdent:$ident),
38063806
"$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
38073807

3808+
def SDTDynAllocaOp :
3809+
SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
3810+
3811+
def dyn_alloca :
3812+
SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
3813+
[SDNPHasChain, SDNPSideEffect]>;
3814+
3815+
def DYNAMIC_STACKALLOC32 :
3816+
NVPTXInst<(outs Int32Regs:$ptr),
3817+
(ins Int32Regs:$size, i32imm:$align),
3818+
"alloca.u32 \t$ptr, $size, $align;\n\t"
3819+
"cvta.local.u32 \t$ptr, $ptr;",
3820+
[(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
3821+
Requires<[hasPTX<73>, hasSM<52>]>;
3822+
3823+
def DYNAMIC_STACKALLOC64 :
3824+
NVPTXInst<(outs Int64Regs:$ptr),
3825+
(ins Int64Regs:$size, i32imm:$align),
3826+
"alloca.u64 \t$ptr, $size, $align;\n\t"
3827+
"cvta.local.u64 \t$ptr, $ptr;",
3828+
[(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
3829+
Requires<[hasPTX<73>, hasSM<52>]>;
38083830

38093831
include "NVPTXIntrinsics.td"
38103832

llvm/test/CodeGen/Generic/ForceStackAlign.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
; Stack realignment not supported.
99
; XFAIL: target=sparc{{.*}}
1010

11-
; NVPTX cannot select dynamic_stackalloc
11+
; NVPTX can only select dynamic_stackalloc on sm_52+ and with ptx73+
1212
; XFAIL: target=nvptx{{.*}}
1313

1414
define i32 @f(ptr %p) nounwind {
Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,44 @@
1-
; RUN: not llc -march=nvptx < %s 2>&1 | FileCheck %s
2-
; RUN: not llc -march=nvptx64 < %s 2>&1 | FileCheck %s
1+
; RUN: not llc < %s -march=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
2+
; RUN: not llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
33

4-
; CHECK: in function test_dynamic_stackalloc{{.*}}: dynamic alloca unsupported by NVPTX backend
4+
; RUN: llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
5+
; RUN: llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
6+
; RUN: %if ptxas %{ llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
7+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
58

6-
define void @test_dynamic_stackalloc(i64 %n) {
7-
%alloca = alloca i32, i64 %n
8-
store volatile i32 0, ptr %alloca
9-
ret void
9+
; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
10+
11+
; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
12+
; CHECK-NOT: __local_depot
13+
14+
; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
15+
; CHECK-32-NEXT: mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
16+
; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
17+
; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
18+
; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
19+
; CHECK-32-NEXT: { // callseq 0, 0
20+
; CHECK-32-NEXT: .reg .b32 temp_param_reg;
21+
; CHECK-32-NEXT: .param .b32 param0;
22+
; CHECK-32-NEXT: st.param.b32 [param0+0], %r[[ALLOCA]];
23+
24+
; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
25+
; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
26+
; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
27+
; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
28+
; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
29+
; CHECK-64-NEXT: { // callseq 0, 0
30+
; CHECK-64-NEXT: .reg .b32 temp_param_reg;
31+
; CHECK-64-NEXT: .param .b64 param0;
32+
; CHECK-64-NEXT: st.param.b64 [param0+0], %rd[[ALLOCA]];
33+
34+
; CHECK-NEXT: .param .b32 retval0;
35+
; CHECK-NEXT: call.uni (retval0),
36+
; CHECK-NEXT: bar,
37+
38+
define i32 @test_dynamic_stackalloc(i64 %n) {
39+
%alloca = alloca i8, i64 %n, align 16
40+
%call = call i32 @bar(ptr %alloca)
41+
ret i32 %call
1042
}
43+
44+
declare i32 @bar(ptr)

0 commit comments

Comments
 (0)