Skip to content

Commit c760671

Browse files
authored
[AMDGPU] Update base addr of dyn alloca considering GrowingUp stack (llvm#119822)
Currently, compiler calculates the base address of dynamic sized stack object (alloca) as follows: 1. `NewSP = Align(CurrSP + Size)` _where_ `Size = # of elements * wave size * alloca type` 2. `BaseAddr = NewSP` 3. The alignment is computed as: `AlignedAddr = Addr & ~(Alignment - 1)` 4. Return the `BaseAddr` This makes sense when stack is grows downwards. AMDGPU stack grows upwards, the base address needs to be aligned first and SP bump by required size later: 1. `BaseAddr = Align(CurrSP)` 2. `NewSP = BaseAddr + Size` 3. `AlignedAddr = (Addr + (Alignment - 1)) & ~(Alignment - 1)` 4. and returns the `BaseAddr`.
1 parent a73ca29 commit c760671

File tree

7 files changed

+312
-215
lines changed

7 files changed

+312
-215
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,15 +1204,18 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
12041204
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
12051205
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
12061206

1207-
auto SPCopy = B.buildCopy(PtrTy, SPReg);
1207+
auto OldSP = B.buildCopy(PtrTy, SPReg);
12081208
if (Alignment > TFI.getStackAlign()) {
1209-
auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1210-
B.buildMaskLowPtrBits(Dst, PtrAdd,
1209+
auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1210+
auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
1211+
B.buildConstant(LLT::scalar(32), StackAlignMask));
1212+
B.buildMaskLowPtrBits(Dst, Tmp1,
12111213
Log2(Alignment) + ST.getWavefrontSizeLog2());
12121214
} else {
1213-
B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1215+
B.buildCopy(Dst, OldSP);
12141216
}
1215-
1217+
auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
1218+
B.buildCopy(SPReg, PtrAdd);
12161219
MI.eraseFromParent();
12171220
return true;
12181221
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4016,8 +4016,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
40164016
InVals, /*IsThisReturn=*/false, SDValue());
40174017
}
40184018

4019-
// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020-
// except for applying the wave size scale to the increment amount.
4019+
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020+
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
4021+
// applying the wave size scale to the increment amount.
40214022
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40224023
SelectionDAG &DAG) const {
40234024
const MachineFunction &MF = DAG.getMachineFunction();
@@ -4037,31 +4038,35 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40374038
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
40384039

40394040
SDValue Size = Tmp2.getOperand(1);
4040-
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4041-
Chain = SP.getValue(1);
4042-
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
4041+
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4042+
Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
4043+
40434044
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
40444045
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
40454046
"Stack grows upwards for AMDGPU");
40464047

4048+
Chain = BaseAddr.getValue(1);
4049+
Align StackAlign = TFL->getStackAlign();
4050+
if (Alignment > StackAlign) {
4051+
uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4052+
<< Subtarget->getWavefrontSizeLog2();
4053+
uint64_t StackAlignMask = ScaledAlignment - 1;
4054+
SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4055+
DAG.getConstant(StackAlignMask, dl, VT));
4056+
BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4057+
DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4058+
}
4059+
40474060
SDValue ScaledSize = DAG.getNode(
40484061
ISD::SHL, dl, VT, Size,
40494062
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
40504063

4051-
Align StackAlign = TFL->getStackAlign();
4052-
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
4053-
if (Alignment && *Alignment > StackAlign) {
4054-
Tmp1 = DAG.getNode(
4055-
ISD::AND, dl, VT, Tmp1,
4056-
DAG.getSignedConstant(-(uint64_t)Alignment->value()
4057-
<< Subtarget->getWavefrontSizeLog2(),
4058-
dl, VT));
4059-
}
4064+
SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
40604065

4061-
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4066+
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
40624067
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
40634068

4064-
return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4069+
return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
40654070
}
40664071

40674072
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

0 commit comments

Comments
 (0)