Skip to content

Commit 7a98f08

Browse files
cdevadasYashwant Singh
authored and
Yashwant Singh
committed
[AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual VGPRs
Currently, the custom SGPR spill lowering pass spills SGPRs into physical VGPR lanes and the remaining VGPRs are used by regalloc for vector regclass allocation. This imposes many restrictions that we ended up with unsuccessful SGPR spilling when there won't be enough VGPRs and we are forced to spill the leftover into memory during PEI. The custom spill handling during PEI has many edge cases and often breaks the compiler time to time. This patch implements spilling SGPRs into virtual VGPR lanes. Since we now split the register allocation for SGPRs and VGPRs, the virtual registers introduced for the spill lanes would get allocated automatically in the subsequent regalloc invocation for VGPRs. Spill to virtual registers will always be successful, even in the high-pressure situations, and hence it avoids most of the edge cases during PEI. We are now left with only the custom SGPR spills during PEI for special registers like the frame pointer which is an unproblematic case. Differential Revision: https://reviews.llvm.org/D124196
1 parent 691dc2d commit 7a98f08

File tree

83 files changed

+7413
-5688
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+7413
-5688
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister(
9999
SGPR, PrologEpilogSGPRSaveRestoreInfo(
100100
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
101101

102-
LLVM_DEBUG(
103-
auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
104-
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
105-
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
102+
LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
103+
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
104+
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
105+
<< '\n';);
106106
} else {
107107
// Remove dead <FI> index
108108
MF.getFrameInfo().RemoveStackObject(FI);
@@ -264,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder {
264264

265265
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
266266
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
267-
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
267+
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
268268
assert(Spill.size() == NumSubRegs);
269269

270270
for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -309,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder {
309309
void restoreFromVGPRLane(const int FI) {
310310
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
311311
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
312-
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
312+
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
313313
assert(Spill.size() == NumSubRegs);
314314

315315
for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -1353,8 +1353,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
13531353
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
13541354
TRI->isAGPR(MRI, VReg))) {
13551355
assert(RS != nullptr);
1356-
// FIXME: change to enterBasicBlockEnd()
1357-
RS->enterBasicBlock(MBB);
1356+
RS->enterBasicBlockEnd(MBB);
1357+
RS->backward(MI);
13581358
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
13591359
SpillFIs.set(FI);
13601360
continue;

llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp

Lines changed: 114 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,23 @@ class SILowerSGPRSpills : public MachineFunctionPass {
5050
SILowerSGPRSpills() : MachineFunctionPass(ID) {}
5151

5252
void calculateSaveRestoreBlocks(MachineFunction &MF);
53-
bool spillCalleeSavedRegs(MachineFunction &MF);
53+
bool spillCalleeSavedRegs(MachineFunction &MF,
54+
SmallVectorImpl<int> &CalleeSavedFIs);
55+
void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
5456

5557
bool runOnMachineFunction(MachineFunction &MF) override;
5658

5759
void getAnalysisUsage(AnalysisUsage &AU) const override {
5860
AU.setPreservesAll();
5961
MachineFunctionPass::getAnalysisUsage(AU);
6062
}
63+
64+
MachineFunctionProperties getClearedProperties() const override {
65+
// SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
66+
return MachineFunctionProperties()
67+
.set(MachineFunctionProperties::Property::IsSSA)
68+
.set(MachineFunctionProperties::Property::NoVRegs);
69+
}
6170
};
6271

6372
} // end anonymous namespace
@@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
197206
EntryBB.sortUniqueLiveIns();
198207
}
199208

200-
bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
209+
bool SILowerSGPRSpills::spillCalleeSavedRegs(
210+
MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
201211
MachineRegisterInfo &MRI = MF.getRegInfo();
202212
const Function &F = MF.getFunction();
203213
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
228238
TRI->getSpillAlign(*RC), true);
229239

230240
CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
241+
CalleeSavedFIs.push_back(JunkFI);
231242
}
232243
}
233244

@@ -248,6 +259,50 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
248259
return false;
249260
}
250261

262+
void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
263+
LiveIntervals *LIS) {
264+
// TODO: This is a workaround to avoid the unmodelled liveness computed with
265+
// whole-wave virtual registers when allocated together with the regular VGPR
266+
// virtual registers. Presently, the liveness computed during the regalloc is
267+
// only uniform (or single lane aware) and it doesn't take account of the
268+
// divergent control flow that exists for our GPUs. Since the WWM registers
269+
// can modify inactive lanes, the wave-aware liveness should be computed for
270+
// the virtual registers to accurately plot their interferences. Without
271+
// having the divergent CFG for the function, it is difficult to implement the
272+
// wave-aware liveness info. Until then, we conservatively extend the liveness
273+
// of the wwm registers into the entire function so that they won't be reused
274+
// without first spilling/splitting their liveranges.
275+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
276+
277+
// Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
278+
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
279+
for (MachineBasicBlock *SaveBlock : SaveBlocks) {
280+
MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
281+
auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
282+
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
283+
MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
284+
if (LIS) {
285+
LIS->InsertMachineInstrInMaps(*MIB);
286+
}
287+
}
288+
}
289+
290+
// Insert the KILL in the return blocks to extend their liveness untill the
291+
// end of function. Insert a separate KILL for each VGPR.
292+
for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
293+
MachineBasicBlock::iterator InsertBefore =
294+
RestoreBlock->getFirstTerminator();
295+
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
296+
auto MIB =
297+
BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
298+
TII->get(TargetOpcode::KILL));
299+
MIB.addReg(Reg);
300+
if (LIS)
301+
LIS->InsertMachineInstrInMaps(*MIB);
302+
}
303+
}
304+
}
305+
251306
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
252307
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
253308
TII = ST.getInstrInfo();
@@ -261,7 +316,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
261316
// First, expose any CSR SGPR spills. This is mostly the same as what PEI
262317
// does, but somewhat simpler.
263318
calculateSaveRestoreBlocks(MF);
264-
bool HasCSRs = spillCalleeSavedRegs(MF);
319+
SmallVector<int> CalleeSavedFIs;
320+
bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
265321

266322
MachineFrameInfo &MFI = MF.getFrameInfo();
267323
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -275,6 +331,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
275331

276332
bool MadeChange = false;
277333
bool NewReservedRegs = false;
334+
bool SpilledToVirtVGPRLanes = false;
278335

279336
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
280337
// handled as SpilledToReg in regular PrologEpilogInserter.
@@ -297,23 +354,53 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
297354

298355
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
299356
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
300-
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
301-
NewReservedRegs = true;
302-
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
303-
MI, FI, nullptr, Indexes, LIS);
304-
(void)Spilled;
305-
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
306-
SpillFIs.set(FI);
357+
358+
bool IsCalleeSaveSGPRSpill =
359+
std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
360+
CalleeSavedFIs.end();
361+
if (IsCalleeSaveSGPRSpill) {
362+
// Spill callee-saved SGPRs into physical VGPR lanes.
363+
364+
// TODO: This is to ensure the CFIs are static for efficient frame
365+
// unwinding in the debugger. Spilling them into virtual VGPR lanes
366+
// involve regalloc to allocate the physical VGPRs and that might
367+
// cause intermediate spill/split of such liveranges for successful
368+
// allocation. This would result in broken CFI encoding unless the
369+
// regalloc aware CFI generation to insert new CFIs along with the
370+
// intermediate spills is implemented. There is no such support
371+
// currently exist in the LLVM compiler.
372+
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
373+
NewReservedRegs = true;
374+
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
375+
MI, FI, nullptr, Indexes, LIS, true);
376+
if (!Spilled)
377+
llvm_unreachable(
378+
"failed to spill SGPR to physical VGPR lane when allocated");
379+
}
380+
} else {
381+
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
382+
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
383+
MI, FI, nullptr, Indexes, LIS);
384+
if (!Spilled)
385+
llvm_unreachable(
386+
"failed to spill SGPR to virtual VGPR lane when allocated");
387+
SpillFIs.set(FI);
388+
SpilledToVirtVGPRLanes = true;
389+
}
307390
}
308391
}
309392
}
310393

311-
// FIXME: Adding to live-ins redundant with reserving registers.
312-
for (MachineBasicBlock &MBB : MF) {
313-
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
314-
MBB.addLiveIn(Reg);
315-
MBB.sortUniqueLiveIns();
394+
if (SpilledToVirtVGPRLanes) {
395+
extendWWMVirtRegLiveness(MF, LIS);
396+
if (LIS) {
397+
// Compute the LiveInterval for the newly created virtual registers.
398+
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
399+
LIS->createAndComputeVirtRegInterval(Reg);
400+
}
401+
}
316402

403+
for (MachineBasicBlock &MBB : MF) {
317404
// FIXME: The dead frame indices are replaced with a null register from
318405
// the debug value instructions. We should instead, update it with the
319406
// correct register value. But not sure the register value alone is
@@ -334,6 +421,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
334421
// lane".
335422
FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
336423

424+
MadeChange = true;
425+
}
426+
427+
if (SpilledToVirtVGPRLanes) {
337428
const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
338429
// Shift back the reserved SGPR for EXEC copy into the lowest range.
339430
// This SGPR is reserved to handle the whole-wave spill/copy operations
@@ -342,20 +433,21 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
342433
if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
343434
TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
344435
FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
345-
346-
MadeChange = true;
347436
} else {
348-
// No SGPR spills and hence there won't be any WWM spills/copies. Reset the
349-
// SGPR reserved for EXEC copy.
437+
// No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
438+
// spills/copies. Reset the SGPR reserved for EXEC copy.
350439
FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
351440
}
352441

353442
SaveBlocks.clear();
354443
RestoreBlocks.clear();
355444

356-
// Updated the reserved registers with any VGPRs added for SGPR spills.
357-
if (NewReservedRegs)
358-
MRI.freezeReservedRegs(MF);
445+
// Updated the reserved registers with any physical VGPRs added for SGPR
446+
// spills.
447+
if (NewReservedRegs) {
448+
for (Register Reg : FuncInfo->getWWMReservedRegs())
449+
MRI.reserveReg(Reg, TRI);
450+
}
359451

360452
return MadeChange;
361453
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 34 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -314,37 +314,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
314314
return false;
315315
}
316316

317-
bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
318-
int FI,
319-
unsigned LaneIndex) {
320-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
321-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
317+
bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
318+
MachineFunction &MF, int FI, unsigned LaneIndex) {
322319
MachineRegisterInfo &MRI = MF.getRegInfo();
323320
Register LaneVGPR;
324321
if (!LaneIndex) {
325-
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
326-
if (LaneVGPR == AMDGPU::NoRegister) {
327-
// We have no VGPRs left for spilling SGPRs. Reset because we will not
328-
// partially spill the SGPR to VGPRs.
329-
SGPRSpillToVGPRLanes.erase(FI);
330-
return false;
331-
}
332-
322+
LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
333323
SpillVGPRs.push_back(LaneVGPR);
334-
// Add this register as live-in to all blocks to avoid machine verifier
335-
// complaining about use of an undefined physical register.
336-
for (MachineBasicBlock &BB : MF)
337-
BB.addLiveIn(LaneVGPR);
338324
} else {
339325
LaneVGPR = SpillVGPRs.back();
340326
}
341327

342-
SGPRSpillToVGPRLanes[FI].push_back(
328+
SGPRSpillsToVirtualVGPRLanes[FI].push_back(
343329
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
344330
return true;
345331
}
346332

347-
bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
333+
bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
348334
MachineFunction &MF, int FI, unsigned LaneIndex) {
349335
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
350336
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -355,16 +341,21 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
355341
if (LaneVGPR == AMDGPU::NoRegister) {
356342
// We have no VGPRs left for spilling SGPRs. Reset because we will not
357343
// partially spill the SGPR to VGPRs.
358-
PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
344+
SGPRSpillsToPhysicalVGPRLanes.erase(FI);
359345
return false;
360346
}
361347

362348
allocateWWMSpill(MF, LaneVGPR);
349+
reserveWWMRegister(LaneVGPR);
350+
for (MachineBasicBlock &MBB : MF) {
351+
MBB.addLiveIn(LaneVGPR);
352+
MBB.sortUniqueLiveIns();
353+
}
363354
} else {
364-
LaneVGPR = WWMSpills.back().first;
355+
LaneVGPR = WWMReservedRegs.back();
365356
}
366357

367-
PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
358+
SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
368359
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
369360
return true;
370361
}
@@ -373,8 +364,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
373364
int FI,
374365
bool IsPrologEpilog) {
375366
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
376-
IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
377-
: SGPRSpillToVGPRLanes[FI];
367+
IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
368+
: SGPRSpillsToVirtualVGPRLanes[FI];
378369

379370
// This has already been allocated.
380371
if (!SpillLanes.empty())
@@ -395,15 +386,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
395386
"not spilling SGPRs to VGPRs");
396387

397388
unsigned &NumSpillLanes =
398-
IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
389+
IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
399390

400391
for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
401392
unsigned LaneIndex = (NumSpillLanes % WaveSize);
402393

403-
bool Allocated =
404-
IsPrologEpilog
405-
? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
406-
: allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
394+
bool Allocated = IsPrologEpilog
395+
? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
396+
: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
407397
if (!Allocated) {
408398
NumSpillLanes -= I;
409399
return false;
@@ -484,16 +474,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
484474

485475
bool SIMachineFunctionInfo::removeDeadFrameIndices(
486476
MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
487-
// Remove dead frame indices from function frame. And also make sure to remove
488-
// the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
489-
// could result in an unexpected side effect and bug, in case of any
490-
// re-mapping of freed frame indices by later pass(es) like "stack slot
477+
// Remove dead frame indices from function frame, however keep FP & BP since
478+
// spills for them haven't been inserted yet. And also make sure to remove the
479+
// frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
480+
// otherwise, it could result in an unexpected side effect and bug, in case of
481+
// any re-mapping of freed frame indices by later pass(es) like "stack slot
491482
// coloring".
492-
for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
483+
for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
493484
MFI.RemoveStackObject(R.first);
494-
SGPRSpillToVGPRLanes.erase(R.first);
485+
SGPRSpillsToVirtualVGPRLanes.erase(R.first);
495486
}
496487

488+
// Remove the dead frame indices of CSR SGPRs which are spilled to physical
489+
// VGPR lanes during SILowerSGPRSpills pass.
490+
if (!ResetSGPRSpillStackIDs) {
491+
for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
492+
MFI.RemoveStackObject(R.first);
493+
SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
494+
}
495+
}
497496
bool HaveSGPRToMemory = false;
498497

499498
if (ResetSGPRSpillStackIDs) {

0 commit comments

Comments
 (0)