Skip to content

Commit b4a62b1

Browse files
cdevadasYashwant Singh
authored and
Yashwant Singh
committed
[AMDGPU] Enable whole wave register copy
So far, we haven't exposed the allocation of whole-wave registers to regalloc. We hand-picked them for various whole wave mode operations. With a future patch, we want the allocator to efficiently allocate them rather than using the custom pre-allocation pass. Any liverange split of virtual registers involved in whole-wave operations require the resulting COPY introduced with the split to be performed for all lanes. It isn't implemented in the compiler yet. This patch would identify all such copies and manipulate the exec mask around them to enable all lanes without affecting the value of exec mask elsewhere. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D143762
1 parent 1ff3a5d commit b4a62b1

File tree

10 files changed

+221
-12
lines changed

10 files changed

+221
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
4141
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
4242
FunctionPass *createSIOptimizeVGPRLiveRangePass();
4343
FunctionPass *createSIFixSGPRCopiesPass();
44+
FunctionPass *createLowerWWMCopiesPass();
4445
FunctionPass *createSIMemoryLegalizerPass();
4546
FunctionPass *createSIInsertWaitcntsPass();
4647
FunctionPass *createSIPreAllocateWWMRegsPass();
@@ -144,6 +145,9 @@ extern char &SIFixSGPRCopiesID;
144145
void initializeSIFixVGPRCopiesPass(PassRegistry &);
145146
extern char &SIFixVGPRCopiesID;
146147

148+
void initializeSILowerWWMCopiesPass(PassRegistry &);
149+
extern char &SILowerWWMCopiesID;
150+
147151
void initializeSILowerI1CopiesPass(PassRegistry &);
148152
extern char &SILowerI1CopiesID;
149153

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
364364
initializeAMDGPUDAGToDAGISelPass(*PR);
365365
initializeGCNDPPCombinePass(*PR);
366366
initializeSILowerI1CopiesPass(*PR);
367+
initializeSILowerWWMCopiesPass(*PR);
367368
initializeSILowerSGPRSpillsPass(*PR);
368369
initializeSIFixSGPRCopiesPass(*PR);
369370
initializeSIFixVGPRCopiesPass(*PR);
@@ -1296,6 +1297,7 @@ void GCNPassConfig::addOptimizedRegAlloc() {
12961297
}
12971298

12981299
bool GCNPassConfig::addPreRewrite() {
1300+
addPass(&SILowerWWMCopiesID);
12991301
if (EnableRegReassign)
13001302
addPass(&GCNNSAReassignID);
13011303
return true;
@@ -1350,6 +1352,8 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
13501352
addPass(&SILowerSGPRSpillsID);
13511353

13521354
addPass(createVGPRAllocPass(false));
1355+
1356+
addPass(&SILowerWWMCopiesID);
13531357
return true;
13541358
}
13551359

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ add_llvm_target(AMDGPUCodeGen
145145
SILoadStoreOptimizer.cpp
146146
SILowerControlFlow.cpp
147147
SILowerI1Copies.cpp
148+
SILowerWWMCopies.cpp
148149
SILowerSGPRSpills.cpp
149150
SIMachineFunctionInfo.cpp
150151
SIMachineScheduler.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2414,6 +2414,14 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
24142414
return std::pair(Split[0], Split[1]);
24152415
}
24162416

2417+
std::optional<DestSourcePair>
2418+
SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2419+
if (MI.getOpcode() == AMDGPU::WWM_COPY)
2420+
return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2421+
2422+
return std::nullopt;
2423+
}
2424+
24172425
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
24182426
MachineOperand &Src0,
24192427
unsigned Src0OpName,
@@ -3080,6 +3088,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
30803088
case AMDGPU::S_MOV_B32:
30813089
case AMDGPU::S_MOV_B64:
30823090
case AMDGPU::COPY:
3091+
case AMDGPU::WWM_COPY:
30833092
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
30843093
case AMDGPU::V_ACCVGPR_READ_B32_e64:
30853094
case AMDGPU::V_ACCVGPR_MOV_B32:
@@ -4969,7 +4978,8 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
49694978
MachineBasicBlock &MBB,
49704979
MachineBasicBlock::iterator MBBI,
49714980
const DebugLoc &DL, Register Reg,
4972-
bool IsSCCLive) const {
4981+
bool IsSCCLive,
4982+
SlotIndexes *Indexes) const {
49734983
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
49744984
const SIInstrInfo *TII = ST.getInstrInfo();
49754985
bool IsWave32 = ST.isWave32();
@@ -4979,23 +4989,34 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
49794989
// the single instruction S_OR_SAVEEXEC that clobbers SCC.
49804990
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
49814991
MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4982-
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill);
4983-
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
4992+
auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
4993+
.addReg(Exec, RegState::Kill);
4994+
auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
4995+
if (Indexes) {
4996+
Indexes->insertMachineInstrInMaps(*StoreExecMI);
4997+
Indexes->insertMachineInstrInMaps(*FlipExecMI);
4998+
}
49844999
} else {
49855000
const unsigned OrSaveExec =
49865001
IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
49875002
auto SaveExec =
49885003
BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
49895004
SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5005+
if (Indexes)
5006+
Indexes->insertMachineInstrInMaps(*SaveExec);
49905007
}
49915008
}
49925009

49935010
void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
49945011
MachineBasicBlock::iterator MBBI,
4995-
const DebugLoc &DL, Register Reg) const {
5012+
const DebugLoc &DL, Register Reg,
5013+
SlotIndexes *Indexes) const {
49965014
unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
49975015
MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4998-
BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5016+
auto ExecRestoreMI =
5017+
BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5018+
if (Indexes)
5019+
Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
49995020
}
50005021

50015022
static const TargetRegisterClass *
@@ -7980,6 +8001,16 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
79808001
return ArrayRef(TargetFlags);
79818002
}
79828003

8004+
unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
8005+
const MachineFunction &MF) const {
8006+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8007+
assert(SrcReg.isVirtual());
8008+
if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8009+
return AMDGPU::WWM_COPY;
8010+
8011+
return AMDGPU::COPY;
8012+
}
8013+
79838014
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
79848015
return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
79858016
MI.modifiesRegister(AMDGPU::EXEC, &RI);
@@ -8547,7 +8578,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
85478578
// A similar issue also exists with spilling and reloading $exec registers.
85488579
//
85498580
// To prevent that, constrain the %0 register class here.
8550-
if (MI.isFullCopy()) {
8581+
if (isFullCopyInstr(MI)) {
85518582
Register DstReg = MI.getOperand(0).getReg();
85528583
Register SrcReg = MI.getOperand(1).getReg();
85538584
if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
@@ -8644,7 +8675,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
86448675
if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
86458676
return InstructionUniformity::AlwaysUniform;
86468677

8647-
if (MI.isCopy()) {
8678+
if (isCopyInstr(MI)) {
86488679
const MachineOperand &srcOp = MI.getOperand(1);
86498680
if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
86508681
const TargetRegisterClass *regClass =

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
170170
Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
171171

172172
protected:
173+
/// If the specific machine instruction is a instruction that moves/copies
174+
/// value from one register to another register return destination and source
175+
/// registers as machine operands.
176+
std::optional<DestSourcePair>
177+
isCopyInstrImpl(const MachineInstr &MI) const override;
178+
173179
bool swapSourceModifiers(MachineInstr &MI,
174180
MachineOperand &Src0, unsigned Src0OpName,
175181
MachineOperand &Src1, unsigned Src1OpName) const;
@@ -827,7 +833,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
827833
}
828834

829835
bool isVGPRCopy(const MachineInstr &MI) const {
830-
assert(MI.isCopy());
836+
assert(isCopyInstr(MI));
831837
Register Dest = MI.getOperand(0).getReg();
832838
const MachineFunction &MF = *MI.getParent()->getParent();
833839
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -897,7 +903,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
897903
if (OpIdx >= MI.getDesc().NumOperands)
898904
return false;
899905

900-
if (MI.isCopy()) {
906+
if (isCopyInstr(MI)) {
901907
unsigned Size = getOpSize(MI, OpIdx);
902908
assert(Size == 8 || Size == 4);
903909

@@ -946,12 +952,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
946952

947953
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
948954
MachineBasicBlock::iterator MBBI,
949-
const DebugLoc &DL, Register Reg,
950-
bool IsSCCLive) const;
955+
const DebugLoc &DL, Register Reg, bool IsSCCLive,
956+
SlotIndexes *Indexes = nullptr) const;
951957

952958
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
953959
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
954-
Register Reg) const;
960+
Register Reg, SlotIndexes *Indexes = nullptr) const;
955961

956962
/// Return the correct register class for \p OpNo. For target-specific
957963
/// instructions, this will return the register class that has been defined
@@ -1143,6 +1149,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11431149
CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
11441150
const ScheduleDAGMI *DAG) const override;
11451151

1152+
unsigned getLiveRangeSplitOpcode(Register Reg,
1153+
const MachineFunction &MF) const override;
1154+
11461155
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
11471156

11481157
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,13 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
172172

173173
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
174174

175+
def WWM_COPY : SPseudoInstSI <
176+
(outs unknown:$dst), (ins unknown:$src)> {
177+
let hasSideEffects = 0;
178+
let isAsCheapAsAMove = 1;
179+
let isConvergent = 1;
180+
}
181+
175182
def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
176183
let Uses = [EXEC];
177184
let Defs = [EXEC, SCC];
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// Lowering the WWM_COPY instructions for various register classes.
11+
/// AMDGPU target generates WWM_COPY instruction to differentiate WWM
12+
/// copy from COPY. This pass generates the necessary exec mask manipulation
13+
/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to
14+
/// COPY.
15+
//
16+
//===----------------------------------------------------------------------===//
17+
18+
#include "AMDGPU.h"
19+
#include "GCNSubtarget.h"
20+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21+
#include "SIMachineFunctionInfo.h"
22+
#include "llvm/CodeGen/LiveIntervals.h"
23+
#include "llvm/CodeGen/MachineFunctionPass.h"
24+
#include "llvm/CodeGen/VirtRegMap.h"
25+
#include "llvm/InitializePasses.h"
26+
27+
using namespace llvm;
28+
29+
#define DEBUG_TYPE "si-lower-wwm-copies"
30+
31+
namespace {
32+
33+
class SILowerWWMCopies : public MachineFunctionPass {
34+
public:
35+
static char ID;
36+
37+
SILowerWWMCopies() : MachineFunctionPass(ID) {
38+
initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry());
39+
}
40+
41+
bool runOnMachineFunction(MachineFunction &MF) override;
42+
43+
StringRef getPassName() const override { return "SI Lower WWM Copies"; }
44+
45+
void getAnalysisUsage(AnalysisUsage &AU) const override {
46+
AU.setPreservesAll();
47+
MachineFunctionPass::getAnalysisUsage(AU);
48+
}
49+
50+
private:
51+
bool isSCCLiveAtMI(const MachineInstr &MI);
52+
void addToWWMSpills(MachineFunction &MF, Register Reg);
53+
54+
LiveIntervals *LIS;
55+
SlotIndexes *Indexes;
56+
VirtRegMap *VRM;
57+
const SIRegisterInfo *TRI;
58+
const MachineRegisterInfo *MRI;
59+
SIMachineFunctionInfo *MFI;
60+
};
61+
62+
} // End anonymous namespace.
63+
64+
INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies",
65+
false, false)
66+
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
67+
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
68+
INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false,
69+
false)
70+
71+
char SILowerWWMCopies::ID = 0;
72+
73+
char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID;
74+
75+
bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) {
76+
// We can't determine the liveness info if LIS isn't available. Early return
77+
// in that case and always assume SCC is live.
78+
if (!LIS)
79+
return true;
80+
81+
LiveRange &LR =
82+
LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
83+
SlotIndex Idx = LIS->getInstructionIndex(MI);
84+
return LR.liveAt(Idx);
85+
}
86+
87+
// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills
88+
// for preserving its entire lanes at function prolog/epilog.
89+
void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) {
90+
if (Reg.isPhysical())
91+
return;
92+
93+
Register PhysReg = VRM->getPhys(Reg);
94+
assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
95+
"should have allocated a physical register");
96+
97+
MFI->allocateWWMSpill(MF, PhysReg);
98+
}
99+
100+
bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
101+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
102+
const SIInstrInfo *TII = ST.getInstrInfo();
103+
104+
MFI = MF.getInfo<SIMachineFunctionInfo>();
105+
LIS = getAnalysisIfAvailable<LiveIntervals>();
106+
Indexes = getAnalysisIfAvailable<SlotIndexes>();
107+
VRM = getAnalysisIfAvailable<VirtRegMap>();
108+
TRI = ST.getRegisterInfo();
109+
MRI = &MF.getRegInfo();
110+
111+
if (!MFI->hasVRegFlags())
112+
return false;
113+
114+
bool Changed = false;
115+
for (MachineBasicBlock &MBB : MF) {
116+
for (MachineInstr &MI : MBB) {
117+
if (MI.getOpcode() != AMDGPU::WWM_COPY)
118+
continue;
119+
120+
// TODO: Club adjacent WWM ops between same exec save/restore
121+
assert(TII->isVGPRCopy(MI));
122+
123+
// For WWM vector copies, manipulate the exec mask around the copy
124+
// instruction.
125+
const DebugLoc &DL = MI.getDebugLoc();
126+
MachineBasicBlock::iterator InsertPt = MI.getIterator();
127+
Register RegForExecCopy = MFI->getSGPRForEXECCopy();
128+
TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
129+
isSCCLiveAtMI(MI), Indexes);
130+
TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
131+
addToWWMSpills(MF, MI.getOperand(0).getReg());
132+
LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
133+
134+
// Lower WWM_COPY back to COPY
135+
MI.setDesc(TII->get(AMDGPU::COPY));
136+
Changed |= true;
137+
}
138+
}
139+
140+
return Changed;
141+
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
667667
return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag;
668668
}
669669

670+
bool hasVRegFlags() { return VRegFlags.size(); }
671+
670672
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
671673
Align Alignment = Align(4));
672674

0 commit comments

Comments
 (0)