Skip to content

Commit f8a56df

Browse files
AMDGPU/GlobalISel: AMDGPURegBankSelect (llvm#112863)
Assign register banks to virtual registers. Does not use generic RegBankSelect. After register bank selection all register operand of G_ instructions have LLT and register banks exclusively. If they had register class, reassign appropriate register bank. Assign register banks using machine uniformity analysis: Sgpr - uniform values and some lane masks Vgpr - divergent, non S1, values Vcc - divergent S1 values(lane masks) AMDGPURegBankSelect does not consider available instructions and, in some cases, G_ instructions with some register bank assignment can't be inst-selected. This is solved in RegBankLegalize. Exceptions when uniformity analysis does not work: S32/S64 lane masks: - need to end up with sgpr register class after instruction selection - In most cases Uniformity analysis declares them as uniform (forced by tablegen) resulting in sgpr S32/S64 reg bank - When Uniformity analysis declares them as divergent (some phis), use intrinsic lane mask analyzer to still assign sgpr register bank temporal divergence copy: - COPY to vgpr with implicit use of $exec inside of the cycle - this copy is declared as uniform by uniformity analysis - make sure that assigned bank is vgpr Note: uniformity analysis does not consider that registers with vgpr def are divergent (you can have uniform value in vgpr). - TODO: implicit use of $exec could be implemented as indicator that instruction is divergent
1 parent 3208801 commit f8a56df

File tree

5 files changed

+977
-686
lines changed

5 files changed

+977
-686
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,16 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "AMDGPUGlobalISelUtils.h"
10+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1011
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1113
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1214
#include "llvm/CodeGenTypes/LowLevelType.h"
1315
#include "llvm/IR/Constants.h"
16+
#include "llvm/IR/IntrinsicsAMDGPU.h"
1417

1518
using namespace llvm;
19+
using namespace AMDGPU;
1620
using namespace MIPatternMatch;
1721

1822
std::pair<Register, unsigned>
@@ -68,3 +72,37 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
6872

6973
return std::pair(Reg, 0);
7074
}
75+
76+
IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
77+
: MRI(MF.getRegInfo()) {
78+
initLaneMaskIntrinsics(MF);
79+
}
80+
81+
bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) const {
82+
return S32S64LaneMask.contains(Reg);
83+
}
84+
85+
void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
86+
for (auto &MBB : MF) {
87+
for (auto &MI : MBB) {
88+
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
89+
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
90+
S32S64LaneMask.insert(MI.getOperand(3).getReg());
91+
findLCSSAPhi(MI.getOperand(0).getReg());
92+
}
93+
94+
if (MI.getOpcode() == AMDGPU::SI_IF ||
95+
MI.getOpcode() == AMDGPU::SI_ELSE) {
96+
findLCSSAPhi(MI.getOperand(0).getReg());
97+
}
98+
}
99+
}
100+
}
101+
102+
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
103+
S32S64LaneMask.insert(Reg);
104+
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
105+
if (LCSSAPhi.isPHI())
106+
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
107+
}
108+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "llvm/ADT/DenseSet.h"
1213
#include "llvm/CodeGen/Register.h"
1314
#include <utility>
1415

@@ -18,6 +19,7 @@ class MachineRegisterInfo;
1819
class GCNSubtarget;
1920
class GISelKnownBits;
2021
class LLT;
22+
class MachineFunction;
2123

2224
namespace AMDGPU {
2325

@@ -26,6 +28,26 @@ std::pair<Register, unsigned>
2628
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
2729
GISelKnownBits *KnownBits = nullptr,
2830
bool CheckNUW = false);
31+
32+
// Currently finds S32/S64 lane masks that can be declared as divergent by
33+
// uniformity analysis (all are phis at the moment).
34+
// These are defined as i32/i64 in some IR intrinsics (not as i1).
35+
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
36+
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
37+
// reg class after instruction-select, don't search for all of them.
38+
class IntrinsicLaneMaskAnalyzer {
39+
SmallDenseSet<Register, 8> S32S64LaneMask;
40+
MachineRegisterInfo &MRI;
41+
42+
public:
43+
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
44+
bool isS32S64LaneMask(Register Reg) const;
45+
46+
private:
47+
void initLaneMaskIntrinsics(MachineFunction &MF);
48+
// This will not be needed when we turn off LCSSA for global-isel.
49+
void findLCSSAPhi(Register Reg);
50+
};
2951
}
3052
}
3153

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 205 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,18 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19-
#include "llvm/CodeGen/MachineFunctionPass.h"
19+
#include "AMDGPUGlobalISelUtils.h"
20+
#include "GCNSubtarget.h"
21+
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
22+
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
23+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
24+
#include "llvm/CodeGen/TargetPassConfig.h"
2025
#include "llvm/InitializePasses.h"
2126

2227
#define DEBUG_TYPE "amdgpu-regbankselect"
2328

2429
using namespace llvm;
30+
using namespace AMDGPU;
2531

2632
namespace {
2733

@@ -40,6 +46,9 @@ class AMDGPURegBankSelect : public MachineFunctionPass {
4046
}
4147

4248
void getAnalysisUsage(AnalysisUsage &AU) const override {
49+
AU.addRequired<TargetPassConfig>();
50+
AU.addRequired<GISelCSEAnalysisWrapperPass>();
51+
AU.addRequired<MachineUniformityAnalysisPass>();
4352
MachineFunctionPass::getAnalysisUsage(AU);
4453
}
4554

@@ -55,6 +64,9 @@ class AMDGPURegBankSelect : public MachineFunctionPass {
5564

5665
INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, DEBUG_TYPE,
5766
"AMDGPU Register Bank Select", false, false)
67+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
68+
INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
69+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
5870
INITIALIZE_PASS_END(AMDGPURegBankSelect, DEBUG_TYPE,
5971
"AMDGPU Register Bank Select", false, false)
6072

@@ -66,9 +78,201 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() {
6678
return new AMDGPURegBankSelect();
6779
}
6880

81+
class RegBankSelectHelper {
82+
MachineIRBuilder &B;
83+
MachineRegisterInfo &MRI;
84+
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
85+
const MachineUniformityInfo &MUI;
86+
const RegisterBank *SgprRB;
87+
const RegisterBank *VgprRB;
88+
const RegisterBank *VccRB;
89+
90+
public:
91+
RegBankSelectHelper(MachineIRBuilder &B,
92+
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
93+
const MachineUniformityInfo &MUI,
94+
const RegisterBankInfo &RBI)
95+
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
96+
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
97+
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
98+
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
99+
100+
const RegisterBank *getRegBankToAssign(Register Reg) {
101+
if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
102+
return SgprRB;
103+
if (MRI.getType(Reg) == LLT::scalar(1))
104+
return VccRB;
105+
return VgprRB;
106+
}
107+
108+
// %rc:RegClass(s32) = G_ ...
109+
// ...
110+
// %a = G_ ..., %rc
111+
// ->
112+
// %rb:RegBank(s32) = G_ ...
113+
// %rc:RegClass(s32) = COPY %rb
114+
// ...
115+
// %a = G_ ..., %rb
116+
void reAssignRegBankOnDef(MachineInstr &MI, MachineOperand &DefOP,
117+
const RegisterBank *RB) {
118+
// Register that already has Register class got it during pre-inst selection
119+
// of another instruction. Maybe cross bank copy was required so we insert a
120+
// copy that can be removed later. This simplifies post regbanklegalize
121+
// combiner and avoids need to special case some patterns.
122+
Register Reg = DefOP.getReg();
123+
LLT Ty = MRI.getType(Reg);
124+
Register NewReg = MRI.createVirtualRegister({RB, Ty});
125+
DefOP.setReg(NewReg);
126+
127+
auto &MBB = *MI.getParent();
128+
B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator())));
129+
B.buildCopy(Reg, NewReg);
130+
131+
// The problem was discovered for uniform S1 that was used as both
132+
// lane mask(vcc) and regular sgpr S1.
133+
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
134+
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
135+
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
136+
// - the regular sgpr S1(uniform) instruction is now broken since
137+
// it uses sreg_64_xexec(S1) which is divergent.
138+
139+
// Replace virtual registers with register class on generic instructions
140+
// uses with virtual registers with register bank.
141+
for (auto &UseMI : make_early_inc_range(MRI.use_instructions(Reg))) {
142+
if (UseMI.isPreISelOpcode()) {
143+
for (MachineOperand &Op : UseMI.operands()) {
144+
if (Op.isReg() && Op.getReg() == Reg)
145+
Op.setReg(NewReg);
146+
}
147+
}
148+
}
149+
}
150+
151+
// %a = G_ ..., %rc
152+
// ->
153+
// %rb:RegBank(s32) = COPY %rc
154+
// %a = G_ ..., %rb
155+
void constrainRegBankUse(MachineInstr &MI, MachineOperand &UseOP,
156+
const RegisterBank *RB) {
157+
Register Reg = UseOP.getReg();
158+
159+
LLT Ty = MRI.getType(Reg);
160+
Register NewReg = MRI.createVirtualRegister({RB, Ty});
161+
UseOP.setReg(NewReg);
162+
163+
if (MI.isPHI()) {
164+
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
165+
MachineBasicBlock *DefMBB = DefMI->getParent();
166+
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
167+
} else {
168+
B.setInstr(MI);
169+
}
170+
171+
B.buildCopy(NewReg, Reg);
172+
}
173+
};
174+
175+
static Register getVReg(MachineOperand &Op) {
176+
if (!Op.isReg())
177+
return {};
178+
179+
// Operands of COPY and G_SI_CALL can be physical registers.
180+
Register Reg = Op.getReg();
181+
if (!Reg.isVirtual())
182+
return {};
183+
184+
return Reg;
185+
}
186+
69187
bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
70188
if (MF.getProperties().hasProperty(
71189
MachineFunctionProperties::Property::FailedISel))
72190
return false;
191+
192+
// Setup the instruction builder with CSE.
193+
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
194+
GISelCSEAnalysisWrapper &Wrapper =
195+
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
196+
GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig());
197+
GISelObserverWrapper Observer;
198+
Observer.addObserver(&CSEInfo);
199+
200+
CSEMIRBuilder B(MF);
201+
B.setCSEInfo(&CSEInfo);
202+
B.setChangeObserver(Observer);
203+
204+
RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
205+
RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
206+
207+
IntrinsicLaneMaskAnalyzer ILMA(MF);
208+
MachineUniformityInfo &MUI =
209+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
210+
MachineRegisterInfo &MRI = *B.getMRI();
211+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
212+
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
213+
// Virtual registers at this point don't have register banks.
214+
// Virtual registers in def and use operands of already inst-selected
215+
// instruction have register class.
216+
217+
for (MachineBasicBlock &MBB : MF) {
218+
for (MachineInstr &MI : MBB) {
219+
// Vregs in def and use operands of COPY can have either register class
220+
// or bank. If there is neither on vreg in def operand, assign bank.
221+
if (MI.isCopy()) {
222+
Register DefReg = getVReg(MI.getOperand(0));
223+
if (!DefReg.isValid() || MRI.getRegClassOrNull(DefReg))
224+
continue;
225+
226+
assert(!MRI.getRegBankOrNull(DefReg));
227+
MRI.setRegBank(DefReg, *RBSHelper.getRegBankToAssign(DefReg));
228+
continue;
229+
}
230+
231+
if (!MI.isPreISelOpcode())
232+
continue;
233+
234+
// Vregs in def and use operands of G_ instructions need to have register
235+
// banks assigned. Before this loop possible case are
236+
// - (1) vreg without register class or bank in def or use operand
237+
// - (2) vreg with register class in def operand
238+
// - (3) vreg, defined by G_ instruction, in use operand
239+
// - (4) vreg, defined by pre-inst-selected instruction, in use operand
240+
241+
// First three cases are handled in loop through all def operands of G_
242+
// instructions. For case (1) simply setRegBank. Cases (2) and (3) are
243+
// handled by reAssignRegBankOnDef.
244+
for (MachineOperand &DefOP : MI.defs()) {
245+
Register DefReg = getVReg(DefOP);
246+
if (!DefReg.isValid())
247+
continue;
248+
249+
const RegisterBank *RB = RBSHelper.getRegBankToAssign(DefReg);
250+
if (MRI.getRegClassOrNull(DefReg))
251+
RBSHelper.reAssignRegBankOnDef(MI, DefOP, RB);
252+
else {
253+
assert(!MRI.getRegBankOrNull(DefReg));
254+
MRI.setRegBank(DefReg, *RB);
255+
}
256+
}
257+
258+
// Register bank select doesn't modify pre-inst-selected instructions.
259+
// For case (4) need to insert a copy, handled by constrainRegBankUse.
260+
for (MachineOperand &UseOP : MI.uses()) {
261+
Register UseReg = getVReg(UseOP);
262+
if (!UseReg.isValid())
263+
continue;
264+
265+
// Skip case (3).
266+
if (!MRI.getRegClassOrNull(UseReg) ||
267+
MRI.getVRegDef(UseReg)->isPreISelOpcode())
268+
continue;
269+
270+
// Use with register class defined by pre-inst-selected instruction.
271+
const RegisterBank *RB = RBSHelper.getRegBankToAssign(UseReg);
272+
RBSHelper.constrainRegBankUse(MI, UseOP, RB);
273+
}
274+
}
275+
}
276+
73277
return true;
74278
}

0 commit comments

Comments
 (0)