Skip to content

Commit e7081d1

Browse files
author
Changpeng Fang
committed
AMDGPU: Implement waterfall loop for MIMG instructions with 256-bit SRsrc
Summary: When the resource descriptor is of vgpr, we need a waterfall loop to read into a sgpr. In this patchm we generalized the implementation to work for any regster class sizes, and extend the work to MIMG instructions. Fixes: SWDEV-223405 Reviewers: arsenm, nhaehnle Differential Revision: https://reviews.llvm.org/D82603
1 parent d14cf45 commit e7081d1

File tree

5 files changed

+321
-223
lines changed

5 files changed

+321
-223
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 67 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -4759,59 +4759,78 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
47594759

47604760
MachineBasicBlock::iterator I = LoopBB.begin();
47614761

4762+
SmallVector<Register, 8> ReadlanePieces;
4763+
Register CondReg = AMDGPU::NoRegister;
4764+
47624765
Register VRsrc = Rsrc.getReg();
47634766
unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
47644767

4765-
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4766-
Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
4767-
Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
4768-
Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
4769-
Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4770-
Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4771-
Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4772-
Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4773-
Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4774-
4775-
// Beginning of the loop, read the next Rsrc variant.
4776-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
4777-
.addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
4778-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
4779-
.addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
4780-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
4781-
.addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
4782-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
4783-
.addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
4784-
4785-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
4786-
.addReg(SRsrcSub0)
4787-
.addImm(AMDGPU::sub0)
4788-
.addReg(SRsrcSub1)
4789-
.addImm(AMDGPU::sub1)
4790-
.addReg(SRsrcSub2)
4791-
.addImm(AMDGPU::sub2)
4792-
.addReg(SRsrcSub3)
4793-
.addImm(AMDGPU::sub3);
4768+
unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
4769+
unsigned NumSubRegs = RegSize / 32;
4770+
assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
4771+
4772+
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
4773+
4774+
Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4775+
Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4776+
4777+
// Read the next variant <- also loop target.
4778+
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
4779+
.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
4780+
4781+
// Read the next variant <- also loop target.
4782+
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
4783+
.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
4784+
4785+
ReadlanePieces.push_back(CurRegLo);
4786+
ReadlanePieces.push_back(CurRegHi);
4787+
4788+
// Comparison is to be done as 64-bit.
4789+
Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
4790+
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
4791+
.addReg(CurRegLo)
4792+
.addImm(AMDGPU::sub0)
4793+
.addReg(CurRegHi)
4794+
.addImm(AMDGPU::sub1);
4795+
4796+
Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
4797+
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
4798+
.addReg(CurReg)
4799+
.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
4800+
4801+
// Combine the comparision results with AND.
4802+
if (CondReg == AMDGPU::NoRegister) // First.
4803+
CondReg = NewCondReg;
4804+
else { // If not the first, we create an AND.
4805+
Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
4806+
BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
4807+
.addReg(CondReg)
4808+
.addReg(NewCondReg);
4809+
CondReg = AndReg;
4810+
}
4811+
} // End for loop.
4812+
4813+
auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
4814+
Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
4815+
4816+
// Build scalar Rsrc.
4817+
auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
4818+
unsigned Channel = 0;
4819+
for (Register Piece : ReadlanePieces) {
4820+
Merge.addReg(Piece)
4821+
.addImm(TRI->getSubRegFromChannel(Channel++));
4822+
}
47944823

47954824
// Update Rsrc operand to use the SGPR Rsrc.
47964825
Rsrc.setReg(SRsrc);
47974826
Rsrc.setIsKill(true);
47984827

4799-
// Identify all lanes with identical Rsrc operands in their VGPRs.
4800-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
4801-
.addReg(SRsrc, 0, AMDGPU::sub0_sub1)
4802-
.addReg(VRsrc, 0, AMDGPU::sub0_sub1);
4803-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
4804-
.addReg(SRsrc, 0, AMDGPU::sub2_sub3)
4805-
.addReg(VRsrc, 0, AMDGPU::sub2_sub3);
4806-
BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond)
4807-
.addReg(CondReg0)
4808-
.addReg(CondReg1);
4809-
4810-
MRI.setSimpleHint(SaveExec, AndCond);
4828+
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4829+
MRI.setSimpleHint(SaveExec, CondReg);
48114830

48124831
// Update EXEC to matching lanes, saving original to SaveExec.
48134832
BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
4814-
.addReg(AndCond, RegState::Kill);
4833+
.addReg(CondReg, RegState::Kill);
48154834

48164835
// The original instruction is here; we insert the terminators after it.
48174836
I = LoopBB.end();
@@ -4820,6 +4839,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
48204839
BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
48214840
.addReg(Exec)
48224841
.addReg(SaveExec);
4842+
48234843
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
48244844
}
48254845

@@ -5081,16 +5101,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
50815101
(AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
50825102
(isMUBUF(MI) || isMTBUF(MI)))) {
50835103
MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
5084-
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
5085-
unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
5086-
SRsrc->setReg(SGPR);
5087-
}
5104+
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
5105+
loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
50885106

50895107
MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
5090-
if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
5091-
unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
5092-
SSamp->setReg(SGPR);
5093-
}
5108+
if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
5109+
loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
5110+
50945111
return;
50955112
}
50965113

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
2+
3+
4+
declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
5+
6+
; GCN-LABEL: {{^}}water_loop_rsrc:
7+
8+
; GCN: [[RSRC_LOOP:[a-zA-Z0-9_]+]]: ; =>This Inner Loop Header: Depth=1
9+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG0:[0-9]+]], v[[VREG0:[0-9]+]]
10+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG1:[0-9]+]], v[[VREG1:[0-9]+]]
11+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG2:[0-9]+]], v[[VREG2:[0-9]+]]
12+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG3:[0-9]+]], v[[VREG3:[0-9]+]]
13+
; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s{{\[}}[[SREG0]]:[[SREG1]]{{\]}}, v{{\[}}[[VREG0]]:[[VREG1]]{{\]}}
14+
; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG2]]:[[SREG3]]{{\]}}, v{{\[}}[[VREG2]]:[[VREG3]]{{\]}}
15+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG4:[0-9]+]], v[[VREG4:[0-9]+]]
16+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG5:[0-9]+]], v[[VREG5:[0-9]+]]
17+
; GCN-NEXT: s_and_b64 [[AND0:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
18+
; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP2:vcc]], s{{\[}}[[SREG4]]:[[SREG5]]{{\]}}, v{{\[}}[[VREG4]]:[[VREG5]]{{\]}}
19+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG6:[0-9]+]], v[[VREG6:[0-9]+]]
20+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG7:[0-9]+]], v[[VREG7:[0-9]+]]
21+
; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP3:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG6]]:[[SREG7]]{{\]}}, v{{\[}}[[VREG6]]:[[VREG7]]{{\]}}
22+
; GCN-NEXT: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[AND0]], [[CMP2]]
23+
; GCN-NEXT: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[AND1]], [[CMP3]]
24+
; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
25+
; GCN-NEXT: s_nop 0
26+
; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
27+
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
28+
; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]]
29+
define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
30+
main_body:
31+
%v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
32+
ret <4 x float> %v
33+
}
34+
35+
36+
; GCN-LABEL: {{^}}water_loop_samp:
37+
38+
; GCN: [[SAMP_LOOP:[a-zA-Z0-9_]+]]: ; =>This Inner Loop Header: Depth=1
39+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG0:[0-9]+]], v[[VREG0:[0-9]+]]
40+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG1:[0-9]+]], v[[VREG1:[0-9]+]]
41+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG2:[0-9]+]], v[[VREG2:[0-9]+]]
42+
; GCN-NEXT: v_readfirstlane_b32 s[[SREG3:[0-9]+]], v[[VREG3:[0-9]+]]
43+
44+
; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s{{\[}}[[SREG0]]:[[SREG1]]{{\]}}, v{{\[}}[[VREG0]]:[[VREG1]]{{\]}}
45+
; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG2]]:[[SREG3]]{{\]}}, v{{\[}}[[VREG2]]:[[VREG3]]{{\]}}
46+
; GCN-NEXT: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
47+
; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
48+
; GCN-NEXT: s_nop 0
49+
50+
; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1
51+
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
52+
; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]]
53+
define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) {
54+
main_body:
55+
%v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
56+
ret <4 x float> %v
57+
}

0 commit comments

Comments
 (0)