@@ -4759,59 +4759,78 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
4759
4759
4760
4760
MachineBasicBlock::iterator I = LoopBB.begin ();
4761
4761
4762
+ SmallVector<Register, 8 > ReadlanePieces;
4763
+ Register CondReg = AMDGPU::NoRegister;
4764
+
4762
4765
Register VRsrc = Rsrc.getReg ();
4763
4766
unsigned VRsrcUndef = getUndefRegState (Rsrc.isUndef ());
4764
4767
4765
- Register SaveExec = MRI.createVirtualRegister (BoolXExecRC);
4766
- Register CondReg0 = MRI.createVirtualRegister (BoolXExecRC);
4767
- Register CondReg1 = MRI.createVirtualRegister (BoolXExecRC);
4768
- Register AndCond = MRI.createVirtualRegister (BoolXExecRC);
4769
- Register SRsrcSub0 = MRI.createVirtualRegister (&AMDGPU::SGPR_32RegClass);
4770
- Register SRsrcSub1 = MRI.createVirtualRegister (&AMDGPU::SGPR_32RegClass);
4771
- Register SRsrcSub2 = MRI.createVirtualRegister (&AMDGPU::SGPR_32RegClass);
4772
- Register SRsrcSub3 = MRI.createVirtualRegister (&AMDGPU::SGPR_32RegClass);
4773
- Register SRsrc = MRI.createVirtualRegister (&AMDGPU::SGPR_128RegClass);
4774
-
4775
- // Beginning of the loop, read the next Rsrc variant.
4776
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
4777
- .addReg (VRsrc, VRsrcUndef, AMDGPU::sub0);
4778
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
4779
- .addReg (VRsrc, VRsrcUndef, AMDGPU::sub1);
4780
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
4781
- .addReg (VRsrc, VRsrcUndef, AMDGPU::sub2);
4782
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
4783
- .addReg (VRsrc, VRsrcUndef, AMDGPU::sub3);
4784
-
4785
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::REG_SEQUENCE), SRsrc)
4786
- .addReg (SRsrcSub0)
4787
- .addImm (AMDGPU::sub0)
4788
- .addReg (SRsrcSub1)
4789
- .addImm (AMDGPU::sub1)
4790
- .addReg (SRsrcSub2)
4791
- .addImm (AMDGPU::sub2)
4792
- .addReg (SRsrcSub3)
4793
- .addImm (AMDGPU::sub3);
4768
+ unsigned RegSize = TRI->getRegSizeInBits (Rsrc.getReg (), MRI);
4769
+ unsigned NumSubRegs = RegSize / 32 ;
4770
+ assert (NumSubRegs % 2 == 0 && NumSubRegs <= 32 && " Unhandled register size" );
4771
+
4772
+ for (unsigned Idx = 0 ; Idx < NumSubRegs; Idx += 2 ) {
4773
+
4774
+ Register CurRegLo = MRI.createVirtualRegister (&AMDGPU::SGPR_32RegClass);
4775
+ Register CurRegHi = MRI.createVirtualRegister (&AMDGPU::SGPR_32RegClass);
4776
+
4777
+ // Read the next variant <- also loop target.
4778
+ BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
4779
+ .addReg (VRsrc, VRsrcUndef, TRI->getSubRegFromChannel (Idx));
4780
+
4781
+ // Read the next variant <- also loop target.
4782
+ BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
4783
+ .addReg (VRsrc, VRsrcUndef, TRI->getSubRegFromChannel (Idx + 1 ));
4784
+
4785
+ ReadlanePieces.push_back (CurRegLo);
4786
+ ReadlanePieces.push_back (CurRegHi);
4787
+
4788
+ // Comparison is to be done as 64-bit.
4789
+ Register CurReg = MRI.createVirtualRegister (&AMDGPU::SGPR_64RegClass);
4790
+ BuildMI (LoopBB, I, DL, TII.get (AMDGPU::REG_SEQUENCE), CurReg)
4791
+ .addReg (CurRegLo)
4792
+ .addImm (AMDGPU::sub0)
4793
+ .addReg (CurRegHi)
4794
+ .addImm (AMDGPU::sub1);
4795
+
4796
+ Register NewCondReg = MRI.createVirtualRegister (BoolXExecRC);
4797
+ BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
4798
+ .addReg (CurReg)
4799
+ .addReg (VRsrc, VRsrcUndef, TRI->getSubRegFromChannel (Idx, 2 ));
4800
+
4801
+ // Combine the comparision results with AND.
4802
+ if (CondReg == AMDGPU::NoRegister) // First.
4803
+ CondReg = NewCondReg;
4804
+ else { // If not the first, we create an AND.
4805
+ Register AndReg = MRI.createVirtualRegister (BoolXExecRC);
4806
+ BuildMI (LoopBB, I, DL, TII.get (AndOpc), AndReg)
4807
+ .addReg (CondReg)
4808
+ .addReg (NewCondReg);
4809
+ CondReg = AndReg;
4810
+ }
4811
+ } // End for loop.
4812
+
4813
+ auto SRsrcRC = TRI->getEquivalentSGPRClass (MRI.getRegClass (VRsrc));
4814
+ Register SRsrc = MRI.createVirtualRegister (SRsrcRC);
4815
+
4816
+ // Build scalar Rsrc.
4817
+ auto Merge = BuildMI (LoopBB, I, DL, TII.get (AMDGPU::REG_SEQUENCE), SRsrc);
4818
+ unsigned Channel = 0 ;
4819
+ for (Register Piece : ReadlanePieces) {
4820
+ Merge.addReg (Piece)
4821
+ .addImm (TRI->getSubRegFromChannel (Channel++));
4822
+ }
4794
4823
4795
4824
// Update Rsrc operand to use the SGPR Rsrc.
4796
4825
Rsrc.setReg (SRsrc);
4797
4826
Rsrc.setIsKill (true );
4798
4827
4799
- // Identify all lanes with identical Rsrc operands in their VGPRs.
4800
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
4801
- .addReg (SRsrc, 0 , AMDGPU::sub0_sub1)
4802
- .addReg (VRsrc, 0 , AMDGPU::sub0_sub1);
4803
- BuildMI (LoopBB, I, DL, TII.get (AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
4804
- .addReg (SRsrc, 0 , AMDGPU::sub2_sub3)
4805
- .addReg (VRsrc, 0 , AMDGPU::sub2_sub3);
4806
- BuildMI (LoopBB, I, DL, TII.get (AndOpc), AndCond)
4807
- .addReg (CondReg0)
4808
- .addReg (CondReg1);
4809
-
4810
- MRI.setSimpleHint (SaveExec, AndCond);
4828
+ Register SaveExec = MRI.createVirtualRegister (BoolXExecRC);
4829
+ MRI.setSimpleHint (SaveExec, CondReg);
4811
4830
4812
4831
// Update EXEC to matching lanes, saving original to SaveExec.
4813
4832
BuildMI (LoopBB, I, DL, TII.get (SaveExecOpc), SaveExec)
4814
- .addReg (AndCond , RegState::Kill);
4833
+ .addReg (CondReg , RegState::Kill);
4815
4834
4816
4835
// The original instruction is here; we insert the terminators after it.
4817
4836
I = LoopBB.end ();
@@ -4820,6 +4839,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
4820
4839
BuildMI (LoopBB, I, DL, TII.get (XorTermOpc), Exec)
4821
4840
.addReg (Exec)
4822
4841
.addReg (SaveExec);
4842
+
4823
4843
BuildMI (LoopBB, I, DL, TII.get (AMDGPU::S_CBRANCH_EXECNZ)).addMBB (&LoopBB);
4824
4844
}
4825
4845
@@ -5081,16 +5101,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
5081
5101
(AMDGPU::isShader (MF.getFunction ().getCallingConv ()) &&
5082
5102
(isMUBUF (MI) || isMTBUF (MI)))) {
5083
5103
MachineOperand *SRsrc = getNamedOperand (MI, AMDGPU::OpName::srsrc);
5084
- if (SRsrc && !RI.isSGPRClass (MRI.getRegClass (SRsrc->getReg ()))) {
5085
- unsigned SGPR = readlaneVGPRToSGPR (SRsrc->getReg (), MI, MRI);
5086
- SRsrc->setReg (SGPR);
5087
- }
5104
+ if (SRsrc && !RI.isSGPRClass (MRI.getRegClass (SRsrc->getReg ())))
5105
+ loadSRsrcFromVGPR (*this , MI, *SRsrc, MDT);
5088
5106
5089
5107
MachineOperand *SSamp = getNamedOperand (MI, AMDGPU::OpName::ssamp);
5090
- if (SSamp && !RI.isSGPRClass (MRI.getRegClass (SSamp->getReg ()))) {
5091
- unsigned SGPR = readlaneVGPRToSGPR (SSamp->getReg (), MI, MRI);
5092
- SSamp->setReg (SGPR);
5093
- }
5108
+ if (SSamp && !RI.isSGPRClass (MRI.getRegClass (SSamp->getReg ())))
5109
+ loadSRsrcFromVGPR (*this , MI, *SSamp, MDT);
5110
+
5094
5111
return ;
5095
5112
}
5096
5113
0 commit comments