@@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3544
3544
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3545
3545
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3546
3546
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3547
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3547
3548
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3548
3549
// Don't fold if we are using source or output modifiers. The new VOP2
3549
3550
// instructions don't have them.
@@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3564
3565
bool IsFMA =
3565
3566
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3566
3567
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3568
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3567
3569
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3568
3570
MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
3569
3571
MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3597
3599
3598
3600
unsigned NewOpc =
3599
3601
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3600
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3602
+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3603
+ ? AMDGPU::V_FMAMK_F16_t16
3604
+ : AMDGPU::V_FMAMK_F16_fake16
3601
3605
: AMDGPU::V_FMAMK_F16)
3602
3606
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3603
3607
if (pseudoToMCOpcode (NewOpc) == -1 )
3604
3608
return false ;
3605
3609
3606
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3607
- // would also require restricting their register classes. For now
3608
- // just bail out.
3609
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3610
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3611
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3612
+ // restricting their register classes. For now just bail out.
3613
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3614
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3610
3615
return false ;
3611
3616
3612
3617
const int64_t Imm = getImmFor (RegSrc == Src1 ? *Src0 : *Src1);
@@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3621
3626
Src0->setIsKill (RegSrc->isKill ());
3622
3627
3623
3628
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3624
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3629
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3625
3630
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3626
3631
UseMI.untieRegOperand (
3627
3632
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3676
3681
3677
3682
unsigned NewOpc =
3678
3683
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3679
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3684
+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3685
+ ? AMDGPU::V_FMAAK_F16_t16
3686
+ : AMDGPU::V_FMAAK_F16_fake16
3680
3687
: AMDGPU::V_FMAAK_F16)
3681
3688
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3682
3689
if (pseudoToMCOpcode (NewOpc) == -1 )
3683
3690
return false ;
3684
3691
3685
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3686
- // would also require restricting their register classes. For now
3687
- // just bail out.
3688
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3692
+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3693
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3694
+ // restricting their register classes. For now just bail out.
3695
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3696
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3689
3697
return false ;
3690
3698
3691
3699
// FIXME: This would be a lot easier if we could return a new instruction
3692
3700
// instead of having to modify in place.
3693
3701
3694
3702
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3695
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3703
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3696
3704
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3697
3705
UseMI.untieRegOperand (
3698
3706
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3879
3887
return AMDGPU::V_FMA_LEGACY_F32_e64;
3880
3888
case AMDGPU::V_FMAC_F16_e32:
3881
3889
case AMDGPU::V_FMAC_F16_e64:
3890
+ case AMDGPU::V_FMAC_F16_t16_e64:
3882
3891
case AMDGPU::V_FMAC_F16_fake16_e64:
3883
- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3892
+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3893
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3894
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
3884
3895
: AMDGPU::V_FMA_F16_gfx9_e64;
3885
3896
case AMDGPU::V_FMAC_F32_e32:
3886
3897
case AMDGPU::V_FMAC_F32_e64:
@@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3946
3957
return MIB;
3947
3958
}
3948
3959
3949
- assert (
3950
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3951
- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3952
- " pre-RA" );
3960
+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3961
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3962
+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3963
+ " present "
3964
+ " pre-RA" );
3953
3965
3954
3966
// Handle MAC/FMAC.
3955
3967
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3956
3968
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3969
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3957
3970
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3958
3971
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3959
3972
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3960
3973
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3961
3974
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3975
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3962
3976
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3963
3977
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3964
3978
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3973
3987
return nullptr ;
3974
3988
case AMDGPU::V_MAC_F16_e64:
3975
3989
case AMDGPU::V_FMAC_F16_e64:
3990
+ case AMDGPU::V_FMAC_F16_t16_e64:
3976
3991
case AMDGPU::V_FMAC_F16_fake16_e64:
3977
3992
case AMDGPU::V_MAC_F32_e64:
3978
3993
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4058
4073
int64_t Imm;
4059
4074
if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
4060
4075
unsigned NewOpc =
4061
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4062
- : AMDGPU::V_FMAAK_F16)
4076
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts ()
4077
+ ? ST.useRealTrue16Insts ()
4078
+ ? AMDGPU::V_FMAAK_F16_t16
4079
+ : AMDGPU::V_FMAAK_F16_fake16
4080
+ : AMDGPU::V_FMAAK_F16)
4063
4081
: AMDGPU::V_FMAAK_F32)
4064
4082
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4065
4083
if (pseudoToMCOpcode (NewOpc) != -1 ) {
@@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4076
4094
return MIB;
4077
4095
}
4078
4096
}
4079
- unsigned NewOpc =
4080
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4081
- : AMDGPU::V_FMAMK_F16)
4082
- : AMDGPU::V_FMAMK_F32)
4083
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4097
+ unsigned NewOpc = IsFMA
4098
+ ? (IsF16 ? (ST.hasTrue16BitInsts ()
4099
+ ? ST.useRealTrue16Insts ()
4100
+ ? AMDGPU::V_FMAMK_F16_t16
4101
+ : AMDGPU::V_FMAMK_F16_fake16
4102
+ : AMDGPU::V_FMAMK_F16)
4103
+ : AMDGPU::V_FMAMK_F32)
4104
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4084
4105
if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
4085
4106
if (pseudoToMCOpcode (NewOpc) != -1 ) {
4086
4107
MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
4526
4547
case AMDGPU::V_MAC_F32_e64:
4527
4548
case AMDGPU::V_MAC_LEGACY_F32_e64:
4528
4549
case AMDGPU::V_FMAC_F16_e64:
4550
+ case AMDGPU::V_FMAC_F16_t16_e64:
4529
4551
case AMDGPU::V_FMAC_F16_fake16_e64:
4530
4552
case AMDGPU::V_FMAC_F32_e64:
4531
4553
case AMDGPU::V_FMAC_F64_e64:
@@ -5582,7 +5604,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5582
5604
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5583
5605
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5584
5606
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5585
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5607
+ case AMDGPU::S_FMAC_F16:
5608
+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5609
+ : AMDGPU::V_FMAC_F16_fake16_e64;
5586
5610
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5587
5611
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5588
5612
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments