Skip to content

Commit 2a7487c

Browse files
authored
[AMDGPU][True16][CodeGen] true16 codegen pattern for fma (llvm#122950)
true16 codegen pattern for f16 fma. created a duplicated shrink-mad-fma-gfx10.mir from shrink-mad-fma to seperate pre-GFX11 and GFX11 mir test.
1 parent 29b7295 commit 2a7487c

10 files changed

+871
-244
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
198198
return AMDGPU::V_FMA_F32_e64;
199199
case AMDGPU::V_FMAC_F16_e64:
200200
return AMDGPU::V_FMA_F16_gfx9_e64;
201+
case AMDGPU::V_FMAC_F16_t16_e64:
202+
return AMDGPU::V_FMA_F16_gfx9_t16_e64;
201203
case AMDGPU::V_FMAC_F16_fake16_e64:
202204
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
203205
case AMDGPU::V_FMAC_LEGACY_F32_e64:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+49-25
Original file line numberDiff line numberDiff line change
@@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35443544
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
35453545
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35463546
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3547+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35473548
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
35483549
// Don't fold if we are using source or output modifiers. The new VOP2
35493550
// instructions don't have them.
@@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35643565
bool IsFMA =
35653566
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35663567
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3568+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35673569
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35683570
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
35693571
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35973599

35983600
unsigned NewOpc =
35993601
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3600-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3602+
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3603+
? AMDGPU::V_FMAMK_F16_t16
3604+
: AMDGPU::V_FMAMK_F16_fake16
36013605
: AMDGPU::V_FMAMK_F16)
36023606
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
36033607
if (pseudoToMCOpcode(NewOpc) == -1)
36043608
return false;
36053609

3606-
// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3607-
// would also require restricting their register classes. For now
3608-
// just bail out.
3609-
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3610+
// V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3611+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3612+
// restricting their register classes. For now just bail out.
3613+
if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3614+
NewOpc == AMDGPU::V_FMAMK_F16_fake16)
36103615
return false;
36113616

36123617
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36213626
Src0->setIsKill(RegSrc->isKill());
36223627

36233628
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3624-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3629+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36253630
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36263631
UseMI.untieRegOperand(
36273632
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36763681

36773682
unsigned NewOpc =
36783683
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3679-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3684+
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3685+
? AMDGPU::V_FMAAK_F16_t16
3686+
: AMDGPU::V_FMAAK_F16_fake16
36803687
: AMDGPU::V_FMAAK_F16)
36813688
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
36823689
if (pseudoToMCOpcode(NewOpc) == -1)
36833690
return false;
36843691

3685-
// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3686-
// would also require restricting their register classes. For now
3687-
// just bail out.
3688-
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3692+
// V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3693+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3694+
// restricting their register classes. For now just bail out.
3695+
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3696+
NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36893697
return false;
36903698

36913699
// FIXME: This would be a lot easier if we could return a new instruction
36923700
// instead of having to modify in place.
36933701

36943702
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3695-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3703+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36963704
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36973705
UseMI.untieRegOperand(
36983706
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38793887
return AMDGPU::V_FMA_LEGACY_F32_e64;
38803888
case AMDGPU::V_FMAC_F16_e32:
38813889
case AMDGPU::V_FMAC_F16_e64:
3890+
case AMDGPU::V_FMAC_F16_t16_e64:
38823891
case AMDGPU::V_FMAC_F16_fake16_e64:
3883-
return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3892+
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3893+
? AMDGPU::V_FMA_F16_gfx9_t16_e64
3894+
: AMDGPU::V_FMA_F16_gfx9_fake16_e64
38843895
: AMDGPU::V_FMA_F16_gfx9_e64;
38853896
case AMDGPU::V_FMAC_F32_e32:
38863897
case AMDGPU::V_FMAC_F32_e64:
@@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39463957
return MIB;
39473958
}
39483959

3949-
assert(
3950-
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3951-
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3952-
"pre-RA");
3960+
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3961+
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3962+
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3963+
"present "
3964+
"pre-RA");
39533965

39543966
// Handle MAC/FMAC.
39553967
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
39563968
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3969+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39573970
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39583971
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39593972
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39603973
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39613974
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3975+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39623976
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39633977
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39643978
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39733987
return nullptr;
39743988
case AMDGPU::V_MAC_F16_e64:
39753989
case AMDGPU::V_FMAC_F16_e64:
3990+
case AMDGPU::V_FMAC_F16_t16_e64:
39763991
case AMDGPU::V_FMAC_F16_fake16_e64:
39773992
case AMDGPU::V_MAC_F32_e64:
39783993
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40584073
int64_t Imm;
40594074
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
40604075
unsigned NewOpc =
4061-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4062-
: AMDGPU::V_FMAAK_F16)
4076+
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
4077+
? ST.useRealTrue16Insts()
4078+
? AMDGPU::V_FMAAK_F16_t16
4079+
: AMDGPU::V_FMAAK_F16_fake16
4080+
: AMDGPU::V_FMAAK_F16)
40634081
: AMDGPU::V_FMAAK_F32)
40644082
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
40654083
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40764094
return MIB;
40774095
}
40784096
}
4079-
unsigned NewOpc =
4080-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4081-
: AMDGPU::V_FMAMK_F16)
4082-
: AMDGPU::V_FMAMK_F32)
4083-
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4097+
unsigned NewOpc = IsFMA
4098+
? (IsF16 ? (ST.hasTrue16BitInsts()
4099+
? ST.useRealTrue16Insts()
4100+
? AMDGPU::V_FMAMK_F16_t16
4101+
: AMDGPU::V_FMAMK_F16_fake16
4102+
: AMDGPU::V_FMAMK_F16)
4103+
: AMDGPU::V_FMAMK_F32)
4104+
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
40844105
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
40854106
if (pseudoToMCOpcode(NewOpc) != -1) {
40864107
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
45264547
case AMDGPU::V_MAC_F32_e64:
45274548
case AMDGPU::V_MAC_LEGACY_F32_e64:
45284549
case AMDGPU::V_FMAC_F16_e64:
4550+
case AMDGPU::V_FMAC_F16_t16_e64:
45294551
case AMDGPU::V_FMAC_F16_fake16_e64:
45304552
case AMDGPU::V_FMAC_F32_e64:
45314553
case AMDGPU::V_FMAC_F64_e64:
@@ -5582,7 +5604,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55825604
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55835605
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55845606
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5585-
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5607+
case AMDGPU::S_FMAC_F16:
5608+
return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5609+
: AMDGPU::V_FMAC_F16_fake16_e64;
55865610
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55875611
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55885612
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;

llvm/lib/Target/AMDGPU/SIInstructions.td

+8
Original file line numberDiff line numberDiff line change
@@ -3287,6 +3287,14 @@ def : GCNPat <
32873287
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
32883288
SRCMODS.NONE, $src2)
32893289
>;
3290+
let True16Predicate = UseRealTrue16Insts in
3291+
def : GCNPat <
3292+
(fma (f16 (VOP3NoMods f16:$src0)),
3293+
(f16 (VOP3NoMods f16:$src1)),
3294+
(f16 (VOP3NoMods f16:$src2))),
3295+
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
3296+
SRCMODS.NONE, $src2)
3297+
>;
32903298
let True16Predicate = UseFakeTrue16Insts in
32913299
def : GCNPat <
32923300
(fma (f16 (VOP3NoMods f16:$src0)),

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

+13-4
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
455455
break;
456456
case AMDGPU::V_FMA_F16_e64:
457457
case AMDGPU::V_FMA_F16_gfx9_e64:
458+
NewOpcode = AMDGPU::V_FMAAK_F16;
459+
break;
460+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
461+
NewOpcode = AMDGPU::V_FMAAK_F16_t16;
462+
break;
458463
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
459-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
460-
: AMDGPU::V_FMAAK_F16;
464+
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
461465
break;
462466
}
463467
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
485489
break;
486490
case AMDGPU::V_FMA_F16_e64:
487491
case AMDGPU::V_FMA_F16_gfx9_e64:
492+
NewOpcode = AMDGPU::V_FMAMK_F16;
493+
break;
494+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
495+
NewOpcode = AMDGPU::V_FMAMK_F16_t16;
496+
break;
488497
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
489-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
490-
: AMDGPU::V_FMAMK_F16;
498+
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
491499
break;
492500
}
493501
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
959967
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
960968
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
961969
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
970+
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
962971
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
963972
shrinkMadFma(MI);
964973
continue;

llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll

+47-21
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
44
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
55
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
6+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
78

89
define float @v_fma_f32(float %x, float %y, float %z) {
910
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
107108
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
108109
; GFX10-NEXT: s_setpc_b64 s[30:31]
109110
;
110-
; GFX11-LABEL: v_fma_f16:
111-
; GFX11: ; %bb.0:
112-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113-
; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
114-
; GFX11-NEXT: s_setpc_b64 s[30:31]
111+
; GFX11-TRUE16-LABEL: v_fma_f16:
112+
; GFX11-TRUE16: ; %bb.0:
113+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
115+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
116+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
117+
;
118+
; GFX11-FAKE16-LABEL: v_fma_f16:
119+
; GFX11-FAKE16: ; %bb.0:
120+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
122+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
115123
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
116124
ret half %fma
117125
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
145153
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
146154
; GFX10-NEXT: s_setpc_b64 s[30:31]
147155
;
148-
; GFX11-LABEL: v_fma_f16_fneg_lhs:
149-
; GFX11: ; %bb.0:
150-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151-
; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
152-
; GFX11-NEXT: s_setpc_b64 s[30:31]
156+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
157+
; GFX11-TRUE16: ; %bb.0:
158+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
160+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
161+
;
162+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
163+
; GFX11-FAKE16: ; %bb.0:
164+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
166+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
153167
%neg.x = fneg half %x
154168
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
155169
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
184198
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
185199
; GFX10-NEXT: s_setpc_b64 s[30:31]
186200
;
187-
; GFX11-LABEL: v_fma_f16_fneg_rhs:
188-
; GFX11: ; %bb.0:
189-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190-
; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
191-
; GFX11-NEXT: s_setpc_b64 s[30:31]
201+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
202+
; GFX11-TRUE16: ; %bb.0:
203+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
205+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
206+
;
207+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
208+
; GFX11-FAKE16: ; %bb.0:
209+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
211+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
192212
%neg.y = fneg half %y
193213
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
194214
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
223243
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
224244
; GFX10-NEXT: s_setpc_b64 s[30:31]
225245
;
226-
; GFX11-LABEL: v_fma_f16_fneg_add:
227-
; GFX11: ; %bb.0:
228-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229-
; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
230-
; GFX11-NEXT: s_setpc_b64 s[30:31]
246+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
247+
; GFX11-TRUE16: ; %bb.0:
248+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
250+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
251+
;
252+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
253+
; GFX11-FAKE16: ; %bb.0:
254+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
256+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
231257
%neg.z = fneg half %z
232258
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
233259
ret half %fma

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
2+
# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
3+
# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
34
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
45

56
---

0 commit comments

Comments
 (0)