Skip to content

Commit 3601849

Browse files
authored
AMDGPU: Check for subreg match when folding through reg_sequence (llvm#140582)
We need to consider the use instruction's intepretation of the bits, not the defined immediate without use context. This will regress some cases where we previously coud match f64 inline constants. We can restore them by either using pseudo instructions to materialize f64 constants, or recognizing reg_sequence decomposed into 32-bit pieces for them (which essentially means recognizing every other input is a 0). Fixes llvm#139908
1 parent 2b7cc2b commit 3601849

File tree

7 files changed

+242
-35
lines changed

7 files changed

+242
-35
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
895895
if (!SrcRC)
896896
return {};
897897

898+
// TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
899+
// every other other element is 0 for 64-bit immediates)
898900
int64_t Imm;
899901
for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
900902
const MachineOperand *Op = Defs[I].first;
@@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
924926
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
925927
return nullptr;
926928

927-
// FIXME: Verify SplatRC is compatible with the use operand
928-
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
929-
if (!TII->isInlineConstant(*SplatVal, OpTy) ||
930-
!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
929+
int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
930+
if (RCID == -1)
931+
return nullptr;
932+
933+
// Special case 0/-1, since when interpreted as a 64-bit element both halves
934+
// have the same bits. Effectively this code does not handle 64-bit element
935+
// operands correctly, as the incoming 64-bit constants are already split into
936+
// 32-bit sequence elements.
937+
//
938+
// TODO: We should try to figure out how to interpret the reg_sequence as a
939+
// split 64-bit splat constant, or use 64-bit pseudos for materializing f64
940+
// constants.
941+
if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) {
942+
const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
943+
// We need to figure out the scalar type read by the operand. e.g. the MFMA
944+
// operand will be AReg_128, and we want to check if it's compatible with an
945+
// AReg_32 constant.
946+
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
947+
switch (OpTy) {
948+
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
949+
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
950+
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
951+
break;
952+
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
953+
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
954+
break;
955+
default:
956+
return nullptr;
957+
}
958+
959+
if (!TRI->getCommonSubClass(OpRC, SplatRC))
960+
return nullptr;
961+
}
962+
963+
if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
931964
return nullptr;
932965

933966
return SplatVal;
@@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand(
10391072
}
10401073
}
10411074

1042-
if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
1075+
if (RSUse->getSubReg() != RegSeqDstSubReg)
10431076
continue;
10441077

1045-
if (RSUse->getSubReg() != RegSeqDstSubReg)
1078+
if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
10461079
continue;
10471080

1048-
foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1049-
CopiesToReplace);
1081+
foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace);
10501082
}
10511083

10521084
return;

llvm/test/CodeGen/AMDGPU/constrained-shift.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
192192
;
193193
; GISEL-LABEL: s_csh_v4i32:
194194
; GISEL: ; %bb.0:
195-
; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31
196-
; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31
195+
; GISEL-NEXT: s_mov_b32 s8, 31
196+
; GISEL-NEXT: s_mov_b32 s9, s8
197+
; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
198+
; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
197199
; GISEL-NEXT: s_lshl_b32 s8, s0, s4
198200
; GISEL-NEXT: s_lshl_b32 s9, s1, s5
199201
; GISEL-NEXT: s_lshl_b32 s10, s2, s6

llvm/test/CodeGen/AMDGPU/global-saddr-load.ll

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
745745
;
746746
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
747747
; GFX12-SDAG: ; %bb.0:
748-
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1
748+
; GFX12-SDAG-NEXT: s_mov_b32 s0, 1
749+
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
750+
; GFX12-SDAG-NEXT: s_mov_b32 s1, s0
751+
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
749752
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
750753
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
751754
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,19 @@ bb:
262262
ret void
263263
}
264264

265-
; FIXME: This should not be foldable as an inline immediate
266265
; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
267-
; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
266+
; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
267+
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
268+
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
269+
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
270+
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
271+
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
272+
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
273+
; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
274+
275+
; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
268276
; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
269-
; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
277+
; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
270278
; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
271279
; GCN: global_store_dwordx4
272280
; GCN: global_store_dwordx4

llvm/test/CodeGen/AMDGPU/operand-folding.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) {
155155
; CHECK-LABEL: issue139908:
156156
; CHECK: ; %bb.0:
157157
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158-
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 42, v[0:1]
158+
; CHECK-NEXT: s_mov_b32 s4, 42
159+
; CHECK-NEXT: s_mov_b32 s5, s4
160+
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
159161
; CHECK-NEXT: v_cndmask_b32_e64 v0, 2, 1, vcc
160162
; CHECK-NEXT: s_setpc_b64 s[30:31]
161163
%eq = icmp eq i64 %in, 180388626474

llvm/test/CodeGen/AMDGPU/packed-fp32.ll

Lines changed: 179 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
3-
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
4-
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
5-
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
6-
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
4+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
5+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
6+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
77

88
define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
99
; GFX900-LABEL: fadd_v2_vv:
@@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
411411
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
412412
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
413413
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
414+
; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0
415+
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
414416
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
415417
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
416418
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
417-
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
419+
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
418420
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
419421
; PACKED-GISEL-NEXT: s_endpgm
420422
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
11861188
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
11871189
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
11881190
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1191+
; PACKED-GISEL-NEXT: s_mov_b32 s2, 4.0
1192+
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
11891193
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
11901194
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
11911195
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
1192-
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0
1196+
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
11931197
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
11941198
; PACKED-GISEL-NEXT: s_endpgm
11951199
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
15941598
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
15951599
; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
15961600
; PACKED-SDAG-NEXT: s_endpgm
1601+
;
1602+
; GFX90A-GISEL-LABEL: fma_v2_v_imm:
1603+
; GFX90A-GISEL: ; %bb.0:
1604+
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1605+
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1606+
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1607+
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 0x43480000
1608+
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
1609+
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1610+
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1611+
; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4
1612+
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
1613+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
1614+
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
1615+
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
1616+
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1617+
; GFX90A-GISEL-NEXT: s_endpgm
1618+
;
1619+
; GFX942-GISEL-LABEL: fma_v2_v_imm:
1620+
; GFX942-GISEL: ; %bb.0:
1621+
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1622+
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1623+
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1624+
; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x43480000
1625+
; GFX942-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
1626+
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1627+
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1628+
; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
1629+
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
1630+
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
1631+
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
1632+
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
1633+
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1634+
; GFX942-GISEL-NEXT: s_endpgm
15971635
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
15981636
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
15991637
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
16751713
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
16761714
; PACKED-SDAG-NEXT: s_endpgm
16771715
;
1678-
; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
1679-
; PACKED-GISEL: ; %bb.0:
1680-
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1681-
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1682-
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1683-
; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0
1684-
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
1685-
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1686-
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
1687-
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
1688-
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
1689-
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1690-
; PACKED-GISEL-NEXT: s_endpgm
1716+
; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
1717+
; GFX90A-GISEL: ; %bb.0:
1718+
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1719+
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1720+
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1721+
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0
1722+
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0
1723+
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1724+
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1725+
; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4
1726+
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
1727+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
1728+
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
1729+
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
1730+
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1731+
; GFX90A-GISEL-NEXT: s_endpgm
1732+
;
1733+
; GFX942-GISEL-LABEL: fma_v2_v_lit_splat:
1734+
; GFX942-GISEL: ; %bb.0:
1735+
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1736+
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1737+
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1738+
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
1739+
; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0
1740+
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1741+
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1742+
; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
1743+
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
1744+
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
1745+
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
1746+
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
1747+
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1748+
; GFX942-GISEL-NEXT: s_endpgm
16911749
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
16921750
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
16931751
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
17251783
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
17261784
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
17271785
; PACKED-SDAG-NEXT: s_endpgm
1786+
;
1787+
; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit:
1788+
; GFX90A-GISEL: ; %bb.0:
1789+
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1790+
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1791+
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1792+
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0
1793+
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0
1794+
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1795+
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1796+
; GFX90A-GISEL-NEXT: s_mov_b32 s5, 2.0
1797+
; GFX90A-GISEL-NEXT: s_mov_b32 s3, 0x40400000
1798+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
1799+
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
1800+
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
1801+
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1802+
; GFX90A-GISEL-NEXT: s_endpgm
1803+
;
1804+
; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit:
1805+
; GFX942-GISEL: ; %bb.0:
1806+
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1807+
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1808+
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1809+
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
1810+
; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0
1811+
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1812+
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1813+
; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
1814+
; GFX942-GISEL-NEXT: s_mov_b32 s3, 0x40400000
1815+
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
1816+
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
1817+
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
1818+
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1819+
; GFX942-GISEL-NEXT: s_endpgm
17281820
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
17291821
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
17301822
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
20592151
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0
20602152
; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
20612153
; PACKED-SDAG-NEXT: s_endpgm
2154+
;
2155+
; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
2156+
; GFX90A-GISEL: ; %bb.0: ; %bb
2157+
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2158+
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0
2159+
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
2160+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2161+
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2162+
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
2163+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1
2164+
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
2165+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0
2166+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v0
2167+
; GFX90A-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2168+
; GFX90A-GISEL-NEXT: s_endpgm
2169+
;
2170+
; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
2171+
; GFX942-GISEL: ; %bb.0: ; %bb
2172+
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2173+
; GFX942-GISEL-NEXT: s_mov_b32 s2, 0
2174+
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
2175+
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
2176+
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2177+
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
2178+
; GFX942-GISEL-NEXT: s_nop 0
2179+
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1
2180+
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
2181+
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0
2182+
; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0
2183+
; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2184+
; GFX942-GISEL-NEXT: s_endpgm
20622185
bb:
20632186
%i12 = fadd <2 x float> zeroinitializer, %arg
20642187
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
20992222
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
21002223
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
21012224
; PACKED-SDAG-NEXT: s_endpgm
2225+
;
2226+
; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
2227+
; GFX90A-GISEL: ; %bb.0: ; %bb
2228+
; GFX90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2229+
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2230+
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2231+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2232+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s2
2233+
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
2234+
; GFX90A-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2
2235+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v1
2236+
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3]
2237+
; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2
2238+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, 0
2239+
; GFX90A-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2240+
; GFX90A-GISEL-NEXT: s_endpgm
2241+
;
2242+
; GFX942-GISEL-LABEL: fadd_fadd_fsub:
2243+
; GFX942-GISEL: ; %bb.0: ; %bb
2244+
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2245+
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2246+
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2247+
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
2248+
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s2
2249+
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
2250+
; GFX942-GISEL-NEXT: s_nop 0
2251+
; GFX942-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2
2252+
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, v1
2253+
; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3]
2254+
; GFX942-GISEL-NEXT: s_nop 0
2255+
; GFX942-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2
2256+
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
2257+
; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2258+
; GFX942-GISEL-NEXT: s_endpgm
21022259
bb:
21032260
%i12 = fadd <2 x float> %arg, %arg1
21042261
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
22512408
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
22522409
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
22532410
declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
2411+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2412+
; GFX90A-SDAG: {{.*}}
2413+
; GFX942-SDAG: {{.*}}

0 commit comments

Comments
 (0)