Skip to content

Commit 6c9a9d9

Browse files
authored
[AMDGPU] Set inst_pref_size to maximum (#126981)
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
1 parent 4dd29eb commit 6c9a9d9

File tree

4 files changed

+77
-15
lines changed

4 files changed

+77
-15
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,18 +1230,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12301230
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
12311231
ProgInfo.EXCPEnable = 0;
12321232

1233-
if (STM.hasGFX90AInsts()) {
1234-
// return ((Dst & ~Mask) | (Value << Shift))
1235-
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1236-
uint32_t Shift) {
1237-
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1238-
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1239-
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1240-
Dst = MCBinaryExpr::createOr(
1241-
Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1242-
return Dst;
1243-
};
1233+
// return ((Dst & ~Mask) | (Value << Shift))
1234+
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1235+
uint32_t Shift) {
1236+
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1237+
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1238+
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1239+
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
1240+
Ctx);
1241+
return Dst;
1242+
};
12441243

1244+
if (STM.hasGFX90AInsts()) {
12451245
ProgInfo.ComputePGMRSrc3 =
12461246
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
12471247
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
@@ -1268,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12681268
", final occupancy is " + Twine(Occupancy));
12691269
F.getContext().diagnose(Diag);
12701270
}
1271+
1272+
if (isGFX11Plus(STM)) {
1273+
uint32_t CodeSizeInBytes = (uint32_t)std::min(
1274+
ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1275+
(uint64_t)std::numeric_limits<uint32_t>::max());
1276+
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1277+
uint32_t Field, Shift, Width;
1278+
if (isGFX11(STM)) {
1279+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1280+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1281+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1282+
} else {
1283+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1284+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1285+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1286+
}
1287+
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1288+
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1289+
CreateExpr(InstPrefSize), Field, Shift);
1290+
}
12711291
}
12721292

12731293
static unsigned getRsrcReg(CallingConv::ID CallConv) {

llvm/lib/Target/AMDGPU/SIProgramInfo.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
202202
return MCConstantExpr::create(0, Ctx);
203203
}
204204

205-
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
206-
if (CodeSizeInBytes.has_value())
205+
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
206+
bool IsLowerBound) {
207+
if (!IsLowerBound && CodeSizeInBytes.has_value())
207208
return *CodeSizeInBytes;
208209

209210
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -216,14 +217,20 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
216217
// overestimated. In case of inline asm used getInstSizeInBytes() will
217218
// return a maximum size of a single instruction, where the real size may
218219
// differ. At this point CodeSize may be already off.
219-
CodeSize = alignTo(CodeSize, MBB.getAlignment());
220+
if (!IsLowerBound)
221+
CodeSize = alignTo(CodeSize, MBB.getAlignment());
220222

221223
for (const MachineInstr &MI : MBB) {
222224
// TODO: CodeSize should account for multiple functions.
223225

224226
if (MI.isMetaInstruction())
225227
continue;
226228

229+
// We cannot properly estimate inline asm size. It can be as small as zero
230+
// if that is just a comment.
231+
if (IsLowerBound && MI.isInlineAsm())
232+
continue;
233+
227234
CodeSize += TII->getInstSizeInBytes(MI);
228235
}
229236
}

llvm/lib/Target/AMDGPU/SIProgramInfo.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
101101
void reset(const MachineFunction &MF);
102102

103103
// Get function code size and cache the value.
104-
uint64_t getFunctionCodeSize(const MachineFunction &MF);
104+
// If \p IsLowerBound is set it returns a minimal code size which is safe
105+
// to address.
106+
uint64_t getFunctionCodeSize(const MachineFunction &MF,
107+
bool IsLowerBound = false);
105108

106109
/// Compute the value of the ComputePGMRsrc1 register.
107110
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
3+
4+
; GCN-LABEL: .amdhsa_kernel large
5+
; GFX11: .amdhsa_inst_pref_size 3
6+
; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
7+
; GFX12: .amdhsa_inst_pref_size 4
8+
; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
9+
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
10+
bb:
11+
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
12+
ret void
13+
}
14+
15+
; GCN-LABEL: .amdhsa_kernel small
16+
; GCN: .amdhsa_inst_pref_size 1
17+
; GCN: codeLenInByte = {{[0-9]$}}
18+
define amdgpu_kernel void @small() {
19+
bb:
20+
ret void
21+
}
22+
23+
; Ignore inline asm in size calculation
24+
25+
; GCN-LABEL: .amdhsa_kernel inline_asm
26+
; GCN: .amdhsa_inst_pref_size 1
27+
; GCN: codeLenInByte = {{[0-9]$}}
28+
define amdgpu_kernel void @inline_asm() {
29+
bb:
30+
call void asm sideeffect ".fill 256, 4, 0", ""()
31+
ret void
32+
}

0 commit comments

Comments
 (0)