Skip to content

Commit 8df7596

Browse files
committed
[CodeGen] Fine tune MachineFunctionSplitPass (MFS) for FSAFDO.
The original MFS work D85368 shows good performance improvement with Instrumented FDO. However, AutoFDO or Flow-Sensitive AutoFDO (FSAFDO) does not show performance gain. This is mainly caused by a less accurate profile compared to the iFDO profile. For the past few months, we have been working to improve FSAFDO quality, like in D145171. Taking advantage of this improvement, MFS now shows performance improvements over FSAFDO profiles. That being said, 2 minor changes need to be made, 1) An FS-AutoFDO profile generation pass needs to be added right before MFS pass and an FSAFDO profile load pass is needed when FS-AutoFDO is enabled and the MFS flag is present. 2) MFS only applies to hot functions, because we believe (and experiment also shows) FS-AutoFDO is more accurate about functions that have plenty of samples than those with no or very few samples. With this improvement, we see a 1.2% performance improvement in clang benchmark, 0.9% QPS improvement in our internal search benchmark, and 3%-5% improvement in internal storage benchmark. This is #1 of the two patches that enables the improvement. Reviewed By: wenlei, snehasish, xur Differential Revision: https://reviews.llvm.org/D152399
1 parent 7c7b191 commit 8df7596

File tree

2 files changed

+109
-11
lines changed

2 files changed

+109
-11
lines changed

llvm/lib/CodeGen/MachineFunctionSplitter.cpp

+41-11
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
//===----------------------------------------------------------------------===//
2525

2626
#include "llvm/ADT/SmallVector.h"
27+
#include "llvm/Analysis/BlockFrequencyInfo.h"
28+
#include "llvm/Analysis/BranchProbabilityInfo.h"
2729
#include "llvm/Analysis/EHUtils.h"
2830
#include "llvm/Analysis/ProfileSummaryInfo.h"
2931
#include "llvm/CodeGen/BasicBlockSectionUtils.h"
@@ -94,16 +96,26 @@ static void setDescendantEHBlocksCold(MachineFunction &MF) {
9496
}
9597
}
9698

99+
static void finishAdjustingBasicBlocksAndLandingPads(MachineFunction &MF) {
100+
auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) {
101+
return X.getSectionID().Type < Y.getSectionID().Type;
102+
};
103+
llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
104+
llvm::avoidZeroOffsetLandingPad(MF);
105+
}
106+
97107
static bool isColdBlock(const MachineBasicBlock &MBB,
98108
const MachineBlockFrequencyInfo *MBFI,
99-
ProfileSummaryInfo *PSI) {
109+
ProfileSummaryInfo *PSI, bool HasAccurateProfile) {
100110
std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
111+
// If using accurate profile, no count means cold.
112+
// If no accurate profile, no count means "do not judge
113+
// coldness".
101114
if (!Count)
102-
return true;
115+
return HasAccurateProfile;
103116

104-
if (PercentileCutoff > 0) {
117+
if (PercentileCutoff > 0)
105118
return PSI->isColdCountNthPercentile(PercentileCutoff, *Count);
106-
}
107119
return (*Count < ColdCountThreshold);
108120
}
109121

@@ -140,9 +152,28 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
140152

141153
MachineBlockFrequencyInfo *MBFI = nullptr;
142154
ProfileSummaryInfo *PSI = nullptr;
155+
// Whether this pass is using FSAFDO profile (not accurate) or IRPGO
156+
// (accurate). HasAccurateProfile is only used when UseProfileData is true,
157+
// but giving it a default value to silent any possible warning.
158+
bool HasAccurateProfile = false;
143159
if (UseProfileData) {
144160
MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
145161
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
162+
// "HasAccurateProfile" is false for FSAFDO, true when using IRPGO
163+
// (traditional instrumented FDO) or CSPGO profiles.
164+
HasAccurateProfile =
165+
PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile();
166+
// If HasAccurateProfile is false, we only trust hot functions,
167+
// which have many samples, and consider them as split
168+
// candidates. On the other hand, if HasAccurateProfile (likeIRPGO), we
169+
// trust both cold and hot functions.
170+
if (!HasAccurateProfile && !PSI->isFunctionHotInCallGraph(&MF, *MBFI)) {
171+
// Split all EH code and it's descendant statically by default.
172+
if (SplitAllEHCode)
173+
setDescendantEHBlocksCold(MF);
174+
finishAdjustingBasicBlocksAndLandingPads(MF);
175+
return true;
176+
}
146177
}
147178

148179
SmallVector<MachineBasicBlock *, 2> LandingPads;
@@ -152,7 +183,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
152183

153184
if (MBB.isEHPad())
154185
LandingPads.push_back(&MBB);
155-
else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode)
186+
else if (UseProfileData &&
187+
isColdBlock(MBB, MBFI, PSI, HasAccurateProfile) && !SplitAllEHCode)
156188
MBB.setSectionID(MBBSectionID::ColdSectionID);
157189
}
158190

@@ -161,21 +193,19 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
161193
setDescendantEHBlocksCold(MF);
162194
// We only split out eh pads if all of them are cold.
163195
else {
196+
// Here we have UseProfileData == true.
164197
bool HasHotLandingPads = false;
165198
for (const MachineBasicBlock *LP : LandingPads) {
166-
if (!isColdBlock(*LP, MBFI, PSI))
199+
if (!isColdBlock(*LP, MBFI, PSI, HasAccurateProfile))
167200
HasHotLandingPads = true;
168201
}
169202
if (!HasHotLandingPads) {
170203
for (MachineBasicBlock *LP : LandingPads)
171204
LP->setSectionID(MBBSectionID::ColdSectionID);
172205
}
173206
}
174-
auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) {
175-
return X.getSectionID().Type < Y.getSectionID().Type;
176-
};
177-
llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
178-
llvm::avoidZeroOffsetLandingPad(MF);
207+
208+
finishAdjustingBasicBlocksAndLandingPads(MF);
179209
return true;
180210
}
181211

llvm/test/CodeGen/X86/machine-function-splitter.ll

+68
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -mfs-psi-cutoff=0 -mfs-count-threshold=2000 | FileCheck %s --dump-input=always -check-prefix=MFS-OPTS1
33
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -mfs-psi-cutoff=950000 | FileCheck %s -check-prefix=MFS-OPTS2
44
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -mfs-split-ehcode | FileCheck %s -check-prefix=MFS-EH-SPLIT
5+
; RUN: sed 's/InstrProf/SampleProfile/g' %s > %t.ll
6+
; RUN: llc < %t.ll -mtriple=x86_64-unknown-linux-gnu -split-machine-functions | FileCheck %s --check-prefix=FSAFDO-MFS
7+
; RUN: llc < %t.ll -mtriple=x86_64-unknown-linux-gnu -split-machine-functions | FileCheck %s --check-prefix=FSAFDO-MFS2
8+
59
define void @foo1(i1 zeroext %0) nounwind !prof !14 !section_prefix !15 {
610
;; Check that cold block is moved to .text.split.
711
; MFS-DEFAULTS-LABEL: foo1
@@ -368,6 +372,66 @@ try.cont: ; preds = %entry
368372
ret i32 %8
369373
}
370374

375+
define void @foo14(i1 zeroext %0, i1 zeroext %1) nounwind !prof !24 {
376+
; FSAFDO-MFS: .section .text.split.foo14,"ax"
377+
; FSAFDO-MFS: foo14.cold:
378+
br i1 %0, label %3, label %7, !prof !25
379+
380+
3:
381+
%4 = call i32 @bar()
382+
br label %7
383+
384+
5:
385+
%6 = call i32 @baz()
386+
br label %7
387+
388+
7:
389+
br i1 %1, label %8, label %10, !prof !26
390+
391+
8:
392+
%9 = call i32 @bam()
393+
br label %12
394+
395+
10:
396+
%11 = call i32 @baz()
397+
br label %12
398+
399+
12:
400+
%13 = tail call i32 @qux()
401+
ret void
402+
}
403+
404+
define void @foo15(i1 zeroext %0, i1 zeroext %1) nounwind !prof !27 {
405+
;; HasAccurateProfile is false, foo15 is hot, but no profile data for
406+
;; blocks, no split should happen.
407+
; FSAFDO-MFS2-NOT: .section .text.split.foo15,"ax"
408+
; FSAFDO-MFS2-NOT: foo15.cold:
409+
br i1 %0, label %3, label %7
410+
411+
3:
412+
%4 = call i32 @bar()
413+
br label %7
414+
415+
5:
416+
%6 = call i32 @baz()
417+
br label %7
418+
419+
7:
420+
br i1 %1, label %8, label %10
421+
422+
8:
423+
%9 = call i32 @bam()
424+
br label %12
425+
426+
10:
427+
%11 = call i32 @baz()
428+
br label %12
429+
430+
12:
431+
%13 = tail call i32 @qux()
432+
ret void
433+
}
434+
371435
declare i32 @bar()
372436
declare i32 @baz()
373437
declare i32 @bam()
@@ -404,3 +468,7 @@ attributes #0 = { "implicit-section-name"="nosplit" }
404468
!21 = !{!"branch_weights", i32 6000, i32 4000}
405469
!22 = !{!"branch_weights", i32 80, i32 9920}
406470
!23 = !{!"function_entry_count", i64 7}
471+
!24 = !{!"function_entry_count", i64 10000}
472+
!25 = !{!"branch_weights", i32 0, i32 7000}
473+
!26 = !{!"branch_weights", i32 1000, i32 6000}
474+
!27 = !{!"function_entry_count", i64 10000}

0 commit comments

Comments
 (0)