Skip to content

Commit 3dcb60d

Browse files
committed
[CSSPGO][llvm-profgen] Fix external address issues of perf reader (leading external LBR part)
We can have the sampling just hit into the external addresses, in that case, both the top stack frame and the latest LBR target are external addresses. For example: ``` ffffffff 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0 ffffffff 40067e 0xffffffff/0xffffffff/P/-/-/0 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0 ``` Before we will ignore the entire samples. However, we found there exists some internal LBRs in the remaining part of sample, the range between them is still a valid range, we will lose some valid LBRs. Those LBRs will be unwinded based on a empty(context-less) call stack. This change tries to fix it, instead of ignoring the entire sample, we only ignore the leading external addresses. Note that the first outgoing LBR is useful since there is a valid range between it's source and next LBR's target. Reviewed By: hoy, wenlei Differential Revision: https://reviews.llvm.org/D115538
1 parent 3220571 commit 3dcb60d

File tree

7 files changed

+165
-44
lines changed

7 files changed

+165
-44
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin
2+
3+
; Test for an external top address, should only ignore the call stack and keep unwinding the LBR
4+
5+
; Valid LBR. The first 4006d7 will be kept for unwinding, the second will be truncated.
6+
7+
ffffffff
8+
ffffffff
9+
4006d7
10+
ffffffff
11+
4006d7
12+
ffffffff
13+
0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0
14+
15+
; Valid LBR
16+
ffffffff
17+
0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0
18+
19+
; Valid LBR
20+
ffffffff
21+
0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
22+
23+
; Valid LBR
24+
40067e
25+
0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
26+
27+
; Valid LBR
28+
ffffffff
29+
5541f689495641d7
30+
0xffffffff/0xffffffff/P/-/-/0 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
31+
32+
; Empty sample
33+
ffffffff
34+
5541f689495641d7
35+
0xffffffff/0xffffffff/P/-/-/0 0xffffffff/0xffffffff/P/-/-/0
36+
37+
; Invalid LBR
38+
ffffffff
39+
0xffffffff/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,5 @@
11
PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin
22

3-
; test for an external or invalid top address, should skip the whole sample
4-
5-
ffffffff
6-
40067e
7-
5541f689495641d7
8-
0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0
9-
103
40067e
114
5541f689495641d7
125
0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/external-address.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --skip-symbolization --profile-summary-hot-count=0 --compress-recursion=0
2+
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
3+
4+
5+
; CHECK-UNWINDER: [main:1 @ foo]
6+
; CHECK-UNWINDER: 2
7+
; CHECK-UNWINDER: 670-6ad:4
8+
; CHECK-UNWINDER: 6bd-6c8:4
9+
; CHECK-UNWINDER: 2
10+
; CHECK-UNWINDER: 69b->670:5
11+
; CHECK-UNWINDER: 6c8->67e:1
12+
; CHECK-UNWINDER: [main:1 @ foo:3.1 @ bar]
13+
; CHECK-UNWINDER: 1
14+
; CHECK-UNWINDER: 6af-6bb:4
15+
; CHECK-UNWINDER: 0
16+
17+
; Manually created to test if remaining call stack can be correctly unwinded.
18+
; CHECK-UNWINDER: [main:1 @ foo:4 @ main:1 @ foo]
19+
; CHECK-UNWINDER: 2
20+
; CHECK-UNWINDER: 670-6ad:1
21+
; CHECK-UNWINDER: 6bd-6c8:1
22+
; CHECK-UNWINDER: 2
23+
; CHECK-UNWINDER: 69b->670:1
24+
; CHECK-UNWINDER: 6c8->67e:1
25+
; CHECK-UNWINDER: [main:1 @ foo:4 @ main:1 @ foo:3.1 @ bar]
26+
; CHECK-UNWINDER: 1
27+
; CHECK-UNWINDER: 6af-6bb:1
28+
; CHECK-UNWINDER: 0

llvm/test/tools/llvm-profgen/cs-interrupt.test

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --skip-symbolization --profile-summary-cold-count=0
44
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
55
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --profile-summary-cold-count=0
6-
>>>>>>> 02ea7084c370 ([llvm-profgen] Support LBR only perf script)
76
; RUN: FileCheck %s --input-file %t
87

98
; CHECK:[main:1 @ foo]:88:0

llvm/test/tools/llvm-profgen/inline-noprobe2.test

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,22 +73,30 @@
7373
;CHECK: 1: 5
7474
;CHECK: 2: 5
7575
;CHECK: 3: 5
76-
;CHECK: main:486:0
76+
;CHECK: main:906:0
7777
;CHECK: 0: 0
7878
;CHECK: 3: 0
7979
;CHECK: 4.1: 0
8080
;CHECK: 4.3: 0
81-
;CHECK: 5.1: 6
82-
;CHECK: 5.3: 6
83-
;CHECK: 6: 6
84-
;CHECK: 6.1: 6
85-
;CHECK: 6.3: 6
81+
;CHECK: 5.1: 11
82+
;CHECK: 5.3: 11
83+
;CHECK: 6: 11
84+
;CHECK: 6.1: 14
85+
;CHECK: 6.3: 11
8686
;CHECK: 7: 0
8787
;CHECK: 8: 0 quick_sort:1
8888
;CHECK: 9: 0
8989
;CHECK: 11: 0
9090
;CHECK: 14: 0
9191
;CHECK: 65499: 0
92+
;CHECK: quick_sort:903:25
93+
;CHECK: 1: 24
94+
;CHECK: 2: 12 partition_pivot_last:7 partition_pivot_first:5
95+
;CHECK: 3: 11 quick_sort:12
96+
;CHECK: 4: 12 quick_sort:12
97+
;CHECK: 6: 24
98+
;CHECK: 65507: 12
99+
92100

93101

94102
; original code:

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ void VirtualUnwinder::unwindCall(UnwindState &State) {
5959
// pro/epi tracker(Dwarf CFI) for the precise check.
6060
uint64_t Source = State.getCurrentLBRSource();
6161
auto *ParentFrame = State.getParentFrame();
62+
6263
if (ParentFrame == State.getDummyRootPtr() ||
6364
ParentFrame->Address != Source) {
6465
State.switchToFrame(Source);
@@ -121,7 +122,7 @@ void VirtualUnwinder::unwindReturn(UnwindState &State) {
121122
State.InstPtr.update(LBR.Source);
122123
}
123124

124-
void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
125+
void VirtualUnwinder::unwindBranch(UnwindState &State) {
125126
// TODO: Tolerate tail call for now, as we may see tail call from libraries.
126127
// This is only for intra function branches, excluding tail calls.
127128
uint64_t Source = State.getCurrentLBRSource();
@@ -219,7 +220,7 @@ void VirtualUnwinder::collectSamplesFromFrameTrie(
219220

220221
void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
221222
UnwindState &State, uint64_t Repeat) {
222-
if (Branch.IsArtificial)
223+
if (Branch.IsArtificial || Branch.Target == ExternalAddr)
223224
return;
224225

225226
if (Binary->usePseudoProbes()) {
@@ -242,21 +243,18 @@ bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) {
242243
if (!State.validateInitialState())
243244
return false;
244245

245-
// Also do not attempt linear unwind for the leaf range as it's incomplete.
246-
bool IsLeaf = true;
247-
248246
// Now process the LBR samples in parrallel with stack sample
249247
// Note that we do not reverse the LBR entry order so we can
250248
// unwind the sample stack as we walk through LBR entries.
251249
while (State.hasNextLBR()) {
252250
State.checkStateConsistency();
253251

254-
// Unwind implicit calls/returns from inlining, along the linear path,
255-
// break into smaller sub section each with its own calling context.
256-
if (!IsLeaf) {
252+
// Do not attempt linear unwind for the leaf range as it's incomplete.
253+
if (!State.IsLastLBR()) {
254+
// Unwind implicit calls/returns from inlining, along the linear path,
255+
// break into smaller sub section each with its own calling context.
257256
unwindLinear(State, Repeat);
258257
}
259-
IsLeaf = false;
260258

261259
// Save the LBR branch before it gets unwound.
262260
const LBREntry &Branch = State.getCurrentLBR();
@@ -271,9 +269,15 @@ bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) {
271269
// Unwind returns - check whether the IP is indeed at a return instruction
272270
unwindReturn(State);
273271
} else {
274-
// Unwind branches - for regular intra function branches, we only
275-
// need to record branch with context.
276-
unwindBranchWithinFrame(State);
272+
// Unwind branches
273+
// For regular intra function branches, we only need to record branch with
274+
// context. For an artificial branch cross function boundaries, we got an
275+
// issue with returning to external code. Take the two LBR enties for
276+
// example: [foo:8(RETURN), ext:1] [ext:3(CALL), bar:1] After perf reader,
277+
// we only get[foo:8(RETURN), bar:1], unwinder will be confused like foo
278+
// return to bar. Here we detect and treat this case as BRANCH instead of
279+
// RETURN which only update the source address.
280+
unwindBranch(State);
277281
}
278282
State.advanceLBR();
279283
// Record `branch` with calling context after unwinding.
@@ -432,9 +436,9 @@ void HybridPerfReader::unwindSamples() {
432436
if (Binary->useFSDiscriminator())
433437
exitWithError("FS discriminator is not supported in CS profile.");
434438
std::set<uint64_t> AllUntrackedCallsites;
439+
VirtualUnwinder Unwinder(&SampleCounters, Binary);
435440
for (const auto &Item : AggregatedSamples) {
436441
const PerfSample *Sample = Item.first.getPtr();
437-
VirtualUnwinder Unwinder(&SampleCounters, Binary);
438442
Unwinder.unwind(Sample, Item.second);
439443
auto &CurrUntrackedCallsites = Unwinder.getUntrackedCallsites();
440444
AllUntrackedCallsites.insert(CurrUntrackedCallsites.begin(),
@@ -508,26 +512,32 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
508512
bool IsOutgoing = SrcIsInternal && !DstIsInternal;
509513
bool IsArtificial = false;
510514

511-
// Ignore branches outside the current binary. Ignore all remaining branches
512-
// if there's no incoming branch before the external branch in reverse
513-
// order.
515+
// Ignore branches outside the current binary.
514516
if (IsExternal) {
515-
if (PrevTrDst)
516-
continue;
517-
if (!LBRStack.empty()) {
517+
if (!PrevTrDst && !LBRStack.empty()) {
518518
WithColor::warning()
519519
<< "Invalid transfer to external code in LBR record at line "
520520
<< TraceIt.getLineNumber() << ": " << TraceIt.getCurrentLine()
521521
<< "\n";
522522
}
523-
break;
523+
// Do not ignore the entire samples, the remaining LBR can still be
524+
// unwound using a context-less stack.
525+
continue;
524526
}
525527

526528
if (IsOutgoing) {
527529
if (!PrevTrDst) {
528-
// This is unpaired outgoing jump which is likely due to interrupt or
529-
// incomplete LBR trace. Ignore current and subsequent entries since
530-
// they are likely in different contexts.
530+
// This is a leading outgoing LBR, we should keep processing the LBRs.
531+
if (LBRStack.empty()) {
532+
NumLeadingOutgoingLBR++;
533+
// Record this LBR since current source and next LBR' target is still
534+
// a valid range.
535+
LBRStack.emplace_back(LBREntry(Src, ExternalAddr, false));
536+
continue;
537+
}
538+
// This is middle unpaired outgoing jump which is likely due to
539+
// interrupt or incomplete LBR trace. Ignore current and subsequent
540+
// entries since they are likely in different contexts.
531541
break;
532542
}
533543

@@ -593,9 +603,17 @@ bool PerfScriptReader::extractCallstack(TraceStream &TraceIt,
593603
}
594604
TraceIt.advance();
595605
// Currently intermixed frame from different binaries is not supported.
596-
// Ignore caller frames not from binary of interest.
597-
if (!Binary->addressIsCode(FrameAddr))
598-
break;
606+
if (!Binary->addressIsCode(FrameAddr)) {
607+
if (CallStack.empty())
608+
NumLeafExternalFrame++;
609+
// Push a special value(ExternalAddr) for the external frames so that
610+
// unwinder can still work on this with artificial Call/Return branch.
611+
// After unwinding, the context will be truncated for external frame.
612+
// Also deduplicate the consecutive external addresses.
613+
if (CallStack.empty() || CallStack.back() != ExternalAddr)
614+
CallStack.emplace_back(ExternalAddr);
615+
continue;
616+
}
599617

600618
// We need to translate return address to call address for non-leaf frames.
601619
if (!CallStack.empty()) {
@@ -613,6 +631,10 @@ bool PerfScriptReader::extractCallstack(TraceStream &TraceIt,
613631
CallStack.emplace_back(FrameAddr);
614632
}
615633

634+
// Strip out the bottom external addr.
635+
if (CallStack.size() > 1 && CallStack.back() == ExternalAddr)
636+
CallStack.pop_back();
637+
616638
// Skip other unrelated line, find the next valid LBR line
617639
// Note that even for empty call stack, we should skip the address at the
618640
// bottom, otherwise the following pass may generate a truncated callstack
@@ -885,6 +907,7 @@ uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) {
885907
}
886908

887909
void PerfScriptReader::parseSample(TraceStream &TraceIt) {
910+
NumTotalSample++;
888911
uint64_t Count = parseAggregatedCount(TraceIt);
889912
assert(Count >= 1 && "Aggregated count should be >= 1!");
890913
parseSample(TraceIt, Count);
@@ -1131,6 +1154,11 @@ void PerfScriptReader::parsePerfTraces() {
11311154
// Parse perf traces and do aggregation.
11321155
parseAndAggregateTrace();
11331156

1157+
emitWarningSummary(NumLeafExternalFrame, NumTotalSample,
1158+
"of samples have leaf external frame in call stack.");
1159+
emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample,
1160+
"of samples have leading external LBR.");
1161+
11341162
// Generate unsymbolized profile.
11351163
warnTruncatedStack();
11361164
warnInvalidRange();

0 commit comments

Comments
 (0)