Skip to content

Commit 2316a10

Browse files
author
spupyrev
committed
[BOLT] stale profile matching [part 2 out of 2]
This is a first "serious" version of stale profile matching in BOLT. This diff extends the hash computation for basic blocks so that we can apply a fuzzy hash-based matching. The idea is to compute several "versions" of a hash value for a basic block. A loose version of a hash (computed by ignoring instruction operands) allows to match blocks in functions whose content has been changed, while stricter hash values (considering instruction opcodes with operands and even based on hashes of block's successors/predecessors) allow to resolve collisions. In order to save space and build time, individual hash components are blended into a single uint64_t. There are likely numerous ways of improving hash computation but already this simple variant provides significant perf benefits. **Perf testing** on the clang binary: collecting data on clang-10 and using it to optimize clang-11 (with ~1 year of commits in between). Next, we compare - //stale_clang// (clang-11 optimized with profile collected on clang-10 with **infer-stale-profile=0**) - //opt_clang// (clang-11 optimized with profile collected on clang-11) - //infer_clang// (clang-11 optimized with profile collected on clang-10 with **infer-stale-profile=1**) `LTO-only` mode: //stale_clang// vs //opt_clang//: task-clock [delta(%): 9.4252 ± 1.6582, p-value: 0.000002] (That is, there is a ~9.5% perf regression) //infer_clang// vs //opt_clang//: task-clock [delta(%): 2.1834 ± 1.8158, p-value: 0.040702] (That is, the regression is reduced to ~2%) Related BOLT logs: ``` BOLT-INFO: identified 2114 (18.61%) stale functions responsible for 30.96% samples BOLT-INFO: inferred profile for 2101 (18.52% of all profiled) functions responsible for 30.95% samples ``` `LTO+AutoFDO` mode: //stale_clang// vs //opt_clang//: task-clock [delta(%): 19.1293 ± 1.4131, p-value: 0.000002] //infer_clang// vs //opt_clang//: task-clock [delta(%): 7.4364 ± 1.3343, p-value: 0.000002] Related BOLT logs: ``` BOLT-INFO: identified 5452 (50.27%) stale functions responsible for 85.34% samples BOLT-INFO: inferred profile for 5442 (50.23% of all profiled) functions responsible for 85.33% samples ``` Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D146661
1 parent 100c756 commit 2316a10

File tree

3 files changed

+197
-17
lines changed

3 files changed

+197
-17
lines changed

bolt/lib/Core/BinaryFunction.cpp

-8
Original file line numberDiff line numberDiff line change
@@ -3611,14 +3611,6 @@ size_t BinaryFunction::computeHash(bool UseDFS,
36113611
return Hash = std::hash<std::string>{}(HashString);
36123612
}
36133613

3614-
void BinaryFunction::computeBlockHashes() const {
3615-
for (const BinaryBasicBlock *BB : BasicBlocks) {
3616-
std::string Hash =
3617-
hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); });
3618-
BB->setHash(std::hash<std::string>{}(Hash));
3619-
}
3620-
}
3621-
36223614
void BinaryFunction::insertBasicBlocks(
36233615
BinaryBasicBlock *Start,
36243616
std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,

bolt/lib/Profile/StaleProfileMatching.cpp

+194-6
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333

3434
#include <queue>
3535

36+
#undef DEBUG_TYPE
37+
#define DEBUG_TYPE "bolt-prof"
38+
3639
using namespace llvm;
3740

3841
namespace opts {
@@ -133,6 +136,176 @@ cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
133136
namespace llvm {
134137
namespace bolt {
135138

139+
/// An object wrapping several components of a basic block hash. The combined
140+
/// (blended) hash is represented and stored as one uint64_t, while individual
141+
/// components are of smaller size (e.g., uint16_t or uint8_t).
142+
struct BlendedBlockHash {
143+
private:
144+
static uint64_t combineHashes(uint16_t Hash1, uint16_t Hash2, uint16_t Hash3,
145+
uint16_t Hash4) {
146+
uint64_t Hash = 0;
147+
148+
Hash |= uint64_t(Hash4);
149+
Hash <<= 16;
150+
151+
Hash |= uint64_t(Hash3);
152+
Hash <<= 16;
153+
154+
Hash |= uint64_t(Hash2);
155+
Hash <<= 16;
156+
157+
Hash |= uint64_t(Hash1);
158+
159+
return Hash;
160+
}
161+
162+
static void parseHashes(uint64_t Hash, uint16_t &Hash1, uint16_t &Hash2,
163+
uint16_t &Hash3, uint16_t &Hash4) {
164+
Hash1 = Hash & 0xffff;
165+
Hash >>= 16;
166+
167+
Hash2 = Hash & 0xffff;
168+
Hash >>= 16;
169+
170+
Hash3 = Hash & 0xffff;
171+
Hash >>= 16;
172+
173+
Hash4 = Hash & 0xffff;
174+
Hash >>= 16;
175+
}
176+
177+
public:
178+
explicit BlendedBlockHash() {}
179+
180+
explicit BlendedBlockHash(uint64_t CombinedHash) {
181+
parseHashes(CombinedHash, Offset, OpcodeHash, InstrHash, NeighborHash);
182+
}
183+
184+
/// Combine the blended hash into uint64_t.
185+
uint64_t combine() const {
186+
return combineHashes(Offset, OpcodeHash, InstrHash, NeighborHash);
187+
}
188+
189+
/// Compute a distance between two given blended hashes. The smaller the
190+
/// distance, the more similar two blocks are. For identical basic blocks,
191+
/// the distance is zero.
192+
uint64_t distance(const BlendedBlockHash &BBH) const {
193+
assert(OpcodeHash == BBH.OpcodeHash &&
194+
"incorrect blended hash distance computation");
195+
uint64_t Dist = 0;
196+
// Account for NeighborHash
197+
Dist += NeighborHash == BBH.NeighborHash ? 0 : 1;
198+
Dist <<= 16;
199+
// Account for InstrHash
200+
Dist += InstrHash == BBH.InstrHash ? 0 : 1;
201+
Dist <<= 16;
202+
// Account for Offset
203+
Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
204+
return Dist;
205+
}
206+
207+
/// The offset of the basic block from the function start.
208+
uint16_t Offset{0};
209+
/// (Loose) Hash of the basic block instructions, excluding operands.
210+
uint16_t OpcodeHash{0};
211+
/// (Strong) Hash of the basic block instructions, including opcodes and
212+
/// operands.
213+
uint16_t InstrHash{0};
214+
/// Hash of the (loose) basic block together with (loose) hashes of its
215+
/// successors and predecessors.
216+
uint16_t NeighborHash{0};
217+
};
218+
219+
/// The object is used to identify and match basic blocks in a BinaryFunction
220+
/// given their hashes computed on a binary built from several revisions behind
221+
/// release.
222+
class StaleMatcher {
223+
public:
224+
/// Initialize stale matcher.
225+
void init(const std::vector<FlowBlock *> &Blocks,
226+
const std::vector<BlendedBlockHash> &Hashes) {
227+
assert(Blocks.size() == Hashes.size() &&
228+
"incorrect matcher initialization");
229+
for (size_t I = 0; I < Blocks.size(); I++) {
230+
FlowBlock *Block = Blocks[I];
231+
uint16_t OpHash = Hashes[I].OpcodeHash;
232+
OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block));
233+
}
234+
}
235+
236+
/// Find the most similar block for a given hash.
237+
const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const {
238+
auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
239+
if (BlockIt == OpHashToBlocks.end()) {
240+
return nullptr;
241+
}
242+
FlowBlock *BestBlock = nullptr;
243+
uint64_t BestDist = std::numeric_limits<uint64_t>::max();
244+
for (auto It : BlockIt->second) {
245+
FlowBlock *Block = It.second;
246+
BlendedBlockHash Hash = It.first;
247+
uint64_t Dist = Hash.distance(BlendedHash);
248+
if (BestBlock == nullptr || Dist < BestDist) {
249+
BestDist = Dist;
250+
BestBlock = Block;
251+
}
252+
}
253+
return BestBlock;
254+
}
255+
256+
private:
257+
using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
258+
std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
259+
};
260+
261+
void BinaryFunction::computeBlockHashes() const {
262+
if (size() == 0)
263+
return;
264+
265+
assert(hasCFG() && "the function is expected to have CFG");
266+
267+
std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
268+
std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
269+
// Initialize hash components
270+
for (size_t I = 0; I < BasicBlocks.size(); I++) {
271+
const BinaryBasicBlock *BB = BasicBlocks[I];
272+
assert(BB->getIndex() == I && "incorrect block index");
273+
BlendedHashes[I].Offset = BB->getOffset();
274+
// Hashing complete instructions
275+
std::string InstrHashStr = hashBlock(
276+
BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
277+
uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
278+
BlendedHashes[I].InstrHash = hash_64_to_16(InstrHash);
279+
// Hashing opcodes
280+
std::string OpcodeHashStr =
281+
hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); });
282+
OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
283+
BlendedHashes[I].OpcodeHash = hash_64_to_16(OpcodeHashes[I]);
284+
}
285+
286+
// Initialize neighbor hash
287+
for (size_t I = 0; I < BasicBlocks.size(); I++) {
288+
const BinaryBasicBlock *BB = BasicBlocks[I];
289+
uint64_t Hash = OpcodeHashes[I];
290+
// Append hashes of successors
291+
for (BinaryBasicBlock *SuccBB : BB->successors()) {
292+
uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
293+
Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
294+
}
295+
// Append hashes of predecessors
296+
for (BinaryBasicBlock *PredBB : BB->predecessors()) {
297+
uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
298+
Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
299+
}
300+
BlendedHashes[I].NeighborHash = hash_64_to_16(Hash);
301+
}
302+
303+
// Assign hashes
304+
for (size_t I = 0; I < BasicBlocks.size(); I++) {
305+
const BinaryBasicBlock *BB = BasicBlocks[I];
306+
BB->setHash(BlendedHashes[I].combine());
307+
}
308+
}
136309
/// Create a wrapper flow function to use with the profile inference algorithm,
137310
/// and initialize its jumps and metadata.
138311
FlowFunction
@@ -224,23 +397,38 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder,
224397
const yaml::bolt::BinaryFunctionProfile &YamlBF,
225398
FlowFunction &Func) {
226399
assert(Func.Blocks.size() == BlockOrder.size() + 1);
227-
// Initialize stale matcher
228-
DenseMap<uint64_t, std::vector<FlowBlock *>> HashToBlocks;
400+
401+
std::vector<FlowBlock *> Blocks;
402+
std::vector<BlendedBlockHash> BlendedHashes;
229403
for (uint64_t I = 0; I < BlockOrder.size(); I++) {
230404
const BinaryBasicBlock *BB = BlockOrder[I];
231405
assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
232-
HashToBlocks[BB->getHash()].push_back(&Func.Blocks[I + 1]);
406+
Blocks.push_back(&Func.Blocks[I + 1]);
407+
BlendedBlockHash BlendedHash(BB->getHash());
408+
BlendedHashes.push_back(BlendedHash);
409+
LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
410+
<< Twine::utohexstr(BB->getHash()) << "\n");
233411
}
412+
StaleMatcher Matcher;
413+
Matcher.init(Blocks, BlendedHashes);
234414

235415
// Index in yaml profile => corresponding (matched) block
236416
DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
237417
// Match blocks from the profile to the blocks in CFG
238418
for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
239419
assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
240-
auto It = HashToBlocks.find(YamlBB.Hash);
241-
if (It != HashToBlocks.end()) {
242-
const FlowBlock *MatchedBlock = It->second.front();
420+
BlendedBlockHash BlendedHash(YamlBB.Hash);
421+
const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash);
422+
if (MatchedBlock != nullptr) {
243423
MatchedBlocks[YamlBB.Index] = MatchedBlock;
424+
LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index
425+
<< " and hash = " << Twine::utohexstr(YamlBB.Hash)
426+
<< " to BB with index = " << MatchedBlock->Index - 1
427+
<< "\n");
428+
} else {
429+
LLVM_DEBUG(
430+
dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index
431+
<< " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n");
244432
}
245433
}
246434

bolt/test/X86/Inputs/blarge_profile_stale.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,15 @@ functions:
3737
blocks:
3838
- bid: 0
3939
insns: 4
40-
hash: 0xE3FEB842A6548CCF
40+
hash: 0xb1e5b76571270000
4141
exec: 20
4242
succ: [ { bid: 1, cnt: 0 } ]
4343
- bid: 1
4444
insns: 9
45-
hash: 0x85948FF2924613B7
45+
hash: 0x587e93788b970010
4646
succ: [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ]
4747
- bid: 3
4848
insns: 2
49-
hash: 0x41D8DB2D2B01F411
49+
hash: 0x20e605d745e50039
5050
succ: [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ]
5151
...

0 commit comments

Comments
 (0)