Skip to content

Commit 242cc20

Browse files
committed
Recommit "[VPlan] First step towards VPlan cost modeling. (llvm#92555)"
This reverts commit 6f538f6. Extra tests for crashes discovered when building Chromium have been added in fb86cb7, 3be7312. Original message: This adds a new interface to compute the cost of recipes, VPBasicBlocks, VPRegionBlocks and VPlan, initially falling back to the legacy cost model for all recipes. Follow-up patches will gradually migrate recipes to compute their own costs step-by-step. It also adds getBestPlan function to LVP which computes the cost of all VPlans and picks the most profitable one together with the most profitable VF. The VPlan selected by the VPlan cost model is executed and there is an assert to catch cases where the VPlan cost model and the legacy cost model disagree. Even though I checked a number of different build configurations on AArch64 and X86, there may be some differences that have been missed. Additional discussions and context can be found in @arcbbb's llvm#67647 and llvm#67934 which is an earlier version of the current PR. PR: llvm#92555
1 parent c07be08 commit 242cc20

File tree

8 files changed

+421
-28
lines changed

8 files changed

+421
-28
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,16 @@ class LoopVectorizationPlanner {
344344
/// A builder used to construct the current plan.
345345
VPBuilder Builder;
346346

347+
/// Computes the cost of \p Plan for vectorization factor \p VF.
348+
///
349+
/// The current implementation requires access to the
350+
/// LoopVectorizationLegality to handle inductions and reductions, which is
351+
/// why it is kept separate from the VPlan-only cost infrastructure.
352+
///
353+
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
354+
/// been retired.
355+
InstructionCost cost(VPlan &Plan, ElementCount VF) const;
356+
347357
public:
348358
LoopVectorizationPlanner(
349359
Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -365,6 +375,9 @@ class LoopVectorizationPlanner {
365375
/// Return the best VPlan for \p VF.
366376
VPlan &getBestPlanFor(ElementCount VF) const;
367377

378+
/// Return the most profitable plan and fix its VF to the most profitable one.
379+
VPlan &getBestPlan() const;
380+
368381
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
369382
/// according to the best selected \p VF and \p UF.
370383
///
@@ -443,7 +456,9 @@ class LoopVectorizationPlanner {
443456
ElementCount MinVF);
444457

445458
/// \return The most profitable vectorization factor and the cost of that VF.
446-
/// This method checks every VF in \p CandidateVFs.
459+
/// This method checks every VF in \p CandidateVFs. This is now only used to
460+
/// verify the decisions by the new VPlan-based cost-model and will be retired
461+
/// once the VPlan-based cost-model is stabilized.
447462
VectorizationFactor
448463
selectVectorizationFactor(const ElementCountSet &CandidateVFs);
449464

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 214 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
290290
cl::desc("A flag that overrides the target's max interleave factor for "
291291
"vectorized loops."));
292292

293-
static cl::opt<unsigned> ForceTargetInstructionCost(
293+
cl::opt<unsigned> ForceTargetInstructionCost(
294294
"force-target-instruction-cost", cl::init(0), cl::Hidden,
295295
cl::desc("A flag that overrides the target's expected cost for "
296296
"an instruction to a single constant value. Mostly "
@@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
412412
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
413413
}
414414

415-
/// A helper function that returns the reciprocal of the block probability of
416-
/// predicated blocks. If we return X, we are assuming the predicated block
417-
/// will execute once for every X iterations of the loop header.
418-
///
419-
/// TODO: We should use actual block probability here, if available. Currently,
420-
/// we always assume predicated blocks have a 50% chance of executing.
421-
static unsigned getReciprocalPredBlockProb() { return 2; }
422-
423415
/// Returns "best known" trip count for the specified loop \p L as defined by
424416
/// the following procedure:
425417
/// 1) Returns exact trip count if it is known.
@@ -1621,6 +1613,16 @@ class LoopVectorizationCostModel {
16211613
/// \p VF is the vectorization factor chosen for the original loop.
16221614
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
16231615

1616+
/// Return the cost of instructions in an inloop reduction pattern, if I is
1617+
/// part of that pattern.
1618+
std::optional<InstructionCost>
1619+
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1620+
TTI::TargetCostKind CostKind) const;
1621+
1622+
/// Returns the execution time cost of an instruction for a given vector
1623+
/// width. Vector width of one means scalar.
1624+
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1625+
16241626
private:
16251627
unsigned NumPredStores = 0;
16261628

@@ -1646,21 +1648,11 @@ class LoopVectorizationCostModel {
16461648
/// of elements.
16471649
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
16481650

1649-
/// Returns the execution time cost of an instruction for a given vector
1650-
/// width. Vector width of one means scalar.
1651-
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1652-
16531651
/// The cost-computation logic from getInstructionCost which provides
16541652
/// the vector type as an output parameter.
16551653
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
16561654
Type *&VectorTy);
16571655

1658-
/// Return the cost of instructions in an inloop reduction pattern, if I is
1659-
/// part of that pattern.
1660-
std::optional<InstructionCost>
1661-
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1662-
TTI::TargetCostKind CostKind) const;
1663-
16641656
/// Calculate vectorization cost of memory instruction \p I.
16651657
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
16661658

@@ -7297,7 +7289,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72977289
if (!MaxFactors.hasVector())
72987290
return VectorizationFactor::Disabled();
72997291

7300-
// Select the optimal vectorization factor.
7292+
// Select the optimal vectorization factor according to the legacy cost-model.
7293+
// This is now only used to verify the decisions by the new VPlan-based
7294+
// cost-model and will be retired once the VPlan-based cost-model is
7295+
// stabilized.
73017296
VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
73027297
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
73037298
if (!hasPlanWithVF(VF.Width)) {
@@ -7308,6 +7303,196 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
73087303
return VF;
73097304
}
73107305

7306+
InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7307+
ElementCount VF) const {
7308+
return CM.getInstructionCost(UI, VF).first;
7309+
}
7310+
7311+
bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7312+
return (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7313+
SkipCostComputation.contains(UI);
7314+
}
7315+
7316+
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7317+
ElementCount VF) const {
7318+
InstructionCost Cost = 0;
7319+
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7320+
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7321+
7322+
// Cost modeling for inductions is inaccurate in the legacy cost model
7323+
// compared to the recipes that are generated. To match here initially during
7324+
// VPlan cost model bring up directly use the induction costs from the legacy
7325+
// cost model. Note that we do this as pre-processing; the VPlan may not have
7326+
// any recipes associated with the original induction increment instruction
7327+
// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7328+
// the cost of both induction increment instructions that are represented by
7329+
// recipes and those that are not, to avoid distinguishing between them here,
7330+
// and skip all recipes that represent induction increments (the former case)
7331+
// later on, if they exist, to avoid counting them twice. Similarly we
7332+
// pre-compute the cost of any optimized truncates.
7333+
// TODO: Switch to more accurate costing based on VPlan.
7334+
for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7335+
Instruction *IVInc = cast<Instruction>(
7336+
IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7337+
if (CostCtx.SkipCostComputation.insert(IVInc).second) {
7338+
InstructionCost InductionCost = CostCtx.getLegacyCost(IVInc, VF);
7339+
LLVM_DEBUG({
7340+
dbgs() << "Cost of " << InductionCost << " for VF " << VF
7341+
<< ":\n induction increment " << *IVInc << "\n";
7342+
IVInc->dump();
7343+
});
7344+
Cost += InductionCost;
7345+
}
7346+
for (User *U : IV->users()) {
7347+
auto *CI = cast<Instruction>(U);
7348+
if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7349+
continue;
7350+
assert(!CostCtx.SkipCostComputation.contains(CI) &&
7351+
"Same cast for multiple inductions?");
7352+
CostCtx.SkipCostComputation.insert(CI);
7353+
InstructionCost CastCost = CostCtx.getLegacyCost(CI, VF);
7354+
LLVM_DEBUG({
7355+
dbgs() << "Cost of " << CastCost << " for VF " << VF
7356+
<< ":\n induction cast " << *CI << "\n";
7357+
CI->dump();
7358+
});
7359+
Cost += CastCost;
7360+
}
7361+
}
7362+
7363+
/// Compute the cost of all exiting conditions of the loop using the legacy
7364+
/// cost model. This is to match the legacy behavior, which adds the cost of
7365+
/// all exit conditions. Note that this over-estimates the cost, as there will
7366+
/// be a single condition to control the vector loop.
7367+
SmallVector<BasicBlock *> Exiting;
7368+
CM.TheLoop->getExitingBlocks(Exiting);
7369+
SetVector<Instruction *> ExitInstrs;
7370+
// Collect all exit conditions.
7371+
for (BasicBlock *EB : Exiting) {
7372+
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7373+
if (!Term)
7374+
continue;
7375+
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7376+
ExitInstrs.insert(CondI);
7377+
}
7378+
}
7379+
// Compute the cost of all instructions only feeding the exit conditions.
7380+
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7381+
Instruction *CondI = ExitInstrs[I];
7382+
if (!OrigLoop->contains(CondI) ||
7383+
!CostCtx.SkipCostComputation.insert(CondI).second)
7384+
continue;
7385+
Cost += CostCtx.getLegacyCost(CondI, VF);
7386+
for (Value *Op : CondI->operands()) {
7387+
auto *OpI = dyn_cast<Instruction>(Op);
7388+
if (!OpI || any_of(OpI->users(), [&ExitInstrs](User *U) {
7389+
return !ExitInstrs.contains(cast<Instruction>(U));
7390+
}))
7391+
continue;
7392+
ExitInstrs.insert(OpI);
7393+
}
7394+
}
7395+
7396+
// The legacy cost model has special logic to compute the cost of in-loop
7397+
// reductions, which may be smaller than the sum of all instructions involved
7398+
// in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7399+
// which the legacy cost model uses to assign cost. Pre-compute their costs
7400+
// for now.
7401+
// TODO: Switch to costing based on VPlan once the logic has been ported.
7402+
for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7403+
if (!CM.isInLoopReduction(RedPhi) &&
7404+
!RecurrenceDescriptor::isAnyOfRecurrenceKind(
7405+
RdxDesc.getRecurrenceKind()))
7406+
continue;
7407+
7408+
// AnyOf reduction codegen may remove the select. To match the legacy cost
7409+
// model, pre-compute the cost for AnyOf reductions here.
7410+
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7411+
RdxDesc.getRecurrenceKind())) {
7412+
auto *Select = cast<SelectInst>(*find_if(
7413+
RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7414+
assert(!CostCtx.SkipCostComputation.contains(Select) &&
7415+
"reduction op visited multiple times");
7416+
CostCtx.SkipCostComputation.insert(Select);
7417+
auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7418+
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7419+
<< ":\n any-of reduction " << *Select << "\n");
7420+
Cost += ReductionCost;
7421+
continue;
7422+
}
7423+
7424+
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7425+
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7426+
ChainOps.end());
7427+
// Also include the operands of instructions in the chain, as the cost-model
7428+
// may mark extends as free.
7429+
for (auto *ChainOp : ChainOps) {
7430+
for (Value *Op : ChainOp->operands()) {
7431+
if (auto *I = dyn_cast<Instruction>(Op))
7432+
ChainOpsAndOperands.insert(I);
7433+
}
7434+
}
7435+
7436+
// Pre-compute the cost for I, if it has a reduction pattern cost.
7437+
for (Instruction *I : ChainOpsAndOperands) {
7438+
auto ReductionCost = CM.getReductionPatternCost(
7439+
I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7440+
if (!ReductionCost)
7441+
continue;
7442+
7443+
assert(!CostCtx.SkipCostComputation.contains(I) &&
7444+
"reduction op visited multiple times");
7445+
CostCtx.SkipCostComputation.insert(I);
7446+
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7447+
<< ":\n in-loop reduction " << *I << "\n");
7448+
Cost += *ReductionCost;
7449+
}
7450+
}
7451+
7452+
// Now compute and add the VPlan-based cost.
7453+
Cost += Plan.cost(VF, CostCtx);
7454+
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7455+
return Cost;
7456+
}
7457+
7458+
VPlan &LoopVectorizationPlanner::getBestPlan() const {
7459+
// If there is a single VPlan with a single VF, return it directly.
7460+
VPlan &FirstPlan = *VPlans[0];
7461+
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7462+
return FirstPlan;
7463+
7464+
VPlan *BestPlan = &FirstPlan;
7465+
ElementCount ScalarVF = ElementCount::getFixed(1);
7466+
assert(hasPlanWithVF(ScalarVF) &&
7467+
"More than a single plan/VF w/o any plan having scalar VF");
7468+
7469+
InstructionCost ScalarCost = cost(getBestPlanFor(ScalarVF), ScalarVF);
7470+
VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7471+
7472+
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7473+
if (ForceVectorization) {
7474+
// Ignore scalar width, because the user explicitly wants vectorization.
7475+
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7476+
// evaluation.
7477+
BestFactor.Cost = InstructionCost::getMax();
7478+
}
7479+
7480+
for (auto &P : VPlans) {
7481+
for (ElementCount VF : P->vectorFactors()) {
7482+
if (VF.isScalar())
7483+
continue;
7484+
InstructionCost Cost = cost(*P, VF);
7485+
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7486+
if (isMoreProfitable(CurrentFactor, BestFactor)) {
7487+
BestFactor = CurrentFactor;
7488+
BestPlan = &*P;
7489+
}
7490+
}
7491+
}
7492+
BestPlan->setVF(BestFactor.Width);
7493+
return *BestPlan;
7494+
}
7495+
73117496
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
73127497
assert(count_if(VPlans,
73137498
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10166,8 +10351,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1016610351
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
1016710352
PSI, Checks);
1016810353

10169-
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10170-
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10354+
VPlan &BestPlan = LVP.getBestPlan();
10355+
assert(size(BestPlan.vectorFactors()) == 1 &&
10356+
"Plan should have a single VF");
10357+
ElementCount Width = *BestPlan.vectorFactors().begin();
10358+
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
10359+
<< "\n");
10360+
assert(VF.Width == Width &&
10361+
"VPlan cost model and legacy cost model disagreed");
10362+
LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
1017110363
++LoopsVectorized;
1017210364

1017310365
// Add metadata to disable runtime unrolling a scalar loop when there

0 commit comments

Comments
 (0)