Skip to content

Commit 72f339d

Browse files
authored
[LoopVectorize] Use predicated version of getSmallConstantMaxTripCount (#109928)
There are a number of places where we call getSmallConstantMaxTripCount without passing a vector of predicates: getSmallBestKnownTC isIndvarOverflowCheckKnownFalse computeMaxVF isMoreProfitable I've changed all of these to now pass in a predicate vector so that we get the benefit of making better vectorisation choices when we know the max trip count for loops that require SCEV predicate checks. I've tried to add tests that cover all the cases affected by these changes.
1 parent b222f31 commit 72f339d

File tree

5 files changed

+442
-22
lines changed

5 files changed

+442
-22
lines changed

llvm/include/llvm/Analysis/ScalarEvolution.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2376,6 +2376,10 @@ class PredicatedScalarEvolution {
23762376
/// Get the (predicated) symbolic max backedge count for the analyzed loop.
23772377
const SCEV *getSymbolicMaxBackedgeTakenCount();
23782378

2379+
/// Returns the upper bound of the loop trip count as a normal unsigned
2380+
/// value, or 0 if the trip count is unknown.
2381+
unsigned getSmallConstantMaxTripCount();
2382+
23792383
/// Adds a new predicate.
23802384
void addPredicate(const SCEVPredicate &Pred);
23812385

@@ -2447,6 +2451,9 @@ class PredicatedScalarEvolution {
24472451

24482452
/// The symbolic backedge taken count.
24492453
const SCEV *SymbolicMaxBackedgeCount = nullptr;
2454+
2455+
/// The constant max trip count for the loop.
2456+
std::optional<unsigned> SmallConstantMaxTripCount;
24502457
};
24512458

24522459
template <> struct DenseMapInfo<ScalarEvolution::FoldID> {

llvm/lib/Analysis/ScalarEvolution.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15050,6 +15050,16 @@ const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() {
1505015050
return SymbolicMaxBackedgeCount;
1505115051
}
1505215052

15053+
unsigned PredicatedScalarEvolution::getSmallConstantMaxTripCount() {
15054+
if (!SmallConstantMaxTripCount) {
15055+
SmallVector<const SCEVPredicate *, 4> Preds;
15056+
SmallConstantMaxTripCount = SE.getSmallConstantMaxTripCount(&L, &Preds);
15057+
for (const auto *P : Preds)
15058+
addPredicate(*P);
15059+
}
15060+
return *SmallConstantMaxTripCount;
15061+
}
15062+
1505315063
void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
1505415064
if (Preds->implies(&Pred))
1505515065
return;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -411,10 +411,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
411411
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
412412
/// 4) Returns std::nullopt if all of the above failed.
413413
static std::optional<unsigned>
414-
getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
414+
getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
415415
bool CanUseConstantMax = true) {
416416
// Check if exact trip count is known.
417-
if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
417+
if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
418418
return ExpectedTC;
419419

420420
// Check if there is an expected trip count available from profile data.
@@ -426,7 +426,7 @@ getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
426426
return std::nullopt;
427427

428428
// Check if upper bound estimate is known.
429-
if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
429+
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
430430
return ExpectedTC;
431431

432432
return std::nullopt;
@@ -1789,12 +1789,15 @@ class GeneratedRTChecks {
17891789

17901790
Loop *OuterLoop = nullptr;
17911791

1792+
PredicatedScalarEvolution &PSE;
1793+
17921794
public:
1793-
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1794-
TargetTransformInfo *TTI, const DataLayout &DL,
1795-
bool AddBranchWeights)
1796-
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1797-
MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1795+
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1796+
LoopInfo *LI, TargetTransformInfo *TTI,
1797+
const DataLayout &DL, bool AddBranchWeights)
1798+
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1799+
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1800+
AddBranchWeights(AddBranchWeights), PSE(PSE) {}
17981801

17991802
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18001803
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1941,7 +1944,7 @@ class GeneratedRTChecks {
19411944

19421945
// Get the best known TC estimate.
19431946
if (auto EstimatedTC = getSmallBestKnownTC(
1944-
*SE, OuterLoop, /* CanUseConstantMax = */ false))
1947+
PSE, OuterLoop, /* CanUseConstantMax = */ false))
19451948
BestTripCount = *EstimatedTC;
19461949

19471950
BestTripCount = std::max(BestTripCount, 1U);
@@ -2272,8 +2275,7 @@ static bool isIndvarOverflowCheckKnownFalse(
22722275
// We know the runtime overflow check is known false iff the (max) trip-count
22732276
// is known and (max) trip-count + (VF * UF) does not overflow in the type of
22742277
// the vector loop induction variable.
2275-
if (unsigned TC =
2276-
Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2278+
if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
22772279
uint64_t MaxVF = VF.getKnownMinValue();
22782280
if (VF.isScalable()) {
22792281
std::optional<unsigned> MaxVScale =
@@ -3962,8 +3964,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
39623964
}
39633965

39643966
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
3965-
unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
3967+
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
39663968
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3969+
if (TC != MaxTC)
3970+
LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
39673971
if (TC == 1) {
39683972
reportVectorizationFailure("Single iteration (non) loop",
39693973
"loop trip count is one, irrelevant for vectorization",
@@ -4257,7 +4261,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
42574261
InstructionCost CostA = A.Cost;
42584262
InstructionCost CostB = B.Cost;
42594263

4260-
unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4264+
unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
42614265

42624266
// Improve estimate for the vector width if it is scalable.
42634267
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
@@ -4852,7 +4856,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
48524856
if (!Legal->isSafeForAnyVectorWidth())
48534857
return 1;
48544858

4855-
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
4859+
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
48564860
const bool HasReductions = !Legal->getReductionVars().empty();
48574861

48584862
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -9618,8 +9622,8 @@ static bool processLoopInVPlanNativePath(
96189622
{
96199623
bool AddBranchWeights =
96209624
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9621-
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9622-
F->getDataLayout(), AddBranchWeights);
9625+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9626+
AddBranchWeights);
96239627
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
96249628
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
96259629
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9683,7 +9687,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
96839687
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
96849688
VectorizationFactor &VF,
96859689
std::optional<unsigned> VScale, Loop *L,
9686-
ScalarEvolution &SE,
9690+
PredicatedScalarEvolution &PSE,
96879691
ScalarEpilogueLowering SEL) {
96889692
InstructionCost CheckCost = Checks.getCost();
96899693
if (!CheckCost.isValid())
@@ -9768,7 +9772,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
97689772

97699773
// Skip vectorization if the expected trip count is less than the minimum
97709774
// required trip count.
9771-
if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9775+
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
97729776
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
97739777
VF.MinProfitableTripCount)) {
97749778
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
@@ -9875,7 +9879,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98759879

98769880
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
98779881
// count by optimizing for size, to minimize overheads.
9878-
auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9882+
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
98799883
if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
98809884
LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
98819885
<< "This loop is worth vectorizing only if no scalar "
@@ -9973,8 +9977,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
99739977

99749978
bool AddBranchWeights =
99759979
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9976-
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9977-
F->getDataLayout(), AddBranchWeights);
9980+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9981+
AddBranchWeights);
99789982
if (LVP.hasPlanWithVF(VF.Width)) {
99799983
// Select the interleave count.
99809984
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
@@ -9990,7 +9994,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
99909994
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
99919995
if (!ForceVectorization &&
99929996
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
9993-
*PSE.getSE(), SEL)) {
9997+
PSE, SEL)) {
99949998
ORE->emit([&]() {
99959999
return OptimizationRemarkAnalysisAliasing(
999610000
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

0 commit comments

Comments
 (0)