@@ -498,7 +498,7 @@ class InnerLoopVectorizer {
498
498
virtual std::pair<BasicBlock *, Value *>
499
499
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
500
500
501
- /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
501
+ /// Fix the vectorized code, taking care of header phi's, and more.
502
502
void fixVectorizedLoop(VPTransformState &State);
503
503
504
504
// Return true if any runtime check is added.
@@ -2713,7 +2713,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
2713
2713
| |
2714
2714
(opt) v <-- edge from middle to exit iff epilogue is not required.
2715
2715
| [ ] \
2716
- | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2716
+ | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2717
+ | | wrapped in VPIRBasicBlock).
2717
2718
\ |
2718
2719
\ v
2719
2720
>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
@@ -2956,7 +2957,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2956
2957
// and there is nothing to fix from vector loop; phis should have incoming
2957
2958
// from scalar loop only.
2958
2959
} else {
2959
- // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2960
+ // TODO: Check in VPlan to see if IV users need fixing instead of checking
2960
2961
// the cost model.
2961
2962
2962
2963
// If we inserted an edge from the middle block to the unique exit block,
@@ -2970,10 +2971,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2970
2971
IVEndValues[Entry.first], LoopMiddleBlock, State);
2971
2972
}
2972
2973
2973
- // Fix live-out phis not already fixed earlier.
2974
- for (const auto &KV : Plan.getLiveOuts())
2975
- KV.second->fixPhi(Plan, State);
2976
-
2977
2974
for (Instruction *PI : PredicatedInstructions)
2978
2975
sinkScalarOperands(&*PI);
2979
2976
@@ -8790,6 +8787,41 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8790
8787
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8791
8788
}
8792
8789
8790
+ /// Create resume phis in the scalar preheader for first-order recurrences and
8791
+ /// reductions and update the VPIRInstructions wrapping the original phis in the
8792
+ /// scalar header.
8793
+ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
8794
+ auto *ScalarPH = Plan.getScalarPreheader();
8795
+ auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
8796
+ VPBuilder ScalarPHBuilder(ScalarPH);
8797
+ VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8798
+ VPValue *OneVPV = Plan.getOrAddLiveIn(
8799
+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8800
+ for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
8801
+ auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
8802
+ auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
8803
+ if (!ScalarPhiI)
8804
+ break;
8805
+ auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
8806
+ if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
8807
+ continue;
8808
+ // The backedge value provides the value to resume coming out of a loop,
8809
+ // which for FORs is a vector whose last element needs to be extracted. The
8810
+ // start value provides the value if the loop is bypassed.
8811
+ bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
8812
+ auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8813
+ if (IsFOR)
8814
+ ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8815
+ VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
8816
+ "vector.recur.extract");
8817
+ StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8818
+ auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
8819
+ VPInstruction::ResumePhi,
8820
+ {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
8821
+ ScalarPhiIRI->addOperand(ResumePhiR);
8822
+ }
8823
+ }
8824
+
8793
8825
// Collect VPIRInstructions for phis in the original exit block that are modeled
8794
8826
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
8795
8827
// modeled explicitly yet and won't be included. Those are un-truncated
@@ -8819,8 +8851,7 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
8819
8851
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8820
8852
// Exit values for inductions are computed and updated outside of VPlan and
8821
8853
// independent of induction recipes.
8822
- // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8823
- // live-outs.
8854
+ // TODO: Compute induction exit values in VPlan.
8824
8855
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8825
8856
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8826
8857
isa<VPWidenPointerInductionRecipe>(V) ||
@@ -8853,7 +8884,8 @@ addUsersInExitBlock(VPlan &Plan,
8853
8884
// modeling the corresponding LCSSA phis.
8854
8885
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8855
8886
VPValue *V = ExitIRI->getOperand(0);
8856
- // Pass live-in values used by exit phis directly through to the live-out.
8887
+ // Pass live-in values used by exit phis directly through to their users in
8888
+ // the exit block.
8857
8889
if (V->isLiveIn())
8858
8890
continue;
8859
8891
@@ -8865,39 +8897,17 @@ addUsersInExitBlock(VPlan &Plan,
8865
8897
}
8866
8898
}
8867
8899
8868
- /// Handle live-outs for first order reductions, both in the scalar preheader
8869
- /// and the original exit block:
8870
- /// 1. Feed a resume value for every FOR from the vector loop to the scalar
8871
- /// loop, if middle block branches to scalar preheader, by introducing
8872
- /// ExtractFromEnd and ResumePhi recipes in each, respectively, and a
8873
- /// VPLiveOut which uses the latter and corresponds to the scalar header.
8874
- /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in
8875
- /// the original exit block using a VPLiveOut.
8876
- static void addLiveOutsForFirstOrderRecurrences(
8900
+ /// Handle users in the exit block for first order reductions in the original
8901
+ /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8902
+ /// users in the original exit block using the VPIRInstruction wrapping to the
8903
+ /// LCSSA phi.
8904
+ static void addExitUsersForFirstOrderRecurrences(
8877
8905
VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
8878
8906
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8879
-
8880
- // Start by finding out if middle block branches to scalar preheader, which is
8881
- // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8882
- // middle block.
8883
- // TODO: Should be replaced by
8884
- // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8885
- // scalar region is modeled as well.
8907
+ auto *ScalarPHVPBB = Plan.getScalarPreheader();
8886
8908
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8887
- VPBasicBlock *ScalarPHVPBB = nullptr;
8888
- if (MiddleVPBB->getNumSuccessors() == 2) {
8889
- // Order is strict: first is the exit block, second is the scalar preheader.
8890
- ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
8891
- } else if (ExitUsersToFix.empty()) {
8892
- ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
8893
- } else {
8894
- llvm_unreachable("unsupported CFG in VPlan");
8895
- }
8896
-
8897
8909
VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8898
8910
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8899
- VPValue *OneVPV = Plan.getOrAddLiveIn(
8900
- ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8901
8911
VPValue *TwoVPV = Plan.getOrAddLiveIn(
8902
8912
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
8903
8913
@@ -8973,26 +8983,16 @@ static void addLiveOutsForFirstOrderRecurrences(
8973
8983
// lo = lcssa.phi [s1, scalar.body],
8974
8984
// [vector.recur.extract.for.phi, middle.block]
8975
8985
//
8976
- // Extract the resume value and create a new VPLiveOut for it.
8977
- auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8978
- {FOR->getBackedgeValue(), OneVPV},
8979
- {}, "vector.recur.extract");
8980
- auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8981
- VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8982
- "scalar.recur.init");
8983
- auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
8984
- Plan.addLiveOut(FORPhi, ResumePhiRecipe);
8985
-
8986
8986
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8987
8987
// Extract the penultimate value of the recurrence and use it as operand for
8988
8988
// the VPIRInstruction modeling the phi.
8989
8989
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8990
8990
if (ExitIRI->getOperand(0) != FOR)
8991
8991
continue;
8992
- VPValue *Ext = MiddleBuilder.createNaryOp(
8992
+ VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8993
8993
VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
8994
8994
"vector.recur.extract.for.phi");
8995
- ExitIRI->setOperand(0, Ext );
8995
+ ExitIRI->setOperand(0, PenultimateElement );
8996
8996
ExitUsersToFix.remove(ExitIRI);
8997
8997
}
8998
8998
}
@@ -9166,11 +9166,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9166
9166
"VPBasicBlock");
9167
9167
RecipeBuilder.fixHeaderPhis();
9168
9168
9169
+ addScalarResumePhis(RecipeBuilder, *Plan);
9169
9170
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
9170
9171
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9171
- addLiveOutsForFirstOrderRecurrences (*Plan, ExitUsersToFix);
9172
+ addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
9172
9173
addUsersInExitBlock(*Plan, ExitUsersToFix);
9173
-
9174
9174
// ---------------------------------------------------------------------------
9175
9175
// Transform initial VPlan: Apply previously taken decisions, in order, to
9176
9176
// bring the VPlan to its final state.
@@ -9192,9 +9192,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9192
9192
// Replace VPValues for known constant strides guaranteed by predicate scalar
9193
9193
// evolution.
9194
9194
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9195
- auto *R = dyn_cast<VPRecipeBase>(&U);
9196
- if (!R)
9197
- return false;
9195
+ auto *R = cast<VPRecipeBase>(&U);
9198
9196
return R->getParent()->getParent() ||
9199
9197
R->getParent() ==
9200
9198
Plan->getVectorLoopRegion()->getSinglePredecessor();
@@ -9291,7 +9289,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9291
9289
// instructions leading from the loop exit instr to the phi need to be converted
9292
9290
// to reductions, with one operand being vector and the other being the scalar
9293
9291
// reduction chain. For other reductions, a select is introduced between the phi
9294
- // and live-out recipes when folding the tail.
9292
+ // and users outside the vector region when folding the tail.
9295
9293
//
9296
9294
// A ComputeReductionResult recipe is added to the middle block, also for
9297
9295
// in-loop reductions which compute their result in-loop, because generating
@@ -9325,8 +9323,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9325
9323
for (VPUser *U : Cur->users()) {
9326
9324
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9327
9325
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9328
- assert(UserRecipe->getParent() == MiddleVPBB &&
9329
- "U must be either in the loop region or the middle block.");
9326
+ assert((UserRecipe->getParent() == MiddleVPBB ||
9327
+ UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9328
+ "U must be either in the loop region, the middle block or the "
9329
+ "scalar preheader.");
9330
9330
continue;
9331
9331
}
9332
9332
Worklist.insert(UserRecipe);
@@ -9440,8 +9440,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9440
9440
9441
9441
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9442
9442
// If tail is folded by masking, introduce selects between the phi
9443
- // and the live-out instruction of each reduction, at the beginning of the
9444
- // dedicated latch block.
9443
+ // and the users outside the vector region of each reduction, at the
9444
+ // beginning of the dedicated latch block.
9445
9445
auto *OrigExitingVPV = PhiR->getBackedgeValue();
9446
9446
auto *NewExitingVPV = PhiR->getBackedgeValue();
9447
9447
if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
@@ -9513,17 +9513,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9513
9513
});
9514
9514
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9515
9515
9516
- // Order is strict: if there are multiple successors, the first is the exit
9517
- // block, second is the scalar preheader.
9518
- VPBasicBlock *ScalarPHVPBB =
9519
- cast<VPBasicBlock>(MiddleVPBB->getSuccessors().back());
9520
- VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9521
- auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
9522
- VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
9523
- {}, "bc.merge.rdx");
9524
- auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
9525
- Plan->addLiveOut(RedPhi, ResumePhiRecipe);
9526
-
9527
9516
// Adjust AnyOf reductions; replace the reduction phi for the selected value
9528
9517
// with a boolean reduction phi node to check if the condition is true in
9529
9518
// any iteration. The final value is selected by the final
0 commit comments