@@ -8008,6 +8008,47 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8008
8008
return EdgeMaskCache[Edge] = EdgeMask;
8009
8009
}
8010
8010
8011
+ void VPRecipeBuilder::createHeaderMask (VPlan &Plan) {
8012
+ BasicBlock *Header = OrigLoop->getHeader ();
8013
+
8014
+ // When not folding the tail, use nullptr to model all-true mask.
8015
+ if (!CM.foldTailByMasking ()) {
8016
+ BlockMaskCache[Header] = nullptr ;
8017
+ return ;
8018
+ }
8019
+
8020
+ // If we're using the active lane mask for control flow, then we get the
8021
+ // mask from the active lane mask PHI that is cached in the VPlan.
8022
+ TailFoldingStyle TFStyle = CM.getTailFoldingStyle ();
8023
+ if (useActiveLaneMaskForControlFlow (TFStyle)) {
8024
+ BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi ();
8025
+ return ;
8026
+ }
8027
+
8028
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
8029
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8030
+ // constructing the desired canonical IV in the header block as its first
8031
+ // non-phi instructions.
8032
+
8033
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion ()->getEntryBasicBlock ();
8034
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi ();
8035
+ auto *IV = new VPWidenCanonicalIVRecipe (Plan.getCanonicalIV ());
8036
+ HeaderVPBB->insert (IV, NewInsertionPoint);
8037
+
8038
+ VPBuilder::InsertPointGuard Guard (Builder);
8039
+ Builder.setInsertPoint (HeaderVPBB, NewInsertionPoint);
8040
+ VPValue *BlockMask = nullptr ;
8041
+ if (useActiveLaneMask (TFStyle)) {
8042
+ VPValue *TC = Plan.getTripCount ();
8043
+ BlockMask = Builder.createNaryOp (VPInstruction::ActiveLaneMask, {IV, TC},
8044
+ nullptr , " active.lane.mask" );
8045
+ } else {
8046
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount ();
8047
+ BlockMask = Builder.createNaryOp (VPInstruction::ICmpULE, {IV, BTC});
8048
+ }
8049
+ BlockMaskCache[Header] = BlockMask;
8050
+ }
8051
+
8011
8052
VPValue *VPRecipeBuilder::createBlockInMask (BasicBlock *BB, VPlan &Plan) {
8012
8053
assert (OrigLoop->contains (BB) && " Block is not a part of a loop" );
8013
8054
@@ -8016,45 +8057,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8016
8057
if (BCEntryIt != BlockMaskCache.end ())
8017
8058
return BCEntryIt->second ;
8018
8059
8060
+ assert (OrigLoop->getHeader () != BB &&
8061
+ " Loop header must have cached block mask" );
8062
+
8019
8063
// All-one mask is modelled as no-mask following the convention for masked
8020
8064
// load/store/gather/scatter. Initialize BlockMask to no-mask.
8021
8065
VPValue *BlockMask = nullptr ;
8022
-
8023
- if (OrigLoop->getHeader () == BB) {
8024
- if (!CM.blockNeedsPredicationForAnyReason (BB))
8025
- return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8026
-
8027
- assert (CM.foldTailByMasking () && " must fold the tail" );
8028
-
8029
- // If we're using the active lane mask for control flow, then we get the
8030
- // mask from the active lane mask PHI that is cached in the VPlan.
8031
- TailFoldingStyle TFStyle = CM.getTailFoldingStyle ();
8032
- if (useActiveLaneMaskForControlFlow (TFStyle))
8033
- return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi ();
8034
-
8035
- // Introduce the early-exit compare IV <= BTC to form header block mask.
8036
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8037
- // constructing the desired canonical IV in the header block as its first
8038
- // non-phi instructions.
8039
-
8040
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion ()->getEntryBasicBlock ();
8041
- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi ();
8042
- auto *IV = new VPWidenCanonicalIVRecipe (Plan.getCanonicalIV ());
8043
- HeaderVPBB->insert (IV, NewInsertionPoint);
8044
-
8045
- VPBuilder::InsertPointGuard Guard (Builder);
8046
- Builder.setInsertPoint (HeaderVPBB, NewInsertionPoint);
8047
- if (useActiveLaneMask (TFStyle)) {
8048
- VPValue *TC = Plan.getTripCount ();
8049
- BlockMask = Builder.createNaryOp (VPInstruction::ActiveLaneMask, {IV, TC},
8050
- nullptr , " active.lane.mask" );
8051
- } else {
8052
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount ();
8053
- BlockMask = Builder.createNaryOp (VPInstruction::ICmpULE, {IV, BTC});
8054
- }
8055
- return BlockMaskCache[BB] = BlockMask;
8056
- }
8057
-
8058
8066
// This is the block mask. We OR all incoming edges.
8059
8067
for (auto *Predecessor : predecessors (BB)) {
8060
8068
VPValue *EdgeMask = createEdgeMask (Predecessor, BB, Plan);
@@ -8766,6 +8774,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8766
8774
DLInst ? DLInst->getDebugLoc () : DebugLoc (),
8767
8775
CM.getTailFoldingStyle (IVUpdateMayOverflow));
8768
8776
8777
+ // Proactively create header mask. Masks for other blocks are created on
8778
+ // demand.
8779
+ RecipeBuilder.createHeaderMask (*Plan);
8780
+
8769
8781
// Scan the body of the loop in a topological order to visit each basic block
8770
8782
// after having visited its predecessor basic blocks.
8771
8783
LoopBlocksDFS DFS (OrigLoop);
@@ -8822,13 +8834,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8822
8834
}
8823
8835
8824
8836
RecipeBuilder.setRecipe (Instr, Recipe);
8825
- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe)) {
8826
- // VPWidenIntOrFpInductionRecipes must be kept in the phi section of
8827
- // HeaderVPBB. VPWidenIntOrFpInductionRecipes for optimized truncates
8828
- // may be generated after non-phi recipes and need to be moved to the
8829
- // phi section of HeaderVPBB.
8837
+ if (isa<VPHeaderPHIRecipe>(Recipe)) {
8838
+ // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8839
+ // the following cases, VPHeaderPHIRecipes may be created after non-phi
8840
+ // recipes and need to be moved to the phi section of HeaderVPBB:
8841
+ // * tail-folding (non-phi recipes computing the header mask are
8842
+ // introduced earlier than regular header phi recipes, and should appear
8843
+ // after them)
8844
+ // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8845
+
8830
8846
assert ((HeaderVPBB->getFirstNonPhi () == VPBB->end () ||
8831
- isa<TruncInst>(Instr)) &&
8847
+ CM. foldTailByMasking () || isa<TruncInst>(Instr)) &&
8832
8848
" unexpected recipe needs moving" );
8833
8849
Recipe->insertBefore (*HeaderVPBB, HeaderVPBB->getFirstNonPhi ());
8834
8850
} else
0 commit comments