Skip to content

Commit 26bb2da

Browse files
committed
[VPlan] Proactively create mask for tail-folding up-front (NFCI).
Split off mask creation for tail folding and proactively create the mask for the header block. This simplifies createBlockInMask. Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D157037
1 parent fa9c93e commit 26bb2da

File tree

2 files changed

+63
-44
lines changed

2 files changed

+63
-44
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8008,6 +8008,47 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
80088008
return EdgeMaskCache[Edge] = EdgeMask;
80098009
}
80108010

8011+
void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8012+
BasicBlock *Header = OrigLoop->getHeader();
8013+
8014+
// When not folding the tail, use nullptr to model all-true mask.
8015+
if (!CM.foldTailByMasking()) {
8016+
BlockMaskCache[Header] = nullptr;
8017+
return;
8018+
}
8019+
8020+
// If we're using the active lane mask for control flow, then we get the
8021+
// mask from the active lane mask PHI that is cached in the VPlan.
8022+
TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
8023+
if (useActiveLaneMaskForControlFlow(TFStyle)) {
8024+
BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi();
8025+
return;
8026+
}
8027+
8028+
// Introduce the early-exit compare IV <= BTC to form header block mask.
8029+
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8030+
// constructing the desired canonical IV in the header block as its first
8031+
// non-phi instructions.
8032+
8033+
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8034+
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8035+
auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8036+
HeaderVPBB->insert(IV, NewInsertionPoint);
8037+
8038+
VPBuilder::InsertPointGuard Guard(Builder);
8039+
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8040+
VPValue *BlockMask = nullptr;
8041+
if (useActiveLaneMask(TFStyle)) {
8042+
VPValue *TC = Plan.getTripCount();
8043+
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8044+
nullptr, "active.lane.mask");
8045+
} else {
8046+
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8047+
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8048+
}
8049+
BlockMaskCache[Header] = BlockMask;
8050+
}
8051+
80118052
VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
80128053
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
80138054

@@ -8016,45 +8057,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
80168057
if (BCEntryIt != BlockMaskCache.end())
80178058
return BCEntryIt->second;
80188059

8060+
assert(OrigLoop->getHeader() != BB &&
8061+
"Loop header must have cached block mask");
8062+
80198063
// All-one mask is modelled as no-mask following the convention for masked
80208064
// load/store/gather/scatter. Initialize BlockMask to no-mask.
80218065
VPValue *BlockMask = nullptr;
8022-
8023-
if (OrigLoop->getHeader() == BB) {
8024-
if (!CM.blockNeedsPredicationForAnyReason(BB))
8025-
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8026-
8027-
assert(CM.foldTailByMasking() && "must fold the tail");
8028-
8029-
// If we're using the active lane mask for control flow, then we get the
8030-
// mask from the active lane mask PHI that is cached in the VPlan.
8031-
TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
8032-
if (useActiveLaneMaskForControlFlow(TFStyle))
8033-
return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
8034-
8035-
// Introduce the early-exit compare IV <= BTC to form header block mask.
8036-
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8037-
// constructing the desired canonical IV in the header block as its first
8038-
// non-phi instructions.
8039-
8040-
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8041-
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8042-
auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8043-
HeaderVPBB->insert(IV, NewInsertionPoint);
8044-
8045-
VPBuilder::InsertPointGuard Guard(Builder);
8046-
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8047-
if (useActiveLaneMask(TFStyle)) {
8048-
VPValue *TC = Plan.getTripCount();
8049-
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8050-
nullptr, "active.lane.mask");
8051-
} else {
8052-
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8053-
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8054-
}
8055-
return BlockMaskCache[BB] = BlockMask;
8056-
}
8057-
80588066
// This is the block mask. We OR all incoming edges.
80598067
for (auto *Predecessor : predecessors(BB)) {
80608068
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
@@ -8766,6 +8774,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
87668774
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
87678775
CM.getTailFoldingStyle(IVUpdateMayOverflow));
87688776

8777+
// Proactively create header mask. Masks for other blocks are created on
8778+
// demand.
8779+
RecipeBuilder.createHeaderMask(*Plan);
8780+
87698781
// Scan the body of the loop in a topological order to visit each basic block
87708782
// after having visited its predecessor basic blocks.
87718783
LoopBlocksDFS DFS(OrigLoop);
@@ -8822,13 +8834,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
88228834
}
88238835

88248836
RecipeBuilder.setRecipe(Instr, Recipe);
8825-
if (isa<VPWidenIntOrFpInductionRecipe>(Recipe)) {
8826-
// VPWidenIntOrFpInductionRecipes must be kept in the phi section of
8827-
// HeaderVPBB. VPWidenIntOrFpInductionRecipes for optimized truncates
8828-
// may be generated after non-phi recipes and need to be moved to the
8829-
// phi section of HeaderVPBB.
8837+
if (isa<VPHeaderPHIRecipe>(Recipe)) {
8838+
// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8839+
// the following cases, VPHeaderPHIRecipes may be created after non-phi
8840+
// recipes and need to be moved to the phi section of HeaderVPBB:
8841+
// * tail-folding (non-phi recipes computing the header mask are
8842+
// introduced earlier than regular header phi recipes, and should appear
8843+
// after them)
8844+
// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8845+
88308846
assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8831-
isa<TruncInst>(Instr)) &&
8847+
CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
88328848
"unexpected recipe needs moving");
88338849
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
88348850
} else

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,12 @@ class VPRecipeBuilder {
133133
Ingredient2Recipe[I] = R;
134134
}
135135

136+
/// Create the mask for the vector loop header block.
137+
void createHeaderMask(VPlan &Plan);
138+
136139
/// A helper function that computes the predicate of the block BB, assuming
137-
/// that the header block of the loop is set to True. It returns the *entry*
138-
/// mask for the block BB.
140+
/// that the header block of the loop is set to True or the loop mask when
141+
/// tail folding. It returns the *entry* mask for the block BB.
139142
VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
140143

141144
/// A helper function that computes the predicate of the edge between SRC

0 commit comments

Comments
 (0)