Skip to content

Commit 06bb8c9

Browse files
authored
[VPlan] Explicitly handle scalar pointer inductions. (#83068)
Add a new PtrAdd opcode to VPInstruction that corresponds to IRBuilder::CreatePtrAdd, which creates a GEP with source element type i8. This is then used to model scalarizing VPWidenPointerInductionRecipe by introducing scalar-steps to model the index increment followed by a PtrAdd. Note that PtrAdd needs to be able to generate code for only the first lane or for all lanes. This may warrant introducing a separate recipe for scalarizing that can be created without relying on the underlying IR. Depends on #80271 PR: #83068
1 parent 26d896f commit 06bb8c9

23 files changed

+782
-733
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+2-33
Original file line numberDiff line numberDiff line change
@@ -9111,42 +9111,11 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
91119111
"Not a pointer induction according to InductionDescriptor!");
91129112
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
91139113
"Unexpected type.");
9114+
assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9115+
"Recipe should have been replaced");
91149116

91159117
auto *IVR = getParent()->getPlan()->getCanonicalIV();
91169118
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9117-
9118-
if (onlyScalarsGenerated(State.VF.isScalable())) {
9119-
// This is the normalized GEP that starts counting at zero.
9120-
Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9121-
CanonicalIV, IndDesc.getStep()->getType());
9122-
// Determine the number of scalars we need to generate for each unroll
9123-
// iteration. If the instruction is uniform, we only need to generate the
9124-
// first lane. Otherwise, we generate all VF values.
9125-
bool IsUniform = vputils::onlyFirstLaneUsed(this);
9126-
assert((IsUniform || !State.VF.isScalable()) &&
9127-
"Cannot scalarize a scalable VF");
9128-
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9129-
9130-
for (unsigned Part = 0; Part < State.UF; ++Part) {
9131-
Value *PartStart =
9132-
createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9133-
9134-
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9135-
Value *Idx = State.Builder.CreateAdd(
9136-
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9137-
Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9138-
9139-
Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9140-
Value *SclrGep = emitTransformedIndex(
9141-
State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9142-
IndDesc.getKind(), IndDesc.getInductionBinOp());
9143-
SclrGep->setName("next.gep");
9144-
State.set(this, SclrGep, VPIteration(Part, Lane));
9145-
}
9146-
}
9147-
return;
9148-
}
9149-
91509119
Type *PhiType = IndDesc.getStep()->getType();
91519120

91529121
// Build a pointer phi

llvm/lib/Transforms/Vectorize/VPlan.cpp

+2-5
Original file line numberDiff line numberDiff line change
@@ -860,11 +860,8 @@ void VPlan::execute(VPTransformState *State) {
860860
Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
861861
} else {
862862
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
863-
// TODO: Split off the case that all users of a pointer phi are scalar
864-
// from the VPWidenPointerInductionRecipe.
865-
if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable()))
866-
continue;
867-
863+
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
864+
"recipe generating only scalars should have been replaced");
868865
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
869866
Phi = cast<PHINode>(GEP->getPointerOperand());
870867
}

llvm/lib/Transforms/Vectorize/VPlan.h

+31-10
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,10 @@ class VPInstruction : public VPRecipeWithIRFlags {
11551155
BranchOnCount,
11561156
BranchOnCond,
11571157
ComputeReductionResult,
1158+
// Add an offset in bytes (second operand) to a base pointer (first
1159+
// operand). Only generates scalar values (either for the first lane only or
1160+
// for all lanes, depending on its uses).
1161+
PtrAdd,
11581162
};
11591163

11601164
private:
@@ -1164,11 +1168,28 @@ class VPInstruction : public VPRecipeWithIRFlags {
11641168
/// An optional name that can be used for the generated IR instruction.
11651169
const std::string Name;
11661170

1167-
/// Utility method serving execute(): generates a single instance of the
1168-
/// modeled instruction. \returns the generated value for \p Part.
1169-
/// In some cases an existing value is returned rather than a generated
1171+
/// Returns true if this VPInstruction generates scalar values for all lanes.
1172+
/// Most VPInstructions generate a single value per part, either vector or
1173+
/// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
1174+
/// values per all lanes, stemming from an original ingredient. This method
1175+
/// identifies the (rare) cases of VPInstructions that do so as well, w/o an
1176+
/// underlying ingredient.
1177+
bool doesGeneratePerAllLanes() const;
1178+
1179+
/// Returns true if we can generate a scalar for the first lane only if
1180+
/// needed.
1181+
bool canGenerateScalarForFirstLane() const;
1182+
1183+
/// Utility methods serving execute(): generates a single instance of the
1184+
/// modeled instruction for a given part. \returns the generated value for \p
1185+
/// Part. In some cases an existing value is returned rather than a generated
11701186
/// one.
1171-
Value *generateInstruction(VPTransformState &State, unsigned Part);
1187+
Value *generatePerPart(VPTransformState &State, unsigned Part);
1188+
1189+
/// Utility methods serving execute(): generates a scalar single instance of
1190+
/// the modeled instruction for a given lane. \returns the scalar generated
1191+
/// value for lane \p Lane.
1192+
Value *generatePerLane(VPTransformState &State, const VPIteration &Lane);
11721193

11731194
#if !defined(NDEBUG)
11741195
/// Return true if the VPInstruction is a floating point math operation, i.e.
@@ -2491,12 +2512,6 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
24912512
/// for floating point inductions.
24922513
const FPMathOperator *FPBinOp;
24932514

2494-
VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
2495-
const FPMathOperator *FPBinOp, VPValue *Start,
2496-
VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
2497-
: VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
2498-
Kind(Kind), FPBinOp(FPBinOp) {}
2499-
25002515
public:
25012516
VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
25022517
VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
@@ -2505,6 +2520,12 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
25052520
dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
25062521
Start, CanonicalIV, Step) {}
25072522

2523+
VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
2524+
const FPMathOperator *FPBinOp, VPValue *Start,
2525+
VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
2526+
: VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
2527+
Kind(Kind), FPBinOp(FPBinOp) {}
2528+
25082529
~VPDerivedIVRecipe() override = default;
25092530

25102531
VPRecipeBase *clone() override {

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
4444
CachedTypes[OtherV] = ResTy;
4545
return ResTy;
4646
}
47+
case VPInstruction::PtrAdd:
48+
// Return the type based on the pointer argument (i.e. first operand).
49+
return inferScalarType(R->getOperand(0));
4750
default:
4851
break;
4952
}

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+68-11
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
127127
case VPInstruction::Not:
128128
case VPInstruction::CalculateTripCountMinusVF:
129129
case VPInstruction::CanonicalIVIncrementForPart:
130+
case VPInstruction::PtrAdd:
130131
return false;
131132
default:
132133
return true;
@@ -270,10 +271,39 @@ VPInstruction::VPInstruction(unsigned Opcode,
270271
assert(isFPMathOp() && "this op can't take fast-math flags");
271272
}
272273

273-
Value *VPInstruction::generateInstruction(VPTransformState &State,
274-
unsigned Part) {
274+
bool VPInstruction::doesGeneratePerAllLanes() const {
275+
return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
276+
}
277+
278+
bool VPInstruction::canGenerateScalarForFirstLane() const {
279+
if (Instruction::isBinaryOp(getOpcode()))
280+
return true;
281+
282+
switch (Opcode) {
283+
case VPInstruction::BranchOnCond:
284+
case VPInstruction::BranchOnCount:
285+
case VPInstruction::CalculateTripCountMinusVF:
286+
case VPInstruction::CanonicalIVIncrementForPart:
287+
case VPInstruction::ComputeReductionResult:
288+
case VPInstruction::PtrAdd:
289+
return true;
290+
default:
291+
return false;
292+
}
293+
}
294+
295+
Value *VPInstruction::generatePerLane(VPTransformState &State,
296+
const VPIteration &Lane) {
297+
IRBuilderBase &Builder = State.Builder;
298+
299+
assert(getOpcode() == VPInstruction::PtrAdd &&
300+
"only PtrAdd opcodes are supported for now");
301+
return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
302+
State.get(getOperand(1), Lane), Name);
303+
}
304+
305+
Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
275306
IRBuilderBase &Builder = State.Builder;
276-
Builder.SetCurrentDebugLocation(getDebugLoc());
277307

278308
if (Instruction::isBinaryOp(getOpcode())) {
279309
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
@@ -490,6 +520,13 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
490520

491521
return ReducedPartRdx;
492522
}
523+
case VPInstruction::PtrAdd: {
524+
assert(vputils::onlyFirstLaneUsed(this) &&
525+
"can only generate first lane for PtrAdd");
526+
Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
527+
Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
528+
return Builder.CreatePtrAdd(Ptr, Addend, Name);
529+
}
493530
default:
494531
llvm_unreachable("Unsupported opcode for instruction");
495532
}
@@ -514,17 +551,33 @@ void VPInstruction::execute(VPTransformState &State) {
514551
"Recipe not a FPMathOp but has fast-math flags?");
515552
if (hasFastMathFlags())
516553
State.Builder.setFastMathFlags(getFastMathFlags());
554+
State.Builder.SetCurrentDebugLocation(getDebugLoc());
555+
bool GeneratesPerFirstLaneOnly =
556+
canGenerateScalarForFirstLane() &&
557+
(vputils::onlyFirstLaneUsed(this) ||
558+
getOpcode() == VPInstruction::ComputeReductionResult);
559+
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
517560
for (unsigned Part = 0; Part < State.UF; ++Part) {
518-
Value *GeneratedValue = generateInstruction(State, Part);
519-
if (!hasResult())
561+
if (GeneratesPerAllLanes) {
562+
for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
563+
Lane != NumLanes; ++Lane) {
564+
Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
565+
assert(GeneratedValue && "generatePerLane must produce a value");
566+
State.set(this, GeneratedValue, VPIteration(Part, Lane));
567+
}
520568
continue;
521-
assert(GeneratedValue && "generateInstruction must produce a value");
569+
}
522570

523-
bool IsVector = GeneratedValue->getType()->isVectorTy();
524-
State.set(this, GeneratedValue, Part, !IsVector);
525-
assert((IsVector || getOpcode() == VPInstruction::ComputeReductionResult ||
526-
State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
527-
"scalar value but not only first lane used");
571+
Value *GeneratedValue = generatePerPart(State, Part);
572+
if (!hasResult())
573+
continue;
574+
assert(GeneratedValue && "generatePerPart must produce a value");
575+
assert((GeneratedValue->getType()->isVectorTy() ==
576+
!GeneratesPerFirstLaneOnly ||
577+
State.VF.isScalar()) &&
578+
"scalar value but not only first lane defined");
579+
State.set(this, GeneratedValue, Part,
580+
/*IsScalar*/ GeneratesPerFirstLaneOnly);
528581
}
529582
}
530583

@@ -537,6 +590,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
537590
default:
538591
return false;
539592
case Instruction::ICmp:
593+
case VPInstruction::PtrAdd:
540594
// TODO: Cover additional opcodes.
541595
return vputils::onlyFirstLaneUsed(this);
542596
case VPInstruction::ActiveLaneMask:
@@ -594,6 +648,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
594648
case VPInstruction::ComputeReductionResult:
595649
O << "compute-reduction-result";
596650
break;
651+
case VPInstruction::PtrAdd:
652+
O << "ptradd";
653+
break;
597654
default:
598655
O << Instruction::getOpcodeName(getOpcode());
599656
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+53-13
Original file line numberDiff line numberDiff line change
@@ -498,15 +498,18 @@ static void removeDeadRecipes(VPlan &Plan) {
498498
}
499499
}
500500

501-
static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
501+
static VPValue *createScalarIVSteps(VPlan &Plan,
502+
InductionDescriptor::InductionKind Kind,
503+
Instruction::BinaryOps InductionOpcode,
504+
FPMathOperator *FPBinOp,
502505
ScalarEvolution &SE, Instruction *TruncI,
503506
VPValue *StartV, VPValue *Step,
504507
VPBasicBlock::iterator IP) {
505508
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
506509
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
507510
VPSingleDefRecipe *BaseIV = CanonicalIV;
508-
if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step)) {
509-
BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step);
511+
if (!CanonicalIV->isCanonical(Kind, StartV, Step)) {
512+
BaseIV = new VPDerivedIVRecipe(Kind, FPBinOp, StartV, CanonicalIV, Step);
510513
HeaderVPBB->insert(BaseIV, IP);
511514
}
512515

@@ -536,21 +539,56 @@ static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
536539
VecPreheader->appendRecipe(Step->getDefiningRecipe());
537540
}
538541

539-
VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
542+
VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(
543+
BaseIV, Step, InductionOpcode,
544+
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags());
540545
HeaderVPBB->insert(Steps, IP);
541546
return Steps;
542547
}
543548

544-
/// If any user of a VPWidenIntOrFpInductionRecipe needs scalar values,
545-
/// provide them by building scalar steps off of the canonical scalar IV and
546-
/// update the original IV's users. This is an optional optimization to reduce
547-
/// the needs of vector extracts.
548-
static void optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
549+
/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
550+
/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
551+
/// VPWidenPointerInductionRecipe will generate vectors only. If some users
552+
/// require vectors while other require scalars, the scalar uses need to extract
553+
/// the scalars from the generated vectors (Note that this is different to how
554+
/// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe,
555+
/// if any of its users needs scalar values, by providing them scalar steps
556+
/// built on the canonical scalar IV and update the original IV's users. This is
557+
/// an optional optimization to reduce the needs of vector extracts.
558+
static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
549559
SmallVector<VPRecipeBase *> ToRemove;
550560
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
551561
bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
552562
VPBasicBlock::iterator InsertPt = HeaderVPBB->getFirstNonPhi();
553563
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
564+
// Replace wide pointer inductions which have only their scalars used by
565+
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
566+
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
567+
if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
568+
continue;
569+
570+
const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
571+
VPValue *StartV = Plan.getVPValueOrAddLiveIn(
572+
ConstantInt::get(ID.getStep()->getType(), 0));
573+
VPValue *StepV = PtrIV->getOperand(1);
574+
VPRecipeBase *Steps =
575+
createScalarIVSteps(Plan, InductionDescriptor::IK_IntInduction,
576+
Instruction::Add, nullptr, SE, nullptr, StartV,
577+
StepV, InsertPt)
578+
->getDefiningRecipe();
579+
580+
auto *Recipe =
581+
new VPInstruction(VPInstruction::PtrAdd,
582+
{PtrIV->getStartValue(), Steps->getVPSingleValue()},
583+
PtrIV->getDebugLoc(), "next.gep");
584+
585+
Recipe->insertAfter(Steps);
586+
PtrIV->replaceAllUsesWith(Recipe);
587+
continue;
588+
}
589+
590+
// Replace widened induction with scalar steps for users that only use
591+
// scalars.
554592
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
555593
if (!WideIV)
556594
continue;
@@ -560,9 +598,11 @@ static void optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
560598
continue;
561599

562600
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
563-
VPValue *Steps = createScalarIVSteps(Plan, ID, SE, WideIV->getTruncInst(),
564-
WideIV->getStartValue(),
565-
WideIV->getStepValue(), InsertPt);
601+
VPValue *Steps = createScalarIVSteps(
602+
Plan, ID.getKind(), ID.getInductionOpcode(),
603+
dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), SE,
604+
WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
605+
InsertPt);
566606

567607
// Update scalar users of IV to use Step instead.
568608
if (!HasOnlyVectorVFs)
@@ -1025,7 +1065,7 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
10251065
removeRedundantInductionCasts(Plan);
10261066

10271067
simplifyRecipes(Plan, SE.getContext());
1028-
optimizeInductions(Plan, SE);
1068+
legalizeAndOptimizeInductions(Plan, SE);
10291069
removeDeadRecipes(Plan);
10301070

10311071
createAndOptimizeReplicateRegions(Plan);

llvm/test/Transforms/LoopLoadElim/versioning-scev-invalidation.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ define void @g(ptr %dst.1, ptr %start, i64 %N) {
6363
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
6464
; CHECK: vector.body:
6565
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
66-
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
67-
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
66+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
67+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
6868
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_1]], i64 [[TMP4]]
6969
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
7070
; CHECK-NEXT: store <4 x double> zeroinitializer, ptr [[TMP5]], align 8

0 commit comments

Comments
 (0)