Skip to content

Commit 5698921

Browse files
committed
[SLP] Look-ahead operand reordering heuristic.
This patch introduces a new heuristic for guiding operand reordering. The new "look-ahead" heuristic can look beyond the immediate predecessors. This helps break ties when the immediate predecessors have identical opcodes (see lit test for an example). Committed on behalf of @vporpo (Vasileios Porpodas) Differential Revision: https://reviews.llvm.org/D60897 llvm-svn: 364084
1 parent 2441a40 commit 5698921

File tree

2 files changed

+276
-93
lines changed

2 files changed

+276
-93
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 232 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,12 @@ static cl::opt<unsigned> MinTreeSize(
147147
"slp-min-tree-size", cl::init(3), cl::Hidden,
148148
cl::desc("Only vectorize small trees if they are fully vectorizable"));
149149

150+
// The maximum depth that the look-ahead score heuristic will explore.
151+
// The higher this value, the higher the compilation time overhead.
152+
static cl::opt<int> LookAheadMaxDepth(
153+
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
154+
cl::desc("The maximum look-ahead depth for operand reordering scores"));
155+
150156
static cl::opt<bool>
151157
ViewSLPTree("view-slp-tree", cl::Hidden,
152158
cl::desc("Display the SLP trees with Graphviz"));
@@ -708,6 +714,7 @@ class BoUpSLP {
708714

709715
const DataLayout &DL;
710716
ScalarEvolution &SE;
717+
const BoUpSLP &R;
711718

712719
/// \returns the operand data at \p OpIdx and \p Lane.
713720
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -733,6 +740,207 @@ class BoUpSLP {
733740
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
734741
}
735742

743+
// The hard-coded scores listed here are not very important. When computing
744+
// the scores of matching one sub-tree with another, we are basically
745+
// counting the number of values that are matching. So even if all scores
746+
// are set to 1, we would still get a decent matching result.
747+
// However, sometimes we have to break ties. For example we may have to
748+
// choose between matching loads vs matching opcodes. This is what these
749+
// scores are helping us with: they provide the order of preference.
750+
751+
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
752+
static const int ScoreConsecutiveLoads = 3;
753+
/// Constants.
754+
static const int ScoreConstants = 2;
755+
/// Instructions with the same opcode.
756+
static const int ScoreSameOpcode = 2;
757+
/// Instructions with alt opcodes (e.g, add + sub).
758+
static const int ScoreAltOpcodes = 1;
759+
/// Identical instructions (a.k.a. splat or broadcast).
760+
static const int ScoreSplat = 1;
761+
/// Matching with an undef is preferable to failing.
762+
static const int ScoreUndef = 1;
763+
/// Score for failing to find a decent match.
764+
static const int ScoreFail = 0;
765+
/// User external to the vectorized code.
766+
static const int ExternalUseCost = 1;
767+
/// The user is internal but in a different lane.
768+
static const int UserInDiffLaneCost = ExternalUseCost;
769+
770+
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
771+
static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
772+
ScalarEvolution &SE) {
773+
auto *LI1 = dyn_cast<LoadInst>(V1);
774+
auto *LI2 = dyn_cast<LoadInst>(V2);
775+
if (LI1 && LI2)
776+
return isConsecutiveAccess(LI1, LI2, DL, SE)
777+
? VLOperands::ScoreConsecutiveLoads
778+
: VLOperands::ScoreFail;
779+
780+
auto *C1 = dyn_cast<Constant>(V1);
781+
auto *C2 = dyn_cast<Constant>(V2);
782+
if (C1 && C2)
783+
return VLOperands::ScoreConstants;
784+
785+
auto *I1 = dyn_cast<Instruction>(V1);
786+
auto *I2 = dyn_cast<Instruction>(V2);
787+
if (I1 && I2) {
788+
if (I1 == I2)
789+
return VLOperands::ScoreSplat;
790+
InstructionsState S = getSameOpcode({I1, I2});
791+
// Note: Only consider instructions with <= 2 operands to avoid
792+
// complexity explosion.
793+
if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
794+
return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
795+
: VLOperands::ScoreSameOpcode;
796+
}
797+
798+
if (isa<UndefValue>(V2))
799+
return VLOperands::ScoreUndef;
800+
801+
return VLOperands::ScoreFail;
802+
}
803+
804+
/// Holds the values and their lane that are taking part in the look-ahead
805+
/// score calculation. This is used in the external uses cost calculation.
806+
SmallDenseMap<Value *, int> InLookAheadValues;
807+
808+
/// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
809+
/// either external to the vectorized code, or require shuffling.
810+
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
811+
const std::pair<Value *, int> &RHS) {
812+
int Cost = 0;
813+
SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
814+
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
815+
Value *V = Values[Idx].first;
816+
// Calculate the absolute lane, using the minimum relative lane of LHS
817+
// and RHS as base and Idx as the offset.
818+
int Ln = std::min(LHS.second, RHS.second) + Idx;
819+
assert(Ln >= 0 && "Bad lane calculation");
820+
for (User *U : V->users()) {
821+
if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
822+
// The user is in the VectorizableTree. Check if we need to insert.
823+
auto It = llvm::find(UserTE->Scalars, U);
824+
assert(It != UserTE->Scalars.end() && "U is in UserTE");
825+
int UserLn = std::distance(UserTE->Scalars.begin(), It);
826+
assert(UserLn >= 0 && "Bad lane");
827+
if (UserLn != Ln)
828+
Cost += UserInDiffLaneCost;
829+
} else {
830+
// Check if the user is in the look-ahead code.
831+
auto It2 = InLookAheadValues.find(U);
832+
if (It2 != InLookAheadValues.end()) {
833+
// The user is in the look-ahead code. Check the lane.
834+
if (It2->second != Ln)
835+
Cost += UserInDiffLaneCost;
836+
} else {
837+
// The user is neither in SLP tree nor in the look-ahead code.
838+
Cost += ExternalUseCost;
839+
}
840+
}
841+
}
842+
}
843+
return Cost;
844+
}
845+
846+
/// Go through the operands of \p LHS and \p RHS recursively until \p
847+
/// MaxLevel, and return the cummulative score. For example:
848+
/// \verbatim
849+
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
850+
/// \ / \ / \ / \ /
851+
/// + + + +
852+
/// G1 G2 G3 G4
853+
/// \endverbatim
854+
/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
855+
/// each level recursively, accumulating the score. It starts from matching
856+
/// the additions at level 0, then moves on to the loads (level 1). The
857+
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
858+
/// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
859+
/// {A[0],C[0]} has a score of VLOperands::ScoreFail.
860+
/// Please note that the order of the operands does not matter, as we
861+
/// evaluate the score of all profitable combinations of operands. In
862+
/// other words the score of G1 and G4 is the same as G1 and G2. This
863+
/// heuristic is based on ideas described in:
864+
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
865+
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
866+
/// Luís F. W. Góes
867+
int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
868+
const std::pair<Value *, int> &RHS, int CurrLevel,
869+
int MaxLevel) {
870+
871+
Value *V1 = LHS.first;
872+
Value *V2 = RHS.first;
873+
// Get the shallow score of V1 and V2.
874+
int ShallowScoreAtThisLevel =
875+
std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
876+
getExternalUsesCost(LHS, RHS));
877+
int Lane1 = LHS.second;
878+
int Lane2 = RHS.second;
879+
880+
// If reached MaxLevel,
881+
// or if V1 and V2 are not instructions,
882+
// or if they are SPLAT,
883+
// or if they are not consecutive, early return the current cost.
884+
auto *I1 = dyn_cast<Instruction>(V1);
885+
auto *I2 = dyn_cast<Instruction>(V2);
886+
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
887+
ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
888+
(isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
889+
return ShallowScoreAtThisLevel;
890+
assert(I1 && I2 && "Should have early exited.");
891+
892+
// Keep track of in-tree values for determining the external-use cost.
893+
InLookAheadValues[V1] = Lane1;
894+
InLookAheadValues[V2] = Lane2;
895+
896+
// Contains the I2 operand indexes that got matched with I1 operands.
897+
SmallSet<int, 4> Op2Used;
898+
899+
// Recursion towards the operands of I1 and I2. We are trying all possbile
900+
// operand pairs, and keeping track of the best score.
901+
for (int OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
902+
OpIdx1 != NumOperands1; ++OpIdx1) {
903+
// Try to pair op1I with the best operand of I2.
904+
int MaxTmpScore = 0;
905+
int MaxOpIdx2 = -1;
906+
// If I2 is commutative try all combinations.
907+
int FromIdx = isCommutative(I2) ? 0 : OpIdx1;
908+
int ToIdx = isCommutative(I2) ? I2->getNumOperands() : OpIdx1 + 1;
909+
assert(FromIdx < ToIdx && "Bad index");
910+
for (int OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
911+
// Skip operands already paired with OpIdx1.
912+
if (Op2Used.count(OpIdx2))
913+
continue;
914+
// Recursively calculate the cost at each level
915+
int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
916+
{I2->getOperand(OpIdx2), Lane2},
917+
CurrLevel + 1, MaxLevel);
918+
// Look for the best score.
919+
if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
920+
MaxTmpScore = TmpScore;
921+
MaxOpIdx2 = OpIdx2;
922+
}
923+
}
924+
if (MaxOpIdx2 >= 0) {
925+
// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
926+
Op2Used.insert(MaxOpIdx2);
927+
ShallowScoreAtThisLevel += MaxTmpScore;
928+
}
929+
}
930+
return ShallowScoreAtThisLevel;
931+
}
932+
933+
/// \Returns the look-ahead score, which tells us how much the sub-trees
934+
/// rooted at \p LHS and \p RHS match, the more they match the higher the
935+
/// score. This helps break ties in an informed way when we cannot decide on
936+
/// the order of the operands by just considering the immediate
937+
/// predecessors.
938+
int getLookAheadScore(const std::pair<Value *, int> &LHS,
939+
const std::pair<Value *, int> &RHS) {
940+
InLookAheadValues.clear();
941+
return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
942+
}
943+
736944
// Search all operands in Ops[*][Lane] for the one that matches best
737945
// Ops[OpIdx][LastLane] and return its opreand index.
738946
// If no good match can be found, return None.
@@ -750,9 +958,6 @@ class BoUpSLP {
750958
// The linearized opcode of the operand at OpIdx, Lane.
751959
bool OpIdxAPO = getData(OpIdx, Lane).APO;
752960

753-
const unsigned BestScore = 2;
754-
const unsigned GoodScore = 1;
755-
756961
// The best operand index and its score.
757962
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
758963
// are using the score to differentiate between the two.
@@ -781,41 +986,19 @@ class BoUpSLP {
781986
// Look for an operand that matches the current mode.
782987
switch (RMode) {
783988
case ReorderingMode::Load:
784-
if (isa<LoadInst>(Op)) {
785-
// Figure out which is left and right, so that we can check for
786-
// consecutive loads
787-
bool LeftToRight = Lane > LastLane;
788-
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
789-
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
790-
if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
791-
cast<LoadInst>(OpRight), DL, SE))
792-
BestOp.Idx = Idx;
793-
}
794-
break;
795-
case ReorderingMode::Opcode:
796-
// We accept both Instructions and Undefs, but with different scores.
797-
if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
798-
cast<Instruction>(Op)->getOpcode() ==
799-
cast<Instruction>(OpLastLane)->getOpcode()) ||
800-
(isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
801-
isa<UndefValue>(Op)) {
802-
// An instruction has a higher score than an undef.
803-
unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
804-
if (Score > BestOp.Score) {
805-
BestOp.Idx = Idx;
806-
BestOp.Score = Score;
807-
}
808-
}
809-
break;
810989
case ReorderingMode::Constant:
811-
if (isa<Constant>(Op)) {
812-
unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
813-
if (Score > BestOp.Score) {
814-
BestOp.Idx = Idx;
815-
BestOp.Score = Score;
816-
}
990+
case ReorderingMode::Opcode: {
991+
bool LeftToRight = Lane > LastLane;
992+
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
993+
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
994+
unsigned Score =
995+
getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
996+
if (Score > BestOp.Score) {
997+
BestOp.Idx = Idx;
998+
BestOp.Score = Score;
817999
}
8181000
break;
1001+
}
8191002
case ReorderingMode::Splat:
8201003
if (Op == OpLastLane)
8211004
BestOp.Idx = Idx;
@@ -946,8 +1129,8 @@ class BoUpSLP {
9461129
public:
9471130
/// Initialize with all the operands of the instruction vector \p RootVL.
9481131
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
949-
ScalarEvolution &SE)
950-
: DL(DL), SE(SE) {
1132+
ScalarEvolution &SE, const BoUpSLP &R)
1133+
: DL(DL), SE(SE), R(R) {
9511134
// Append all the operands of RootVL.
9521135
appendOperandsOfVL(RootVL);
9531136
}
@@ -1169,7 +1352,8 @@ class BoUpSLP {
11691352
SmallVectorImpl<Value *> &Left,
11701353
SmallVectorImpl<Value *> &Right,
11711354
const DataLayout &DL,
1172-
ScalarEvolution &SE);
1355+
ScalarEvolution &SE,
1356+
const BoUpSLP &R);
11731357
struct TreeEntry {
11741358
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
11751359
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -2371,7 +2555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
23712555
// Commutative predicate - collect + sort operands of the instructions
23722556
// so that each side is more likely to have the same opcode.
23732557
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
2374-
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2558+
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
23752559
} else {
23762560
// Collect operands - commute if it uses the swapped predicate.
23772561
for (Value *V : VL) {
@@ -2415,7 +2599,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
24152599
// have the same opcode.
24162600
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
24172601
ValueList Left, Right;
2418-
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2602+
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
24192603
buildTree_rec(Left, Depth + 1, {TE, 0});
24202604
buildTree_rec(Right, Depth + 1, {TE, 1});
24212605
return;
@@ -2584,7 +2768,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
25842768
// Reorder operands if reordering would enable vectorization.
25852769
if (isa<BinaryOperator>(VL0)) {
25862770
ValueList Left, Right;
2587-
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2771+
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
25882772
buildTree_rec(Left, Depth + 1, {TE, 0});
25892773
buildTree_rec(Right, Depth + 1, {TE, 1});
25902774
return;
@@ -3299,13 +3483,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
32993483

33003484
// Perform operand reordering on the instructions in VL and return the reordered
33013485
// operands in Left and Right.
3302-
void BoUpSLP::reorderInputsAccordingToOpcode(
3303-
ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
3304-
SmallVectorImpl<Value *> &Right, const DataLayout &DL,
3305-
ScalarEvolution &SE) {
3486+
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
3487+
SmallVectorImpl<Value *> &Left,
3488+
SmallVectorImpl<Value *> &Right,
3489+
const DataLayout &DL,
3490+
ScalarEvolution &SE,
3491+
const BoUpSLP &R) {
33063492
if (VL.empty())
33073493
return;
3308-
VLOperands Ops(VL, DL, SE);
3494+
VLOperands Ops(VL, DL, SE, R);
33093495
// Reorder the operands in place.
33103496
Ops.reorder();
33113497
Left = Ops.getVL(0);

0 commit comments

Comments
 (0)