@@ -147,6 +147,12 @@ static cl::opt<unsigned> MinTreeSize(
147
147
" slp-min-tree-size" , cl::init(3 ), cl::Hidden,
148
148
cl::desc(" Only vectorize small trees if they are fully vectorizable" ));
149
149
150
+ // The maximum depth that the look-ahead score heuristic will explore.
151
+ // The higher this value, the higher the compilation time overhead.
152
+ static cl::opt<int > LookAheadMaxDepth (
153
+ " slp-max-look-ahead-depth" , cl::init(2 ), cl::Hidden,
154
+ cl::desc(" The maximum look-ahead depth for operand reordering scores" ));
155
+
150
156
static cl::opt<bool >
151
157
ViewSLPTree (" view-slp-tree" , cl::Hidden,
152
158
cl::desc (" Display the SLP trees with Graphviz" ));
@@ -708,6 +714,7 @@ class BoUpSLP {
708
714
709
715
const DataLayout &DL;
710
716
ScalarEvolution &SE;
717
+ const BoUpSLP &R;
711
718
712
719
// / \returns the operand data at \p OpIdx and \p Lane.
713
720
OperandData &getData (unsigned OpIdx, unsigned Lane) {
@@ -733,6 +740,207 @@ class BoUpSLP {
733
740
std::swap (OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
734
741
}
735
742
743
+ // The hard-coded scores listed here are not very important. When computing
744
+ // the scores of matching one sub-tree with another, we are basically
745
+ // counting the number of values that are matching. So even if all scores
746
+ // are set to 1, we would still get a decent matching result.
747
+ // However, sometimes we have to break ties. For example we may have to
748
+ // choose between matching loads vs matching opcodes. This is what these
749
+ // scores are helping us with: they provide the order of preference.
750
+
751
+ // / Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
752
+ static const int ScoreConsecutiveLoads = 3 ;
753
+ // / Constants.
754
+ static const int ScoreConstants = 2 ;
755
+ // / Instructions with the same opcode.
756
+ static const int ScoreSameOpcode = 2 ;
757
+ // / Instructions with alt opcodes (e.g, add + sub).
758
+ static const int ScoreAltOpcodes = 1 ;
759
+ // / Identical instructions (a.k.a. splat or broadcast).
760
+ static const int ScoreSplat = 1 ;
761
+ // / Matching with an undef is preferable to failing.
762
+ static const int ScoreUndef = 1 ;
763
+ // / Score for failing to find a decent match.
764
+ static const int ScoreFail = 0 ;
765
+ // / User external to the vectorized code.
766
+ static const int ExternalUseCost = 1 ;
767
+ // / The user is internal but in a different lane.
768
+ static const int UserInDiffLaneCost = ExternalUseCost;
769
+
770
+ // / \returns the score of placing \p V1 and \p V2 in consecutive lanes.
771
+ static int getShallowScore (Value *V1, Value *V2, const DataLayout &DL,
772
+ ScalarEvolution &SE) {
773
+ auto *LI1 = dyn_cast<LoadInst>(V1);
774
+ auto *LI2 = dyn_cast<LoadInst>(V2);
775
+ if (LI1 && LI2)
776
+ return isConsecutiveAccess (LI1, LI2, DL, SE)
777
+ ? VLOperands::ScoreConsecutiveLoads
778
+ : VLOperands::ScoreFail;
779
+
780
+ auto *C1 = dyn_cast<Constant>(V1);
781
+ auto *C2 = dyn_cast<Constant>(V2);
782
+ if (C1 && C2)
783
+ return VLOperands::ScoreConstants;
784
+
785
+ auto *I1 = dyn_cast<Instruction>(V1);
786
+ auto *I2 = dyn_cast<Instruction>(V2);
787
+ if (I1 && I2) {
788
+ if (I1 == I2)
789
+ return VLOperands::ScoreSplat;
790
+ InstructionsState S = getSameOpcode ({I1, I2});
791
+ // Note: Only consider instructions with <= 2 operands to avoid
792
+ // complexity explosion.
793
+ if (S.getOpcode () && S.MainOp ->getNumOperands () <= 2 )
794
+ return S.isAltShuffle () ? VLOperands::ScoreAltOpcodes
795
+ : VLOperands::ScoreSameOpcode;
796
+ }
797
+
798
+ if (isa<UndefValue>(V2))
799
+ return VLOperands::ScoreUndef;
800
+
801
+ return VLOperands::ScoreFail;
802
+ }
803
+
804
+ // / Holds the values and their lane that are taking part in the look-ahead
805
+ // / score calculation. This is used in the external uses cost calculation.
806
+ SmallDenseMap<Value *, int > InLookAheadValues;
807
+
808
+ // / \Returns the additinal cost due to uses of \p LHS and \p RHS that are
809
+ // / either external to the vectorized code, or require shuffling.
810
+ int getExternalUsesCost (const std::pair<Value *, int > &LHS,
811
+ const std::pair<Value *, int > &RHS) {
812
+ int Cost = 0 ;
813
+ SmallVector<std::pair<Value *, int >, 2 > Values = {LHS, RHS};
814
+ for (int Idx = 0 , IdxE = Values.size (); Idx != IdxE; ++Idx) {
815
+ Value *V = Values[Idx].first ;
816
+ // Calculate the absolute lane, using the minimum relative lane of LHS
817
+ // and RHS as base and Idx as the offset.
818
+ int Ln = std::min (LHS.second , RHS.second ) + Idx;
819
+ assert (Ln >= 0 && " Bad lane calculation" );
820
+ for (User *U : V->users ()) {
821
+ if (const TreeEntry *UserTE = R.getTreeEntry (U)) {
822
+ // The user is in the VectorizableTree. Check if we need to insert.
823
+ auto It = llvm::find (UserTE->Scalars , U);
824
+ assert (It != UserTE->Scalars .end () && " U is in UserTE" );
825
+ int UserLn = std::distance (UserTE->Scalars .begin (), It);
826
+ assert (UserLn >= 0 && " Bad lane" );
827
+ if (UserLn != Ln)
828
+ Cost += UserInDiffLaneCost;
829
+ } else {
830
+ // Check if the user is in the look-ahead code.
831
+ auto It2 = InLookAheadValues.find (U);
832
+ if (It2 != InLookAheadValues.end ()) {
833
+ // The user is in the look-ahead code. Check the lane.
834
+ if (It2->second != Ln)
835
+ Cost += UserInDiffLaneCost;
836
+ } else {
837
+ // The user is neither in SLP tree nor in the look-ahead code.
838
+ Cost += ExternalUseCost;
839
+ }
840
+ }
841
+ }
842
+ }
843
+ return Cost;
844
+ }
845
+
846
+ // / Go through the operands of \p LHS and \p RHS recursively until \p
847
+ // / MaxLevel, and return the cummulative score. For example:
848
+ // / \verbatim
849
+ // / A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
850
+ // / \ / \ / \ / \ /
851
+ // / + + + +
852
+ // / G1 G2 G3 G4
853
+ // / \endverbatim
854
+ // / The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
855
+ // / each level recursively, accumulating the score. It starts from matching
856
+ // / the additions at level 0, then moves on to the loads (level 1). The
857
+ // / score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
858
+ // / {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
859
+ // / {A[0],C[0]} has a score of VLOperands::ScoreFail.
860
+ // / Please note that the order of the operands does not matter, as we
861
+ // / evaluate the score of all profitable combinations of operands. In
862
+ // / other words the score of G1 and G4 is the same as G1 and G2. This
863
+ // / heuristic is based on ideas described in:
864
+ // / Look-ahead SLP: Auto-vectorization in the presence of commutative
865
+ // / operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
866
+ // / Luís F. W. Góes
867
+ int getScoreAtLevelRec (const std::pair<Value *, int > &LHS,
868
+ const std::pair<Value *, int > &RHS, int CurrLevel,
869
+ int MaxLevel) {
870
+
871
+ Value *V1 = LHS.first ;
872
+ Value *V2 = RHS.first ;
873
+ // Get the shallow score of V1 and V2.
874
+ int ShallowScoreAtThisLevel =
875
+ std::max ((int )ScoreFail, getShallowScore (V1, V2, DL, SE) -
876
+ getExternalUsesCost (LHS, RHS));
877
+ int Lane1 = LHS.second ;
878
+ int Lane2 = RHS.second ;
879
+
880
+ // If reached MaxLevel,
881
+ // or if V1 and V2 are not instructions,
882
+ // or if they are SPLAT,
883
+ // or if they are not consecutive, early return the current cost.
884
+ auto *I1 = dyn_cast<Instruction>(V1);
885
+ auto *I2 = dyn_cast<Instruction>(V2);
886
+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
887
+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
888
+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
889
+ return ShallowScoreAtThisLevel;
890
+ assert (I1 && I2 && " Should have early exited." );
891
+
892
+ // Keep track of in-tree values for determining the external-use cost.
893
+ InLookAheadValues[V1] = Lane1;
894
+ InLookAheadValues[V2] = Lane2;
895
+
896
+ // Contains the I2 operand indexes that got matched with I1 operands.
897
+ SmallSet<int , 4 > Op2Used;
898
+
899
+ // Recursion towards the operands of I1 and I2. We are trying all possbile
900
+ // operand pairs, and keeping track of the best score.
901
+ for (int OpIdx1 = 0 , NumOperands1 = I1->getNumOperands ();
902
+ OpIdx1 != NumOperands1; ++OpIdx1) {
903
+ // Try to pair op1I with the best operand of I2.
904
+ int MaxTmpScore = 0 ;
905
+ int MaxOpIdx2 = -1 ;
906
+ // If I2 is commutative try all combinations.
907
+ int FromIdx = isCommutative (I2) ? 0 : OpIdx1;
908
+ int ToIdx = isCommutative (I2) ? I2->getNumOperands () : OpIdx1 + 1 ;
909
+ assert (FromIdx < ToIdx && " Bad index" );
910
+ for (int OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
911
+ // Skip operands already paired with OpIdx1.
912
+ if (Op2Used.count (OpIdx2))
913
+ continue ;
914
+ // Recursively calculate the cost at each level
915
+ int TmpScore = getScoreAtLevelRec ({I1->getOperand (OpIdx1), Lane1},
916
+ {I2->getOperand (OpIdx2), Lane2},
917
+ CurrLevel + 1 , MaxLevel);
918
+ // Look for the best score.
919
+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
920
+ MaxTmpScore = TmpScore;
921
+ MaxOpIdx2 = OpIdx2;
922
+ }
923
+ }
924
+ if (MaxOpIdx2 >= 0 ) {
925
+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
926
+ Op2Used.insert (MaxOpIdx2);
927
+ ShallowScoreAtThisLevel += MaxTmpScore;
928
+ }
929
+ }
930
+ return ShallowScoreAtThisLevel;
931
+ }
932
+
933
+ // / \Returns the look-ahead score, which tells us how much the sub-trees
934
+ // / rooted at \p LHS and \p RHS match, the more they match the higher the
935
+ // / score. This helps break ties in an informed way when we cannot decide on
936
+ // / the order of the operands by just considering the immediate
937
+ // / predecessors.
938
+ int getLookAheadScore (const std::pair<Value *, int > &LHS,
939
+ const std::pair<Value *, int > &RHS) {
940
+ InLookAheadValues.clear ();
941
+ return getScoreAtLevelRec (LHS, RHS, 1 , LookAheadMaxDepth);
942
+ }
943
+
736
944
// Search all operands in Ops[*][Lane] for the one that matches best
737
945
// Ops[OpIdx][LastLane] and return its opreand index.
738
946
// If no good match can be found, return None.
@@ -750,9 +958,6 @@ class BoUpSLP {
750
958
// The linearized opcode of the operand at OpIdx, Lane.
751
959
bool OpIdxAPO = getData (OpIdx, Lane).APO ;
752
960
753
- const unsigned BestScore = 2 ;
754
- const unsigned GoodScore = 1 ;
755
-
756
961
// The best operand index and its score.
757
962
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
758
963
// are using the score to differentiate between the two.
@@ -781,41 +986,19 @@ class BoUpSLP {
781
986
// Look for an operand that matches the current mode.
782
987
switch (RMode) {
783
988
case ReorderingMode::Load:
784
- if (isa<LoadInst>(Op)) {
785
- // Figure out which is left and right, so that we can check for
786
- // consecutive loads
787
- bool LeftToRight = Lane > LastLane;
788
- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
789
- Value *OpRight = (LeftToRight) ? Op : OpLastLane;
790
- if (isConsecutiveAccess (cast<LoadInst>(OpLeft),
791
- cast<LoadInst>(OpRight), DL, SE))
792
- BestOp.Idx = Idx;
793
- }
794
- break ;
795
- case ReorderingMode::Opcode:
796
- // We accept both Instructions and Undefs, but with different scores.
797
- if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
798
- cast<Instruction>(Op)->getOpcode () ==
799
- cast<Instruction>(OpLastLane)->getOpcode ()) ||
800
- (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
801
- isa<UndefValue>(Op)) {
802
- // An instruction has a higher score than an undef.
803
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
804
- if (Score > BestOp.Score ) {
805
- BestOp.Idx = Idx;
806
- BestOp.Score = Score;
807
- }
808
- }
809
- break ;
810
989
case ReorderingMode::Constant:
811
- if (isa<Constant>(Op)) {
812
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
813
- if (Score > BestOp.Score ) {
814
- BestOp.Idx = Idx;
815
- BestOp.Score = Score;
816
- }
990
+ case ReorderingMode::Opcode: {
991
+ bool LeftToRight = Lane > LastLane;
992
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
993
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
994
+ unsigned Score =
995
+ getLookAheadScore ({OpLeft, LastLane}, {OpRight, Lane});
996
+ if (Score > BestOp.Score ) {
997
+ BestOp.Idx = Idx;
998
+ BestOp.Score = Score;
817
999
}
818
1000
break ;
1001
+ }
819
1002
case ReorderingMode::Splat:
820
1003
if (Op == OpLastLane)
821
1004
BestOp.Idx = Idx;
@@ -946,8 +1129,8 @@ class BoUpSLP {
946
1129
public:
947
1130
// / Initialize with all the operands of the instruction vector \p RootVL.
948
1131
VLOperands (ArrayRef<Value *> RootVL, const DataLayout &DL,
949
- ScalarEvolution &SE)
950
- : DL(DL), SE(SE) {
1132
+ ScalarEvolution &SE, const BoUpSLP &R )
1133
+ : DL(DL), SE(SE), R(R) {
951
1134
// Append all the operands of RootVL.
952
1135
appendOperandsOfVL (RootVL);
953
1136
}
@@ -1169,7 +1352,8 @@ class BoUpSLP {
1169
1352
SmallVectorImpl<Value *> &Left,
1170
1353
SmallVectorImpl<Value *> &Right,
1171
1354
const DataLayout &DL,
1172
- ScalarEvolution &SE);
1355
+ ScalarEvolution &SE,
1356
+ const BoUpSLP &R);
1173
1357
struct TreeEntry {
1174
1358
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8 >;
1175
1359
TreeEntry (VecTreeTy &Container) : Container(Container) {}
@@ -2371,7 +2555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2371
2555
// Commutative predicate - collect + sort operands of the instructions
2372
2556
// so that each side is more likely to have the same opcode.
2373
2557
assert (P0 == SwapP0 && " Commutative Predicate mismatch" );
2374
- reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE);
2558
+ reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE, * this );
2375
2559
} else {
2376
2560
// Collect operands - commute if it uses the swapped predicate.
2377
2561
for (Value *V : VL) {
@@ -2415,7 +2599,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2415
2599
// have the same opcode.
2416
2600
if (isa<BinaryOperator>(VL0) && VL0->isCommutative ()) {
2417
2601
ValueList Left, Right;
2418
- reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE);
2602
+ reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE, * this );
2419
2603
buildTree_rec (Left, Depth + 1 , {TE, 0 });
2420
2604
buildTree_rec (Right, Depth + 1 , {TE, 1 });
2421
2605
return ;
@@ -2584,7 +2768,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2584
2768
// Reorder operands if reordering would enable vectorization.
2585
2769
if (isa<BinaryOperator>(VL0)) {
2586
2770
ValueList Left, Right;
2587
- reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE);
2771
+ reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE, * this );
2588
2772
buildTree_rec (Left, Depth + 1 , {TE, 0 });
2589
2773
buildTree_rec (Right, Depth + 1 , {TE, 1 });
2590
2774
return ;
@@ -3299,13 +3483,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
3299
3483
3300
3484
// Perform operand reordering on the instructions in VL and return the reordered
3301
3485
// operands in Left and Right.
3302
- void BoUpSLP::reorderInputsAccordingToOpcode (
3303
- ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
3304
- SmallVectorImpl<Value *> &Right, const DataLayout &DL,
3305
- ScalarEvolution &SE) {
3486
+ void BoUpSLP::reorderInputsAccordingToOpcode (ArrayRef<Value *> VL,
3487
+ SmallVectorImpl<Value *> &Left,
3488
+ SmallVectorImpl<Value *> &Right,
3489
+ const DataLayout &DL,
3490
+ ScalarEvolution &SE,
3491
+ const BoUpSLP &R) {
3306
3492
if (VL.empty ())
3307
3493
return ;
3308
- VLOperands Ops (VL, DL, SE);
3494
+ VLOperands Ops (VL, DL, SE, R );
3309
3495
// Reorder the operands in place.
3310
3496
Ops.reorder ();
3311
3497
Left = Ops.getVL (0 );
0 commit comments