@@ -6947,20 +6947,36 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6947
6947
// Improve gather cost for gather of loads, if we can group some of the
6948
6948
// loads into vector loads.
6949
6949
InstructionsState S = getSameOpcode(VL, *R.TLI);
6950
- if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
6951
- !S.isAltShuffle() &&
6950
+ const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
6951
+ unsigned MinVF = R.getMinVF(2 * Sz);
6952
+ if (VL.size() > 2 &&
6953
+ ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
6954
+ (InVectors.empty() &&
6955
+ any_of(seq<unsigned>(0, VL.size() / MinVF),
6956
+ [&](unsigned Idx) {
6957
+ ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
6958
+ InstructionsState S = getSameOpcode(SubVL, *R.TLI);
6959
+ return S.getOpcode() == Instruction::Load &&
6960
+ !S.isAltShuffle();
6961
+ }))) &&
6952
6962
!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
6953
6963
!isSplat(Gathers)) {
6954
6964
BoUpSLP::ValueSet VectorizedLoads;
6965
+ SmallVector<LoadInst *> VectorizedStarts;
6966
+ SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;
6955
6967
unsigned StartIdx = 0;
6956
6968
unsigned VF = VL.size() / 2;
6957
- unsigned VectorizedCnt = 0;
6958
- unsigned ScatterVectorizeCnt = 0;
6959
- const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
6969
+ const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
6960
6970
for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
6961
6971
for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
6962
6972
Cnt += VF) {
6963
6973
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
6974
+ if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
6975
+ InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
6976
+ if (SliceS.getOpcode() != Instruction::Load ||
6977
+ SliceS.isAltShuffle())
6978
+ continue;
6979
+ }
6964
6980
if (!VectorizedLoads.count(Slice.front()) &&
6965
6981
!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
6966
6982
SmallVector<Value *> PointerOps;
@@ -6974,10 +6990,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6974
6990
case LoadsState::PossibleStridedVectorize:
6975
6991
// Mark the vectorized loads so that we don't vectorize them
6976
6992
// again.
6977
- if (LS == LoadsState::Vectorize)
6978
- ++VectorizedCnt ;
6993
+ if (LS == LoadsState::Vectorize && CurrentOrder.empty() )
6994
+ VectorizedStarts.push_back(cast<LoadInst>(Slice.front())) ;
6979
6995
else
6980
- ++ScatterVectorizeCnt ;
6996
+ ScatterVectorized.emplace_back(Cnt, VF) ;
6981
6997
VectorizedLoads.insert(Slice.begin(), Slice.end());
6982
6998
// If we vectorized initial block, no need to try to vectorize
6983
6999
// it again.
@@ -7008,8 +7024,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7008
7024
}
7009
7025
// Exclude potentially vectorized loads from list of gathered
7010
7026
// scalars.
7011
- auto *LI = cast<LoadInst>(S.MainOp);
7012
- Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
7027
+ Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7013
7028
// The cost for vectorized loads.
7014
7029
InstructionCost ScalarsCost = 0;
7015
7030
for (Value *V : VectorizedLoads) {
@@ -7019,17 +7034,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7019
7034
LI->getAlign(), LI->getPointerAddressSpace(),
7020
7035
CostKind, TTI::OperandValueInfo(), LI);
7021
7036
}
7022
- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
7023
- Align Alignment = LI->getAlign();
7024
- GatherCost +=
7025
- VectorizedCnt *
7026
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7027
- LI->getPointerAddressSpace(), CostKind,
7028
- TTI::OperandValueInfo(), LI);
7029
- GatherCost += ScatterVectorizeCnt *
7030
- TTI.getGatherScatterOpCost(
7031
- Instruction::Load, LoadTy, LI->getPointerOperand(),
7032
- /*VariableMask=*/false, Alignment, CostKind, LI);
7037
+ auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
7038
+ for (LoadInst *LI : VectorizedStarts) {
7039
+ Align Alignment = LI->getAlign();
7040
+ GatherCost +=
7041
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7042
+ LI->getPointerAddressSpace(), CostKind,
7043
+ TTI::OperandValueInfo(), LI);
7044
+ }
7045
+ for (std::pair<unsigned, unsigned> P : ScatterVectorized) {
7046
+ auto *LI0 = cast<LoadInst>(VL[P.first]);
7047
+ Align CommonAlignment = LI0->getAlign();
7048
+ for (Value *V : VL.slice(P.first + 1, VF - 1))
7049
+ CommonAlignment =
7050
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
7051
+ GatherCost += TTI.getGatherScatterOpCost(
7052
+ Instruction::Load, LoadTy, LI0->getPointerOperand(),
7053
+ /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
7054
+ }
7033
7055
if (NeedInsertSubvectorAnalysis) {
7034
7056
// Add the cost for the subvectors insert.
7035
7057
for (int I = VF, E = VL.size(); I < E; I += VF)
0 commit comments