Skip to content

Commit 12bcd63

Browse files
committed
[SLP]Improve detection of gathered loads, if no other deps are detected.
If the gather node includes ordered loads only partially (not the whole node consists of loads) and the other gathered scalar are not loads, and no other dependency from other nodes is found, we still can improve the cost of gather, if take into account the fact that these loads still can be vectorized.
1 parent 61ab43a commit 12bcd63

File tree

2 files changed

+44
-22
lines changed

2 files changed

+44
-22
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6947,20 +6947,36 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
69476947
// Improve gather cost for gather of loads, if we can group some of the
69486948
// loads into vector loads.
69496949
InstructionsState S = getSameOpcode(VL, *R.TLI);
6950-
if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
6951-
!S.isAltShuffle() &&
6950+
const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
6951+
unsigned MinVF = R.getMinVF(2 * Sz);
6952+
if (VL.size() > 2 &&
6953+
((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
6954+
(InVectors.empty() &&
6955+
any_of(seq<unsigned>(0, VL.size() / MinVF),
6956+
[&](unsigned Idx) {
6957+
ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
6958+
InstructionsState S = getSameOpcode(SubVL, *R.TLI);
6959+
return S.getOpcode() == Instruction::Load &&
6960+
!S.isAltShuffle();
6961+
}))) &&
69526962
!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
69536963
!isSplat(Gathers)) {
69546964
BoUpSLP::ValueSet VectorizedLoads;
6965+
SmallVector<LoadInst *> VectorizedStarts;
6966+
SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;
69556967
unsigned StartIdx = 0;
69566968
unsigned VF = VL.size() / 2;
6957-
unsigned VectorizedCnt = 0;
6958-
unsigned ScatterVectorizeCnt = 0;
6959-
const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
6969+
const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
69606970
for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
69616971
for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
69626972
Cnt += VF) {
69636973
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
6974+
if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
6975+
InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
6976+
if (SliceS.getOpcode() != Instruction::Load ||
6977+
SliceS.isAltShuffle())
6978+
continue;
6979+
}
69646980
if (!VectorizedLoads.count(Slice.front()) &&
69656981
!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
69666982
SmallVector<Value *> PointerOps;
@@ -6974,10 +6990,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
69746990
case LoadsState::PossibleStridedVectorize:
69756991
// Mark the vectorized loads so that we don't vectorize them
69766992
// again.
6977-
if (LS == LoadsState::Vectorize)
6978-
++VectorizedCnt;
6993+
if (LS == LoadsState::Vectorize && CurrentOrder.empty())
6994+
VectorizedStarts.push_back(cast<LoadInst>(Slice.front()));
69796995
else
6980-
++ScatterVectorizeCnt;
6996+
ScatterVectorized.emplace_back(Cnt, VF);
69816997
VectorizedLoads.insert(Slice.begin(), Slice.end());
69826998
// If we vectorized initial block, no need to try to vectorize
69836999
// it again.
@@ -7008,8 +7024,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
70087024
}
70097025
// Exclude potentially vectorized loads from list of gathered
70107026
// scalars.
7011-
auto *LI = cast<LoadInst>(S.MainOp);
7012-
Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
7027+
Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
70137028
// The cost for vectorized loads.
70147029
InstructionCost ScalarsCost = 0;
70157030
for (Value *V : VectorizedLoads) {
@@ -7019,17 +7034,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
70197034
LI->getAlign(), LI->getPointerAddressSpace(),
70207035
CostKind, TTI::OperandValueInfo(), LI);
70217036
}
7022-
auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
7023-
Align Alignment = LI->getAlign();
7024-
GatherCost +=
7025-
VectorizedCnt *
7026-
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7027-
LI->getPointerAddressSpace(), CostKind,
7028-
TTI::OperandValueInfo(), LI);
7029-
GatherCost += ScatterVectorizeCnt *
7030-
TTI.getGatherScatterOpCost(
7031-
Instruction::Load, LoadTy, LI->getPointerOperand(),
7032-
/*VariableMask=*/false, Alignment, CostKind, LI);
7037+
auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
7038+
for (LoadInst *LI : VectorizedStarts) {
7039+
Align Alignment = LI->getAlign();
7040+
GatherCost +=
7041+
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7042+
LI->getPointerAddressSpace(), CostKind,
7043+
TTI::OperandValueInfo(), LI);
7044+
}
7045+
for (std::pair<unsigned, unsigned> P : ScatterVectorized) {
7046+
auto *LI0 = cast<LoadInst>(VL[P.first]);
7047+
Align CommonAlignment = LI0->getAlign();
7048+
for (Value *V : VL.slice(P.first + 1, VF - 1))
7049+
CommonAlignment =
7050+
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
7051+
GatherCost += TTI.getGatherScatterOpCost(
7052+
Instruction::Load, LoadTy, LI0->getPointerOperand(),
7053+
/*VariableMask=*/false, CommonAlignment, CostKind, LI0);
7054+
}
70337055
if (NeedInsertSubvectorAnalysis) {
70347056
// Add the cost for the subvectors insert.
70357057
for (int I = VF, E = VL.size(); I < E; I += VF)

llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
; YAML-NEXT: Function: test
99
; YAML-NEXT: Args:
1010
; YAML-NEXT: - String: 'SLP vectorized with cost '
11-
; YAML-NEXT: - Cost: '-2'
11+
; YAML-NEXT: - Cost: '-4'
1212
; YAML-NEXT: - String: ' and with tree size '
1313
; YAML-NEXT: - TreeSize: '4'
1414
; YAML-LABEL: --- !Passed

0 commit comments

Comments
 (0)