Skip to content

Commit b28f407

Browse files
committed
[SLP]Improve reduction cost model for scalars.
Instead of abstract cost of the scalar reduction ops, try to use the cost of actual reduction operation instructions, where possible. Also, remove the estimation of the vectorized GEPs pointers for reduced loads, since it is already handled in the tree. Differential Revision: https://reviews.llvm.org/D148036
1 parent caea93c commit b28f407

File tree

3 files changed

+84
-92
lines changed

3 files changed

+84
-92
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,18 +1148,6 @@ class BoUpSLP {
11481148
/// Construct a vectorizable tree that starts at \p Roots.
11491149
void buildTree(ArrayRef<Value *> Roots);
11501150

1151-
/// Checks if the very first tree node is going to be vectorized.
1152-
bool isVectorizedFirstNode() const {
1153-
return !VectorizableTree.empty() &&
1154-
VectorizableTree.front()->State == TreeEntry::Vectorize;
1155-
}
1156-
1157-
/// Returns the main instruction for the very first node.
1158-
Instruction *getFirstNodeMainOp() const {
1159-
assert(!VectorizableTree.empty() && "No tree to get the first node from");
1160-
return VectorizableTree.front()->getMainOp();
1161-
}
1162-
11631151
/// Returns whether the root node has in-tree uses.
11641152
bool doesRootHaveInTreeUses() const {
11651153
return !VectorizableTree.empty() &&
@@ -13340,22 +13328,7 @@ class HorizontalReduction {
1334013328
// Estimate cost.
1334113329
InstructionCost TreeCost = V.getTreeCost(VL);
1334213330
InstructionCost ReductionCost =
13343-
getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
13344-
if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
13345-
Instruction *MainOp = V.getFirstNodeMainOp();
13346-
for (Value *V : VL) {
13347-
auto *VI = dyn_cast<LoadInst>(V);
13348-
// Add the costs of scalar GEP pointers, to be removed from the
13349-
// code.
13350-
if (!VI || VI == MainOp)
13351-
continue;
13352-
auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
13353-
if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
13354-
continue;
13355-
TreeCost -= TTI->getArithmeticInstrCost(
13356-
Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
13357-
}
13358-
}
13331+
getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
1335913332
InstructionCost Cost = TreeCost + ReductionCost;
1336013333
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
1336113334
if (!Cost.isValid())
@@ -13591,7 +13564,8 @@ class HorizontalReduction {
1359113564
/// Calculate the cost of a reduction.
1359213565
InstructionCost getReductionCost(TargetTransformInfo *TTI,
1359313566
ArrayRef<Value *> ReducedVals,
13594-
unsigned ReduxWidth, FastMathFlags FMF) {
13567+
bool IsCmpSelMinMax, unsigned ReduxWidth,
13568+
FastMathFlags FMF) {
1359513569
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1359613570
Value *FirstReducedVal = ReducedVals.front();
1359713571
Type *ScalarTy = FirstReducedVal->getType();
@@ -13600,6 +13574,35 @@ class HorizontalReduction {
1360013574
// If all of the reduced values are constant, the vector cost is 0, since
1360113575
// the reduction value can be calculated at the compile time.
1360213576
bool AllConsts = allConstant(ReducedVals);
13577+
auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
13578+
InstructionCost Cost = 0;
13579+
// Scalar cost is repeated for N-1 elements.
13580+
int Cnt = ReducedVals.size();
13581+
for (Value *RdxVal : ReducedVals) {
13582+
if (Cnt == 1)
13583+
break;
13584+
--Cnt;
13585+
if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
13586+
Cost += GenCostFn();
13587+
continue;
13588+
}
13589+
InstructionCost ScalarCost = 0;
13590+
for (User *U : RdxVal->users()) {
13591+
auto *RdxOp = cast<Instruction>(U);
13592+
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
13593+
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
13594+
continue;
13595+
}
13596+
ScalarCost = InstructionCost::getInvalid();
13597+
break;
13598+
}
13599+
if (ScalarCost.isValid())
13600+
Cost += ScalarCost;
13601+
else
13602+
Cost += GenCostFn();
13603+
}
13604+
return Cost;
13605+
};
1360313606
switch (RdxKind) {
1360413607
case RecurKind::Add:
1360513608
case RecurKind::Mul:
@@ -13612,7 +13615,9 @@ class HorizontalReduction {
1361213615
if (!AllConsts)
1361313616
VectorCost =
1361413617
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
13615-
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
13618+
ScalarCost = EvaluateScalarCost([&]() {
13619+
return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
13620+
});
1361613621
break;
1361713622
}
1361813623
case RecurKind::FMax:
@@ -13626,10 +13631,12 @@ class HorizontalReduction {
1362613631
/*IsUnsigned=*/false, CostKind);
1362713632
}
1362813633
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
13629-
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
13630-
SclCondTy, RdxPred, CostKind) +
13631-
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
13632-
SclCondTy, RdxPred, CostKind);
13634+
ScalarCost = EvaluateScalarCost([&]() {
13635+
return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy,
13636+
RdxPred, CostKind) +
13637+
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
13638+
RdxPred, CostKind);
13639+
});
1363313640
break;
1363413641
}
1363513642
case RecurKind::SMax:
@@ -13646,18 +13653,18 @@ class HorizontalReduction {
1364613653
IsUnsigned, CostKind);
1364713654
}
1364813655
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
13649-
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
13650-
SclCondTy, RdxPred, CostKind) +
13651-
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
13652-
SclCondTy, RdxPred, CostKind);
13656+
ScalarCost = EvaluateScalarCost([&]() {
13657+
return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy,
13658+
RdxPred, CostKind) +
13659+
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
13660+
RdxPred, CostKind);
13661+
});
1365313662
break;
1365413663
}
1365513664
default:
1365613665
llvm_unreachable("Expected arithmetic or min/max reduction operation");
1365713666
}
1365813667

13659-
// Scalar cost is repeated for N-1 elements.
13660-
ScalarCost *= (ReduxWidth - 1);
1366113668
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
1366213669
<< " for reduction that starts with " << *FirstReducedVal
1366313670
<< " (It is a splitting reduction)\n");

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Lines changed: 16 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE2
3-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE4
4-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX
5-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX2
2+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
3+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
4+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
5+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
66
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH
77

88
@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
@@ -1113,41 +1113,18 @@ define i16 @smin_intrinsic_rdx_v8i16(ptr %p0) {
11131113
}
11141114

11151115
define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) {
1116-
; SSE2-LABEL: @umax_intrinsic_rdx_v4i64(
1117-
; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
1118-
; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
1119-
; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
1120-
; SSE2-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
1121-
; SSE2-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
1122-
; SSE2-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
1123-
; SSE2-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
1124-
; SSE2-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
1125-
; SSE2-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
1126-
; SSE2-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
1127-
; SSE2-NEXT: ret i64 [[M]]
1128-
;
1129-
; SSE4-LABEL: @umax_intrinsic_rdx_v4i64(
1130-
; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
1131-
; SSE4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
1132-
; SSE4-NEXT: ret i64 [[TMP2]]
1133-
;
1134-
; AVX-LABEL: @umax_intrinsic_rdx_v4i64(
1135-
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
1136-
; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
1137-
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
1138-
; AVX-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
1139-
; AVX-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
1140-
; AVX-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
1141-
; AVX-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
1142-
; AVX-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
1143-
; AVX-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
1144-
; AVX-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
1145-
; AVX-NEXT: ret i64 [[M]]
1146-
;
1147-
; AVX2-LABEL: @umax_intrinsic_rdx_v4i64(
1148-
; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
1149-
; AVX2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
1150-
; AVX2-NEXT: ret i64 [[TMP2]]
1116+
; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64(
1117+
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
1118+
; DEFAULT-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
1119+
; DEFAULT-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
1120+
; DEFAULT-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
1121+
; DEFAULT-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
1122+
; DEFAULT-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
1123+
; DEFAULT-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
1124+
; DEFAULT-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
1125+
; DEFAULT-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
1126+
; DEFAULT-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
1127+
; DEFAULT-NEXT: ret i64 [[M]]
11511128
;
11521129
; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(
11531130
; THRESH-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
@@ -1252,5 +1229,3 @@ define void @PR49730() {
12521229
%t14 = call i32 @llvm.umin.i32(i32 %t13, i32 93)
12531230
ret void
12541231
}
1255-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1256-
; SSE: {{.*}}

llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
2+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
3+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
44
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
55
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
66

@@ -22,10 +22,25 @@ define i32 @smax_v2i32(i32) {
2222
}
2323

2424
define i32 @smax_v4i32(i32) {
25-
; CHECK-LABEL: @smax_v4i32(
26-
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
27-
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
28-
; CHECK-NEXT: ret i32 [[TMP3]]
25+
; SSE2-LABEL: @smax_v4i32(
26+
; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
27+
; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
28+
; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
29+
; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
30+
; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
31+
; SSE2-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
32+
; SSE2-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
33+
; SSE2-NEXT: ret i32 [[TMP8]]
34+
;
35+
; SSE4-LABEL: @smax_v4i32(
36+
; SSE4-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
37+
; SSE4-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
38+
; SSE4-NEXT: ret i32 [[TMP3]]
39+
;
40+
; AVX-LABEL: @smax_v4i32(
41+
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
42+
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
43+
; AVX-NEXT: ret i32 [[TMP3]]
2944
;
3045
%2 = load i32, ptr @arr, align 16
3146
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
@@ -100,8 +115,3 @@ define i32 @smax_v16i32(i32) {
100115
%32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
101116
ret i32 %32
102117
}
103-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
104-
; AVX: {{.*}}
105-
; SSE: {{.*}}
106-
; SSE2: {{.*}}
107-
; SSE4: {{.*}}

0 commit comments

Comments
 (0)