[SLP]Improve reduction cost model for scalars.

alexey-bataev · alexey-bataev · commit b28f407df9d0 · 2023-04-12T11:32:51.000-07:00
Instead of abstract cost of the scalar reduction ops, try to use the cost of actual reduction operation instructions, where possible. Also, remove the estimation of the vectorized GEPs pointers for reduced loads, since it is already handled in the tree. Differential Revision: https://reviews.llvm.org/D148036
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1148,18 +1148,6 @@ class BoUpSLP {
   /// Construct a vectorizable tree that starts at \p Roots.
   void buildTree(ArrayRef<Value *> Roots);
 
-  /// Checks if the very first tree node is going to be vectorized.
-  bool isVectorizedFirstNode() const {
-    return !VectorizableTree.empty() &&
-           VectorizableTree.front()->State == TreeEntry::Vectorize;
-  }
-
-  /// Returns the main instruction for the very first node.
-  Instruction *getFirstNodeMainOp() const {
-    assert(!VectorizableTree.empty() && "No tree to get the first node from");
-    return VectorizableTree.front()->getMainOp();
-  }
-
   /// Returns whether the root node has in-tree uses.
   bool doesRootHaveInTreeUses() const {
     return !VectorizableTree.empty() &&
@@ -13340,22 +13328,7 @@ class HorizontalReduction {
         // Estimate cost.
         InstructionCost TreeCost = V.getTreeCost(VL);
         InstructionCost ReductionCost =
-            getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
-        if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
-          Instruction *MainOp = V.getFirstNodeMainOp();
-          for (Value *V : VL) {
-            auto *VI = dyn_cast<LoadInst>(V);
-            // Add the costs of scalar GEP pointers, to be removed from the
-            // code.
-            if (!VI || VI == MainOp)
-              continue;
-            auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
-            if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
-              continue;
-            TreeCost -= TTI->getArithmeticInstrCost(
-                Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
-          }
-        }
+            getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
         InstructionCost Cost = TreeCost + ReductionCost;
         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
         if (!Cost.isValid())
@@ -13591,7 +13564,8 @@ class HorizontalReduction {
   /// Calculate the cost of a reduction.
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    ArrayRef<Value *> ReducedVals,
-                                   unsigned ReduxWidth, FastMathFlags FMF) {
+                                   bool IsCmpSelMinMax, unsigned ReduxWidth,
+                                   FastMathFlags FMF) {
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     Value *FirstReducedVal = ReducedVals.front();
     Type *ScalarTy = FirstReducedVal->getType();
@@ -13600,6 +13574,35 @@ class HorizontalReduction {
     // If all of the reduced values are constant, the vector cost is 0, since
     // the reduction value can be calculated at the compile time.
     bool AllConsts = allConstant(ReducedVals);
+    auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
+      InstructionCost Cost = 0;
+      // Scalar cost is repeated for N-1 elements.
+      int Cnt = ReducedVals.size();
+      for (Value *RdxVal : ReducedVals) {
+        if (Cnt == 1)
+          break;
+        --Cnt;
+        if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
+          Cost += GenCostFn();
+          continue;
+        }
+        InstructionCost ScalarCost = 0;
+        for (User *U : RdxVal->users()) {
+          auto *RdxOp = cast<Instruction>(U);
+          if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+            ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
+            continue;
+          }
+          ScalarCost = InstructionCost::getInvalid();
+          break;
+        }
+        if (ScalarCost.isValid())
+          Cost += ScalarCost;
+        else
+          Cost += GenCostFn();
+      }
+      return Cost;
+    };
     switch (RdxKind) {
     case RecurKind::Add:
     case RecurKind::Mul:
@@ -13612,7 +13615,9 @@ class HorizontalReduction {
       if (!AllConsts)
         VectorCost =
             TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
-      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+      ScalarCost = EvaluateScalarCost([&]() {
+        return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+      });
       break;
     }
     case RecurKind::FMax:
@@ -13626,10 +13631,12 @@ class HorizontalReduction {
                                         /*IsUnsigned=*/false, CostKind);
       }
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
-      ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind) +
-                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind);
+      ScalarCost = EvaluateScalarCost([&]() {
+        return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind) +
+               TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind);
+      });
       break;
     }
     case RecurKind::SMax:
@@ -13646,18 +13653,18 @@ class HorizontalReduction {
                                                  IsUnsigned, CostKind);
       }
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
-      ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind) +
-                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind);
+      ScalarCost = EvaluateScalarCost([&]() {
+        return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind) +
+               TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind);
+      });
       break;
     }
     default:
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
 
-    // Scalar cost is repeated for N-1 elements.
-    ScalarCost *= (ReduxWidth - 1);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
                       << " for reduction that starts with " << *FirstReducedVal
                       << " (It is a splitting reduction)\n");
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE4
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH
 
 @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
@@ -1113,41 +1113,18 @@ define i16 @smin_intrinsic_rdx_v8i16(ptr %p0) {
 }
 
 define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) {
-; SSE2-LABEL: @umax_intrinsic_rdx_v4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
-; SSE2-NEXT:    [[T0:%.*]] = load i64, ptr [[P0]], align 4
-; SSE2-NEXT:    [[T1:%.*]] = load i64, ptr [[P1]], align 4
-; SSE2-NEXT:    [[T2:%.*]] = load i64, ptr [[P2]], align 4
-; SSE2-NEXT:    [[T3:%.*]] = load i64, ptr [[P3]], align 4
-; SSE2-NEXT:    [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
-; SSE2-NEXT:    [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
-; SSE2-NEXT:    [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
-; SSE2-NEXT:    ret i64 [[M]]
-;
-; SSE4-LABEL: @umax_intrinsic_rdx_v4i64(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
-; SSE4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
-; SSE4-NEXT:    ret i64 [[TMP2]]
-;
-; AVX-LABEL: @umax_intrinsic_rdx_v4i64(
-; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
-; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
-; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
-; AVX-NEXT:    [[T0:%.*]] = load i64, ptr [[P0]], align 4
-; AVX-NEXT:    [[T1:%.*]] = load i64, ptr [[P1]], align 4
-; AVX-NEXT:    [[T2:%.*]] = load i64, ptr [[P2]], align 4
-; AVX-NEXT:    [[T3:%.*]] = load i64, ptr [[P3]], align 4
-; AVX-NEXT:    [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
-; AVX-NEXT:    [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
-; AVX-NEXT:    [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
-; AVX-NEXT:    ret i64 [[M]]
-;
-; AVX2-LABEL: @umax_intrinsic_rdx_v4i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
-; AVX2-NEXT:    ret i64 [[TMP2]]
+; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64(
+; DEFAULT-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
+; DEFAULT-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
+; DEFAULT-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
+; DEFAULT-NEXT:    [[T0:%.*]] = load i64, ptr [[P0]], align 4
+; DEFAULT-NEXT:    [[T1:%.*]] = load i64, ptr [[P1]], align 4
+; DEFAULT-NEXT:    [[T2:%.*]] = load i64, ptr [[P2]], align 4
+; DEFAULT-NEXT:    [[T3:%.*]] = load i64, ptr [[P3]], align 4
+; DEFAULT-NEXT:    [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
+; DEFAULT-NEXT:    [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
+; DEFAULT-NEXT:    [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
+; DEFAULT-NEXT:    ret i64 [[M]]
 ;
 ; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(
 ; THRESH-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
@@ -1252,5 +1229,3 @@ define void @PR49730() {
   %t14 = call i32 @llvm.umin.i32(i32 %t13, i32 93)
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
@@ -22,10 +22,25 @@ define i32 @smax_v2i32(i32) {
 }
 
 define i32 @smax_v4i32(i32) {
-; CHECK-LABEL: @smax_v4i32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; SSE2-LABEL: @smax_v4i32(
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
+; SSE2-NEXT:    [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
+; SSE2-NEXT:    [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
+; SSE2-NEXT:    ret i32 [[TMP8]]
+;
+; SSE4-LABEL: @smax_v4i32(
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
+; SSE4-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; SSE4-NEXT:    ret i32 [[TMP3]]
+;
+; AVX-LABEL: @smax_v4i32(
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
+; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; AVX-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, ptr @arr, align 16
   %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
@@ -100,8 +115,3 @@ define i32 @smax_v16i32(i32) {
   %32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
   ret i32 %32
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
-; SSE2: {{.*}}
-; SSE4: {{.*}}