Skip to content

Commit 3097c60

Browse files
authored
[LoopVectorize][NFC] Rewrite tests to check output of vplan cost model (llvm#113697)
Currently it's very difficult to improve the cost model for tail-folded loops because as soon as you add a VPInstruction::computeCost function that adds the costs of instructions such as VPInstruction::ActiveLaneMask and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is because the VF chosen by the legacy cost model doesn't match the vplan cost model. See PR llvm#90191. This assert is currently making it difficult to improve the cost model. Hopefully we will be in a position to remove the assert soon, however in order to do that we have to fix up a whole bunch of tests that rely upon the legacy cost model output. I've tried my best to update these tests to use vplan output instead. There is still work needed for the VF=1 case because the vplan cost model is not printed out in this case. I've not attempted to fix those in this patch.
1 parent 3093b29 commit 3097c60

26 files changed

+379
-277
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+15-1
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@
131131
#include "llvm/Support/ErrorHandling.h"
132132
#include "llvm/Support/InstructionCost.h"
133133
#include "llvm/Support/MathExtras.h"
134+
#include "llvm/Support/NativeFormatting.h"
134135
#include "llvm/Support/raw_ostream.h"
135136
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136137
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
@@ -7424,7 +7425,20 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
74247425

74257426
// Now compute and add the VPlan-based cost.
74267427
Cost += Plan.cost(VF, CostCtx);
7427-
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7428+
#ifndef NDEBUG
7429+
unsigned EstimatedWidth = VF.getKnownMinValue();
7430+
if (VF.isScalable())
7431+
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
7432+
EstimatedWidth *= *VScale;
7433+
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7434+
<< " (Estimated cost per lane: ");
7435+
if (Cost.isValid()) {
7436+
double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7437+
LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7438+
} else /* No point dividing an invalid cost - it will still be invalid */
7439+
LLVM_DEBUG(dbgs() << "Invalid");
7440+
LLVM_DEBUG(dbgs() << ")\n");
7441+
#endif
74287442
return Cost;
74297443
}
74307444

llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
1313
; %var4 a lower scalarization overhead.
1414
;
1515
; COST-LABEL: predicated_udiv_scalarized_operand
16-
; COST: LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
16+
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
1717
;
1818
;
1919
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {

llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll

+13-6
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,14 @@
1111
; CM: LV: Found uniform instruction: %a = extractvalue { i64, i64 } %sv, 0
1212
; CM: LV: Found uniform instruction: %b = extractvalue { i64, i64 } %sv, 1
1313

14+
; Ensure the extractvalue + add instructions are hoisted out
15+
; CM: vector.ph:
16+
; CM: CLONE ir<%a> = extractvalue ir<%sv>
17+
; CM: CLONE ir<%b> = extractvalue ir<%sv>
18+
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
19+
; CM: Successor(s): vector loop
20+
1421
; CM: LV: Scalar loop costs: 5.
15-
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
16-
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1
1722

1823
; Check that the extractvalue operands are actually free in vector code.
1924

@@ -58,12 +63,14 @@ exit:
5863
; Similar to the test case above, but checks getVectorCallCost as well.
5964
declare float @powf(float, float) readnone nounwind
6065

61-
; CM: LV: Found uniform instruction: %a = extractvalue { float, float } %sv, 0
62-
; CM: LV: Found uniform instruction: %b = extractvalue { float, float } %sv, 1
66+
; Ensure the extractvalue + add instructions are hoisted out
67+
; CM: vector.ph:
68+
; CM: CLONE ir<%a> = extractvalue ir<%sv>
69+
; CM: CLONE ir<%b> = extractvalue ir<%sv>
70+
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
71+
; CM: Successor(s): vector loop
6372

6473
; CM: LV: Scalar loop costs: 14.
65-
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0
66-
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1
6774

6875
; FORCED-LABEL: define void @test_getVectorCallCost
6976

llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll

+7-7
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
88

99
; CHECK-COST-LABEL: sadd
1010
; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
11-
; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
12-
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
13-
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
11+
; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
12+
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
13+
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
1414

1515
define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
1616
; CHECK-LABEL: @saddsat(
@@ -129,10 +129,10 @@ while.end: ; preds = %while.body, %entry
129129

130130
; CHECK-COST-LABEL: umin
131131
; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
132-
; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
133-
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
134-
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
135-
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
132+
; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
133+
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
134+
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
135+
; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
136136

137137
define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
138138
; CHECK-LABEL: @umin(

llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
target triple = "aarch64-unknown-linux-gnu"
66

77
; CHECK-COST: Checking a loop in 'fixed_width'
8-
; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
9-
; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
8+
; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
9+
; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
1010
; CHECK-COST: Selecting VF: 1.
1111

1212
; We should decide this loop is not worth vectorising using fixed width vectors

llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ target triple = "aarch64"
1010
; due to invalid cost decisions. The loop below has a low maximum trip count,
1111
; so will be masked.
1212

13-
; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
14-
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
15-
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
16-
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
13+
; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
14+
; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
15+
; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
16+
; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
1717
; COST: LV: Selecting VF: 1.
1818

1919
define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {

llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll

-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"
66

77
; CHECK-LABEL: all_scalar
88
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
9-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
109
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
1110
;
1211
define void @all_scalar(ptr %a, i64 %n) {
@@ -27,7 +26,6 @@ for.end:
2726

2827
; CHECK-LABEL: PR33193
2928
; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
30-
; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
3129
; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions
3230
%struct.a = type { i32, i8 }
3331
define void @PR33193(ptr %a, i64 %n) {

llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
99
;; registers required for a <vscale x 4 x fp128> when trying to maximize
1010
;; vector bandwidth with SVE.
1111

12-
; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
12+
; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext ir<%load.in> to fp128
1313

1414
define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
1515
; CHECK-LABEL: define void @load_ext_trunc_store(

llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll

+26-21
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,59 @@
11
; REQUIRES: asserts
22
; RUN: opt -mtriple=aarch64 -mattr=+sve \
33
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
4-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
4+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
55

66
; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
77
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
8-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
8+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
99

1010
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
1111
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
12-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
12+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
1313

1414
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
1515
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
16-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
16+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
1717

18-
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
18+
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
1919
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
20-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
20+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
21+
22+
; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
23+
; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
24+
; GENERIC: LV: Selecting VF: vscale x 16
2125

22-
; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
23-
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
26+
; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
27+
; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
28+
; NEOVERSE-V1: LV: Selecting VF: vscale x 16
2429

25-
; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
26-
; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
30+
; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
31+
; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
32+
; NEOVERSE-N2: LV: Selecting VF: vscale x 16
2733

28-
; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
29-
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
34+
; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
35+
; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
36+
; NEOVERSE-V2: LV: Selecting VF: 16
3037

31-
; VF-4: <4 x i32>
32-
; VF-VSCALE4: <16 x i32>
38+
; VF-16: <16 x i8>
39+
; VF-VSCALE16: <vscale x 16 x i8>
3340
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
3441
entry:
3542
br label %loop
3643

3744
loop:
3845
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
39-
%arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
40-
%0 = load i32, ptr %arrayidx, align 4
46+
%arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
47+
%0 = load i8, ptr %arrayidx, align 4
4148
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
4249
%1 = load i8, ptr %arrayidx2, align 4
43-
%zext = zext i8 %1 to i32
44-
%add = add nsw i32 %zext, %0
45-
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
46-
store i32 %add, ptr %arrayidx5, align 4
50+
%add = add nsw i8 %0, %1
51+
%arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
52+
store i8 %add, ptr %arrayidx5, align 4
4753
%iv.next = add nuw nsw i64 %iv, 1
4854
%exitcond.not = icmp eq i64 %iv.next, 1024
4955
br i1 %exitcond.not, label %exit, label %loop
5056

5157
exit:
5258
ret void
5359
}
54-

llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll

+13-9
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"
66

77
define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
88
; CHECK: LV: Checking a loop in 'selects_1'
9-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
10-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
11-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
129

13-
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
14-
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
15-
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
10+
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
11+
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
12+
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
13+
14+
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
15+
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
16+
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
17+
1618
; CHECK: LV: Selecting VF: 4
1719

1820
entry:
@@ -48,9 +50,11 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
4850

4951
define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
5052
; CHECK: LV: Checking a loop in 'multi_user_cmp'
51-
; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %cmp1 = fcmp olt float %load1, 0.000000e+00
52-
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
53-
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
53+
; CHECK: Cost of 1 for VF 16:
54+
; CHECK: any-of reduction %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
55+
; CHECK: Cost of 1 for VF 16:
56+
; CHECK: any-of reduction %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
57+
; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
5458
; CHECK: LV: Selecting VF: 16.
5559
entry:
5660
br label %for.body

llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll

+6-6
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
target triple = "aarch64-unknown-linux-gnu"
66

7-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
8-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %addi7 = add i7 %indvars.iv1294, 0
9-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
7+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
8+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
9+
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>
1010

1111
define void @induction_i7(ptr %dst) #0 {
1212
; CHECK-LABEL: define void @induction_i7(
@@ -71,9 +71,9 @@ for.end: ; preds = %for.body
7171
}
7272

7373

74-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
75-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %zexti3 = zext i3 %indvars.iv1294 to i64
76-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
74+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
75+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
76+
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64
7777

7878
define void @induction_i3_zext(ptr %dst) #0 {
7979
; CHECK-LABEL: define void @induction_i3_zext(

llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

+14-6
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,15 @@
77

88
target triple="aarch64-unknown-linux-gnu"
99

10-
; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
11-
; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
12-
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
13-
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
10+
; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
11+
; CHECK: Cost of 4 for VF vscale x 2:
12+
; CHECK: in-loop reduction %add = fadd float %0, %sum.07
13+
; CHECK: Cost of 8 for VF vscale x 4:
14+
; CHECK: in-loop reduction %add = fadd float %0, %sum.07
15+
; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
16+
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd float %0, %sum.07
17+
; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
18+
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd float %0, %sum.07
1419

1520
define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
1621
entry:
@@ -31,8 +36,11 @@ for.end:
3136
}
3237

3338

34-
; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
35-
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
39+
; CHECK-LABEL: LV: Checking a loop in 'fadd_strict64'
40+
; CHECK: Cost of 4 for VF vscale x 2:
41+
; CHECK: in-loop reduction %add = fadd double %0, %sum.07
42+
; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
43+
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd double %0, %sum.07
3644

3745
define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
3846
entry:

0 commit comments

Comments
 (0)