|
1 | 1 | ; REQUIRES: asserts
|
2 | 2 | ; RUN: opt -mtriple=aarch64 -mattr=+sve \
|
3 | 3 | ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
|
4 |
| -; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 |
| 4 | +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16 |
5 | 5 |
|
6 | 6 | ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
|
7 | 7 | ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
|
8 |
| -; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 |
| 8 | +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16 |
9 | 9 |
|
10 | 10 | ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
|
11 | 11 | ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
|
12 |
| -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4 |
| 12 | +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16 |
13 | 13 |
|
14 | 14 | ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
|
15 | 15 | ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
|
16 |
| -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 |
| 16 | +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16 |
17 | 17 |
|
18 |
| -; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \ |
| 18 | +; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \ |
19 | 19 | ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
|
20 |
| -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 |
| 20 | +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16 |
| 21 | + |
| 22 | +; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8) |
| 23 | +; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4) |
| 24 | +; GENERIC: LV: Selecting VF: vscale x 16 |
21 | 25 |
|
22 |
| -; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2). |
23 |
| -; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2). |
| 26 | +; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8) |
| 27 | +; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4) |
| 28 | +; NEOVERSE-V1: LV: Selecting VF: vscale x 16 |
24 | 29 |
|
25 |
| -; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2). |
26 |
| -; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2). |
| 30 | +; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5) |
| 31 | +; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8) |
| 32 | +; NEOVERSE-N2: LV: Selecting VF: vscale x 16 |
27 | 33 |
|
28 |
| -; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1). |
29 |
| -; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). |
| 34 | +; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5) |
| 35 | +; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8) |
| 36 | +; NEOVERSE-V2: LV: Selecting VF: 16 |
30 | 37 |
|
31 |
| -; VF-4: <4 x i32> |
32 |
| -; VF-VSCALE4: <16 x i32> |
| 38 | +; VF-16: <16 x i8> |
| 39 | +; VF-VSCALE16: <vscale x 16 x i8> |
33 | 40 | define void @test0(ptr %a, ptr %b, ptr %c) #0 {
|
34 | 41 | entry:
|
35 | 42 | br label %loop
|
36 | 43 |
|
37 | 44 | loop:
|
38 | 45 | %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
39 |
| - %arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv |
40 |
| - %0 = load i32, ptr %arrayidx, align 4 |
| 46 | + %arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv |
| 47 | + %0 = load i8, ptr %arrayidx, align 4 |
41 | 48 | %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
|
42 | 49 | %1 = load i8, ptr %arrayidx2, align 4
|
43 |
| - %zext = zext i8 %1 to i32 |
44 |
| - %add = add nsw i32 %zext, %0 |
45 |
| - %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv |
46 |
| - store i32 %add, ptr %arrayidx5, align 4 |
| 50 | + %add = add nsw i8 %0, %1 |
| 51 | + %arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv |
| 52 | + store i8 %add, ptr %arrayidx5, align 4 |
47 | 53 | %iv.next = add nuw nsw i64 %iv, 1
|
48 | 54 | %exitcond.not = icmp eq i64 %iv.next, 1024
|
49 | 55 | br i1 %exitcond.not, label %exit, label %loop
|
50 | 56 |
|
51 | 57 | exit:
|
52 | 58 | ret void
|
53 | 59 | }
|
54 |
| - |
|
0 commit comments