Skip to content

Commit a730ed7

Browse files
[SLP]Improve minbitwidth analysis.
This improves overall analysis for minbitwidth in SLP. It allows to analyze the trees with store/insertelement root nodes. Also, instead of using single minbitwidth, detected from the very first analysis stage, it tries to detect the best one for each trunc/ext subtree in the graph and use it for the subtree. Results in better code and less vector register pressure. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0% SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar instructions remain scalar (good). Spec2017/x264 - the whole function idct4x4dc is vectorized using <16 x i16> instead of <16 x i32>, also zext/trunc are removed. In other places last vector zext/sext removed and replaced by extractelement + scalar zext/sext pair. MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by reduce or <4 x i8> Spec2017/imagick - Removed extra zext from 2 packs of the operations. Spec2017/parest - Removed extra zext, replaced by extractelement+scalar zext Spec2017/blender - the whole bunch of vector zext/sext replaced by extractelement+scalar zext/sext, some extra code vectorized in smaller types. Spec2006/gobmk - fixed cost estimation, some small code remains scalar. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#78976
1 parent 1b1aea7 commit a730ed7

15 files changed

+451
-295
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 338 additions & 184 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
1717
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
1818
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
1919
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
20-
; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
21-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
20+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
21+
; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
2222
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
2323
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
24-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
25-
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
24+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
25+
; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
26+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
2627
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
2728
; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
2829
; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64

llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
3-
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
3+
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
44
; RUN: cat %t | FileCheck -check-prefix=YAML %s
5-
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
5+
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
66
; RUN: cat %t | FileCheck -check-prefix=YAML %s
77

88

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,11 @@ entry:
2828
define i64 @red_zext_ld_4xi64(ptr %ptr) {
2929
; CHECK-LABEL: @red_zext_ld_4xi64(
3030
; CHECK-NEXT: entry:
31-
; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
32-
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
33-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
34-
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
35-
; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
36-
; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
37-
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2
38-
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1
39-
; CHECK-NEXT: [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64
40-
; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]]
41-
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3
42-
; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1
43-
; CHECK-NEXT: [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64
44-
; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]]
45-
; CHECK-NEXT: ret i64 [[ADD_3]]
31+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
32+
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
33+
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
34+
; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
35+
; CHECK-NEXT: ret i64 [[TMP3]]
4636
;
4737
entry:
4838
%ld0 = load i8, ptr %ptr

llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -802,9 +802,10 @@ define i64 @red_zext_ld_4xi64(ptr %ptr) {
802802
; CHECK-LABEL: @red_zext_ld_4xi64(
803803
; CHECK-NEXT: entry:
804804
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
805-
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i64>
806-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
807-
; CHECK-NEXT: ret i64 [[TMP2]]
805+
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
806+
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
807+
; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
808+
; CHECK-NEXT: ret i64 [[TMP3]]
808809
;
809810
entry:
810811
%ld0 = load i8, ptr %ptr

llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@ define { i64, i64 } @patatino(double %arg) {
1515
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
1616
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
1717
; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
18-
; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
19-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
18+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
19+
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
2020
; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
21-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
22-
; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1
21+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
22+
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
23+
; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
2324
; CHECK-NEXT: ret { i64, i64 } [[T17]]
2425
;
2526
bb:

llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-3 < %s | FileCheck %s
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-6 < %s | FileCheck %s
33

44
define void @t(i64 %v) {
55
; CHECK-LABEL: define void @t(

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,17 @@ define void @test(i8 %0) {
66
; CHECK-SAME: i8 [[TMP0:%.*]]) {
77
; CHECK-NEXT: entry:
88
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> <i8 0, i8 poison>, i8 [[TMP0]], i32 1
9-
; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
10-
; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
11-
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i16> [[TMP2]], zeroinitializer
12-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
13-
; CHECK-NEXT: [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
14-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
15-
; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
16-
; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP6]], [[TMP8]]
9+
; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
10+
; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP1]], zeroinitializer
11+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
12+
; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
13+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
14+
; CHECK-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
15+
; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP5]], [[TMP7]]
1716
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD]], 1
1817
; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SHR]] to i8
1918
; CHECK-NEXT: store i8 [[CONV9]], ptr null, align 1
20-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
19+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
2120
; CHECK-NEXT: ret void
2221
;
2322
entry:

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,20 @@ define void @test(i64 %d.promoted.i) {
66
; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
77
; CHECK-NEXT: entry:
88
; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]]
9+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I]], i32 1
10+
; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1>
11+
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer
912
; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0
10-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
11-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i64> [[TMP0]], i64 [[AND_1_I]], i32 9
12-
; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i1>
13-
; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i1> [[TMP2]], zeroinitializer
14-
; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
15-
; CHECK-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
16-
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 0
17-
; CHECK-NEXT: store i32 [[TMP6]], ptr null, align 4
13+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
14+
; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1>
15+
; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer
16+
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
17+
; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32
18+
; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]])
19+
; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
20+
; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]]
21+
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0
22+
; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4
1823
; CHECK-NEXT: ret void
1924
;
2025
entry:

llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@ target triple = "x86_64-unknown-linux-gnu"
1717
define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
1818
; SSE-LABEL: @PR31243_zext(
1919
; SSE-NEXT: entry:
20-
; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
21-
; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
22-
; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
23-
; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
24-
; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
25-
; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
20+
; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
21+
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
22+
; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
23+
; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
24+
; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
25+
; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
26+
; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
27+
; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
28+
; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
2629
; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1
2730
; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1
2831
; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -73,12 +76,15 @@ entry:
7376
define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
7477
; SSE-LABEL: @PR31243_sext(
7578
; SSE-NEXT: entry:
76-
; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
77-
; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
78-
; SSE-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
79-
; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
80-
; SSE-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
81-
; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
79+
; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
80+
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
81+
; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
82+
; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
83+
; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
84+
; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
85+
; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
86+
; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
87+
; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
8288
; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1
8389
; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1
8490
; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -89,13 +95,12 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
8995
; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
9096
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
9197
; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
92-
; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
93-
; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
94-
; AVX-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
95-
; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
96-
; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
97-
; AVX-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
98-
; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
98+
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
99+
; AVX-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
100+
; AVX-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
101+
; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
102+
; AVX-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
103+
; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
99104
; AVX-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1
100105
; AVX-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1
101106
; AVX-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]]

llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ define i32 @phi3UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
1515
; CHECK-NEXT: br label [[BB3]]
1616
; CHECK: bb3:
1717
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 undef, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
18-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
19-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
18+
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
19+
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
2020
; CHECK-NEXT: ret i32 [[TMP6]]
2121
;
2222
entry:
@@ -52,8 +52,8 @@ define i32 @phi2UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
5252
; CHECK-NEXT: br label [[BB3]]
5353
; CHECK: bb3:
5454
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
55-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
56-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
55+
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
56+
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
5757
; CHECK-NEXT: ret i32 [[TMP6]]
5858
;
5959
entry:
@@ -89,8 +89,8 @@ define i32 @phi1UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
8989
; CHECK-NEXT: br label [[BB3]]
9090
; CHECK: bb3:
9191
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 0, i8 undef>, [[ENTRY:%.*]] ]
92-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
93-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
92+
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
93+
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
9494
; CHECK-NEXT: ret i32 [[TMP6]]
9595
;
9696
entry:
@@ -127,8 +127,8 @@ define i32 @phi1Undef1PoisonInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %ar
127127
; CHECK-NEXT: br label [[BB3]]
128128
; CHECK: bb3:
129129
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
130-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
131-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
130+
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
131+
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
132132
; CHECK-NEXT: ret i32 [[TMP6]]
133133
;
134134
entry:
@@ -165,8 +165,8 @@ define i32 @phi1Undef2PoisonInputs(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %a
165165
; CHECK-NEXT: br label [[BB3]]
166166
; CHECK: bb3:
167167
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 poison, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
168-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
169-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
168+
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
169+
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
170170
; CHECK-NEXT: ret i32 [[TMP6]]
171171
;
172172
entry:
@@ -202,8 +202,8 @@ define i32 @phi1Undef1PoisonGapInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8
202202
; CHECK-NEXT: br label [[BB3]]
203203
; CHECK: bb3:
204204
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
205-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
206-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
205+
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
206+
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
207207
; CHECK-NEXT: ret i32 [[TMP6]]
208208
;
209209
entry:

0 commit comments

Comments
 (0)