Skip to content

Commit 9529597

Browse files
author
Sjoerd Meijer
committed
Recommit rust-lang#2: "[LV] Induction Variable does not remain scalar under tail-folding."
This was reverted because of a miscompilation. At closer inspection, the problem was actually visible in a changed llvm regression test too. This one-line follow up fix/recommit will splat the IV, which is what we are trying to avoid if unnecessary in general, if tail-folding is requested even if all users are scalar instructions after vectorisation. Because with tail-folding, the splat IV will be used by the predicate of the masked loads/stores instructions. The previous version omitted this, which caused the miscompilation. The original commit message was: If tail-folding of the scalar remainder loop is applied, the primary induction variable is splat to a vector and used by the masked load/store vector instructions, thus the IV does not remain scalar. Because we now mark that the IV does not remain scalar for these cases, we don't emit the vector IV if it is not used. Thus, the vectoriser produces less dead code. Thanks to Ayal Zaks for the direction how to fix this.
1 parent 897d8ee commit 9529597

29 files changed

+3213
-1364
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+10-4
Original file line numberDiff line numberDiff line change
@@ -1909,11 +1909,12 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
19091909
return;
19101910
}
19111911

1912-
// If we haven't yet vectorized the induction variable, splat the scalar
1913-
// induction variable, and build the necessary step vectors.
1914-
// TODO: Don't do it unless the vectorized IV is really required.
1912+
// All IV users are scalar instructions, so only emit a scalar IV, not a
1913+
// vectorised IV. Except when we tail-fold, then the splat IV feeds the
1914+
// predicate used by the masked loads/stores.
19151915
Value *ScalarIV = CreateScalarIV(Step);
1916-
CreateSplatIV(ScalarIV, Step);
1916+
if (!Cost->isScalarEpilogueAllowed())
1917+
CreateSplatIV(ScalarIV, Step);
19171918
buildScalarSteps(ScalarIV, Step, EntryVal, ID);
19181919
}
19191920

@@ -4594,6 +4595,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
45944595
if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
45954596
continue;
45964597

4598+
// If tail-folding is applied, the primary induction variable will be used
4599+
// to feed a vector compare.
4600+
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4601+
continue;
4602+
45974603
// Determine if all users of the induction variable are scalar after
45984604
// vectorization.
45994605
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {

llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll

-6
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616

1717
; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
1818
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
19-
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
20-
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
21-
; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
2219
; FORCED-NEXT: %0 = add i32 %index, 0
2320
; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0
2421
; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0
@@ -68,9 +65,6 @@ declare float @pow(float, float) readnone nounwind
6865

6966
; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
7067
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
71-
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
72-
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
73-
; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
7468
; FORCED-NEXT: %0 = add i32 %index, 0
7569
; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0
7670
; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 0

llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll

-6
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,9 @@ define void @_Z1dv() local_unnamed_addr #0 {
6565
; CHECK: vector.body:
6666
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
6767
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
68-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
69-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
70-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
7168
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
7269
; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]]
7370
; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
74-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP18]], i32 0
75-
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
76-
; CHECK-NEXT: [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT6]], <i32 0, i32 1, i32 2, i32 3>
7771
; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0
7872
; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]]
7973
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64

llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll

-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@ define i32 @test(float* nocapture readonly %x) {
4444
; CHECK: vector.body:
4545
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
4646
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
47-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
48-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
49-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
5047
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
5148
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
5249
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0

llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ target triple = "hexagon"
1414

1515
; CHECK-LABEL: @test1
1616
; CHECK: vector.body:
17-
; CHECK: %induction = add <64 x i32>
18-
; CHECK: icmp ule <64 x i32> %induction
17+
; CHECK: icmp ule <64 x i32> %vec.ind
1918
; CHECK-NOT: load <{{.*}} x i32>
2019

2120

llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll

-3
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) {
1515
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1616
; CHECK: vector.body:
1717
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
18-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
19-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
20-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
2118
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
2219
; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
2320
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[TMP1]]

llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll

+3-4
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,10 @@ define void @func_21() {
1919
; CHECK: vector.body:
2020
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
2121
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 undef, i32 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_STORE_CONTINUE4]] ]
22-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0
23-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer
24-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
22+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
2523
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2624
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
27-
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[INDUCTION]], <i64 4, i64 4>
25+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], <i64 4, i64 4>
2826
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
2927
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
3028
; CHECK: pred.load.if:
@@ -61,6 +59,7 @@ define void @func_21() {
6159
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
6260
; CHECK: pred.store.continue4:
6361
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
62+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
6463
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
6564
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
6665
; CHECK: middle.block:

llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll

+19
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ define void @f1() {
3232
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
3333
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
3434
; CHECK: middle.block:
35+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2
36+
; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
37+
; CHECK: scalar.ph:
38+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ]
39+
; CHECK-NEXT: br label [[BB2:%.*]]
40+
; CHECK: bb2:
41+
; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ]
42+
; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64
43+
; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]]
44+
; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16*
45+
; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64
46+
; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]]
47+
; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]]
48+
; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1
49+
; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
50+
; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop !2
51+
; CHECK: bb3:
52+
; CHECK-NEXT: ret void
53+
;
3554

3655
bb1:
3756
br label %bb2

llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll

-3
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@ define double @sumIfVector(double* nocapture readonly %arr) {
9797
; AVX: vector.body:
9898
; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
9999
; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
100-
; AVX-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
101-
; AVX-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
102-
; AVX-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
103100
; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
104101
; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
105102
; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0

0 commit comments

Comments
 (0)