Skip to content

Commit 6b40331

Browse files
[SVE] Scalarize fixed length masked loads and stores.
When adding support for scalable vector masked loads and stores we accidently opened up likewise for fixed length vectors. This patch restricts support to scalable vectors only, thus ensuring fixed length vectors are treated the same regardless of SVE support. Differential Revision: https://reviews.llvm.org/D83341
1 parent 614fb09 commit 6b40331

File tree

3 files changed

+194
-2
lines changed

3 files changed

+194
-2
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
159159
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
160160

161161
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
162-
if (!isa<VectorType>(DataType) || !ST->hasSVE())
162+
if (!isa<ScalableVectorType>(DataType) || !ST->hasSVE())
163163
return false;
164164

165-
Type *Ty = cast<VectorType>(DataType)->getElementType();
165+
Type *Ty = cast<ScalableVectorType>(DataType)->getElementType();
166166
if (Ty->isBFloatTy() || Ty->isHalfTy() ||
167167
Ty->isFloatTy() || Ty->isDoubleTy())
168168
return true;
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck %s
3+
; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
4+
5+
define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %passthru) {
6+
; CHECK-LABEL: @scalarize_v2i64(
7+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
8+
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
9+
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
10+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
11+
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
12+
; CHECK: cond.load:
13+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
14+
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
15+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP5]], i64 0
16+
; CHECK-NEXT: br label [[ELSE]]
17+
; CHECK: else:
18+
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
19+
; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2
20+
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0
21+
; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
22+
; CHECK: cond.load1:
23+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
24+
; CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP9]], align 8
25+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP10]], i64 1
26+
; CHECK-NEXT: br label [[ELSE2]]
27+
; CHECK: else2:
28+
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
29+
; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
30+
;
31+
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 128, <2 x i1> %mask, <2 x i64> %passthru)
32+
ret <2 x i64> %ret
33+
}
34+
35+
define <2 x i64> @scalarize_v2i64_ones_mask(<2 x i64>* %p, <2 x i64> %passthru) {
36+
; CHECK-LABEL: @scalarize_v2i64_ones_mask(
37+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[P:%.*]], align 8
38+
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
39+
;
40+
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
41+
ret <2 x i64> %ret
42+
}
43+
44+
define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %passthru) {
45+
; CHECK-LABEL: @scalarize_v2i64_zero_mask(
46+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
47+
; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
48+
;
49+
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
50+
ret <2 x i64> %ret
51+
}
52+
53+
define <2 x i64> @scalarize_v2i64_const_mask(<2 x i64>* %p, <2 x i64> %passthru) {
54+
; CHECK-LABEL: @scalarize_v2i64_const_mask(
55+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
56+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
57+
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
58+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i64 1
59+
; CHECK-NEXT: ret <2 x i64> [[TMP4]]
60+
;
61+
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
62+
ret <2 x i64> %ret
63+
}
64+
65+
; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
66+
define <2 x i24> @scalarize_v2i24(<2 x i24>* %p, <2 x i1> %mask, <2 x i24> %passthru) {
67+
; CHECK-LABEL: @scalarize_v2i24(
68+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i24>* [[P:%.*]] to i24*
69+
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
70+
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
71+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
72+
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
73+
; CHECK: cond.load:
74+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 0
75+
; CHECK-NEXT: [[TMP5:%.*]] = load i24, i24* [[TMP4]], align 1
76+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP5]], i64 0
77+
; CHECK-NEXT: br label [[ELSE]]
78+
; CHECK: else:
79+
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
80+
; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2
81+
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0
82+
; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
83+
; CHECK: cond.load1:
84+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 1
85+
; CHECK-NEXT: [[TMP10:%.*]] = load i24, i24* [[TMP9]], align 1
86+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP10]], i64 1
87+
; CHECK-NEXT: br label [[ELSE2]]
88+
; CHECK: else2:
89+
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
90+
; CHECK-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]]
91+
;
92+
%ret = call <2 x i24> @llvm.masked.load.v2i24.p0v2i24(<2 x i24>* %p, i32 8, <2 x i1> %mask, <2 x i24> %passthru)
93+
ret <2 x i24> %ret
94+
}
95+
96+
; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
97+
define <2 x i48> @scalarize_v2i48(<2 x i48>* %p, <2 x i1> %mask, <2 x i48> %passthru) {
98+
; CHECK-LABEL: @scalarize_v2i48(
99+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i48>* [[P:%.*]] to i48*
100+
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
101+
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
102+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
103+
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
104+
; CHECK: cond.load:
105+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 0
106+
; CHECK-NEXT: [[TMP5:%.*]] = load i48, i48* [[TMP4]], align 2
107+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP5]], i64 0
108+
; CHECK-NEXT: br label [[ELSE]]
109+
; CHECK: else:
110+
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
111+
; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2
112+
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0
113+
; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
114+
; CHECK: cond.load1:
115+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 1
116+
; CHECK-NEXT: [[TMP10:%.*]] = load i48, i48* [[TMP9]], align 2
117+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP10]], i64 1
118+
; CHECK-NEXT: br label [[ELSE2]]
119+
; CHECK: else2:
120+
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
121+
; CHECK-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]]
122+
;
123+
%ret = call <2 x i48> @llvm.masked.load.v2i48.p0v2i48(<2 x i48>* %p, i32 16, <2 x i1> %mask, <2 x i48> %passthru)
124+
ret <2 x i48> %ret
125+
}
126+
127+
declare <2 x i24> @llvm.masked.load.v2i24.p0v2i24(<2 x i24>*, i32, <2 x i1>, <2 x i24>)
128+
declare <2 x i48> @llvm.masked.load.v2i48.p0v2i48(<2 x i48>*, i32, <2 x i1>, <2 x i48>)
129+
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck %s
3+
; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
4+
5+
define void @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %data) {
6+
; CHECK-LABEL: @scalarize_v2i64(
7+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
8+
; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
9+
; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1
10+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0
11+
; CHECK-NEXT: br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
12+
; CHECK: cond.store:
13+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
14+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
15+
; CHECK-NEXT: store i64 [[TMP4]], i64* [[TMP5]], align 8
16+
; CHECK-NEXT: br label [[ELSE]]
17+
; CHECK: else:
18+
; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
19+
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
20+
; CHECK-NEXT: br i1 [[TMP7]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
21+
; CHECK: cond.store1:
22+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
23+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
24+
; CHECK-NEXT: store i64 [[TMP8]], i64* [[TMP9]], align 8
25+
; CHECK-NEXT: br label [[ELSE2]]
26+
; CHECK: else2:
27+
; CHECK-NEXT: ret void
28+
;
29+
call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %data, <2 x i64>* %p, i32 128, <2 x i1> %mask)
30+
ret void
31+
}
32+
33+
define void @scalarize_v2i64_ones_mask(<2 x i64>* %p, <2 x i64> %data) {
34+
; CHECK-LABEL: @scalarize_v2i64_ones_mask(
35+
; CHECK-NEXT: store <2 x i64> [[DATA:%.*]], <2 x i64>* [[P:%.*]], align 8
36+
; CHECK-NEXT: ret void
37+
;
38+
call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %data, <2 x i64>* %p, i32 8, <2 x i1> <i1 true, i1 true>)
39+
ret void
40+
}
41+
42+
define void @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %data) {
43+
; CHECK-LABEL: @scalarize_v2i64_zero_mask(
44+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
45+
; CHECK-NEXT: ret void
46+
;
47+
call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %data, <2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 false>)
48+
ret void
49+
}
50+
51+
define void @scalarize_v2i64_const_mask(<2 x i64>* %p, <2 x i64> %data) {
52+
; CHECK-LABEL: @scalarize_v2i64_const_mask(
53+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
54+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
55+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
56+
; CHECK-NEXT: store i64 [[TMP2]], i64* [[TMP3]], align 8
57+
; CHECK-NEXT: ret void
58+
;
59+
call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %data, <2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 true>)
60+
ret void
61+
}
62+
63+
declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)

0 commit comments

Comments
 (0)