Skip to content

Commit 4855a10

Browse files
committed
[X86] Convert fadd/fmul _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506)
Followup to D87604, having confirmed on PR47506 that we can use the llvm codegen expansion for fadd/fmul as well. Differential Revision: https://reviews.llvm.org/D92940
1 parent 47321c3 commit 4855a10

File tree

4 files changed

+32
-127
lines changed

4 files changed

+32
-127
lines changed

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1876,6 +1876,10 @@ TARGET_BUILTIN(__builtin_ia32_reduce_add_d512, "iV16i", "ncV:512:", "avx512f")
18761876
TARGET_BUILTIN(__builtin_ia32_reduce_add_q512, "OiV8Oi", "ncV:512:", "avx512f")
18771877
TARGET_BUILTIN(__builtin_ia32_reduce_and_d512, "iV16i", "ncV:512:", "avx512f")
18781878
TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f")
1879+
TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f")
1880+
TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f")
1881+
TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f")
1882+
TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f")
18791883
TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f")
18801884
TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f")
18811885
TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13631,6 +13631,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1363113631
CGM.getIntrinsic(Intrinsic::vector_reduce_and, Ops[0]->getType());
1363213632
return Builder.CreateCall(F, {Ops[0]});
1363313633
}
13634+
case X86::BI__builtin_ia32_reduce_fadd_pd512:
13635+
case X86::BI__builtin_ia32_reduce_fadd_ps512: {
13636+
Function *F =
13637+
CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
13638+
return Builder.CreateCall(F, {Ops[0], Ops[1]});
13639+
}
13640+
case X86::BI__builtin_ia32_reduce_fmul_pd512:
13641+
case X86::BI__builtin_ia32_reduce_fmul_ps512: {
13642+
Function *F =
13643+
CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
13644+
return Builder.CreateCall(F, {Ops[0], Ops[1]});
13645+
}
1363413646
case X86::BI__builtin_ia32_reduce_mul_d512:
1363513647
case X86::BI__builtin_ia32_reduce_mul_q512: {
1363613648
Function *F =

clang/lib/Headers/avx512fintrin.h

Lines changed: 8 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9345,37 +9345,25 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
93459345
return __builtin_ia32_reduce_or_q512(__W);
93469346
}
93479347

9348-
#define _mm512_mask_reduce_operator(op) \
9349-
__m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
9350-
__m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
9351-
__m256d __t3 = __t1 op __t2; \
9352-
__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
9353-
__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
9354-
__m128d __t6 = __t4 op __t5; \
9355-
__m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
9356-
__m128d __t8 = __t6 op __t7; \
9357-
return __t8[0]
9358-
93599348
static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
9360-
_mm512_mask_reduce_operator(+);
9349+
return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
93619350
}
93629351

93639352
static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
9364-
_mm512_mask_reduce_operator(*);
9353+
return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
93659354
}
93669355

93679356
static __inline__ double __DEFAULT_FN_ATTRS512
93689357
_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
93699358
__W = _mm512_maskz_mov_pd(__M, __W);
9370-
_mm512_mask_reduce_operator(+);
9359+
return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
93719360
}
93729361

93739362
static __inline__ double __DEFAULT_FN_ATTRS512
93749363
_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
93759364
__W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
9376-
_mm512_mask_reduce_operator(*);
9365+
return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
93779366
}
9378-
#undef _mm512_mask_reduce_operator
93799367

93809368
static __inline__ int __DEFAULT_FN_ATTRS512
93819369
_mm512_reduce_add_epi32(__m512i __W) {
@@ -9421,41 +9409,27 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
94219409
return __builtin_ia32_reduce_or_d512((__v16si)__W);
94229410
}
94239411

9424-
#define _mm512_mask_reduce_operator(op) \
9425-
__m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
9426-
__m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
9427-
__m256 __t3 = __t1 op __t2; \
9428-
__m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
9429-
__m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
9430-
__m128 __t6 = __t4 op __t5; \
9431-
__m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
9432-
__m128 __t8 = __t6 op __t7; \
9433-
__m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
9434-
__m128 __t10 = __t8 op __t9; \
9435-
return __t10[0]
9436-
94379412
static __inline__ float __DEFAULT_FN_ATTRS512
94389413
_mm512_reduce_add_ps(__m512 __W) {
9439-
_mm512_mask_reduce_operator(+);
9414+
return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
94409415
}
94419416

94429417
static __inline__ float __DEFAULT_FN_ATTRS512
94439418
_mm512_reduce_mul_ps(__m512 __W) {
9444-
_mm512_mask_reduce_operator(*);
9419+
return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
94459420
}
94469421

94479422
static __inline__ float __DEFAULT_FN_ATTRS512
94489423
_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
94499424
__W = _mm512_maskz_mov_ps(__M, __W);
9450-
_mm512_mask_reduce_operator(+);
9425+
return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
94519426
}
94529427

94539428
static __inline__ float __DEFAULT_FN_ATTRS512
94549429
_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
94559430
__W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
9456-
_mm512_mask_reduce_operator(*);
9431+
return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
94579432
}
9458-
#undef _mm512_mask_reduce_operator
94599433

94609434
static __inline__ long long __DEFAULT_FN_ATTRS512
94619435
_mm512_reduce_max_epi64(__m512i __V) {

clang/test/CodeGen/X86/avx512-reduceIntrin.c

Lines changed: 8 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -115,141 +115,56 @@ int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
115115

116116
double test_mm512_reduce_add_pd(__m512d __W){
117117
// CHECK-LABEL: @test_mm512_reduce_add_pd(
118-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
119-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120-
// CHECK: fadd <4 x double> %{{.*}}, %{{.*}}
121-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
122-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
123-
// CHECK: fadd <2 x double> %{{.*}}, %{{.*}}
124-
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
125-
// CHECK: fadd <2 x double> %{{.*}}, %{{.*}}
126-
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
118+
// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
127119
return _mm512_reduce_add_pd(__W);
128120
}
129121

130122
double test_mm512_reduce_mul_pd(__m512d __W){
131123
// CHECK-LABEL: @test_mm512_reduce_mul_pd(
132-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
133-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
134-
// CHECK: fmul <4 x double> %{{.*}}, %{{.*}}
135-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
136-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
137-
// CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
138-
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
139-
// CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
140-
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
124+
// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
141125
return _mm512_reduce_mul_pd(__W);
142126
}
143127

144128
float test_mm512_reduce_add_ps(__m512 __W){
145129
// CHECK-LABEL: @test_mm512_reduce_add_ps(
146-
// CHECK: bitcast <16 x float> %{{.*}} to <8 x double>
147-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
148-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
149-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
150-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
151-
// CHECK: fadd <8 x float> %{{.*}}, %{{.*}}
152-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
153-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
154-
// CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
155-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
156-
// CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
157-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
158-
// CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
159-
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
130+
// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
160131
return _mm512_reduce_add_ps(__W);
161132
}
162133

163134
float test_mm512_reduce_mul_ps(__m512 __W){
164135
// CHECK-LABEL: @test_mm512_reduce_mul_ps(
165-
// CHECK: bitcast <16 x float> %{{.*}} to <8 x double>
166-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
167-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
168-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
170-
// CHECK: fmul <8 x float> %{{.*}}, %{{.*}}
171-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
172-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
173-
// CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
174-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
175-
// CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
176-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
177-
// CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
178-
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
136+
// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
179137
return _mm512_reduce_mul_ps(__W);
180138
}
181139

182140
double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){
183141
// CHECK-LABEL: @test_mm512_mask_reduce_add_pd(
184142
// CHECK: bitcast i8 %{{.*}} to <8 x i1>
185143
// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
186-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
187-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
188-
// CHECK: fadd <4 x double> %{{.*}}, %{{.*}}
189-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
190-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
191-
// CHECK: fadd <2 x double> %{{.*}}, %{{.*}}
192-
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
193-
// CHECK: fadd <2 x double> %{{.*}}, %{{.*}}
194-
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
144+
// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
195145
return _mm512_mask_reduce_add_pd(__M, __W);
196146
}
197147

198148
double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){
199149
// CHECK-LABEL: @test_mm512_mask_reduce_mul_pd(
200150
// CHECK: bitcast i8 %{{.*}} to <8 x i1>
201151
// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
202-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
203-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
204-
// CHECK: fmul <4 x double> %{{.*}}, %{{.*}}
205-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
206-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
207-
// CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
208-
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
209-
// CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
210-
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
152+
// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
211153
return _mm512_mask_reduce_mul_pd(__M, __W);
212154
}
213155

214156
float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){
215157
// CHECK-LABEL: @test_mm512_mask_reduce_add_ps(
216-
// CHECK-NEXT: entry:
217158
// CHECK: bitcast i16 %{{.*}} to <16 x i1>
218159
// CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
219-
// CHECK: bitcast <16 x float> %{{.*}} to <8 x double>
220-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
221-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
222-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
223-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
224-
// CHECK: fadd <8 x float> %{{.*}}, %{{.*}}
225-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
226-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
227-
// CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
228-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
229-
// CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
230-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
231-
// CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
232-
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
160+
// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
233161
return _mm512_mask_reduce_add_ps(__M, __W);
234162
}
235163

236164
float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){
237165
// CHECK-LABEL: @test_mm512_mask_reduce_mul_ps(
238166
// CHECK: bitcast i16 %{{.*}} to <16 x i1>
239167
// CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}}
240-
// CHECK: bitcast <16 x float> %{{.*}} to <8 x double>
241-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
242-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
243-
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
244-
// CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
245-
// CHECK: fmul <8 x float> %{{.*}}, %{{.*}}
246-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
247-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
248-
// CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
249-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
250-
// CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
251-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
252-
// CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
253-
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
168+
// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
254169
return _mm512_mask_reduce_mul_ps(__M, __W);
255170
}

0 commit comments

Comments
 (0)