@@ -162,42 +162,72 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
162
162
define <2 x i32 > @splatvar_funnnel_v2i32 (<2 x i32 > %x , <2 x i32 > %amt ) nounwind {
163
163
; SSE2-LABEL: splatvar_funnnel_v2i32:
164
164
; SSE2: # %bb.0:
165
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
166
+ ; SSE2-NEXT: pslld $23, %xmm1
165
167
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
166
- ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
167
- ; SSE2-NEXT: psllq %xmm1, %xmm2
168
- ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
169
- ; SSE2-NEXT: psllq %xmm1, %xmm0
170
- ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
168
+ ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
169
+ ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
170
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
171
+ ; SSE2-NEXT: pmuludq %xmm1, %xmm0
172
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
173
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
174
+ ; SSE2-NEXT: pmuludq %xmm2, %xmm1
175
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
176
+ ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
177
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
178
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
179
+ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
180
+ ; SSE2-NEXT: por %xmm3, %xmm0
171
181
; SSE2-NEXT: retq
172
182
;
173
183
; SSE41-LABEL: splatvar_funnnel_v2i32:
174
184
; SSE41: # %bb.0:
185
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
186
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
187
+ ; SSE41-NEXT: pslld $23, %xmm1
175
188
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
176
- ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
177
- ; SSE41-NEXT: psllq %xmm1, %xmm2
178
- ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
179
- ; SSE41-NEXT: psllq %xmm1, %xmm0
180
- ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
189
+ ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
190
+ ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
191
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
192
+ ; SSE41-NEXT: pmuludq %xmm2, %xmm3
193
+ ; SSE41-NEXT: pmuludq %xmm1, %xmm0
194
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
195
+ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
196
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
197
+ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
198
+ ; SSE41-NEXT: por %xmm1, %xmm0
181
199
; SSE41-NEXT: retq
182
200
;
183
201
; AVX1-LABEL: splatvar_funnnel_v2i32:
184
202
; AVX1: # %bb.0:
203
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
204
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
205
+ ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
185
206
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
186
- ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
187
- ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
188
- ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
189
- ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
190
- ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
207
+ ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
208
+ ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
209
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
210
+ ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
211
+ ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
212
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
213
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
214
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
215
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
216
+ ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
191
217
; AVX1-NEXT: retq
192
218
;
193
219
; AVX2-LABEL: splatvar_funnnel_v2i32:
194
220
; AVX2: # %bb.0:
195
- ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
196
- ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
197
- ; AVX2-NEXT: vpsllq %xmm1, %xmm2, %xmm2
198
- ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
199
- ; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0
200
- ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
221
+ ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
222
+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
223
+ ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
224
+ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
225
+ ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2
226
+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
227
+ ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
228
+ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
229
+ ; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0
230
+ ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
201
231
; AVX2-NEXT: retq
202
232
;
203
233
; AVX512F-LABEL: splatvar_funnnel_v2i32:
@@ -259,12 +289,22 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
259
289
;
260
290
; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
261
291
; X86-SSE2: # %bb.0:
292
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
293
+ ; X86-SSE2-NEXT: pslld $23, %xmm1
262
294
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
263
- ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
264
- ; X86-SSE2-NEXT: psllq %xmm1, %xmm2
265
- ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
266
- ; X86-SSE2-NEXT: psllq %xmm1, %xmm0
267
- ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
295
+ ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
296
+ ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
297
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
298
+ ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
299
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
300
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
301
+ ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
302
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
303
+ ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
304
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
305
+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
306
+ ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
307
+ ; X86-SSE2-NEXT: por %xmm3, %xmm0
268
308
; X86-SSE2-NEXT: retl
269
309
%splat = shufflevector <2 x i32 > %amt , <2 x i32 > undef , <2 x i32 > zeroinitializer
270
310
%res = call <2 x i32 > @llvm.fshl.v2i32 (<2 x i32 > %x , <2 x i32 > %x , <2 x i32 > %splat )
0 commit comments