Skip to content

Commit 5bc8364

Browse files
committed
[X86] LowerEXTEND_VECTOR_INREG - add sign_extend_vector_inreg fast path for all-signbits source values
If the source operand is already all-signbits we don't need to create the sign extended elements - just splat the source element to the destination element width
1 parent c425cfa commit 5bc8364

14 files changed

+473
-493
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26789,6 +26789,19 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
2678926789
// We should only get here for sign extend.
2679026790
assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
2679126791
assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
26792+
unsigned InNumElts = InVT.getVectorNumElements();
26793+
26794+
// If the source elements are already all-signbits, we don't need to extend,
26795+
// just splat the elements.
26796+
APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
26797+
if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
26798+
unsigned Scale = InNumElts / NumElts;
26799+
SmallVector<int, 16> ShuffleMask;
26800+
for (unsigned I = 0; I != NumElts; ++I)
26801+
ShuffleMask.append(Scale, I);
26802+
return DAG.getBitcast(VT,
26803+
DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
26804+
}
2679226805

2679326806
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
2679426807
SDValue Curr = In;
@@ -26801,8 +26814,6 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
2680126814

2680226815
unsigned DestWidth = DestVT.getScalarSizeInBits();
2680326816
unsigned Scale = DestWidth / InSVT.getSizeInBits();
26804-
26805-
unsigned InNumElts = InVT.getVectorNumElements();
2680626817
unsigned DestElts = DestVT.getVectorNumElements();
2680726818

2680826819
// Build a shuffle mask that takes each input element and places it in the

llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -182,11 +182,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
182182
; SSE2: # %bb.0:
183183
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
184184
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
185-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
185+
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
186186
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
187187
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
188-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
189-
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
188+
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
189+
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,1,1,4,5,6,7]
190190
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
191191
; SSE2-NEXT: pand %xmm0, %xmm1
192192
; SSE2-NEXT: movmskpd %xmm1, %eax
@@ -196,7 +196,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
196196
; SSSE3-LABEL: v2i8:
197197
; SSSE3: # %bb.0:
198198
; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
199-
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <u,u,u,0,u,u,u,0,u,u,u,1,u,u,u,1>
199+
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
200200
; SSSE3-NEXT: pshufb %xmm1, %xmm0
201201
; SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
202202
; SSSE3-NEXT: pshufb %xmm1, %xmm2
@@ -249,10 +249,10 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
249249
; SSE2-SSSE3-LABEL: v2i16:
250250
; SSE2-SSSE3: # %bb.0:
251251
; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
252-
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
252+
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
253253
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
254254
; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
255-
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
255+
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,1,1,4,5,6,7]
256256
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
257257
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
258258
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
@@ -450,10 +450,10 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
450450
; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
451451
; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
452452
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
453-
; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
454-
; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
455-
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
456-
; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
453+
; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
454+
; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
455+
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
456+
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
457457
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
458458
; SSE2-SSSE3-NEXT: retq
459459
;
@@ -503,9 +503,9 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
503503
; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
504504
; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
505505
; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
506-
; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
507-
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
508-
; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
506+
; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
507+
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
508+
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
509509
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
510510
; SSE2-SSSE3-NEXT: retq
511511
;
@@ -555,10 +555,10 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
555555
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
556556
; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
557557
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
558-
; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
559-
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
560-
; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm1
561-
; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax
558+
; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
559+
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
560+
; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2
561+
; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
562562
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
563563
; SSE2-SSSE3-NEXT: retq
564564
;

llvm/test/CodeGen/X86/bitcast-setcc-128.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
148148
; SSE2: # %bb.0:
149149
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
150150
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
151-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
151+
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
152152
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
153153
; SSE2-NEXT: movmskpd %xmm0, %eax
154154
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -195,7 +195,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
195195
; SSE2-SSSE3-LABEL: v2i16:
196196
; SSE2-SSSE3: # %bb.0:
197197
; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
198-
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
198+
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
199199
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
200200
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
201201
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax

llvm/test/CodeGen/X86/icmp-abs-C-vec.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,6 @@ define <4 x i1> @eq_or_to_abs_vec4x16(<4 x i16> %x) {
963963
; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
964964
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
965965
; SSE2-NEXT: por %xmm1, %xmm0
966-
; SSE2-NEXT: psrad $16, %xmm0
967966
; SSE2-NEXT: retq
968967
%cmp1 = icmp eq <4 x i16> %x, <i16 88, i16 88, i16 88, i16 88>
969968
%cmp2 = icmp eq <4 x i16> %x, <i16 -88, i16 -88, i16 -88, i16 -88>
@@ -1061,16 +1060,14 @@ define <4 x i1> @ne_and_to_abs_vec4x8(<4 x i8> %x) {
10611060
; SSE2: # %bb.0:
10621061
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u>
10631062
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
1064-
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
1065-
; SSE2-NEXT: pxor %xmm2, %xmm1
10661063
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
10671064
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1065+
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
10681066
; SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1069-
; SSE2-NEXT: pxor %xmm2, %xmm0
10701067
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
10711068
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1072-
; SSE2-NEXT: pand %xmm1, %xmm0
1073-
; SSE2-NEXT: psrad $24, %xmm0
1069+
; SSE2-NEXT: por %xmm1, %xmm0
1070+
; SSE2-NEXT: pxor %xmm2, %xmm0
10741071
; SSE2-NEXT: retq
10751072
%cmp1 = icmp ne <4 x i8> %x, <i8 88, i8 88, i8 88, i8 88>
10761073
%cmp2 = icmp ne <4 x i8> %x, <i8 -88, i8 -88, i8 -88, i8 -88>

llvm/test/CodeGen/X86/sext-vsetcc.ll

Lines changed: 35 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@ define <8 x i16> @cmp_ne_load_const(ptr %x) nounwind {
1010
; SSE-LABEL: cmp_ne_load_const:
1111
; SSE: # %bb.0:
1212
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
13+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1314
; SSE-NEXT: pxor %xmm1, %xmm1
1415
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
1516
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
1617
; SSE-NEXT: pxor %xmm1, %xmm0
17-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18-
; SSE-NEXT: psraw $8, %xmm0
1918
; SSE-NEXT: retq
2019
;
2120
; AVX-LABEL: cmp_ne_load_const:
@@ -36,12 +35,11 @@ define <8 x i16> @cmp_ne_load_const_volatile(ptr %x) nounwind {
3635
; SSE-LABEL: cmp_ne_load_const_volatile:
3736
; SSE: # %bb.0:
3837
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
38+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3939
; SSE-NEXT: pxor %xmm1, %xmm1
4040
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
4141
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
4242
; SSE-NEXT: pxor %xmm1, %xmm0
43-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
44-
; SSE-NEXT: psraw $8, %xmm0
4543
; SSE-NEXT: retq
4644
;
4745
; AVX2-LABEL: cmp_ne_load_const_volatile:
@@ -75,15 +73,15 @@ define <8 x i16> @cmp_ne_load_const_extra_use1(ptr %x) nounwind {
7573
; SSE-LABEL: cmp_ne_load_const_extra_use1:
7674
; SSE: # %bb.0:
7775
; SSE-NEXT: subq $24, %rsp
78-
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
79-
; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
76+
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
77+
; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
8078
; SSE-NEXT: callq use_v8i8@PLT
81-
; SSE-NEXT: pxor %xmm0, %xmm0
82-
; SSE-NEXT: pcmpeqb (%rsp), %xmm0 # 16-byte Folded Reload
83-
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
84-
; SSE-NEXT: pxor %xmm0, %xmm1
85-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
86-
; SSE-NEXT: psraw $8, %xmm0
79+
; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
80+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
81+
; SSE-NEXT: pxor %xmm1, %xmm1
82+
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
83+
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
84+
; SSE-NEXT: pxor %xmm1, %xmm0
8785
; SSE-NEXT: addq $24, %rsp
8886
; SSE-NEXT: retq
8987
;
@@ -135,9 +133,8 @@ define <8 x i16> @cmp_ne_load_const_extra_use2(ptr %x) nounwind {
135133
; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
136134
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
137135
; SSE-NEXT: callq use_v8i1@PLT
138-
; SSE-NEXT: punpcklbw (%rsp), %xmm0 # 16-byte Folded Reload
139-
; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
140-
; SSE-NEXT: psraw $8, %xmm0
136+
; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
137+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
141138
; SSE-NEXT: addq $24, %rsp
142139
; SSE-NEXT: retq
143140
;
@@ -183,12 +180,11 @@ define <8 x i16> @cmp_ne_no_load_const(i64 %x) nounwind {
183180
; SSE-LABEL: cmp_ne_no_load_const:
184181
; SSE: # %bb.0:
185182
; SSE-NEXT: movq %rdi, %xmm0
183+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
186184
; SSE-NEXT: pxor %xmm1, %xmm1
187185
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
188186
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
189187
; SSE-NEXT: pxor %xmm1, %xmm0
190-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
191-
; SSE-NEXT: psraw $8, %xmm0
192188
; SSE-NEXT: retq
193189
;
194190
; AVX2-LABEL: cmp_ne_no_load_const:
@@ -223,11 +219,10 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
223219
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
224220
; SSE-NEXT: pmaxub %xmm0, %xmm1
225221
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
222+
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
223+
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
226224
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
227225
; SSE-NEXT: pxor %xmm1, %xmm0
228-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
229-
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
230-
; SSE-NEXT: psrad $24, %xmm0
231226
; SSE-NEXT: retq
232227
;
233228
; AVX-LABEL: cmp_ult_load_const:
@@ -251,11 +246,10 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
251246
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u>
252247
; SSE-NEXT: pmaxub %xmm0, %xmm1
253248
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
249+
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
250+
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
254251
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
255252
; SSE-NEXT: pxor %xmm1, %xmm0
256-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
257-
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
258-
; SSE-NEXT: psrad $24, %xmm0
259253
; SSE-NEXT: retq
260254
;
261255
; AVX2-LABEL: cmp_ult_load_const_bad_type:
@@ -288,12 +282,11 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
288282
define <4 x i32> @cmp_slt_load_const(ptr %x) nounwind {
289283
; SSE-LABEL: cmp_slt_load_const:
290284
; SSE: # %bb.0:
291-
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
292-
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
293-
; SSE-NEXT: pcmpgtb %xmm0, %xmm1
294-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
285+
; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
286+
; SSE-NEXT: movdqa {{.*#+}} xmm0 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
287+
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
288+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
295289
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
296-
; SSE-NEXT: psrad $24, %xmm0
297290
; SSE-NEXT: retq
298291
;
299292
; AVX-LABEL: cmp_slt_load_const:
@@ -314,11 +307,9 @@ define <2 x i64> @cmp_ne_zextload(ptr %x, ptr %y) nounwind {
314307
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
315308
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
316309
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
310+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
317311
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
318312
; SSE-NEXT: pxor %xmm1, %xmm0
319-
; SSE-NEXT: pxor %xmm1, %xmm1
320-
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
321-
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
322313
; SSE-NEXT: retq
323314
;
324315
; AVX2-LABEL: cmp_ne_zextload:
@@ -353,10 +344,9 @@ define <8 x i16> @cmp_ugt_zextload(ptr %x, ptr %y) nounwind {
353344
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
354345
; SSE-NEXT: pminub %xmm0, %xmm1
355346
; SSE-NEXT: pcmpeqb %xmm0, %xmm1
347+
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
356348
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
357349
; SSE-NEXT: pxor %xmm1, %xmm0
358-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
359-
; SSE-NEXT: psraw $8, %xmm0
360350
; SSE-NEXT: retq
361351
;
362352
; AVX-LABEL: cmp_ugt_zextload:
@@ -381,7 +371,6 @@ define <8 x i16> @cmp_sgt_zextload(ptr %x, ptr %y) nounwind {
381371
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
382372
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
383373
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
384-
; SSE-NEXT: psraw $8, %xmm0
385374
; SSE-NEXT: retq
386375
;
387376
; AVX-LABEL: cmp_sgt_zextload:
@@ -407,10 +396,9 @@ define <8 x i32> @cmp_ne_zextload_from_legal_op(ptr %x, ptr %y) {
407396
; SSE-NEXT: pcmpeqw (%rsi), %xmm0
408397
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
409398
; SSE-NEXT: pxor %xmm0, %xmm1
399+
; SSE-NEXT: movdqa %xmm1, %xmm0
410400
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
411-
; SSE-NEXT: psrad $16, %xmm0
412401
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
413-
; SSE-NEXT: psrad $16, %xmm1
414402
; SSE-NEXT: retq
415403
;
416404
; AVX2-LABEL: cmp_ne_zextload_from_legal_op:
@@ -448,16 +436,15 @@ define <8 x i32> @PR50055(ptr %src, ptr %dst) nounwind {
448436
; SSE-NEXT: movdqa %xmm1, %xmm0
449437
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
450438
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
439+
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
451440
; SSE-NEXT: pcmpeqb %xmm3, %xmm2
452441
; SSE-NEXT: pcmpeqd %xmm3, %xmm3
453442
; SSE-NEXT: pxor %xmm2, %xmm3
454-
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
455-
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
456-
; SSE-NEXT: psrad $24, %xmm3
457-
; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
458-
; SSE-NEXT: psrad $24, %xmm2
459-
; SSE-NEXT: movdqa %xmm2, 16(%rsi)
460-
; SSE-NEXT: movdqa %xmm3, (%rsi)
443+
; SSE-NEXT: movdqa %xmm3, %xmm2
444+
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
445+
; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
446+
; SSE-NEXT: movdqa %xmm3, 16(%rsi)
447+
; SSE-NEXT: movdqa %xmm2, (%rsi)
461448
; SSE-NEXT: retq
462449
;
463450
; AVX-LABEL: PR50055:
@@ -484,12 +471,11 @@ define <8 x i16> @multi_use_narrower_size(ptr %src, ptr %dst) nounwind {
484471
; SSE-NEXT: pxor %xmm2, %xmm2
485472
; SSE-NEXT: movdqa %xmm1, %xmm0
486473
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
487-
; SSE-NEXT: pcmpeqb %xmm2, %xmm1
488474
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
475+
; SSE-NEXT: pcmpeqb %xmm2, %xmm1
476+
; SSE-NEXT: movdqa %xmm1, %xmm2
489477
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
490-
; SSE-NEXT: psrad $24, %xmm2
491478
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
492-
; SSE-NEXT: psrad $24, %xmm1
493479
; SSE-NEXT: movdqa %xmm1, 16(%rsi)
494480
; SSE-NEXT: movdqa %xmm2, (%rsi)
495481
; SSE-NEXT: retq
@@ -524,9 +510,8 @@ define <8 x i32> @multi_use_wider_size(ptr %src, ptr %dst) nounwind {
524510
; SSE-NEXT: movdqa %xmm1, %xmm0
525511
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
526512
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
527-
; SSE-NEXT: pcmpeqb %xmm3, %xmm2
528513
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
529-
; SSE-NEXT: psraw $8, %xmm2
514+
; SSE-NEXT: pcmpeqb %xmm3, %xmm2
530515
; SSE-NEXT: movdqa %xmm2, (%rsi)
531516
; SSE-NEXT: retq
532517
;
@@ -556,13 +541,12 @@ define <4 x i64> @PR50055_signed(ptr %src, ptr %dst) {
556541
; SSE-NEXT: psrad $24, %xmm0
557542
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
558543
; SSE-NEXT: psrad $24, %xmm1
544+
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
559545
; SSE-NEXT: pxor %xmm3, %xmm3
560546
; SSE-NEXT: pcmpgtb %xmm3, %xmm2
561-
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
547+
; SSE-NEXT: movdqa %xmm2, %xmm3
562548
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
563-
; SSE-NEXT: psrad $24, %xmm3
564549
; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
565-
; SSE-NEXT: psrad $24, %xmm2
566550
; SSE-NEXT: movdqa %xmm2, 16(%rsi)
567551
; SSE-NEXT: movdqa %xmm3, (%rsi)
568552
; SSE-NEXT: retq

0 commit comments

Comments
 (0)