Skip to content

Commit e229086

Browse files
committed
[X86] ReplaceNodeResults - widen sub-128-bit vector truncations if it would allow them to use PACKSS/PACKUS
We currently just scalarize sub-128-bit vector truncations, but if the input vector has sufficient signbits/zerobits then we should try to use PACKSS/PACKUS with a widened vector with don't care upper elements. Shuffle lowering will struggle to detect this if we wait until the scalarization has been revectorized as a shuffle. Another step towards issue llvm#63710
1 parent d84450b commit e229086

10 files changed

+219
-267
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3716,6 +3716,12 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
37163716
return DAG.getBuildVector(ResultVT, dl,
37173717
Vec->ops().slice(IdxVal, ElemsPerChunk));
37183718

3719+
// Check if we're extracting the upper undef of a widening pattern.
3720+
if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3721+
Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3722+
isNullConstant(Vec.getOperand(2)))
3723+
return DAG.getUNDEF(ResultVT);
3724+
37193725
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
37203726
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
37213727
}
@@ -20016,6 +20022,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
2001620022
SDValue Lo, Hi;
2001720023
std::tie(Lo, Hi) = splitVector(In, DAG, DL);
2001820024

20025+
// If Hi is undef, then don't bother packing it and widen the result instead.
20026+
if (Hi.isUndef()) {
20027+
EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20028+
if (SDValue Res =
20029+
truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20030+
return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20031+
}
20032+
2001920033
unsigned SubSizeInBits = SrcSizeInBits / 2;
2002020034
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
2002120035
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
@@ -31974,9 +31988,45 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3197431988
EVT InEltVT = InVT.getVectorElementType();
3197531989
EVT EltVT = VT.getVectorElementType();
3197631990
unsigned WidenNumElts = WidenVT.getVectorNumElements();
31977-
3197831991
unsigned InBits = InVT.getSizeInBits();
31992+
3197931993
if (128 % InBits == 0) {
31994+
// See if we there are sufficient leading bits to perform a PACKUS/PACKSS.
31995+
// Skip for AVX512 unless this will be a single stage truncation.
31996+
if ((InEltVT == MVT::i16 || InEltVT == MVT::i32) &&
31997+
(EltVT == MVT::i8 || EltVT == MVT::i16) &&
31998+
(!Subtarget.hasAVX512() || InBits == (2 * VT.getSizeInBits()))) {
31999+
unsigned NumPackedSignBits =
32000+
std::min<unsigned>(EltVT.getSizeInBits(), 16);
32001+
unsigned NumPackedZeroBits =
32002+
Subtarget.hasSSE41() ? NumPackedSignBits : 8;
32003+
32004+
// Use PACKUS if the input has zero-bits that extend all the way to the
32005+
// packed/truncated value. e.g. masks, zext_in_reg, etc.
32006+
KnownBits Known = DAG.computeKnownBits(In);
32007+
unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
32008+
bool UsePACKUS =
32009+
NumLeadingZeroBits >= (InEltVT.getSizeInBits() - NumPackedZeroBits);
32010+
32011+
// Use PACKSS if the input has sign-bits that extend all the way to the
32012+
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
32013+
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
32014+
bool UsePACKSS =
32015+
NumSignBits > (InEltVT.getSizeInBits() - NumPackedSignBits);
32016+
32017+
if (UsePACKUS || UsePACKSS) {
32018+
SDValue WidenIn =
32019+
widenSubVector(In, false, Subtarget, DAG, dl,
32020+
InEltVT.getSizeInBits() * WidenNumElts);
32021+
if (SDValue Res = truncateVectorWithPACK(
32022+
UsePACKUS ? X86ISD::PACKUS : X86ISD::PACKSS, WidenVT, WidenIn,
32023+
dl, DAG, Subtarget)) {
32024+
Results.push_back(Res);
32025+
return;
32026+
}
32027+
}
32028+
}
32029+
3198032030
// 128 bit and smaller inputs should avoid truncate all together and
3198132031
// just use a build_vector that will become a shuffle.
3198232032
// TODO: Widen and use a shuffle directly?
@@ -31992,6 +32042,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3199232042
Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
3199332043
return;
3199432044
}
32045+
3199532046
// With AVX512 there are some cases that can use a target specific
3199632047
// truncate node to go from 256/512 to less than 128 with zeros in the
3199732048
// upper elements of the 128 bit result.

llvm/test/CodeGen/X86/avx512-vec-cmp.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,9 +1458,7 @@ define void @half_vec_compare(ptr %x, ptr %y) {
14581458
; KNL-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1]
14591459
; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
14601460
; KNL-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc2,0x01]
1461-
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1462-
; KNL-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
1463-
; KNL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
1461+
; KNL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0]
14641462
; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
14651463
; KNL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
14661464
; KNL-NEXT: vpextrw $0, %xmm0, (%rsi) ## encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]
@@ -1492,9 +1490,7 @@ define void @half_vec_compare(ptr %x, ptr %y) {
14921490
; AVX512BW-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1]
14931491
; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
14941492
; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x01]
1495-
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1496-
; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
1497-
; AVX512BW-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
1493+
; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0]
14981494
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
14991495
; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
15001496
; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]

llvm/test/CodeGen/X86/buildvec-insertvec.ll

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,18 @@ define void @foo(<3 x float> %in, ptr nocapture %out) nounwind {
88
; SSE2-LABEL: foo:
99
; SSE2: # %bb.0:
1010
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
11-
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
12-
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
13-
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
14-
; SSE2-NEXT: shll $8, %ecx
15-
; SSE2-NEXT: orl %eax, %ecx
16-
; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
17-
; SSE2-NEXT: shll $16, %eax
18-
; SSE2-NEXT: orl %ecx, %eax
11+
; SSE2-NEXT: packuswb %xmm0, %xmm0
12+
; SSE2-NEXT: packuswb %xmm0, %xmm0
13+
; SSE2-NEXT: movd %xmm0, %eax
1914
; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
2015
; SSE2-NEXT: movl %eax, (%rdi)
2116
; SSE2-NEXT: retq
2217
;
2318
; SSE41-LABEL: foo:
2419
; SSE41: # %bb.0:
2520
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
26-
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
21+
; SSE41-NEXT: packusdw %xmm0, %xmm0
22+
; SSE41-NEXT: packuswb %xmm0, %xmm0
2723
; SSE41-NEXT: movl $255, %eax
2824
; SSE41-NEXT: pinsrb $3, %eax, %xmm0
2925
; SSE41-NEXT: movd %xmm0, (%rdi)
@@ -32,7 +28,8 @@ define void @foo(<3 x float> %in, ptr nocapture %out) nounwind {
3228
; AVX-LABEL: foo:
3329
; AVX: # %bb.0:
3430
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
35-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
31+
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
32+
; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
3633
; AVX-NEXT: movl $255, %eax
3734
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
3835
; AVX-NEXT: vmovd %xmm0, (%rdi)

llvm/test/CodeGen/X86/fpclamptosat_vec.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -701,13 +701,13 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) {
701701
; CHECK-NEXT: pand %xmm2, %xmm0
702702
; CHECK-NEXT: pandn %xmm1, %xmm2
703703
; CHECK-NEXT: por %xmm0, %xmm2
704-
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
705-
; CHECK-NEXT: movdqa %xmm2, %xmm1
706-
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
707-
; CHECK-NEXT: pand %xmm1, %xmm2
708-
; CHECK-NEXT: pandn %xmm0, %xmm1
709-
; CHECK-NEXT: por %xmm2, %xmm1
710-
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
704+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4294934528,4294934528,u,u>
705+
; CHECK-NEXT: movdqa %xmm2, %xmm0
706+
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
707+
; CHECK-NEXT: pand %xmm0, %xmm2
708+
; CHECK-NEXT: pandn %xmm1, %xmm0
709+
; CHECK-NEXT: por %xmm2, %xmm0
710+
; CHECK-NEXT: packssdw %xmm0, %xmm0
711711
; CHECK-NEXT: retq
712712
entry:
713713
%conv = fptosi <2 x double> %x to <2 x i32>
@@ -2265,13 +2265,13 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
22652265
; CHECK-NEXT: pand %xmm2, %xmm0
22662266
; CHECK-NEXT: pandn %xmm1, %xmm2
22672267
; CHECK-NEXT: por %xmm0, %xmm2
2268-
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
2269-
; CHECK-NEXT: movdqa %xmm2, %xmm1
2270-
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
2271-
; CHECK-NEXT: pand %xmm1, %xmm2
2272-
; CHECK-NEXT: pandn %xmm0, %xmm1
2273-
; CHECK-NEXT: por %xmm2, %xmm1
2274-
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
2268+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4294934528,4294934528,u,u>
2269+
; CHECK-NEXT: movdqa %xmm2, %xmm0
2270+
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
2271+
; CHECK-NEXT: pand %xmm0, %xmm2
2272+
; CHECK-NEXT: pandn %xmm1, %xmm0
2273+
; CHECK-NEXT: por %xmm2, %xmm0
2274+
; CHECK-NEXT: packssdw %xmm0, %xmm0
22752275
; CHECK-NEXT: retq
22762276
entry:
22772277
%conv = fptosi <2 x double> %x to <2 x i32>

llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5185,9 +5185,8 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
51855185
; SSE2-NEXT: pand %xmm3, %xmm4
51865186
; SSE2-NEXT: pandn %xmm0, %xmm3
51875187
; SSE2-NEXT: por %xmm4, %xmm3
5188-
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
5189-
; SSE2-NEXT: packuswb %xmm3, %xmm3
5190-
; SSE2-NEXT: packuswb %xmm3, %xmm3
5188+
; SSE2-NEXT: packssdw %xmm3, %xmm3
5189+
; SSE2-NEXT: packsswb %xmm3, %xmm3
51915190
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
51925191
; SSE2-NEXT: movmskps %xmm2, %ecx
51935192
; SSE2-NEXT: xorl $15, %ecx

llvm/test/CodeGen/X86/masked_store_trunc_usat.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4956,11 +4956,9 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
49564956
; SSE2-NEXT: pxor %xmm0, %xmm3
49574957
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
49584958
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
4959-
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
4960-
; SSE2-NEXT: pxor %xmm4, %xmm3
4961-
; SSE2-NEXT: pand %xmm0, %xmm4
4962-
; SSE2-NEXT: por %xmm3, %xmm4
4963-
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
4959+
; SSE2-NEXT: pand %xmm4, %xmm0
4960+
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
4961+
; SSE2-NEXT: por %xmm0, %xmm4
49644962
; SSE2-NEXT: packuswb %xmm4, %xmm4
49654963
; SSE2-NEXT: packuswb %xmm4, %xmm4
49664964
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2

llvm/test/CodeGen/X86/pmulh.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -114,22 +114,14 @@ define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
114114
; SSE41-NEXT: pmulhw %xmm1, %xmm0
115115
; SSE41-NEXT: retq
116116
;
117-
; AVX2-LABEL: ashr_mulhw_v4i16:
118-
; AVX2: # %bb.0:
119-
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
120-
; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
121-
; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1
122-
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
123-
; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
124-
; AVX2-NEXT: retq
125-
;
126-
; AVX512-LABEL: ashr_mulhw_v4i16:
127-
; AVX512: # %bb.0:
128-
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
129-
; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
130-
; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
131-
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
132-
; AVX512-NEXT: retq
117+
; AVX-LABEL: ashr_mulhw_v4i16:
118+
; AVX: # %bb.0:
119+
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
120+
; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
121+
; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1
122+
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
123+
; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
124+
; AVX-NEXT: retq
133125
%a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
134126
%b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16>
135127
%c = mul <4 x i32> %a1, %b1

llvm/test/CodeGen/X86/vector-trunc-packus.ll

Lines changed: 51 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4169,37 +4169,21 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
41694169
}
41704170

41714171
define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" {
4172-
; SSE2-LABEL: trunc_packus_v4i32_v4i8:
4173-
; SSE2: # %bb.0:
4174-
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
4175-
; SSE2-NEXT: movdqa %xmm1, %xmm2
4176-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
4177-
; SSE2-NEXT: pand %xmm2, %xmm0
4178-
; SSE2-NEXT: pandn %xmm1, %xmm2
4179-
; SSE2-NEXT: por %xmm2, %xmm0
4180-
; SSE2-NEXT: pxor %xmm1, %xmm1
4181-
; SSE2-NEXT: movdqa %xmm0, %xmm2
4182-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
4183-
; SSE2-NEXT: pand %xmm2, %xmm0
4184-
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4185-
; SSE2-NEXT: packuswb %xmm0, %xmm0
4186-
; SSE2-NEXT: packuswb %xmm0, %xmm0
4187-
; SSE2-NEXT: retq
4188-
;
4189-
; SSSE3-LABEL: trunc_packus_v4i32_v4i8:
4190-
; SSSE3: # %bb.0:
4191-
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
4192-
; SSSE3-NEXT: movdqa %xmm1, %xmm2
4193-
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
4194-
; SSSE3-NEXT: pand %xmm2, %xmm0
4195-
; SSSE3-NEXT: pandn %xmm1, %xmm2
4196-
; SSSE3-NEXT: por %xmm2, %xmm0
4197-
; SSSE3-NEXT: pxor %xmm1, %xmm1
4198-
; SSSE3-NEXT: movdqa %xmm0, %xmm2
4199-
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
4200-
; SSSE3-NEXT: pand %xmm2, %xmm0
4201-
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
4202-
; SSSE3-NEXT: retq
4172+
; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8:
4173+
; SSE2-SSSE3: # %bb.0:
4174+
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
4175+
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
4176+
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
4177+
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
4178+
; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
4179+
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
4180+
; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1
4181+
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
4182+
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
4183+
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
4184+
; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
4185+
; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
4186+
; SSE2-SSSE3-NEXT: retq
42034187
;
42044188
; SSE41-LABEL: trunc_packus_v4i32_v4i8:
42054189
; SSE41: # %bb.0:
@@ -4274,39 +4258,22 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
42744258
}
42754259

42764260
define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
4277-
; SSE2-LABEL: trunc_packus_v4i32_v4i8_store:
4278-
; SSE2: # %bb.0:
4279-
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
4280-
; SSE2-NEXT: movdqa %xmm1, %xmm2
4281-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
4282-
; SSE2-NEXT: pand %xmm2, %xmm0
4283-
; SSE2-NEXT: pandn %xmm1, %xmm2
4284-
; SSE2-NEXT: por %xmm0, %xmm2
4285-
; SSE2-NEXT: pxor %xmm0, %xmm0
4286-
; SSE2-NEXT: movdqa %xmm2, %xmm1
4287-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
4288-
; SSE2-NEXT: pand %xmm2, %xmm1
4289-
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
4290-
; SSE2-NEXT: packuswb %xmm1, %xmm1
4291-
; SSE2-NEXT: packuswb %xmm1, %xmm1
4292-
; SSE2-NEXT: movd %xmm1, (%rdi)
4293-
; SSE2-NEXT: retq
4294-
;
4295-
; SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
4296-
; SSSE3: # %bb.0:
4297-
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
4298-
; SSSE3-NEXT: movdqa %xmm1, %xmm2
4299-
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
4300-
; SSSE3-NEXT: pand %xmm2, %xmm0
4301-
; SSSE3-NEXT: pandn %xmm1, %xmm2
4302-
; SSSE3-NEXT: por %xmm0, %xmm2
4303-
; SSSE3-NEXT: pxor %xmm0, %xmm0
4304-
; SSSE3-NEXT: movdqa %xmm2, %xmm1
4305-
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
4306-
; SSSE3-NEXT: pand %xmm2, %xmm1
4307-
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
4308-
; SSSE3-NEXT: movd %xmm1, (%rdi)
4309-
; SSSE3-NEXT: retq
4261+
; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
4262+
; SSE2-SSSE3: # %bb.0:
4263+
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
4264+
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
4265+
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
4266+
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
4267+
; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
4268+
; SSE2-SSSE3-NEXT: por %xmm0, %xmm2
4269+
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm0
4270+
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
4271+
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
4272+
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
4273+
; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
4274+
; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
4275+
; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi)
4276+
; SSE2-SSSE3-NEXT: retq
43104277
;
43114278
; SSE41-LABEL: trunc_packus_v4i32_v4i8_store:
43124279
; SSE41: # %bb.0:
@@ -4328,16 +4295,26 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
43284295
; AVX1-NEXT: vmovd %xmm0, (%rdi)
43294296
; AVX1-NEXT: retq
43304297
;
4331-
; AVX2-LABEL: trunc_packus_v4i32_v4i8_store:
4332-
; AVX2: # %bb.0:
4333-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
4334-
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
4335-
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
4336-
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
4337-
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
4338-
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
4339-
; AVX2-NEXT: vmovd %xmm0, (%rdi)
4340-
; AVX2-NEXT: retq
4298+
; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8_store:
4299+
; AVX2-SLOW: # %bb.0:
4300+
; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
4301+
; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0
4302+
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
4303+
; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
4304+
; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
4305+
; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
4306+
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
4307+
; AVX2-SLOW-NEXT: retq
4308+
;
4309+
; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8_store:
4310+
; AVX2-FAST: # %bb.0:
4311+
; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
4312+
; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0
4313+
; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4314+
; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
4315+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
4316+
; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
4317+
; AVX2-FAST-NEXT: retq
43414318
;
43424319
; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store:
43434320
; AVX512F: # %bb.0:

0 commit comments

Comments
 (0)