Skip to content

Commit a84790e

Browse files
authored
[X86] SimplifyDemandedVectorEltsForTargetNode - reduce width of X86 conversions nodes when upper elements are not demanded. (llvm#102882)
Fixes llvm#83402
1 parent a426ffd commit a84790e

File tree

2 files changed

+28
-11
lines changed

2 files changed

+28
-11
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -42524,6 +42524,26 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4252442524
SDValue Insert =
4252542525
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
4252642526
return TLO.CombineTo(Op, Insert);
42527+
}
42528+
// Conversions.
42529+
// TODO: Add more CVT opcodes when we have test coverage.
42530+
case X86ISD::CVTTP2SI:
42531+
case X86ISD::CVTTP2UI:
42532+
case X86ISD::CVTPH2PS: {
42533+
SDLoc DL(Op);
42534+
unsigned Scale = SizeInBits / ExtSizeInBits;
42535+
SDValue SrcOp = Op.getOperand(0);
42536+
MVT SrcVT = SrcOp.getSimpleValueType();
42537+
unsigned SrcExtSize =
42538+
std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
42539+
MVT ExtVT = MVT::getVectorVT(VT.getSimpleVT().getScalarType(),
42540+
ExtSizeInBits / VT.getScalarSizeInBits());
42541+
SDValue ExtOp = TLO.DAG.getNode(
42542+
Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
42543+
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42544+
SDValue Insert =
42545+
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42546+
return TLO.CombineTo(Op, Insert);
4252742547
}
4252842548
// Zero upper elements.
4252942549
case X86ISD::VZEXT_MOVL:

llvm/test/CodeGen/X86/vector-half-conversions.ll

+8-11
Original file line numberDiff line numberDiff line change
@@ -4990,6 +4990,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
49904990
ret <4 x i32> %ext
49914991
}
49924992

4993+
; PR83402
49934994
define <4 x i32> @fptosi_4f16_to_4i32(<4 x half> %a) nounwind {
49944995
; AVX-LABEL: fptosi_4f16_to_4i32:
49954996
; AVX: # %bb.0:
@@ -5024,16 +5025,14 @@ define <4 x i32> @fptosi_4f16_to_4i32(<4 x half> %a) nounwind {
50245025
;
50255026
; F16C-LABEL: fptosi_4f16_to_4i32:
50265027
; F16C: # %bb.0:
5027-
; F16C-NEXT: vcvtph2ps %xmm0, %ymm0
5028+
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
50285029
; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
5029-
; F16C-NEXT: vzeroupper
50305030
; F16C-NEXT: retq
50315031
;
50325032
; AVX512-LABEL: fptosi_4f16_to_4i32:
50335033
; AVX512: # %bb.0:
5034-
; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
5034+
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
50355035
; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0
5036-
; AVX512-NEXT: vzeroupper
50375036
; AVX512-NEXT: retq
50385037
%cvt = fptosi <4 x half> %a to <4 x i32>
50395038
ret <4 x i32> %cvt
@@ -5213,13 +5212,12 @@ define <4 x i32> @fptoui_4f16_to_4i32(<4 x half> %a) nounwind {
52135212
;
52145213
; F16C-LABEL: fptoui_4f16_to_4i32:
52155214
; F16C: # %bb.0:
5216-
; F16C-NEXT: vcvtph2ps %xmm0, %ymm0
5217-
; F16C-NEXT: vcvttps2dq %ymm0, %ymm1
5218-
; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5219-
; F16C-NEXT: vcvttps2dq %ymm0, %ymm0
5215+
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
5216+
; F16C-NEXT: vcvttps2dq %xmm0, %xmm1
5217+
; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5218+
; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
52205219
; F16C-NEXT: vorps %xmm0, %xmm1, %xmm0
52215220
; F16C-NEXT: vblendvps %xmm1, %xmm0, %xmm1, %xmm0
5222-
; F16C-NEXT: vzeroupper
52235221
; F16C-NEXT: retq
52245222
;
52255223
; AVX512F-LABEL: fptoui_4f16_to_4i32:
@@ -5232,9 +5230,8 @@ define <4 x i32> @fptoui_4f16_to_4i32(<4 x half> %a) nounwind {
52325230
;
52335231
; AVX512-FASTLANE-LABEL: fptoui_4f16_to_4i32:
52345232
; AVX512-FASTLANE: # %bb.0:
5235-
; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %ymm0
5233+
; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0
52365234
; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0
5237-
; AVX512-FASTLANE-NEXT: vzeroupper
52385235
; AVX512-FASTLANE-NEXT: retq
52395236
%cvt = fptoui <4 x half> %a to <4 x i32>
52405237
ret <4 x i32> %cvt

0 commit comments

Comments
 (0)