Skip to content

Commit d746a21

Browse files
committed
[x86] use more phadd for reductions
This is part of what is requested by PR42023: https://bugs.llvm.org/show_bug.cgi?id=42023 There's an extension needed for FP add, but exactly how we would specify that using flags is not clear to me, so I left that as a TODO. We're still missing patterns for partial reductions when the input vector is 256-bit or 512-bit, but I think that's a failure of vector narrowing. If we can reduce the widths, then this matching should work on those tests. Differential Revision: https://reviews.llvm.org/D64760 llvm-svn: 366268
1 parent f4c2d57 commit d746a21

File tree

4 files changed

+86
-56
lines changed

4 files changed

+86
-56
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35624,6 +35624,57 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
3562435624
llvm_unreachable("All opcodes should return within switch");
3562535625
}
3562635626

35627+
/// Try to convert a vector reduction sequence composed of binops and shuffles
35628+
/// into horizontal ops.
35629+
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
35630+
const X86Subtarget &Subtarget) {
35631+
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
35632+
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
35633+
if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
35634+
return SDValue();
35635+
SDValue Index = ExtElt->getOperand(1);
35636+
if (!isNullConstant(Index))
35637+
return SDValue();
35638+
35639+
// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
35640+
ISD::NodeType Opc;
35641+
SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
35642+
if (!Rdx)
35643+
return SDValue();
35644+
35645+
EVT VT = ExtElt->getValueType(0);
35646+
EVT VecVT = ExtElt->getOperand(0).getValueType();
35647+
if (VecVT.getScalarType() != VT)
35648+
return SDValue();
35649+
35650+
unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
35651+
SDLoc DL(ExtElt);
35652+
35653+
// 256-bit horizontal instructions operate on 128-bit chunks rather than
35654+
// across the whole vector, so we need an extract + hop preliminary stage.
35655+
// This is the only step where the operands of the hop are not the same value.
35656+
// TODO: We could extend this to handle 512-bit or even longer vectors.
35657+
if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
35658+
((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
35659+
unsigned NumElts = VecVT.getVectorNumElements();
35660+
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
35661+
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
35662+
VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
35663+
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
35664+
}
35665+
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
35666+
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
35667+
return SDValue();
35668+
35669+
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
35670+
assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
35671+
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
35672+
for (unsigned i = 0; i != ReductionSteps; ++i)
35673+
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
35674+
35675+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
35676+
}
35677+
3562735678
/// Detect vector gather/scatter index generation and convert it from being a
3562835679
/// bunch of shuffles and extracts into a somewhat faster sequence.
3562935680
/// For i686, the best sequence is apparently storing the value and loading
@@ -35710,6 +35761,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
3571035761
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
3571135762
return MinMax;
3571235763

35764+
if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
35765+
return V;
35766+
3571335767
if (SDValue V = scalarizeExtEltFP(N, DAG))
3571435768
return V;
3571535769

llvm/test/CodeGen/X86/phaddsub-extract.ll

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,10 +1903,8 @@ define i16 @hadd16_8(<8 x i16> %x223) {
19031903
;
19041904
; SSE3-FAST-LABEL: hadd16_8:
19051905
; SSE3-FAST: # %bb.0:
1906-
; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1907-
; SSE3-FAST-NEXT: paddw %xmm0, %xmm1
1908-
; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1909-
; SSE3-FAST-NEXT: paddw %xmm1, %xmm0
1906+
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
1907+
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
19101908
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
19111909
; SSE3-FAST-NEXT: movd %xmm0, %eax
19121910
; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1926,10 +1924,8 @@ define i16 @hadd16_8(<8 x i16> %x223) {
19261924
;
19271925
; AVX-FAST-LABEL: hadd16_8:
19281926
; AVX-FAST: # %bb.0:
1929-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1930-
; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
1931-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1932-
; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
1927+
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
1928+
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
19331929
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
19341930
; AVX-FAST-NEXT: vmovd %xmm0, %eax
19351931
; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1956,10 +1952,9 @@ define i32 @hadd32_4(<4 x i32> %x225) {
19561952
;
19571953
; SSE3-FAST-LABEL: hadd32_4:
19581954
; SSE3-FAST: # %bb.0:
1959-
; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1960-
; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
1961-
; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
1962-
; SSE3-FAST-NEXT: movd %xmm1, %eax
1955+
; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0
1956+
; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0
1957+
; SSE3-FAST-NEXT: movd %xmm0, %eax
19631958
; SSE3-FAST-NEXT: retq
19641959
;
19651960
; AVX-SLOW-LABEL: hadd32_4:
@@ -1973,8 +1968,7 @@ define i32 @hadd32_4(<4 x i32> %x225) {
19731968
;
19741969
; AVX-FAST-LABEL: hadd32_4:
19751970
; AVX-FAST: # %bb.0:
1976-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1977-
; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1971+
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
19781972
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
19791973
; AVX-FAST-NEXT: vmovd %xmm0, %eax
19801974
; AVX-FAST-NEXT: retq
@@ -2097,21 +2091,17 @@ define i32 @hadd32_16(<16 x i32> %x225) {
20972091
define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize {
20982092
; SSE3-LABEL: hadd16_8_optsize:
20992093
; SSE3: # %bb.0:
2100-
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2101-
; SSE3-NEXT: paddw %xmm0, %xmm1
2102-
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
2103-
; SSE3-NEXT: paddw %xmm1, %xmm0
2094+
; SSE3-NEXT: phaddw %xmm0, %xmm0
2095+
; SSE3-NEXT: phaddw %xmm0, %xmm0
21042096
; SSE3-NEXT: phaddw %xmm0, %xmm0
21052097
; SSE3-NEXT: movd %xmm0, %eax
21062098
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
21072099
; SSE3-NEXT: retq
21082100
;
21092101
; AVX-LABEL: hadd16_8_optsize:
21102102
; AVX: # %bb.0:
2111-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2112-
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2113-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2114-
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2103+
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
2104+
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
21152105
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
21162106
; AVX-NEXT: vmovd %xmm0, %eax
21172107
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -2129,16 +2119,14 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize {
21292119
define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize {
21302120
; SSE3-LABEL: hadd32_4_optsize:
21312121
; SSE3: # %bb.0:
2132-
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2133-
; SSE3-NEXT: paddd %xmm0, %xmm1
2134-
; SSE3-NEXT: phaddd %xmm1, %xmm1
2135-
; SSE3-NEXT: movd %xmm1, %eax
2122+
; SSE3-NEXT: phaddd %xmm0, %xmm0
2123+
; SSE3-NEXT: phaddd %xmm0, %xmm0
2124+
; SSE3-NEXT: movd %xmm0, %eax
21362125
; SSE3-NEXT: retq
21372126
;
21382127
; AVX-LABEL: hadd32_4_optsize:
21392128
; AVX: # %bb.0:
2140-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2141-
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2129+
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
21422130
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
21432131
; AVX-NEXT: vmovd %xmm0, %eax
21442132
; AVX-NEXT: retq

llvm/test/CodeGen/X86/vector-reduce-add-widen.ll

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
254254
;
255255
; AVX1-FAST-LABEL: test_v4i32:
256256
; AVX1-FAST: # %bb.0:
257-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
258-
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
257+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
259258
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
260259
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
261260
; AVX1-FAST-NEXT: retq
@@ -307,9 +306,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
307306
; AVX1-FAST-LABEL: test_v8i32:
308307
; AVX1-FAST: # %bb.0:
309308
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
310-
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
311-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
312-
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
309+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
310+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
313311
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
314312
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
315313
; AVX1-FAST-NEXT: vzeroupper
@@ -635,10 +633,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
635633
;
636634
; AVX1-FAST-LABEL: test_v8i16:
637635
; AVX1-FAST: # %bb.0:
638-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
639-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
640-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
641-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
636+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
637+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
642638
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
643639
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
644640
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -704,11 +700,9 @@ define i16 @test_v16i16(<16 x i16> %a0) {
704700
; AVX1-FAST-LABEL: test_v16i16:
705701
; AVX1-FAST: # %bb.0:
706702
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
707-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
708-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
709-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
710-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
711-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
703+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
704+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
705+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
712706
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
713707
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
714708
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax

llvm/test/CodeGen/X86/vector-reduce-add.ll

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
241241
;
242242
; AVX1-FAST-LABEL: test_v4i32:
243243
; AVX1-FAST: # %bb.0:
244-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
245-
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
244+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
246245
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
247246
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
248247
; AVX1-FAST-NEXT: retq
@@ -294,9 +293,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
294293
; AVX1-FAST-LABEL: test_v8i32:
295294
; AVX1-FAST: # %bb.0:
296295
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
297-
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
298-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
299-
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
296+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
297+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
300298
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
301299
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
302300
; AVX1-FAST-NEXT: vzeroupper
@@ -605,10 +603,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
605603
;
606604
; AVX1-FAST-LABEL: test_v8i16:
607605
; AVX1-FAST: # %bb.0:
608-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
609-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
610-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
611-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
606+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
607+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
612608
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
613609
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
614610
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -674,11 +670,9 @@ define i16 @test_v16i16(<16 x i16> %a0) {
674670
; AVX1-FAST-LABEL: test_v16i16:
675671
; AVX1-FAST: # %bb.0:
676672
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
677-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
678-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
679-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
680-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
681-
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
673+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
674+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
675+
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
682676
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
683677
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
684678
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax

0 commit comments

Comments
 (0)