Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit afe7517

Browse files
committed
[x86] Teach the new vector shuffle lowering about VBROADCAST and
VPBROADCAST. This has the somewhat expected pervasive impact. I don't know why I forgot about this. Everything seems good with lots of significant improvements in the tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218724 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent a7ac986 commit afe7517

10 files changed

+437
-263
lines changed

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7824,6 +7824,57 @@ static SDValue lowerVectorShuffleAsElementInsertion(
78247824
return V2;
78257825
}
78267826

7827+
/// \brief Try to lower broadcast of a single element.
7828+
///
7829+
/// For convenience, this code also bundles all of the subtarget feature set
7830+
/// filtering. While a little annoying to re-dispatch on type here, there isn't
7831+
/// a convenient way to factor it out.
7832+
static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
7833+
ArrayRef<int> Mask,
7834+
const X86Subtarget *Subtarget,
7835+
SelectionDAG &DAG) {
7836+
if (!Subtarget->hasAVX())
7837+
return SDValue();
7838+
if (VT.isInteger() && !Subtarget->hasAVX2())
7839+
return SDValue();
7840+
7841+
// Check that the mask is a broadcast.
7842+
int BroadcastIdx = -1;
7843+
for (int M : Mask)
7844+
if (M >= 0 && BroadcastIdx == -1)
7845+
BroadcastIdx = M;
7846+
else if (M >= 0 && M != BroadcastIdx)
7847+
return SDValue();
7848+
7849+
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
7850+
"a sorted mask where the broadcast "
7851+
"comes from V1.");
7852+
7853+
// Check if this is a broadcast of a scalar load -- those are more widely
7854+
// supported than broadcasting in-register values.
7855+
if (V.getOpcode() == ISD::BUILD_VECTOR ||
7856+
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
7857+
SDValue BroadcastV = V.getOperand(BroadcastIdx);
7858+
if (ISD::isNON_EXTLoad(BroadcastV.getNode())) {
7859+
// We can directly broadcast from memory.
7860+
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, BroadcastV);
7861+
}
7862+
}
7863+
7864+
// We can't broadcast from a register w/o AVX2.
7865+
if (!Subtarget->hasAVX2())
7866+
return SDValue();
7867+
7868+
// Check if this is a broadcast of a BUILD_VECTOR which we can always handle,
7869+
// or is a broadcast of the zero element.
7870+
if (V.getOpcode() == ISD::BUILD_VECTOR)
7871+
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, V.getOperand(BroadcastIdx));
7872+
else if (BroadcastIdx != 0)
7873+
return SDValue();
7874+
7875+
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
7876+
}
7877+
78277878
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
78287879
///
78297880
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -7900,6 +7951,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
79007951
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
79017952

79027953
if (isSingleInputShuffleMask(Mask)) {
7954+
// Check for being able to broadcast a single element.
7955+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
7956+
Mask, Subtarget, DAG))
7957+
return Broadcast;
7958+
79037959
// Straight shuffle of a single input vector. For everything from SSE2
79047960
// onward this has a single fast instruction with no scary immediates.
79057961
// We have to map the mask as it is actually a v4i32 shuffle instruction.
@@ -8057,6 +8113,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
80578113
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
80588114

80598115
if (NumV2Elements == 0) {
8116+
// Check for being able to broadcast a single element.
8117+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8118+
Mask, Subtarget, DAG))
8119+
return Broadcast;
8120+
80608121
if (Subtarget->hasAVX()) {
80618122
// If we have AVX, we can use VPERMILPS which will allow folding a load
80628123
// into the shuffle.
@@ -8157,6 +8218,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
81578218
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
81588219

81598220
if (NumV2Elements == 0) {
8221+
// Check for being able to broadcast a single element.
8222+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8223+
Mask, Subtarget, DAG))
8224+
return Broadcast;
8225+
81608226
// Straight shuffle of a single input vector. For everything from SSE2
81618227
// onward this has a single fast instruction with no scary immediates.
81628228
// We coerce the shuffle pattern to be compatible with UNPCK instructions
@@ -8253,6 +8319,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
82538319
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
82548320
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
82558321

8322+
// Check for being able to broadcast a single element.
8323+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8324+
Mask, Subtarget, DAG))
8325+
return Broadcast;
8326+
82568327
// Use dedicated unpack instructions for masks that match their pattern.
82578328
if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
82588329
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
@@ -9036,6 +9107,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
90369107

90379108
// For single-input shuffles, there are some nicer lowering tricks we can use.
90389109
if (NumV2Elements == 0) {
9110+
// Check for being able to broadcast a single element.
9111+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9112+
Mask, Subtarget, DAG))
9113+
return Broadcast;
9114+
90399115
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
90409116
// Notably, this handles splat and partial-splat shuffles more efficiently.
90419117
// However, it only makes sense if the pre-duplication shuffle simplifies
@@ -9455,6 +9531,11 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
94559531
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
94569532

94579533
if (isSingleInputShuffleMask(Mask)) {
9534+
// Check for being able to broadcast a single element.
9535+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
9536+
Mask, Subtarget, DAG))
9537+
return Broadcast;
9538+
94589539
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
94599540
// Non-half-crossing single input shuffles can be lowerid with an
94609541
// interleaved permutation.
@@ -9538,6 +9619,11 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
95389619
Subtarget, DAG))
95399620
return Blend;
95409621

9622+
// Check for being able to broadcast a single element.
9623+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
9624+
Mask, Subtarget, DAG))
9625+
return Broadcast;
9626+
95419627
// When the shuffle is mirrored between the 128-bit lanes of the unit, we can
95429628
// use lower latency instructions that will operate on both 128-bit lanes.
95439629
SmallVector<int, 2> RepeatedMask;
@@ -9592,6 +9678,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
95929678
Subtarget, DAG))
95939679
return Blend;
95949680

9681+
// Check for being able to broadcast a single element.
9682+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
9683+
Mask, Subtarget, DAG))
9684+
return Broadcast;
9685+
95959686
// If the shuffle mask is repeated in each 128-bit lane, we have many more
95969687
// options to efficiently lower the shuffle.
95979688
SmallVector<int, 4> RepeatedMask;
@@ -9665,6 +9756,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
96659756
Subtarget, DAG))
96669757
return Blend;
96679758

9759+
// Check for being able to broadcast a single element.
9760+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
9761+
Mask, Subtarget, DAG))
9762+
return Broadcast;
9763+
96689764
// If the shuffle mask is repeated in each 128-bit lane we can use more
96699765
// efficient instructions that mirror the shuffles across the two 128-bit
96709766
// lanes.
@@ -9714,6 +9810,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
97149810
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
97159811
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
97169812

9813+
// Check for being able to broadcast a single element.
9814+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
9815+
Mask, Subtarget, DAG))
9816+
return Broadcast;
9817+
97179818
// There are no generalized cross-lane shuffle operations available on i16
97189819
// element types.
97199820
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
@@ -9779,6 +9880,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
97799880
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
97809881
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
97819882

9883+
// Check for being able to broadcast a single element.
9884+
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
9885+
Mask, Subtarget, DAG))
9886+
return Broadcast;
9887+
97829888
// There are no generalized cross-lane shuffle operations available on i8
97839889
// element types.
97849890
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))

lib/Target/X86/X86InstrSSE.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8711,6 +8711,27 @@ let Predicates = [HasAVX2] in {
87118711
def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
87128712
(VBROADCASTSDYrr VR128:$src)>;
87138713

8714+
// Provide aliases for broadcast from the same regitser class that
8715+
// automatically does the extract.
8716+
def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
8717+
(VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src),
8718+
sub_xmm)))>;
8719+
def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))),
8720+
(VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src),
8721+
sub_xmm)))>;
8722+
def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))),
8723+
(VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src),
8724+
sub_xmm)))>;
8725+
def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))),
8726+
(VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src),
8727+
sub_xmm)))>;
8728+
def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
8729+
(VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
8730+
sub_xmm)))>;
8731+
def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
8732+
(VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
8733+
sub_xmm)))>;
8734+
87148735
// Provide fallback in case the load node that is used in the patterns above
87158736
// is used by additional users, which prevents the pattern selection.
87168737
let AddedComplexity = 20 in {

test/CodeGen/X86/vector-shuffle-128-v16.ll

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,16 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(
3636
; SSE41-NEXT: pshufb %xmm1, %xmm0
3737
; SSE41-NEXT: retq
3838
;
39-
; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
40-
; AVX: # BB#0:
41-
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
42-
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
43-
; AVX-NEXT: retq
39+
; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
40+
; AVX1: # BB#0:
41+
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
42+
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
43+
; AVX1-NEXT: retq
44+
;
45+
; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
46+
; AVX2: # BB#0:
47+
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
48+
; AVX2-NEXT: retq
4449
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
4550
ret <16 x i8> %shuffle
4651
}
@@ -200,10 +205,15 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
200205
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
201206
; SSE41-NEXT: retq
202207
;
203-
; AVX-LABEL: shuffle_v16i8_0101010101010101:
204-
; AVX: # BB#0:
205-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
206-
; AVX-NEXT: retq
208+
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
209+
; AVX1: # BB#0:
210+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
211+
; AVX1-NEXT: retq
212+
;
213+
; AVX2-LABEL: shuffle_v16i8_0101010101010101:
214+
; AVX2: # BB#0:
215+
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
216+
; AVX2-NEXT: retq
207217
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
208218
ret <16 x i8> %shuffle
209219
}
@@ -231,12 +241,18 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
231241
; SSE-NEXT: movdqa %xmm1, %xmm0
232242
; SSE-NEXT: retq
233243
;
234-
; AVX-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
235-
; AVX: # BB#0:
236-
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
237-
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
238-
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
239-
; AVX-NEXT: retq
244+
; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
245+
; AVX1: # BB#0:
246+
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
247+
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
248+
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
249+
; AVX1-NEXT: retq
250+
;
251+
; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
252+
; AVX2: # BB#0:
253+
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
254+
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
255+
; AVX2-NEXT: retq
240256
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
241257
ret <16 x i8> %shuffle
242258
}

test/CodeGen/X86/vector-shuffle-128-v2.ll

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,15 @@ define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
1414
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1515
; SSE-NEXT: retq
1616
;
17-
; AVX-LABEL: shuffle_v2i64_00:
18-
; AVX: # BB#0:
19-
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
20-
; AVX-NEXT: retq
17+
; AVX1-LABEL: shuffle_v2i64_00:
18+
; AVX1: # BB#0:
19+
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
20+
; AVX1-NEXT: retq
21+
;
22+
; AVX2-LABEL: shuffle_v2i64_00:
23+
; AVX2: # BB#0:
24+
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
25+
; AVX2-NEXT: retq
2126
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
2227
ret <2 x i64> %shuffle
2328
}
@@ -53,10 +58,15 @@ define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
5358
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5459
; SSE-NEXT: retq
5560
;
56-
; AVX-LABEL: shuffle_v2i64_22:
57-
; AVX: # BB#0:
58-
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
59-
; AVX-NEXT: retq
61+
; AVX1-LABEL: shuffle_v2i64_22:
62+
; AVX1: # BB#0:
63+
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
64+
; AVX1-NEXT: retq
65+
;
66+
; AVX2-LABEL: shuffle_v2i64_22:
67+
; AVX2: # BB#0:
68+
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0
69+
; AVX2-NEXT: retq
6070
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
6171
ret <2 x i64> %shuffle
6272
}

0 commit comments

Comments
 (0)