@@ -7824,6 +7824,57 @@ static SDValue lowerVectorShuffleAsElementInsertion(
7824
7824
return V2;
7825
7825
}
7826
7826
7827
+ /// \brief Try to lower broadcast of a single element.
7828
+ ///
7829
+ /// For convenience, this code also bundles all of the subtarget feature set
7830
+ /// filtering. While a little annoying to re-dispatch on type here, there isn't
7831
+ /// a convenient way to factor it out.
7832
+ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
7833
+ ArrayRef<int> Mask,
7834
+ const X86Subtarget *Subtarget,
7835
+ SelectionDAG &DAG) {
7836
+ if (!Subtarget->hasAVX())
7837
+ return SDValue();
7838
+ if (VT.isInteger() && !Subtarget->hasAVX2())
7839
+ return SDValue();
7840
+
7841
+ // Check that the mask is a broadcast.
7842
+ int BroadcastIdx = -1;
7843
+ for (int M : Mask)
7844
+ if (M >= 0 && BroadcastIdx == -1)
7845
+ BroadcastIdx = M;
7846
+ else if (M >= 0 && M != BroadcastIdx)
7847
+ return SDValue();
7848
+
7849
+ assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
7850
+ "a sorted mask where the broadcast "
7851
+ "comes from V1.");
7852
+
7853
+ // Check if this is a broadcast of a scalar load -- those are more widely
7854
+ // supported than broadcasting in-register values.
7855
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
7856
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
7857
+ SDValue BroadcastV = V.getOperand(BroadcastIdx);
7858
+ if (ISD::isNON_EXTLoad(BroadcastV.getNode())) {
7859
+ // We can directly broadcast from memory.
7860
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, BroadcastV);
7861
+ }
7862
+ }
7863
+
7864
+ // We can't broadcast from a register w/o AVX2.
7865
+ if (!Subtarget->hasAVX2())
7866
+ return SDValue();
7867
+
7868
+ // Check if this is a broadcast of a BUILD_VECTOR which we can always handle,
7869
+ // or is a broadcast of the zero element.
7870
+ if (V.getOpcode() == ISD::BUILD_VECTOR)
7871
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, V.getOperand(BroadcastIdx));
7872
+ else if (BroadcastIdx != 0)
7873
+ return SDValue();
7874
+
7875
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
7876
+ }
7877
+
7827
7878
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
7828
7879
///
7829
7880
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -7900,6 +7951,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
7900
7951
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
7901
7952
7902
7953
if (isSingleInputShuffleMask(Mask)) {
7954
+ // Check for being able to broadcast a single element.
7955
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
7956
+ Mask, Subtarget, DAG))
7957
+ return Broadcast;
7958
+
7903
7959
// Straight shuffle of a single input vector. For everything from SSE2
7904
7960
// onward this has a single fast instruction with no scary immediates.
7905
7961
// We have to map the mask as it is actually a v4i32 shuffle instruction.
@@ -8057,6 +8113,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8057
8113
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8058
8114
8059
8115
if (NumV2Elements == 0) {
8116
+ // Check for being able to broadcast a single element.
8117
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8118
+ Mask, Subtarget, DAG))
8119
+ return Broadcast;
8120
+
8060
8121
if (Subtarget->hasAVX()) {
8061
8122
// If we have AVX, we can use VPERMILPS which will allow folding a load
8062
8123
// into the shuffle.
@@ -8157,6 +8218,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8157
8218
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8158
8219
8159
8220
if (NumV2Elements == 0) {
8221
+ // Check for being able to broadcast a single element.
8222
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8223
+ Mask, Subtarget, DAG))
8224
+ return Broadcast;
8225
+
8160
8226
// Straight shuffle of a single input vector. For everything from SSE2
8161
8227
// onward this has a single fast instruction with no scary immediates.
8162
8228
// We coerce the shuffle pattern to be compatible with UNPCK instructions
@@ -8253,6 +8319,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
8253
8319
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8254
8320
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8255
8321
8322
+ // Check for being able to broadcast a single element.
8323
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8324
+ Mask, Subtarget, DAG))
8325
+ return Broadcast;
8326
+
8256
8327
// Use dedicated unpack instructions for masks that match their pattern.
8257
8328
if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8258
8329
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
@@ -9036,6 +9107,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9036
9107
9037
9108
// For single-input shuffles, there are some nicer lowering tricks we can use.
9038
9109
if (NumV2Elements == 0) {
9110
+ // Check for being able to broadcast a single element.
9111
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9112
+ Mask, Subtarget, DAG))
9113
+ return Broadcast;
9114
+
9039
9115
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
9040
9116
// Notably, this handles splat and partial-splat shuffles more efficiently.
9041
9117
// However, it only makes sense if the pre-duplication shuffle simplifies
@@ -9455,6 +9531,11 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9455
9531
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9456
9532
9457
9533
if (isSingleInputShuffleMask(Mask)) {
9534
+ // Check for being able to broadcast a single element.
9535
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
9536
+ Mask, Subtarget, DAG))
9537
+ return Broadcast;
9538
+
9458
9539
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
9459
9540
// Non-half-crossing single input shuffles can be lowerid with an
9460
9541
// interleaved permutation.
@@ -9538,6 +9619,11 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9538
9619
Subtarget, DAG))
9539
9620
return Blend;
9540
9621
9622
+ // Check for being able to broadcast a single element.
9623
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
9624
+ Mask, Subtarget, DAG))
9625
+ return Broadcast;
9626
+
9541
9627
// When the shuffle is mirrored between the 128-bit lanes of the unit, we can
9542
9628
// use lower latency instructions that will operate on both 128-bit lanes.
9543
9629
SmallVector<int, 2> RepeatedMask;
@@ -9592,6 +9678,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9592
9678
Subtarget, DAG))
9593
9679
return Blend;
9594
9680
9681
+ // Check for being able to broadcast a single element.
9682
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
9683
+ Mask, Subtarget, DAG))
9684
+ return Broadcast;
9685
+
9595
9686
// If the shuffle mask is repeated in each 128-bit lane, we have many more
9596
9687
// options to efficiently lower the shuffle.
9597
9688
SmallVector<int, 4> RepeatedMask;
@@ -9665,6 +9756,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9665
9756
Subtarget, DAG))
9666
9757
return Blend;
9667
9758
9759
+ // Check for being able to broadcast a single element.
9760
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
9761
+ Mask, Subtarget, DAG))
9762
+ return Broadcast;
9763
+
9668
9764
// If the shuffle mask is repeated in each 128-bit lane we can use more
9669
9765
// efficient instructions that mirror the shuffles across the two 128-bit
9670
9766
// lanes.
@@ -9714,6 +9810,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9714
9810
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9715
9811
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
9716
9812
9813
+ // Check for being able to broadcast a single element.
9814
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
9815
+ Mask, Subtarget, DAG))
9816
+ return Broadcast;
9817
+
9717
9818
// There are no generalized cross-lane shuffle operations available on i16
9718
9819
// element types.
9719
9820
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
@@ -9779,6 +9880,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9779
9880
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
9780
9881
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
9781
9882
9883
+ // Check for being able to broadcast a single element.
9884
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
9885
+ Mask, Subtarget, DAG))
9886
+ return Broadcast;
9887
+
9782
9888
// There are no generalized cross-lane shuffle operations available on i8
9783
9889
// element types.
9784
9890
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
0 commit comments