Skip to content

Commit 7706c7a

Browse files
committed
[X86] Fold vXi1 OR(KSHIFTL(X,NumElts/2),Y) -> KUNPCK
Convert shift+or bool vector patterns into CONCAT_VECTORS if we know this will be lowered to KUNPCK (which requires 16+ vector elements). Fixes PR32547
1 parent 2abad34 commit 7706c7a

File tree

2 files changed

+34
-16
lines changed

2 files changed

+34
-16
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+28
Original file line numberDiff line numberDiff line change
@@ -42625,6 +42625,34 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
4262542625
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
4262642626
return R;
4262742627

42628+
// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
42629+
// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
42630+
// iff the upper elements of the non-shifted arg are zero.
42631+
// KUNPCK require 16+ bool vector elements.
42632+
if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
42633+
unsigned NumElts = VT.getVectorNumElements();
42634+
unsigned HalfElts = NumElts / 2;
42635+
APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
42636+
if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
42637+
N1.getConstantOperandAPInt(1) == HalfElts &&
42638+
DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
42639+
SDLoc dl(N);
42640+
return DAG.getNode(
42641+
ISD::CONCAT_VECTORS, dl, VT,
42642+
extractSubVector(N0, 0, DAG, dl, HalfElts),
42643+
extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
42644+
}
42645+
if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
42646+
N0.getConstantOperandAPInt(1) == HalfElts &&
42647+
DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
42648+
SDLoc dl(N);
42649+
return DAG.getNode(
42650+
ISD::CONCAT_VECTORS, dl, VT,
42651+
extractSubVector(N1, 0, DAG, dl, HalfElts),
42652+
extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
42653+
}
42654+
}
42655+
4262842656
// Attempt to recursively combine an OR of shuffles.
4262942657
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
4263042658
SDValue Op(N, 0);

llvm/test/CodeGen/X86/vector-shuffle-v1.ll

+6-16
Original file line numberDiff line numberDiff line change
@@ -892,10 +892,7 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>
892892
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
893893
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
894894
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
895-
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
896-
; AVX512F-NEXT: kshiftlw $8, %k1, %k1
897-
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
898-
; AVX512F-NEXT: korw %k1, %k0, %k1
895+
; AVX512F-NEXT: kunpckbw %k1, %k0, %k1
899896
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
900897
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
901898
; AVX512F-NEXT: vzeroupper
@@ -905,8 +902,7 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>
905902
; AVX512VL: # %bb.0: # %entry
906903
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
907904
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
908-
; AVX512VL-NEXT: kshiftlw $8, %k0, %k0
909-
; AVX512VL-NEXT: korw %k1, %k0, %k1
905+
; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1
910906
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
911907
; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
912908
; AVX512VL-NEXT: vzeroupper
@@ -916,8 +912,7 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>
916912
; VL_BW_DQ: # %bb.0: # %entry
917913
; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
918914
; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
919-
; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0
920-
; VL_BW_DQ-NEXT: korw %k1, %k0, %k1
915+
; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1
921916
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
922917
; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
923918
; VL_BW_DQ-NEXT: vzeroupper
@@ -945,10 +940,7 @@ define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f
945940
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
946941
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
947942
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
948-
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
949-
; AVX512F-NEXT: kshiftlw $8, %k1, %k1
950-
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
951-
; AVX512F-NEXT: korw %k0, %k1, %k1
943+
; AVX512F-NEXT: kunpckbw %k1, %k0, %k1
952944
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
953945
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
954946
; AVX512F-NEXT: vzeroupper
@@ -958,8 +950,7 @@ define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f
958950
; AVX512VL: # %bb.0: # %entry
959951
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
960952
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
961-
; AVX512VL-NEXT: kshiftlw $8, %k0, %k0
962-
; AVX512VL-NEXT: korw %k0, %k1, %k1
953+
; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1
963954
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
964955
; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
965956
; AVX512VL-NEXT: vzeroupper
@@ -969,8 +960,7 @@ define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f
969960
; VL_BW_DQ: # %bb.0: # %entry
970961
; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
971962
; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
972-
; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0
973-
; VL_BW_DQ-NEXT: korw %k0, %k1, %k1
963+
; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1
974964
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
975965
; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
976966
; VL_BW_DQ-NEXT: vzeroupper

0 commit comments

Comments
 (0)