Skip to content

Commit 7dbfcfa

Browse files
committed
[DAG] combineInsertEltToShuffle - if EXTRACT_VECTOR_ELT fails to match an existing shuffle op, try to replace an undef op if there is one.
This should fix a number of shuffle regressions in D127115 where the re-ordered combines mean we fail to fold a EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT sequence into a BUILD_VECTOR if we extract from more than one vector source.
1 parent ae10b8a commit 7dbfcfa

File tree

2 files changed

+14
-9
lines changed

2 files changed

+14
-9
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19233,6 +19233,14 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
1923319233
}
1923419234
}
1923519235

19236+
// If we failed to find a match, see if we can replace an UNDEF shuffle
19237+
// operand.
19238+
if (ElementOffset == -1 && Y.isUndef() &&
19239+
InsertVal0.getValueType() == Y.getValueType()) {
19240+
ElementOffset = Mask.size();
19241+
Y = InsertVal0;
19242+
}
19243+
1923619244
if (ElementOffset != -1) {
1923719245
SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
1923819246

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -298,15 +298,13 @@ define void @load_i32_stride6_vf4(<24 x i32>* %in.vec, <4 x i32>* %out.vec0, <4
298298
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
299299
; AVX512-NEXT: vpextrd $2, %xmm5, %eax
300300
; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm8
301-
; AVX512-NEXT: vpextrd $3, %xmm0, %eax
302-
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
303-
; AVX512-NEXT: vpinsrd $1, %eax, %xmm6, %xmm6
304301
; AVX512-NEXT: vpextrd $1, %xmm3, %eax
302+
; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm0[2,3]
303+
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
305304
; AVX512-NEXT: vpinsrd $2, %eax, %xmm6, %xmm6
306305
; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3]
307-
; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm7
308-
; AVX512-NEXT: vmovd %xmm1, %eax
309-
; AVX512-NEXT: vpinsrd $1, %eax, %xmm7, %xmm7
306+
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm2[2,3]
307+
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,0,2,3]
310308
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3]
311309
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
312310
; AVX512-NEXT: vmovd %xmm4, %eax
@@ -323,10 +321,9 @@ define void @load_i32_stride6_vf4(<24 x i32>* %in.vec, <4 x i32>* %out.vec0, <4
323321
; AVX512-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
324322
; AVX512-NEXT: vpextrd $2, %xmm4, %eax
325323
; AVX512-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3
326-
; AVX512-NEXT: vpextrd $3, %xmm1, %eax
327-
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
328-
; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
329324
; AVX512-NEXT: vpextrd $1, %xmm5, %eax
325+
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
326+
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
330327
; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
331328
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
332329
; AVX512-NEXT: vmovdqa %xmm8, (%rsi)

0 commit comments

Comments
 (0)