Skip to content

Commit df672f6

Browse files
committed
[DAG] scalarizeExtractedVectorLoad - replace getABITypeAlign with allowsMemoryAccess (PR45116)
One of the cases identified in PR45116 - we don't need to limit extracted loads to ABI alignment, we can use allowsMemoryAccess - which tests using getABITypeAlign, but also checks if a target permits (fast) misaligned memory loads by checking allowsMisalignedMemoryAccesses as a fallback. I've also cleaned up the alignment calculation code - if we have a constant extraction index then the alignment can be based on an offset from the original vector load alignment, but for non-constant indices we should assume the worst (single element alignment only). Differential Revision: https://reviews.llvm.org/D110486
1 parent dff3454 commit df672f6

File tree

3 files changed

+37
-40
lines changed

3 files changed

+37
-40
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18662,32 +18662,35 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
1866218662
if (!VecEltVT.isByteSized())
1866318663
return SDValue();
1866418664

18665-
Align Alignment = OriginalLoad->getAlign();
18666-
Align NewAlign = DAG.getDataLayout().getABITypeAlign(
18667-
VecEltVT.getTypeForEVT(*DAG.getContext()));
18668-
18669-
if (NewAlign > Alignment ||
18670-
!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
18665+
ISD::LoadExtType ExtTy =
18666+
ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
18667+
if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
18668+
!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
1867118669
return SDValue();
1867218670

18673-
ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
18674-
ISD::NON_EXTLOAD : ISD::EXTLOAD;
18675-
if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
18676-
return SDValue();
18677-
18678-
Alignment = NewAlign;
18679-
18671+
Align Alignment = OriginalLoad->getAlign();
1868018672
MachinePointerInfo MPI;
1868118673
SDLoc DL(EVE);
1868218674
if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
1868318675
int Elt = ConstEltNo->getZExtValue();
1868418676
unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
1868518677
MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
18678+
Alignment = commonAlignment(Alignment, PtrOff);
1868618679
} else {
1868718680
// Discard the pointer info except the address space because the memory
1868818681
// operand can't represent this new access since the offset is variable.
1868918682
MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
18683+
Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
1869018684
}
18685+
18686+
bool IsFast = false;
18687+
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
18688+
OriginalLoad->getAddressSpace(), Alignment,
18689+
OriginalLoad->getMemOperand()->getFlags(),
18690+
&IsFast) ||
18691+
!IsFast)
18692+
return SDValue();
18693+
1869118694
SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
1869218695
InVecVT, EltNo);
1869318696

llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9143,18 +9143,12 @@ define i32 @load_single_extract_variable_index_i32(<4 x i32>* %A, i32 %idx) {
91439143
define i32 @load_single_extract_variable_index_v3i32_small_align(<3 x i32>* %A, i32 %idx) {
91449144
; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align:
91459145
; CHECK: ; %bb.0:
9146-
; CHECK-NEXT: sub sp, sp, #16
9147-
; CHECK-NEXT: .cfi_def_cfa_offset 16
9148-
; CHECK-NEXT: ldr d0, [x0]
9149-
; CHECK-NEXT: add x8, x0, #8
91509146
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
9151-
; CHECK-NEXT: mov x9, sp
9152-
; CHECK-NEXT: ld1.s { v0 }[2], [x8]
9153-
; CHECK-NEXT: and x8, x1, #0x3
9154-
; CHECK-NEXT: bfi x9, x8, #2, #2
9155-
; CHECK-NEXT: str q0, [sp]
9156-
; CHECK-NEXT: ldr w0, [x9]
9157-
; CHECK-NEXT: add sp, sp, #16
9147+
; CHECK-NEXT: sxtw x8, w1
9148+
; CHECK-NEXT: cmp x8, #2
9149+
; CHECK-NEXT: mov w9, #2
9150+
; CHECK-NEXT: csel x8, x8, x9, lo
9151+
; CHECK-NEXT: ldr w0, [x0, x8, lsl #2]
91589152
; CHECK-NEXT: ret
91599153
%lv = load <3 x i32>, <3 x i32>* %A, align 2
91609154
%e = extractelement <3 x i32> %lv, i32 %idx

llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -332,14 +332,14 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
332332
; X86-SSE4A: # %bb.0:
333333
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
334334
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
335-
; X86-SSE4A-NEXT: movups (%ecx), %xmm0
336-
; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
335+
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
336+
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
337337
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
338338
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
339-
; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
340339
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
340+
; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax)
341341
; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
342-
; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
342+
; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax)
343343
; X86-SSE4A-NEXT: retl
344344
;
345345
; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
@@ -360,14 +360,14 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
360360
;
361361
; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
362362
; X64-SSE4A: # %bb.0:
363-
; X64-SSE4A-NEXT: movups (%rdi), %xmm0
364-
; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
363+
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
364+
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
365365
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
366366
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
367-
; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
368367
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
368+
; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi)
369369
; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
370-
; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
370+
; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi)
371371
; X64-SSE4A-NEXT: retq
372372
;
373373
; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
@@ -445,14 +445,14 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
445445
; X86-SSE4A: # %bb.0:
446446
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
447447
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
448-
; X86-SSE4A-NEXT: movups (%ecx), %xmm0
449-
; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
448+
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
449+
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
450450
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
451451
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
452-
; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
453452
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
453+
; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax)
454454
; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
455-
; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
455+
; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax)
456456
; X86-SSE4A-NEXT: retl
457457
;
458458
; X64-SSE2-LABEL: merge_2_v4f32_align1:
@@ -473,14 +473,14 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
473473
;
474474
; X64-SSE4A-LABEL: merge_2_v4f32_align1:
475475
; X64-SSE4A: # %bb.0:
476-
; X64-SSE4A-NEXT: movups (%rdi), %xmm0
477-
; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
476+
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
477+
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
478478
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
479479
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
480-
; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
481480
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
481+
; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi)
482482
; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
483-
; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
483+
; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi)
484484
; X64-SSE4A-NEXT: retq
485485
;
486486
; X64-SSE41-LABEL: merge_2_v4f32_align1:

0 commit comments

Comments
 (0)