Skip to content

Commit 5c7957d

Browse files
committed
[AArch64] Allow i16->f64 uitofp tbl shuffles
Just as we convert i8->f32 uitofp to tbl to perform the zext, we can do the same for i16->f64.
1 parent c2b93e0 commit 5c7957d

File tree

2 files changed

+185
-5
lines changed

2 files changed

+185
-5
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16615,7 +16615,7 @@ bool AArch64TargetLowering::shouldSinkOperands(
1661516615
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
1661616616
unsigned NumElts, bool IsLittleEndian,
1661716617
SmallVectorImpl<int> &Mask) {
16618-
if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
16618+
if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
1661916619
return false;
1662016620

1662116621
assert(DstWidth % SrcWidth == 0 &&
@@ -16649,7 +16649,7 @@ static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
1664916649
return nullptr;
1665016650

1665116651
auto *FirstEltZero = Builder.CreateInsertElement(
16652-
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
16652+
PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
1665316653
Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
1665416654
Result = Builder.CreateBitCast(Result, DstTy);
1665516655
if (DstTy != ZExtTy)
@@ -16670,7 +16670,7 @@ static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
1667016670
return nullptr;
1667116671

1667216672
auto *FirstEltZero = Builder.CreateInsertElement(
16673-
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
16673+
PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
1667416674

1667516675
return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
1667616676
}
@@ -16847,6 +16847,9 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1684716847
return false;
1684816848
}
1684916849

16850+
if (DstTy->getScalarSizeInBits() >= 64)
16851+
return false;
16852+
1685016853
IRBuilder<> Builder(ZExt);
1685116854
Value *Result = createTblShuffleForZExt(
1685216855
Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
@@ -16859,8 +16862,10 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1685916862
}
1686016863

1686116864
auto *UIToFP = dyn_cast<UIToFPInst>(I);
16862-
if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16863-
DstTy->getElementType()->isFloatTy()) {
16865+
if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
16866+
DstTy->getElementType()->isFloatTy()) ||
16867+
(SrcTy->getElementType()->isIntegerTy(16) &&
16868+
DstTy->getElementType()->isDoubleTy()))) {
1686416869
IRBuilder<> Builder(I);
1686516870
Value *ZExt = createTblShuffleForZExt(
1686616871
Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),

llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,3 +648,178 @@ loop:
648648
exit:
649649
ret void
650650
}
651+
652+
define void @uitofp_v8i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapture noundef writeonly %y, i32 noundef %n) {
653+
; CHECK-LABEL: uitofp_v8i16_to_v8f64:
654+
; CHECK: ; %bb.0: ; %entry
655+
; CHECK-NEXT: Lloh22:
656+
; CHECK-NEXT: adrp x8, lCPI10_0@PAGE
657+
; CHECK-NEXT: Lloh23:
658+
; CHECK-NEXT: adrp x9, lCPI10_1@PAGE
659+
; CHECK-NEXT: Lloh24:
660+
; CHECK-NEXT: adrp x10, lCPI10_2@PAGE
661+
; CHECK-NEXT: Lloh25:
662+
; CHECK-NEXT: ldr q0, [x8, lCPI10_0@PAGEOFF]
663+
; CHECK-NEXT: Lloh26:
664+
; CHECK-NEXT: adrp x8, lCPI10_3@PAGE
665+
; CHECK-NEXT: Lloh27:
666+
; CHECK-NEXT: ldr q1, [x9, lCPI10_1@PAGEOFF]
667+
; CHECK-NEXT: Lloh28:
668+
; CHECK-NEXT: ldr q2, [x10, lCPI10_2@PAGEOFF]
669+
; CHECK-NEXT: Lloh29:
670+
; CHECK-NEXT: ldr q3, [x8, lCPI10_3@PAGEOFF]
671+
; CHECK-NEXT: mov x8, xzr
672+
; CHECK-NEXT: LBB10_1: ; %vector.body
673+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
674+
; CHECK-NEXT: ldr q4, [x0, x8]
675+
; CHECK-NEXT: add x9, x1, x8
676+
; CHECK-NEXT: add x8, x8, #64
677+
; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192
678+
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
679+
; CHECK-NEXT: tbl.16b v6, { v4 }, v1
680+
; CHECK-NEXT: tbl.16b v7, { v4 }, v2
681+
; CHECK-NEXT: tbl.16b v4, { v4 }, v3
682+
; CHECK-NEXT: ucvtf.2d v5, v5
683+
; CHECK-NEXT: ucvtf.2d v6, v6
684+
; CHECK-NEXT: ucvtf.2d v7, v7
685+
; CHECK-NEXT: ucvtf.2d v4, v4
686+
; CHECK-NEXT: stp q6, q5, [x9, #32]
687+
; CHECK-NEXT: stp q4, q7, [x9]
688+
; CHECK-NEXT: b.ne LBB10_1
689+
; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup
690+
; CHECK-NEXT: ret
691+
; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh29
692+
; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh28
693+
; CHECK-NEXT: .loh AdrpLdr Lloh23, Lloh27
694+
; CHECK-NEXT: .loh AdrpAdrp Lloh22, Lloh26
695+
; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh25
696+
entry:
697+
br label %vector.body
698+
699+
vector.body:
700+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
701+
%.idx = shl nsw i64 %index, 3
702+
%g = getelementptr inbounds i8, ptr %x, i64 %.idx
703+
%wide.vec = load <8 x i16>, ptr %g, align 2
704+
%u = uitofp <8 x i16> %wide.vec to <8 x double>
705+
%h = getelementptr inbounds double, ptr %y, i64 %index
706+
store <8 x double> %u, ptr %h, align 8
707+
%index.next = add nuw i64 %index, 8
708+
%c = icmp eq i64 %index.next, 1024
709+
br i1 %c, label %for.cond.cleanup, label %vector.body
710+
711+
for.cond.cleanup:
712+
ret void
713+
}
714+
715+
define void @uitofp_ld4_v32i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapture noundef writeonly %y, i32 noundef %n) {
716+
; CHECK-LABEL: uitofp_ld4_v32i16_to_v8f64:
717+
; CHECK: ; %bb.0: ; %entry
718+
; CHECK-NEXT: Lloh30:
719+
; CHECK-NEXT: adrp x8, lCPI11_0@PAGE
720+
; CHECK-NEXT: Lloh31:
721+
; CHECK-NEXT: adrp x9, lCPI11_1@PAGE
722+
; CHECK-NEXT: Lloh32:
723+
; CHECK-NEXT: adrp x10, lCPI11_2@PAGE
724+
; CHECK-NEXT: Lloh33:
725+
; CHECK-NEXT: ldr q0, [x8, lCPI11_0@PAGEOFF]
726+
; CHECK-NEXT: Lloh34:
727+
; CHECK-NEXT: adrp x8, lCPI11_3@PAGE
728+
; CHECK-NEXT: Lloh35:
729+
; CHECK-NEXT: ldr q1, [x9, lCPI11_1@PAGEOFF]
730+
; CHECK-NEXT: Lloh36:
731+
; CHECK-NEXT: ldr q2, [x10, lCPI11_2@PAGEOFF]
732+
; CHECK-NEXT: Lloh37:
733+
; CHECK-NEXT: ldr q3, [x8, lCPI11_3@PAGEOFF]
734+
; CHECK-NEXT: mov x8, xzr
735+
; CHECK-NEXT: LBB11_1: ; %vector.body
736+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
737+
; CHECK-NEXT: add x9, x0, x8
738+
; CHECK-NEXT: ldp q5, q4, [x9, #32]
739+
; CHECK-NEXT: ldp q7, q6, [x9]
740+
; CHECK-NEXT: add x9, x1, x8
741+
; CHECK-NEXT: add x8, x8, #64
742+
; CHECK-NEXT: tbl.16b v16, { v4 }, v0
743+
; CHECK-NEXT: tbl.16b v17, { v5 }, v0
744+
; CHECK-NEXT: tbl.16b v21, { v4 }, v1
745+
; CHECK-NEXT: tbl.16b v18, { v6 }, v0
746+
; CHECK-NEXT: tbl.16b v19, { v7 }, v0
747+
; CHECK-NEXT: tbl.16b v20, { v7 }, v1
748+
; CHECK-NEXT: tbl.16b v22, { v5 }, v1
749+
; CHECK-NEXT: tbl.16b v23, { v5 }, v2
750+
; CHECK-NEXT: tbl.16b v24, { v4 }, v2
751+
; CHECK-NEXT: tbl.16b v25, { v7 }, v2
752+
; CHECK-NEXT: tbl.16b v5, { v5 }, v3
753+
; CHECK-NEXT: tbl.16b v4, { v4 }, v3
754+
; CHECK-NEXT: tbl.16b v7, { v7 }, v3
755+
; CHECK-NEXT: tbl.16b v26, { v6 }, v1
756+
; CHECK-NEXT: tbl.16b v27, { v6 }, v2
757+
; CHECK-NEXT: tbl.16b v6, { v6 }, v3
758+
; CHECK-NEXT: ucvtf.2d v17, v17
759+
; CHECK-NEXT: ucvtf.2d v16, v16
760+
; CHECK-NEXT: ucvtf.2d v19, v19
761+
; CHECK-NEXT: ucvtf.2d v18, v18
762+
; CHECK-NEXT: ucvtf.2d v22, v22
763+
; CHECK-NEXT: ucvtf.2d v23, v23
764+
; CHECK-NEXT: ucvtf.2d v5, v5
765+
; CHECK-NEXT: ucvtf.2d v21, v21
766+
; CHECK-NEXT: ucvtf.2d v24, v24
767+
; CHECK-NEXT: ucvtf.2d v4, v4
768+
; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192
769+
; CHECK-NEXT: ucvtf.2d v20, v20
770+
; CHECK-NEXT: ucvtf.2d v25, v25
771+
; CHECK-NEXT: ucvtf.2d v7, v7
772+
; CHECK-NEXT: ucvtf.2d v26, v26
773+
; CHECK-NEXT: ucvtf.2d v27, v27
774+
; CHECK-NEXT: ucvtf.2d v6, v6
775+
; CHECK-NEXT: fadd.2d v17, v22, v17
776+
; CHECK-NEXT: fadd.2d v5, v23, v5
777+
; CHECK-NEXT: fadd.2d v16, v21, v16
778+
; CHECK-NEXT: fadd.2d v4, v24, v4
779+
; CHECK-NEXT: fadd.2d v19, v20, v19
780+
; CHECK-NEXT: fadd.2d v7, v25, v7
781+
; CHECK-NEXT: fadd.2d v18, v26, v18
782+
; CHECK-NEXT: fadd.2d v6, v27, v6
783+
; CHECK-NEXT: fadd.2d v5, v17, v5
784+
; CHECK-NEXT: fadd.2d v4, v16, v4
785+
; CHECK-NEXT: fadd.2d v7, v19, v7
786+
; CHECK-NEXT: fadd.2d v6, v18, v6
787+
; CHECK-NEXT: stp q5, q4, [x9, #32]
788+
; CHECK-NEXT: stp q7, q6, [x9]
789+
; CHECK-NEXT: b.ne LBB11_1
790+
; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup
791+
; CHECK-NEXT: ret
792+
; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh37
793+
; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh36
794+
; CHECK-NEXT: .loh AdrpLdr Lloh31, Lloh35
795+
; CHECK-NEXT: .loh AdrpAdrp Lloh30, Lloh34
796+
; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh33
797+
entry:
798+
br label %vector.body
799+
800+
vector.body:
801+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
802+
%.idx = shl nsw i64 %index, 3
803+
%0 = getelementptr inbounds i8, ptr %x, i64 %.idx
804+
%wide.vec = load <32 x i16>, ptr %0, align 2
805+
%strided.vec = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
806+
%strided.vec36 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
807+
%strided.vec37 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
808+
%strided.vec38 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
809+
%1 = uitofp <8 x i16> %strided.vec to <8 x double>
810+
%2 = uitofp <8 x i16> %strided.vec36 to <8 x double>
811+
%3 = fadd fast <8 x double> %2, %1
812+
%4 = uitofp <8 x i16> %strided.vec37 to <8 x double>
813+
%5 = fadd fast <8 x double> %3, %4
814+
%6 = uitofp <8 x i16> %strided.vec38 to <8 x double>
815+
%7 = fadd fast <8 x double> %5, %6
816+
%8 = getelementptr inbounds double, ptr %y, i64 %index
817+
store <8 x double> %7, ptr %8, align 8
818+
%index.next = add nuw i64 %index, 8
819+
%9 = icmp eq i64 %index.next, 1024
820+
br i1 %9, label %for.cond.cleanup, label %vector.body
821+
822+
for.cond.cleanup:
823+
ret void
824+
}
825+

0 commit comments

Comments
 (0)