Skip to content

Commit 0138cc0

Browse files
author
Justin Hibbits
committed
PowerPC: Treat llvm.fma.f* intrinsic as using CTR with SPE
Summary: The SPE doesn't have a 'fma' instruction, so the intrinsic becomes a libcall. It really should become an expansion to two instructions, but for some reason the compiler doesn't think that's as optimal as a branch. Since this lowering is done after CTR is allocated for loops, tell the optimizer that CTR may be used in this case. This prevents a "Invalid PPC CTR loop!" assertion in the case that a fma() function call is used in a C/C++ file, and clang converts it into an intrinsic. Reviewed By: shchenz Differential Revision: https://reviews.llvm.org/D78668
1 parent 293c6d3 commit 0138cc0

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
319319
return true;
320320
else
321321
continue; // ISD::FCOPYSIGN is never a library call.
322+
case Intrinsic::fma: Opcode = ISD::FMA; break;
322323
case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
323324
case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
324325
case Intrinsic::ceil: Opcode = ISD::FCEIL; break;

llvm/test/CodeGen/PowerPC/spe.ll

+67
Original file line numberDiff line numberDiff line change
@@ -1355,3 +1355,70 @@ return:
13551355
ret double %1
13561356

13571357
}
1358+
1359+
define dso_local float @test_fma(i32 %d) local_unnamed_addr #0 {
1360+
; CHECK-LABEL: test_fma:
1361+
; CHECK: # %bb.0: # %entry
1362+
; CHECK-NEXT: mflr 0
1363+
; CHECK-NEXT: stw 0, 4(1)
1364+
; CHECK-NEXT: stwu 1, -48(1)
1365+
; CHECK-NEXT: .cfi_def_cfa_offset 48
1366+
; CHECK-NEXT: .cfi_offset lr, 4
1367+
; CHECK-NEXT: .cfi_offset r29, -12
1368+
; CHECK-NEXT: .cfi_offset r30, -8
1369+
; CHECK-NEXT: .cfi_offset r29, -40
1370+
; CHECK-NEXT: .cfi_offset r30, -32
1371+
; CHECK-NEXT: cmpwi 3, 1
1372+
; CHECK-NEXT: stw 29, 36(1) # 4-byte Folded Spill
1373+
; CHECK-NEXT: stw 30, 40(1) # 4-byte Folded Spill
1374+
; CHECK-NEXT: evstdd 29, 8(1) # 8-byte Folded Spill
1375+
; CHECK-NEXT: evstdd 30, 16(1) # 8-byte Folded Spill
1376+
; CHECK-NEXT: blt 0, .LBB57_3
1377+
; CHECK-NEXT: # %bb.1: # %for.body.preheader
1378+
; CHECK-NEXT: mr 30, 3
1379+
; CHECK-NEXT: li 29, 0
1380+
; CHECK-NEXT: # implicit-def: $r5
1381+
; CHECK-NEXT: .LBB57_2: # %for.body
1382+
; CHECK-NEXT: #
1383+
; CHECK-NEXT: efscfsi 3, 29
1384+
; CHECK-NEXT: mr 4, 3
1385+
; CHECK-NEXT: bl fmaf
1386+
; CHECK-NEXT: addi 29, 29, 1
1387+
; CHECK-NEXT: cmplw 30, 29
1388+
; CHECK-NEXT: mr 5, 3
1389+
; CHECK-NEXT: bne 0, .LBB57_2
1390+
; CHECK-NEXT: b .LBB57_4
1391+
; CHECK-NEXT: .LBB57_3:
1392+
; CHECK-NEXT: # implicit-def: $r5
1393+
; CHECK-NEXT: .LBB57_4: # %for.cond.cleanup
1394+
; CHECK-NEXT: evldd 30, 16(1) # 8-byte Folded Reload
1395+
; CHECK-NEXT: evldd 29, 8(1) # 8-byte Folded Reload
1396+
; CHECK-NEXT: mr 3, 5
1397+
; CHECK-NEXT: lwz 30, 40(1) # 4-byte Folded Reload
1398+
; CHECK-NEXT: lwz 29, 36(1) # 4-byte Folded Reload
1399+
; CHECK-NEXT: lwz 0, 52(1)
1400+
; CHECK-NEXT: addi 1, 1, 48
1401+
; CHECK-NEXT: mtlr 0
1402+
; CHECK-NEXT: blr
1403+
entry:
1404+
%cmp8 = icmp sgt i32 %d, 0
1405+
br i1 %cmp8, label %for.body, label %for.cond.cleanup
1406+
1407+
for.cond.cleanup: ; preds = %for.body, %entry
1408+
%e.0.lcssa = phi float [ undef, %entry ], [ %0, %for.body ]
1409+
ret float %e.0.lcssa
1410+
1411+
for.body: ; preds = %for.body, %entry
1412+
%f.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
1413+
%e.09 = phi float [ %0, %for.body ], [ undef, %entry ]
1414+
%conv = sitofp i32 %f.010 to float
1415+
%0 = tail call float @llvm.fma.f32(float %conv, float %conv, float %e.09)
1416+
%inc = add nuw nsw i32 %f.010, 1
1417+
%exitcond = icmp eq i32 %inc, %d
1418+
br i1 %exitcond, label %for.cond.cleanup, label %for.body
1419+
}
1420+
1421+
; Function Attrs: nounwind readnone speculatable willreturn
1422+
declare float @llvm.fma.f32(float, float, float) #1
1423+
1424+
attributes #1 = { nounwind readnone speculatable willreturn }

0 commit comments

Comments
 (0)