From 87d24181f7df1da6f5828f17624ca3224c65030b Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sun, 21 Aug 2022 17:31:34 +0100
Subject: [PATCH] Use native scalar `fma` instruction

Cranelift 0.87 now supports lowering `fma` as a libcall on x86 [0].
With 0.88 enabling the native x86 instruction under the `has_fma` flag.

aarch64 and s390x already support this as a native instruction, so it's
nice that we emit it for those.

We can't lower the SIMD version using the `fma` instruction since the
lowering can fail if the x86 `has_fma` flag is not enabled. Cranelift
doesn't yet know how to fallback for these cases

[0]: https://github.com/bytecodealliance/wasmtime/commit/709716bb8e6adaf7e65f3497168af23ce0cf09ef
---
 src/intrinsics/mod.rs  |  6 ++++++
 src/intrinsics/simd.rs | 18 ++++++------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index cb620822f..ef3d5ccea 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -303,6 +303,12 @@ fn codegen_float_intrinsic_call<'tcx>(
 
     let layout = fx.layout_of(ty);
     let res = match intrinsic {
+        sym::fmaf32 | sym::fmaf64 => {
+            let a = args[0].load_scalar(fx);
+            let b = args[1].load_scalar(fx);
+            let c = args[2].load_scalar(fx);
+            CValue::by_val(fx.bcx.ins().fma(a, b, c), layout)
+        }
         sym::copysignf32 | sym::copysignf64 => {
             let a = args[0].load_scalar(fx);
             let b = args[1].load_scalar(fx);
diff --git a/src/intrinsics/simd.rs b/src/intrinsics/simd.rs
index c7efdb392..a32b413d4 100644
--- a/src/intrinsics/simd.rs
+++ b/src/intrinsics/simd.rs
@@ -397,21 +397,15 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
 
             let layout = a.layout();
             let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let res_lane_layout = fx.layout_of(lane_ty);
 
             for lane in 0..lane_count {
-                let a_lane = a.value_lane(fx, lane);
-                let b_lane = b.value_lane(fx, lane);
-                let c_lane = c.value_lane(fx, lane);
+                let a_lane = a.value_lane(fx, lane).load_scalar(fx);
+                let b_lane = b.value_lane(fx, lane).load_scalar(fx);
+                let c_lane = c.value_lane(fx, lane).load_scalar(fx);
 
-                let res_lane = match lane_ty.kind() {
-                    ty::Float(FloatTy::F32) => {
-                        fx.easy_call("fmaf", &[a_lane, b_lane, c_lane], lane_ty)
-                    }
-                    ty::Float(FloatTy::F64) => {
-                        fx.easy_call("fma", &[a_lane, b_lane, c_lane], lane_ty)
-                    }
-                    _ => unreachable!(),
-                };
+                let res_lane = fx.bcx.ins().fma(a_lane, b_lane, c_lane);
+                let res_lane = CValue::by_val(res_lane, res_lane_layout);
 
                 ret.place_lane(fx, lane).write_cvalue(fx, res_lane);
             }