Optimize intrinsics on wasm32

alexcrichton · alexcrichton · commit 284f2d007ce0 · 2018-10-11T14:52:09.000-07:00
Profiling a recent demo I was playing with on `wasm32-unknown-unknown`
pointed me to the surprising result that 15% of the execution time was
in the `sqrt` intrinsic (there's a lot of math here). Upon investigation
I remembered that wasm (unconditionally) has a native `f32.sqrt`
instruction!

I was then subsequently confused that a simple `f.sqrt()` actually
codegens to use `f32.sqrt` in Rust, but I later realized that the
implementations of intrinsics in this library often use other intrinsics
to implement them. That means that the real intrinsic here, `acos`,
internally called `sqrt` at some point but wasn't using the optimized
implementation!

To help fix this situation this PR is intended on providing the
infrastructure for optimized implementations (via code generation) to be
used for each intrinsic. I've gone thorugh the various math instructions
that wasm has available and updated each of the intrinsic
implementations in this crate to optionally use the LLVM intrinsic
versions, which are known to unconditionally compile down to a single
instruction (unlike the arbitrary platform, where we don't know what it
will compile down to!).

To do this I created a new macro to wrap the invocation of LLVM
intrinsics. Invoking LLVM intrinsics is turned off by default (through a
new and on-by-default feature, `stable`). When the `stable` feature is
disabled, however, then the wasm-target specifically will enable usage
of the LLVM intrinsics. I've additionally added a CI builder which
should verify that these continue to build on Travis.

After this I intended to update the submodule in the `compiler-builtins`
repository so we can pull in the optimized implementation there, and
`compiler-builtins` naturally won't set `feature = "stable"` when
compiling so all the intrinsics should get compiled in by default. After
a further update of `the libcompiler_builtins` submodule in
rust-lang/rust we should be good to go!
diff --git a/.travis.yml b/.travis.yml
@@ -29,6 +29,13 @@ matrix:
     - env: TARGET=cargo-fmt
       rust: beta
 
+    - env: TARGET=wasm32-unknown-unknown
+      rust: nightly
+      install: rustup target add $TARGET
+      script:
+        - cargo build --target $TARGET
+        - cargo build --no-default-features --target $TARGET
+
 before_install: set -e
 
 install:
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,3 +24,7 @@ members = [
 
 [dev-dependencies]
 shared = { path = "shared" }
+
+[features]
+default = ['stable']
+stable = []
diff --git a/src/lib.rs b/src/lib.rs
@@ -11,6 +11,10 @@
 
 #![deny(warnings)]
 #![no_std]
+#![cfg_attr(
+    all(target_arch = "wasm32", not(feature = "stable")),
+    feature(core_intrinsics)
+)]
 
 mod math;
 
diff --git a/src/math/ceil.rs b/src/math/ceil.rs
@@ -4,6 +4,14 @@ const TOINT: f64 = 1. / f64::EPSILON;
 
 #[inline]
 pub fn ceil(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.ceil` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::ceilf64(x) }
+        }
+    }
     let u: u64 = x.to_bits();
     let e: i64 = (u >> 52 & 0x7ff) as i64;
     let y: f64;
diff --git a/src/math/ceilf.rs b/src/math/ceilf.rs
@@ -2,6 +2,14 @@ use core::f32;
 
 #[inline]
 pub fn ceilf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.ceil` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::ceilf32(x) }
+        }
+    }
     let mut ui = x.to_bits();
     let e = (((ui >> 23) & 0xff) - 0x7f) as i32;
 
diff --git a/src/math/fabs.rs b/src/math/fabs.rs
@@ -2,5 +2,13 @@ use core::u64;
 
 #[inline]
 pub fn fabs(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.abs` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::fabsf64(x) }
+        }
+    }
     f64::from_bits(x.to_bits() & (u64::MAX / 2))
 }
diff --git a/src/math/fabsf.rs b/src/math/fabsf.rs
@@ -1,4 +1,12 @@
 #[inline]
 pub fn fabsf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.abs` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::fabsf32(x) }
+        }
+    }
     f32::from_bits(x.to_bits() & 0x7fffffff)
 }
diff --git a/src/math/floor.rs b/src/math/floor.rs
@@ -4,6 +4,14 @@ const TOINT: f64 = 1. / f64::EPSILON;
 
 #[inline]
 pub fn floor(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.floor` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::floorf64(x) }
+        }
+    }
     let ui = x.to_bits();
     let e = ((ui >> 52) & 0x7ff) as i32;
 
diff --git a/src/math/floorf.rs b/src/math/floorf.rs
@@ -2,6 +2,14 @@ use core::f32;
 
 #[inline]
 pub fn floorf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.floor` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::floorf32(x) }
+        }
+    }
     let mut ui = x.to_bits();
     let e = (((ui >> 23) & 0xff) - 0x7f) as i32;
 
diff --git a/src/math/mod.rs b/src/math/mod.rs
@@ -58,6 +58,17 @@ macro_rules! i {
     };
 }
 
+macro_rules! llvm_intrinsically_optimized {
+    (#[cfg($($clause:tt)*)] $e:expr) => {
+        #[cfg(all(not(feature = "stable"), $($clause)*))]
+        {
+            if true { // thwart the dead code lint
+                $e
+            }
+        }
+    };
+}
+
 // Public modules
 mod acos;
 mod acosf;
diff --git a/src/math/sqrt.rs b/src/math/sqrt.rs
@@ -82,6 +82,18 @@ const TINY: f64 = 1.0e-300;
 
 #[inline]
 pub fn sqrt(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.sqrt` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return if x < 0.0 {
+                f64::NAN
+            } else {
+                unsafe { ::core::intrinsics::sqrtf64(x) }
+            }
+        }
+    }
     let mut z: f64;
     let sign: u32 = 0x80000000;
     let mut ix0: i32;
diff --git a/src/math/sqrtf.rs b/src/math/sqrtf.rs
@@ -17,6 +17,18 @@ const TINY: f32 = 1.0e-30;
 
 #[inline]
 pub fn sqrtf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.sqrt` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return if x < 0.0 {
+                ::core::f32::NAN
+            } else {
+                unsafe { ::core::intrinsics::sqrtf32(x) }
+            }
+        }
+    }
     let mut z: f32;
     let sign: i32 = 0x80000000u32 as i32;
     let mut ix: i32;
diff --git a/src/math/trunc.rs b/src/math/trunc.rs
@@ -2,6 +2,14 @@ use core::f64;
 
 #[inline]
 pub fn trunc(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.trunc` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::truncf64(x) }
+        }
+    }
     let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120f === 2 ^ 120
 
     let mut i: u64 = x.to_bits();
diff --git a/src/math/truncf.rs b/src/math/truncf.rs
@@ -2,6 +2,14 @@ use core::f32;
 
 #[inline]
 pub fn truncf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.trunc` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::truncf32(x) }
+        }
+    }
     let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
 
     let mut i: u32 = x.to_bits();