Implement u256 with two u128s rather than u64

tgross35 · tgross35 · commit 6fab367310eb · 2025-02-09T23:41:51.000-06:00
This produces better assembly, e.g. on aarch64: .globl libm::u128_wmul .p2align 2 libm::u128_wmul: Lfunc_begin124: .cfi_startproc mul x9, x2, x0 umulh x10, x2, x0 umulh x11, x3, x0 mul x12, x3, x0 umulh x13, x2, x1 mul x14, x2, x1 umulh x15, x3, x1 mul x16, x3, x1 adds x10, x10, x14 cinc x13, x13, hs adds x13, x13, x16 cinc x14, x15, hs adds x10, x10, x12 cinc x11, x11, hs adds x11, x13, x11 stp x9, x10, [x8] cinc x9, x14, hs stp x11, x9, [x8, rust-lang#16] ret The original was ~70 instructions so the improvement is significant. With these changes, the result is reasonably close to what LLVM generates using `u256` operands [1]. [1]: https://llvm.godbolt.org/z/re1aGdaqY
diff --git a/crates/libm-test/benches/icount.rs b/crates/libm-test/benches/icount.rs
@@ -77,7 +77,6 @@ fn setup_u128_mul() -> Vec<(u128, u128)> {
     v
 }
 
-/*
 fn setup_u256_add() -> Vec<(u256, u256)> {
     let mut v = Vec::new();
     for (x, y) in setup_u128_mul() {
@@ -88,7 +87,6 @@ fn setup_u256_add() -> Vec<(u256, u256)> {
     v.push((u256::MAX, u256::MAX));
     v
 }
-*/
 
 fn setup_u256_shift() -> Vec<(u256, u32)> {
     let mut v = Vec::new();
@@ -116,7 +114,6 @@ library_benchmark_group!(
     benchmarks = icount_bench_u128_widen_mul
 );
 
-/* Not yet implemented
 #[library_benchmark]
 #[bench::linspace(setup_u256_add())]
 fn icount_bench_u256_add(cases: Vec<(u256, u256)>) {
@@ -129,7 +126,6 @@ library_benchmark_group!(
     name = icount_bench_u256_add_group;
     benchmarks = icount_bench_u256_add
 );
-*/
 
 #[library_benchmark]
 #[bench::linspace(setup_u256_shift())]
@@ -148,7 +144,7 @@ main!(
     library_benchmark_groups =
     // u256-related benchmarks
     icount_bench_u128_widen_mul_group,
-    // icount_bench_u256_add_group,
+    icount_bench_u256_add_group,
     icount_bench_u256_shr_group,
     // verify-apilist-start
     // verify-sorted-start
diff --git a/crates/libm-test/src/gen/random.rs b/crates/libm-test/src/gen/random.rs
@@ -14,7 +14,7 @@ use crate::run_cfg::{int_range, iteration_count};
 
 pub(crate) const SEED_ENV: &str = "LIBM_SEED";
 
-pub(crate) static SEED: LazyLock<[u8; 32]> = LazyLock::new(|| {
+pub static SEED: LazyLock<[u8; 32]> = LazyLock::new(|| {
     let s = env::var(SEED_ENV).unwrap_or_else(|_| {
         let mut rng = rand::thread_rng();
         (0..32).map(|_| rng.sample(Alphanumeric) as char).collect()
diff --git a/crates/libm-test/src/lib.rs b/crates/libm-test/src/lib.rs
@@ -29,7 +29,10 @@ pub use op::{
 };
 pub use precision::{MaybeOverride, SpecialCase, default_ulp};
 use run_cfg::extensive_max_iterations;
-pub use run_cfg::{CheckBasis, CheckCtx, EXTENSIVE_ENV, GeneratorKind, skip_extensive_test};
+pub use run_cfg::{
+    CheckBasis, CheckCtx, EXTENSIVE_ENV, GeneratorKind, bigint_fuzz_iteration_count,
+    skip_extensive_test,
+};
 pub use test_traits::{CheckOutput, Hex, TupleCall};
 
 /// Result type for tests is usually from `anyhow`. Most times there is no success value to
diff --git a/crates/libm-test/src/run_cfg.rs b/crates/libm-test/src/run_cfg.rs
@@ -158,14 +158,6 @@ impl TestEnv {
         let op = id.math_op();
 
         let will_run_mp = cfg!(feature = "build-mpfr");
-
-        // Tests are pretty slow on non-64-bit targets, x86 MacOS, and targets that run in QEMU. Start
-        // with a reduced number on these platforms.
-        let slow_on_ci = crate::emulated()
-            || usize::BITS < 64
-            || cfg!(all(target_arch = "x86_64", target_vendor = "apple"));
-        let slow_platform = slow_on_ci && crate::ci();
-
         let large_float_ty = match op.float_ty {
             FloatTy::F16 | FloatTy::F32 => false,
             FloatTy::F64 | FloatTy::F128 => true,
@@ -176,7 +168,7 @@ impl TestEnv {
         let input_count = op.rust_sig.args.len();
 
         Self {
-            slow_platform,
+            slow_platform: slow_platform(),
             large_float_ty,
             should_run_extensive: will_run_extensive,
             mp_tests_enabled: will_run_mp,
@@ -185,6 +177,17 @@ impl TestEnv {
     }
 }
 
+/// Tests are pretty slow on non-64-bit targets, x86 MacOS, and targets that run in QEMU. Start
+/// with a reduced number on these platforms.
+fn slow_platform() -> bool {
+    let slow_on_ci = crate::emulated()
+        || usize::BITS < 64
+        || cfg!(all(target_arch = "x86_64", target_vendor = "apple"));
+
+    // If not running in CI, there is no need to reduce iteration count.
+    slow_on_ci && crate::ci()
+}
+
 /// The number of iterations to run for a given test.
 pub fn iteration_count(ctx: &CheckCtx, argnum: usize) -> u64 {
     let t_env = TestEnv::from_env(ctx);
@@ -351,3 +354,12 @@ pub fn skip_extensive_test(ctx: &CheckCtx) -> bool {
     let t_env = TestEnv::from_env(ctx);
     !t_env.should_run_extensive
 }
+
+/// The number of iterations to run for `u256` fuzz tests.
+pub fn bigint_fuzz_iteration_count() -> u64 {
+    if !cfg!(optimizations_enabled) {
+        return 1000;
+    }
+
+    if slow_platform() { 100_000 } else { 5_000_000 }
+}
diff --git a/crates/libm-test/tests/u256.rs b/crates/libm-test/tests/u256.rs
@@ -0,0 +1,147 @@
+//! Test the u256 implementation. the ops already get exercised reasonably well through the `f128`
+//! routines, so this only does a few million fuzz iterations against GMP.
+
+#![cfg(feature = "build-mpfr")]
+
+use std::sync::LazyLock;
+
+use libm::support::{HInt, u256};
+type BigInt = rug::Integer;
+
+use libm_test::bigint_fuzz_iteration_count;
+use libm_test::gen::random::SEED;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use rug::Assign;
+use rug::integer::Order;
+use rug::ops::NotAssign;
+
+static BIGINT_U256_MAX: LazyLock<BigInt> =
+    LazyLock::new(|| BigInt::from_digits(&[u128::MAX, u128::MAX], Order::Lsf));
+
+/// Copied from the test module.
+fn hexu(v: u256) -> String {
+    format!("0x{:032x}{:032x}", v.hi, v.lo)
+}
+
+fn random_u256(rng: &mut ChaCha8Rng) -> u256 {
+    let lo: u128 = rng.gen();
+    let hi: u128 = rng.gen();
+    u256 { lo, hi }
+}
+
+fn assign_bigint(bx: &mut BigInt, x: u256) {
+    bx.assign_digits(&[x.lo, x.hi], Order::Lsf);
+}
+
+fn from_bigint(bx: &mut BigInt) -> u256 {
+    // Truncate so the result fits into `[u128; 2]`. This makes all ops overflowing.
+    *bx &= &*BIGINT_U256_MAX;
+    let mut bres = [0u128, 0];
+    bx.write_digits(&mut bres, Order::Lsf);
+    bx.assign(0);
+    u256 { lo: bres[0], hi: bres[1] }
+}
+
+fn check_one(
+    x: impl FnOnce() -> String,
+    y: impl FnOnce() -> Option<String>,
+    actual: u256,
+    expected: &mut BigInt,
+) {
+    let expected = from_bigint(expected);
+    if actual != expected {
+        let xmsg = x();
+        let ymsg = y().map(|y| format!("y:        {y}\n")).unwrap_or_default();
+        panic!(
+            "Results do not match\n\
+            input:    {xmsg}\n\
+            {ymsg}\
+            actual:   {}\n\
+            expected: {}\
+            ",
+            hexu(actual),
+            hexu(expected),
+        )
+    }
+}
+
+#[test]
+fn mp_u256_bitor() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+        let actual = x | y;
+        bx |= &by;
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_not() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        let actual = !x;
+        bx.not_assign();
+        check_one(|| hexu(x), || None, actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_add() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+        let actual = x + y;
+        bx += &by;
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_shr() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let shift: u32 = rng.gen_range(0..255);
+        assign_bigint(&mut bx, x);
+        let actual = x >> shift;
+        bx >>= shift;
+        check_one(|| hexu(x), || Some(shift.to_string()), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_widen_mul() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x: u128 = rng.gen();
+        let y: u128 = rng.gen();
+        bx.assign(x);
+        by.assign(y);
+        let actual = x.widen_mul(y);
+        bx *= &by;
+        check_one(|| format!("{x:#034x}"), || Some(format!("{y:#034x}")), actual, &mut bx);
+    }
+}
diff --git a/src/math/support/big.rs b/src/math/support/big.rs
diff --git a/src/math/support/big/tests.rs b/src/math/support/big/tests.rs