Skip to content

Commit a3c1c57

Browse files
committed
Add f128 division
Use the new generic division algorithm to expose `__divtf3` and `__divkf3`.
1 parent a5ce398 commit a3c1c57

File tree

5 files changed

+59
-26
lines changed

5 files changed

+59
-26
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ of being added to Rust.
256256

257257
- [x] addtf3.c
258258
- [x] comparetf2.c
259-
- [ ] divtf3.c
259+
- [x] divtf3.c
260260
- [x] extenddftf2.c
261261
- [x] extendhfsf2.c
262262
- [x] extendhftf2.c

build.rs

-1
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,6 @@ mod c {
526526
("__floatsitf", "floatsitf.c"),
527527
("__floatunditf", "floatunditf.c"),
528528
("__floatunsitf", "floatunsitf.c"),
529-
("__divtf3", "divtf3.c"),
530529
("__powitf2", "powitf2.c"),
531530
("__fe_getround", "fp_mode.c"),
532531
("__fe_raise_inexact", "fp_mode.c"),

examples/intrinsics.rs

+5
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,10 @@ mod intrinsics {
256256
a * b
257257
}
258258

259+
pub fn divtf(a: f128, b: f128) -> f128 {
260+
a / b
261+
}
262+
259263
pub fn subtf(a: f128, b: f128) -> f128 {
260264
a - b
261265
}
@@ -440,6 +444,7 @@ fn run() {
440444
bb(aeabi_uldivmod(bb(2), bb(3)));
441445
bb(ashlti3(bb(2), bb(2)));
442446
bb(ashrti3(bb(2), bb(2)));
447+
bb(divtf(bb(2.), bb(2.)));
443448
bb(divti3(bb(2), bb(2)));
444449
bb(eqtf(bb(2.), bb(2.)));
445450
bb(extendhfdf(bb(2.)));

src/float/div.rs

+37-24
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
//!
33
//! This module documentation gives an overview of the method used. More documentation is inline.
44
//!
5-
//! Relevant notation:
5+
//! # Relevant notation
66
//!
77
//! - `m_a`: the mantissa of `a`, in base 2
88
//! - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
99
//! - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
1010
//! fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
1111
//! will have N bits of integer and M-N bits of fraction.
1212
//! - `hw`: half width, i.e. for `f64` this will be a `u32`.
13-
//! - `x` is the best estimate of `1/b`
13+
//! - `x` is the best estimate of `1/m_b`
1414
//!
1515
//! # Method Overview
1616
//!
@@ -22,10 +22,10 @@
2222
//! - Check for early exits (infinity, zero, etc).
2323
//! - If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
2424
//! - Set the implicit bit so math is correct.
25-
//! - Shift the significand (with implicit bit) fully left such that fixed point UQ1 or UQ0
26-
//! numbers can be used for mantissa math. These will have greater precision than the actual
27-
//! mantissa, which is important for correct rounding.
28-
//! - Calculate the reciprocal of `b`, `x`.
25+
//! - Shift mantissa significant digits (with implicit bit) fully left such that fixed-point UQ1
26+
//! or UQ0 numbers can be used for mantissa math. These will have greater precision than the
27+
//! actual mantissa, which is important for correct rounding.
28+
//! - Calculate the reciprocal of `m_b`, `x`.
2929
//! - Use the reciprocal to multiply rather than divide: `res = m_a * x_b * 2^{p_a - p_b}`.
3030
//! - Reapply rounding.
3131
//!
@@ -37,7 +37,7 @@
3737
//!
3838
//! In general, Newton's method takes the following form:
3939
//!
40-
//! ```
40+
//! ```text
4141
//! `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
4242
//! desired result.
4343
//!
@@ -46,7 +46,7 @@
4646
//! x_{n+1} = x_n - f(x_n) / f'(x_n)
4747
//! ```
4848
//!
49-
//! Applying this to finding the reciprocal:
49+
//! Applying this to find the reciprocal:
5050
//!
5151
//! ```text
5252
//! 1 / x = b
@@ -59,23 +59,23 @@
5959
//! x_{n+1} = 2*x_n - b*x_n^2
6060
//! ```
6161
//!
62-
//! This is a process that can be repeated a known number of times to calculate the reciprocal with
63-
//! enough precision to achieve a correctly rounded result for the overall division operation.
62+
//! This is a process that can be repeated to calculate the reciprocal with enough precision to
63+
//! achieve a correctly rounded result for the overall division operation. The maximum required
64+
//! number of iterations is known since precision doubles with each iteration.
6465
//!
6566
//! # Half-width operations
6667
//!
6768
//! Calculating the reciprocal requires widening multiplication and performing arithmetic on the
6869
//! results, meaning that emulated integer arithmetic on `u128` (for `f64`) and `u256` (for `f128`)
6970
//! gets used instead of native math.
7071
//!
71-
//! To make this more efficient, all but the final operation can be computed with half-width
72+
//! To make this more efficient, all but the final operation can be computed using half-width
7273
//! integers. For example, rather than computing four iterations using 128-bit integers for `f64`,
7374
//! we can instead perform three iterations using native 64-bit integers and only one final
7475
//! iteration using the full 128 bits.
7576
//!
76-
//! This works because precision doubles with each round, so only one round is needed to extend
77-
//! precision from half bits to near the full bumber of bits (some leeway is allowed here because
78-
//! our fixed point number has more bits than the final mantissa will).
77+
//! This works because of precision doubling. Some leeway is allowed here because the fixed-point
78+
//! number has more bits than the final mantissa will.
7979
//!
8080
//! [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
8181
@@ -504,33 +504,38 @@ where
504504
F::from_repr(abs_result | quotient_sign)
505505
}
506506

507-
/// Calculate the number of iterations required to get needed precision of a float type.
507+
/// Calculate the number of iterations required for a float type's precision.
508+
///
509+
/// This returns `(h, f)` where `h` is the number of iterations to be done using integers at half
510+
/// the float's bit width, and `f` is the number of iterations done using integers of the float's
511+
/// full width. This is further explained in the module documentation.
508512
///
509-
/// This returns `(h, f)` where `h` is the number of iterations to be donei using integers
510-
/// at half the float's width, and `f` is the number of iterations done using integers of the
511-
/// float's full width. Doing some iterations at half width is an optimization when the float
512-
/// is larger than a word.
513+
/// # Requirements
513514
///
514-
/// ASSUMPTION: the initial estimate should have at least 8 bits of precision. If this is not
515-
/// true, results will be inaccurate.
515+
/// The initial estimate should have at least 8 bits of precision. If this is not true, results
516+
/// will be inaccurate.
516517
const fn get_iterations<F: Float>() -> (usize, usize) {
517-
// Precision doubles with each iteration
518+
// Precision doubles with each iteration. Assume we start with 8 bits of precision.
518519
let total_iterations = F::BITS.ilog2() as usize - 2;
519520

520521
if 2 * size_of::<F>() <= size_of::<*const ()>() {
521522
// If widening multiplication will be efficient (uses word-sized integers), there is no
522523
// reason to use half-sized iterations.
523524
(0, total_iterations)
524525
} else {
526+
// Otherwise, do as many iterations as possible at half width.
525527
(total_iterations - 1, 1)
526528
}
527529
}
528530

529-
/// u_n for different precisions (with N-1 half-width iterations):
531+
/// `u_n` for different precisions (with N-1 half-width iterations).
532+
///
530533
/// W0 is the precision of C
531534
/// u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
532535
///
533536
/// Estimated with bc:
537+
///
538+
/// ```text
534539
/// define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
535540
/// define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
536541
/// define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
@@ -543,8 +548,9 @@ const fn get_iterations<F: Float>() -> (usize, usize) {
543548
/// u_3 | < 7.31 | | < 7.31 | < 27054456580
544549
/// u_4 | | | | < 80.4
545550
/// Final (U_N) | same as u_3 | < 72 | < 218 | < 13920
551+
/// ````
546552
///
547-
/// Add 2 to U_N due to final decrement.
553+
/// Add 2 to `U_N` due to final decrement.
548554
const fn reciprocal_precision<F: Float>() -> u16 {
549555
let (half_iterations, full_iterations) = get_iterations::<F>();
550556

@@ -612,6 +618,13 @@ intrinsics! {
612618
div(a, b)
613619
}
614620

621+
#[avr_skip]
622+
#[ppc_alias = __divkf3]
623+
#[cfg(not(feature = "no-f16-f128"))]
624+
pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
625+
div(a, b)
626+
}
627+
615628
#[cfg(target_arch = "arm")]
616629
pub extern "C" fn __divsf3vfp(a: f32, b: f32) -> f32 {
617630
a / b

testcrate/tests/div_rem.rs

+16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#![feature(f128)]
12
#![allow(unused_macros)]
23

34
use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
@@ -152,4 +153,19 @@ mod float_div {
152153
f32, __divsf3vfp, Single, all();
153154
f64, __divdf3vfp, Double, all();
154155
}
156+
157+
#[cfg(not(feature = "no-f16-f128"))]
158+
#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
159+
float! {
160+
f128, __divtf3, Quad,
161+
// FIXME(llvm): there is a bug in LLVM rt.
162+
// See <https://github.com/llvm/llvm-project/issues/91840>.
163+
not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
164+
}
165+
166+
#[cfg(not(feature = "no-f16-f128"))]
167+
#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
168+
float! {
169+
f128, __divkf3, Quad, not(feature = "no-sys-f128");
170+
}
155171
}

0 commit comments

Comments
 (0)