2
2
//!
3
3
//! This module documentation gives an overview of the method used. More documentation is inline.
4
4
//!
5
- //! Relevant notation:
5
+ //! # Relevant notation
6
6
//!
7
7
//! - `m_a`: the mantissa of `a`, in base 2
8
8
//! - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
9
9
//! - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
10
10
//! fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
11
11
//! will have N bits of integer and M-N bits of fraction.
12
12
//! - `hw`: half width, i.e. for `f64` this will be a `u32`.
13
- //! - `x` is the best estimate of `1/b `
13
+ //! - `x` is the best estimate of `1/m_b `
14
14
//!
15
15
//! # Method Overview
16
16
//!
22
22
//! - Check for early exits (infinity, zero, etc).
23
23
//! - If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
24
24
//! - Set the implicit bit so math is correct.
25
- //! - Shift the significand (with implicit bit) fully left such that fixed point UQ1 or UQ0
26
- //! numbers can be used for mantissa math. These will have greater precision than the actual
27
- //! mantissa, which is important for correct rounding.
28
- //! - Calculate the reciprocal of `b `, `x`.
25
+ //! - Shift mantissa significant digits (with implicit bit) fully left such that fixed- point UQ1
26
+ //! or UQ0 numbers can be used for mantissa math. These will have greater precision than the
27
+ //! actual mantissa, which is important for correct rounding.
28
+ //! - Calculate the reciprocal of `m_b `, `x`.
29
29
//! - Use the reciprocal to multiply rather than divide: `res = m_a * x_b * 2^{p_a - p_b}`.
30
30
//! - Reapply rounding.
31
31
//!
37
37
//!
38
38
//! In general, Newton's method takes the following form:
39
39
//!
40
- //! ```
40
+ //! ```text
41
41
//! `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
42
42
//! desired result.
43
43
//!
46
46
//! x_{n+1} = x_n - f(x_n) / f'(x_n)
47
47
//! ```
48
48
//!
49
- //! Applying this to finding the reciprocal:
49
+ //! Applying this to find the reciprocal:
50
50
//!
51
51
//! ```text
52
52
//! 1 / x = b
59
59
//! x_{n+1} = 2*x_n - b*x_n^2
60
60
//! ```
61
61
//!
62
- //! This is a process that can be repeated a known number of times to calculate the reciprocal with
63
- //! enough precision to achieve a correctly rounded result for the overall division operation.
62
+ //! This is a process that can be repeated to calculate the reciprocal with enough precision to
63
+ //! achieve a correctly rounded result for the overall division operation. The maximum required
64
+ //! number of iterations is known since precision doubles with each iteration.
64
65
//!
65
66
//! # Half-width operations
66
67
//!
67
68
//! Calculating the reciprocal requires widening multiplication and performing arithmetic on the
68
69
//! results, meaning that emulated integer arithmetic on `u128` (for `f64`) and `u256` (for `f128`)
69
70
//! gets used instead of native math.
70
71
//!
71
- //! To make this more efficient, all but the final operation can be computed with half-width
72
+ //! To make this more efficient, all but the final operation can be computed using half-width
72
73
//! integers. For example, rather than computing four iterations using 128-bit integers for `f64`,
73
74
//! we can instead perform three iterations using native 64-bit integers and only one final
74
75
//! iteration using the full 128 bits.
75
76
//!
76
- //! This works because precision doubles with each round, so only one round is needed to extend
77
- //! precision from half bits to near the full bumber of bits (some leeway is allowed here because
78
- //! our fixed point number has more bits than the final mantissa will).
77
+ //! This works because of precision doubling. Some leeway is allowed here because the fixed-point
78
+ //! number has more bits than the final mantissa will.
79
79
//!
80
80
//! [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
81
81
@@ -504,33 +504,38 @@ where
504
504
F :: from_repr ( abs_result | quotient_sign)
505
505
}
506
506
507
- /// Calculate the number of iterations required to get needed precision of a float type.
507
+ /// Calculate the number of iterations required for a float type's precision.
508
+ ///
509
+ /// This returns `(h, f)` where `h` is the number of iterations to be done using integers at half
510
+ /// the float's bit width, and `f` is the number of iterations done using integers of the float's
511
+ /// full width. This is further explained in the module documentation.
508
512
///
509
- /// This returns `(h, f)` where `h` is the number of iterations to be donei using integers
510
- /// at half the float's width, and `f` is the number of iterations done using integers of the
511
- /// float's full width. Doing some iterations at half width is an optimization when the float
512
- /// is larger than a word.
513
+ /// # Requirements
513
514
///
514
- /// ASSUMPTION: the initial estimate should have at least 8 bits of precision. If this is not
515
- /// true, results will be inaccurate.
515
+ /// The initial estimate should have at least 8 bits of precision. If this is not true, results
516
+ /// will be inaccurate.
516
517
const fn get_iterations < F : Float > ( ) -> ( usize , usize ) {
517
- // Precision doubles with each iteration
518
+ // Precision doubles with each iteration. Assume we start with 8 bits of precision.
518
519
let total_iterations = F :: BITS . ilog2 ( ) as usize - 2 ;
519
520
520
521
if 2 * size_of :: < F > ( ) <= size_of :: < * const ( ) > ( ) {
521
522
// If widening multiplication will be efficient (uses word-sized integers), there is no
522
523
// reason to use half-sized iterations.
523
524
( 0 , total_iterations)
524
525
} else {
526
+ // Otherwise, do as many iterations as possible at half width.
525
527
( total_iterations - 1 , 1 )
526
528
}
527
529
}
528
530
529
- /// u_n for different precisions (with N-1 half-width iterations):
531
+ /// `u_n` for different precisions (with N-1 half-width iterations).
532
+ ///
530
533
/// W0 is the precision of C
531
534
/// u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
532
535
///
533
536
/// Estimated with bc:
537
+ ///
538
+ /// ```text
534
539
/// define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
535
540
/// define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
536
541
/// define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
@@ -543,8 +548,9 @@ const fn get_iterations<F: Float>() -> (usize, usize) {
543
548
/// u_3 | < 7.31 | | < 7.31 | < 27054456580
544
549
/// u_4 | | | | < 80.4
545
550
/// Final (U_N) | same as u_3 | < 72 | < 218 | < 13920
551
+ /// ````
546
552
///
547
- /// Add 2 to U_N due to final decrement.
553
+ /// Add 2 to ` U_N` due to final decrement.
548
554
const fn reciprocal_precision < F : Float > ( ) -> u16 {
549
555
let ( half_iterations, full_iterations) = get_iterations :: < F > ( ) ;
550
556
@@ -612,6 +618,13 @@ intrinsics! {
612
618
div( a, b)
613
619
}
614
620
621
+ #[ avr_skip]
622
+ #[ ppc_alias = __divkf3]
623
+ #[ cfg( not( feature = "no-f16-f128" ) ) ]
624
+ pub extern "C" fn __divtf3( a: f128, b: f128) -> f128 {
625
+ div( a, b)
626
+ }
627
+
615
628
#[ cfg( target_arch = "arm" ) ]
616
629
pub extern "C" fn __divsf3vfp( a: f32 , b: f32 ) -> f32 {
617
630
a / b
0 commit comments