|
26 | 26 | namespace LIBC_NAMESPACE_DECL {
|
27 | 27 | namespace fputil {
|
28 | 28 |
|
| 29 | +// Decide whether to round a UInt up, down or not at all at a given bit |
| 30 | +// position, based on the current rounding mode. The assumption is that the |
| 31 | +// caller is going to make the integer `value >> rshift`, and then might need |
| 32 | +// to round it up by 1 depending on the value of the bits shifted off the |
| 33 | +// bottom. |
| 34 | +// |
| 35 | +// `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to |
| 36 | +// be reversed, which is what you'd want if this is the mantissa of a |
| 37 | +// negative floating-point number. |
| 38 | +// |
| 39 | +// Return value is +1 if the value should be rounded up; -1 if it should be |
| 40 | +// rounded down; 0 if it's exact and needs no rounding. |
| 41 | +template <size_t Bits> |
| 42 | +LIBC_INLINE constexpr int |
| 43 | +rounding_direction(const LIBC_NAMESPACE::UInt<Bits> &value, size_t rshift, |
| 44 | + Sign logical_sign) { |
| 45 | + if (rshift == 0 || (rshift < Bits && (value << (Bits - rshift)) == 0) || |
| 46 | + (rshift >= Bits && value == 0)) |
| 47 | + return 0; // exact |
| 48 | + |
| 49 | + switch (quick_get_round()) { |
| 50 | + case FE_TONEAREST: |
| 51 | + if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) { |
| 52 | + // We round up, unless the value is an exact halfway case and |
| 53 | + // the bit that will end up in the units place is 0, in which |
| 54 | + // case tie-break-to-even says round down. |
| 55 | + bool round_bit = rshift < Bits ? value.get_bit(rshift) : 0; |
| 56 | + return round_bit != 0 || (value << (Bits - rshift + 1)) != 0 ? +1 : -1; |
| 57 | + } else { |
| 58 | + return -1; |
| 59 | + } |
| 60 | + case FE_TOWARDZERO: |
| 61 | + return -1; |
| 62 | + case FE_DOWNWARD: |
| 63 | + return logical_sign.is_neg() && |
| 64 | + (rshift < Bits && (value << (Bits - rshift)) != 0) |
| 65 | + ? +1 |
| 66 | + : -1; |
| 67 | + case FE_UPWARD: |
| 68 | + return logical_sign.is_pos() && |
| 69 | + (rshift < Bits && (value << (Bits - rshift)) != 0) |
| 70 | + ? +1 |
| 71 | + : -1; |
| 72 | + default: |
| 73 | + __builtin_unreachable(); |
| 74 | + } |
| 75 | +} |
| 76 | + |
29 | 77 | // A generic class to perform computations of high precision floating points.
|
30 | 78 | // We store the value in dyadic format, including 3 fields:
|
31 | 79 | // sign : boolean value - false means positive, true means negative
|
@@ -101,6 +149,27 @@ template <size_t Bits> struct DyadicFloat {
|
101 | 149 | return exponent + (Bits - 1);
|
102 | 150 | }
|
103 | 151 |
|
| 152 | + // Produce a correctly rounded DyadicFloat from a too-large mantissa, |
| 153 | + // by shifting it down and rounding if necessary. |
| 154 | + template <size_t MantissaBits> |
| 155 | + LIBC_INLINE constexpr static DyadicFloat<Bits> |
| 156 | + round(Sign result_sign, int result_exponent, |
| 157 | + const LIBC_NAMESPACE::UInt<MantissaBits> &input_mantissa, |
| 158 | + size_t rshift) { |
| 159 | + MantissaType result_mantissa(input_mantissa >> rshift); |
| 160 | + if (rounding_direction(input_mantissa, rshift, result_sign) > 0) { |
| 161 | + ++result_mantissa; |
| 162 | + if (result_mantissa == 0) { |
| 163 | + // Rounding up made the mantissa integer wrap round to 0, |
| 164 | + // carrying a bit off the top. So we've rounded up to the next |
| 165 | + // exponent. |
| 166 | + result_mantissa.set_bit(Bits - 1); |
| 167 | + ++result_exponent; |
| 168 | + } |
| 169 | + } |
| 170 | + return DyadicFloat(result_sign, result_exponent, result_mantissa); |
| 171 | + } |
| 172 | + |
104 | 173 | #ifdef LIBC_TYPES_HAS_FLOAT16
|
105 | 174 | template <typename T, bool ShouldSignalExceptions>
|
106 | 175 | LIBC_INLINE constexpr cpp::enable_if_t<
|
@@ -374,6 +443,39 @@ template <size_t Bits> struct DyadicFloat {
|
374 | 443 |
|
375 | 444 | return new_mant;
|
376 | 445 | }
|
| 446 | + |
| 447 | + LIBC_INLINE constexpr MantissaType |
| 448 | + as_mantissa_type_rounded(int *round_dir_out = nullptr) const { |
| 449 | + int round_dir = 0; |
| 450 | + MantissaType new_mant; |
| 451 | + if (mantissa.is_zero()) { |
| 452 | + new_mant = 0; |
| 453 | + } else { |
| 454 | + new_mant = mantissa; |
| 455 | + if (exponent > 0) { |
| 456 | + new_mant <<= exponent; |
| 457 | + } else if (exponent < 0) { |
| 458 | + size_t shift = -exponent; |
| 459 | + new_mant >>= shift; |
| 460 | + round_dir = rounding_direction(mantissa, shift, sign); |
| 461 | + if (round_dir > 0) |
| 462 | + ++new_mant; |
| 463 | + } |
| 464 | + |
| 465 | + if (sign.is_neg()) { |
| 466 | + new_mant = (~new_mant) + 1; |
| 467 | + } |
| 468 | + } |
| 469 | + |
| 470 | + if (round_dir_out) |
| 471 | + *round_dir_out = round_dir; |
| 472 | + |
| 473 | + return new_mant; |
| 474 | + } |
| 475 | + |
| 476 | + LIBC_INLINE constexpr DyadicFloat operator-() const { |
| 477 | + return DyadicFloat(sign.negate(), exponent, mantissa); |
| 478 | + } |
377 | 479 | };
|
378 | 480 |
|
379 | 481 | // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the
|
@@ -433,6 +535,12 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
|
433 | 535 | return result.normalize();
|
434 | 536 | }
|
435 | 537 |
|
| 538 | +template <size_t Bits> |
| 539 | +LIBC_INLINE constexpr DyadicFloat<Bits> quick_sub(DyadicFloat<Bits> a, |
| 540 | + DyadicFloat<Bits> b) { |
| 541 | + return quick_add(a, -b); |
| 542 | +} |
| 543 | + |
436 | 544 | // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic
|
437 | 545 | // floats with rounding toward 0 and then normalize the output:
|
438 | 546 | // result.exponent = a.exponent + b.exponent + Bits,
|
@@ -464,6 +572,95 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
|
464 | 572 | return result;
|
465 | 573 | }
|
466 | 574 |
|
| 575 | +// Correctly rounded multiplication of 2 dyadic floats, assuming the |
| 576 | +// exponent remains within range. |
| 577 | +template <size_t Bits> |
| 578 | +LIBC_INLINE constexpr DyadicFloat<Bits> |
| 579 | +rounded_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) { |
| 580 | + using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>; |
| 581 | + Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS; |
| 582 | + int result_exponent = a.exponent + b.exponent + static_cast<int>(Bits); |
| 583 | + auto product = DblMant(a.mantissa) * DblMant(b.mantissa); |
| 584 | + // As in quick_mul(), renormalize by 1 bit manually rather than countl_zero |
| 585 | + if (product.get_bit(2 * Bits - 1) == 0) { |
| 586 | + product <<= 1; |
| 587 | + result_exponent -= 1; |
| 588 | + } |
| 589 | + |
| 590 | + return DyadicFloat<Bits>::round(result_sign, result_exponent, product, Bits); |
| 591 | +} |
| 592 | + |
| 593 | +// Approximate reciprocal - given a nonzero a, make a good approximation to 1/a. |
| 594 | +// The method is Newton-Raphson iteration, based on quick_mul. |
| 595 | +template <size_t Bits, typename = cpp::enable_if_t<(Bits >= 32)>> |
| 596 | +LIBC_INLINE constexpr DyadicFloat<Bits> |
| 597 | +approx_reciprocal(const DyadicFloat<Bits> &a) { |
| 598 | + // Given an approximation x to 1/a, a better one is x' = x(2-ax). |
| 599 | + // |
| 600 | + // You can derive this by using the Newton-Raphson formula with the function |
| 601 | + // f(x) = 1/x - a. But another way to see that it works is to say: suppose |
| 602 | + // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) = |
| 603 | + // 1-e^2. So the error in x' is the square of the error in x, i.e. the number |
| 604 | + // of correct bits in x' is double the number in x. |
| 605 | + |
| 606 | + // An initial approximation to the reciprocal |
| 607 | + DyadicFloat<Bits> x(Sign::POS, -32 - a.exponent - Bits, |
| 608 | + uint64_t(0xFFFFFFFFFFFFFFFF) / |
| 609 | + static_cast<uint64_t>(a.mantissa >> (Bits - 32))); |
| 610 | + |
| 611 | + // The constant 2, which we'll need in every iteration |
| 612 | + DyadicFloat<Bits> two(Sign::POS, 1, 1); |
| 613 | + |
| 614 | + // We expect at least 31 correct bits from our 32-bit starting approximation |
| 615 | + size_t ok_bits = 31; |
| 616 | + |
| 617 | + // The number of good bits doubles in each iteration, except that rounding |
| 618 | + // errors introduce a little extra each time. Subtract a bit from our |
| 619 | + // accuracy assessment to account for that. |
| 620 | + while (ok_bits < Bits) { |
| 621 | + x = quick_mul(x, quick_sub(two, quick_mul(a, x))); |
| 622 | + ok_bits = 2 * ok_bits - 1; |
| 623 | + } |
| 624 | + |
| 625 | + return x; |
| 626 | +} |
| 627 | + |
| 628 | +// Correctly rounded division of 2 dyadic floats, assuming the |
| 629 | +// exponent remains within range. |
| 630 | +template <size_t Bits> |
| 631 | +LIBC_INLINE constexpr DyadicFloat<Bits> |
| 632 | +rounded_div(const DyadicFloat<Bits> &af, const DyadicFloat<Bits> &bf) { |
| 633 | + using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>; |
| 634 | + |
| 635 | + // Make an approximation to the quotient as a * (1/b). Both the |
| 636 | + // multiplication and the reciprocal are a bit sloppy, which doesn't |
| 637 | + // matter, because we're going to correct for that below. |
| 638 | + auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf)); |
| 639 | + |
| 640 | + // Switch to BigInt and stop using quick_add and quick_mul: now |
| 641 | + // we're working in exact integers so as to get the true remainder. |
| 642 | + DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa; |
| 643 | + q <<= 2; // leave room for a round bit, even if exponent decreases |
| 644 | + a <<= af.exponent - bf.exponent - qf.exponent + 2; |
| 645 | + DblMant qb = q * b; |
| 646 | + if (qb < a) { |
| 647 | + DblMant too_small = a - b; |
| 648 | + while (qb <= too_small) { |
| 649 | + qb += b; |
| 650 | + ++q; |
| 651 | + } |
| 652 | + } else { |
| 653 | + while (qb > a) { |
| 654 | + qb -= b; |
| 655 | + --q; |
| 656 | + } |
| 657 | + } |
| 658 | + |
| 659 | + DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q); |
| 660 | + return DyadicFloat<Bits>::round(qbig.sign, qbig.exponent + Bits, |
| 661 | + qbig.mantissa, Bits); |
| 662 | +} |
| 663 | + |
467 | 664 | // Simple polynomial approximation.
|
468 | 665 | template <size_t Bits>
|
469 | 666 | LIBC_INLINE constexpr DyadicFloat<Bits>
|
|
0 commit comments