Skip to content

Commit b53da77

Browse files
authored
[libc] Alternative algorithm for decimal FP printf (#123643)
The existing options for bin→dec float conversion are all based on the Ryū algorithm, which generates 9 output digits at a time using a table lookup. For users who can't afford the space cost of the table, the table-lookup subroutine is replaced with one that computes the needed table entry on demand, but the algorithm is otherwise unmodified. The performance problem with computing table entries on demand is that now you need to calculate a power of 10 for each 9 digits you output. But if you're calculating a custom power of 10 anyway, it's easier to just compute one, and multiply the _whole_ mantissa by it. This patch adds a header file alongside `float_dec_converter.h`, which replaces the whole Ryū system instead of just the table-lookup routine, implementing this alternative simpler algorithm. The result is accurate enough to satisfy (minimally) the accuracy demands of IEEE 754-2019 even in 128-bit long double. The new float128 test cases demonstrate this by testing the cases closest to the 39-digit rounding boundary. In my tests of generating 39 output digits (the maximum number supported by this algorithm) this code is also both faster and smaller than the USE_DYADIC_FLOAT version of the existing Ryū code.
1 parent c06d0ff commit b53da77

File tree

12 files changed

+1028
-9
lines changed

12 files changed

+1028
-9
lines changed

libc/config/config.json

+4
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
"value": false,
3131
"doc": "Use the same mode for double and long double in printf."
3232
},
33+
"LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320": {
34+
"value": false,
35+
"doc": "Use an alternative printf float implementation based on 320-bit floats"
36+
},
3337
"LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": {
3438
"value": false,
3539
"doc": "Disable printing fixed point values in printf and friends."

libc/docs/configure.rst

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ to learn about the defaults for your platform and target.
4343
- ``LIBC_CONF_PRINTF_DISABLE_WRITE_INT``: Disable handling of %n in printf format string.
4444
- ``LIBC_CONF_PRINTF_FLOAT_TO_STR_NO_SPECIALIZE_LD``: Use the same mode for double and long double in printf.
4545
- ``LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_DYADIC_FLOAT``: Use dyadic float for faster and smaller but less accurate printf doubles.
46+
- ``LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320``: Use an alternative printf float implementation based on 320-bit floats
4647
- ``LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE``: Use large table for better printf long double performance.
4748
* **"pthread" options**
4849
- ``LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT``: Default number of spins before blocking if a mutex is in contention (default to 100).

libc/src/__support/CPP/algorithm.h

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ template <class T> LIBC_INLINE constexpr const T &min(const T &a, const T &b) {
2626
return (a < b) ? a : b;
2727
}
2828

29+
template <class T> LIBC_INLINE constexpr T abs(T a) { return a < 0 ? -a : a; }
30+
2931
template <class InputIt, class UnaryPred>
3032
LIBC_INLINE constexpr InputIt find_if_not(InputIt first, InputIt last,
3133
UnaryPred q) {

libc/src/__support/FPUtil/dyadic_float.h

+197
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,54 @@
2626
namespace LIBC_NAMESPACE_DECL {
2727
namespace fputil {
2828

29+
// Decide whether to round a UInt up, down or not at all at a given bit
30+
// position, based on the current rounding mode. The assumption is that the
31+
// caller is going to make the integer `value >> rshift`, and then might need
32+
// to round it up by 1 depending on the value of the bits shifted off the
33+
// bottom.
34+
//
35+
// `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to
36+
// be reversed, which is what you'd want if this is the mantissa of a
37+
// negative floating-point number.
38+
//
39+
// Return value is +1 if the value should be rounded up; -1 if it should be
40+
// rounded down; 0 if it's exact and needs no rounding.
41+
template <size_t Bits>
42+
LIBC_INLINE constexpr int
43+
rounding_direction(const LIBC_NAMESPACE::UInt<Bits> &value, size_t rshift,
44+
Sign logical_sign) {
45+
if (rshift == 0 || (rshift < Bits && (value << (Bits - rshift)) == 0) ||
46+
(rshift >= Bits && value == 0))
47+
return 0; // exact
48+
49+
switch (quick_get_round()) {
50+
case FE_TONEAREST:
51+
if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) {
52+
// We round up, unless the value is an exact halfway case and
53+
// the bit that will end up in the units place is 0, in which
54+
// case tie-break-to-even says round down.
55+
bool round_bit = rshift < Bits ? value.get_bit(rshift) : 0;
56+
return round_bit != 0 || (value << (Bits - rshift + 1)) != 0 ? +1 : -1;
57+
} else {
58+
return -1;
59+
}
60+
case FE_TOWARDZERO:
61+
return -1;
62+
case FE_DOWNWARD:
63+
return logical_sign.is_neg() &&
64+
(rshift < Bits && (value << (Bits - rshift)) != 0)
65+
? +1
66+
: -1;
67+
case FE_UPWARD:
68+
return logical_sign.is_pos() &&
69+
(rshift < Bits && (value << (Bits - rshift)) != 0)
70+
? +1
71+
: -1;
72+
default:
73+
__builtin_unreachable();
74+
}
75+
}
76+
2977
// A generic class to perform computations of high precision floating points.
3078
// We store the value in dyadic format, including 3 fields:
3179
// sign : boolean value - false means positive, true means negative
@@ -101,6 +149,27 @@ template <size_t Bits> struct DyadicFloat {
101149
return exponent + (Bits - 1);
102150
}
103151

152+
// Produce a correctly rounded DyadicFloat from a too-large mantissa,
153+
// by shifting it down and rounding if necessary.
154+
template <size_t MantissaBits>
155+
LIBC_INLINE constexpr static DyadicFloat<Bits>
156+
round(Sign result_sign, int result_exponent,
157+
const LIBC_NAMESPACE::UInt<MantissaBits> &input_mantissa,
158+
size_t rshift) {
159+
MantissaType result_mantissa(input_mantissa >> rshift);
160+
if (rounding_direction(input_mantissa, rshift, result_sign) > 0) {
161+
++result_mantissa;
162+
if (result_mantissa == 0) {
163+
// Rounding up made the mantissa integer wrap round to 0,
164+
// carrying a bit off the top. So we've rounded up to the next
165+
// exponent.
166+
result_mantissa.set_bit(Bits - 1);
167+
++result_exponent;
168+
}
169+
}
170+
return DyadicFloat(result_sign, result_exponent, result_mantissa);
171+
}
172+
104173
#ifdef LIBC_TYPES_HAS_FLOAT16
105174
template <typename T, bool ShouldSignalExceptions>
106175
LIBC_INLINE constexpr cpp::enable_if_t<
@@ -374,6 +443,39 @@ template <size_t Bits> struct DyadicFloat {
374443

375444
return new_mant;
376445
}
446+
447+
LIBC_INLINE constexpr MantissaType
448+
as_mantissa_type_rounded(int *round_dir_out = nullptr) const {
449+
int round_dir = 0;
450+
MantissaType new_mant;
451+
if (mantissa.is_zero()) {
452+
new_mant = 0;
453+
} else {
454+
new_mant = mantissa;
455+
if (exponent > 0) {
456+
new_mant <<= exponent;
457+
} else if (exponent < 0) {
458+
size_t shift = -exponent;
459+
new_mant >>= shift;
460+
round_dir = rounding_direction(mantissa, shift, sign);
461+
if (round_dir > 0)
462+
++new_mant;
463+
}
464+
465+
if (sign.is_neg()) {
466+
new_mant = (~new_mant) + 1;
467+
}
468+
}
469+
470+
if (round_dir_out)
471+
*round_dir_out = round_dir;
472+
473+
return new_mant;
474+
}
475+
476+
LIBC_INLINE constexpr DyadicFloat operator-() const {
477+
return DyadicFloat(sign.negate(), exponent, mantissa);
478+
}
377479
};
378480

379481
// Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the
@@ -433,6 +535,12 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
433535
return result.normalize();
434536
}
435537

538+
template <size_t Bits>
539+
LIBC_INLINE constexpr DyadicFloat<Bits> quick_sub(DyadicFloat<Bits> a,
540+
DyadicFloat<Bits> b) {
541+
return quick_add(a, -b);
542+
}
543+
436544
// Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic
437545
// floats with rounding toward 0 and then normalize the output:
438546
// result.exponent = a.exponent + b.exponent + Bits,
@@ -464,6 +572,95 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
464572
return result;
465573
}
466574

575+
// Correctly rounded multiplication of 2 dyadic floats, assuming the
576+
// exponent remains within range.
577+
template <size_t Bits>
578+
LIBC_INLINE constexpr DyadicFloat<Bits>
579+
rounded_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) {
580+
using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>;
581+
Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
582+
int result_exponent = a.exponent + b.exponent + static_cast<int>(Bits);
583+
auto product = DblMant(a.mantissa) * DblMant(b.mantissa);
584+
// As in quick_mul(), renormalize by 1 bit manually rather than countl_zero
585+
if (product.get_bit(2 * Bits - 1) == 0) {
586+
product <<= 1;
587+
result_exponent -= 1;
588+
}
589+
590+
return DyadicFloat<Bits>::round(result_sign, result_exponent, product, Bits);
591+
}
592+
593+
// Approximate reciprocal - given a nonzero a, make a good approximation to 1/a.
594+
// The method is Newton-Raphson iteration, based on quick_mul.
595+
template <size_t Bits, typename = cpp::enable_if_t<(Bits >= 32)>>
596+
LIBC_INLINE constexpr DyadicFloat<Bits>
597+
approx_reciprocal(const DyadicFloat<Bits> &a) {
598+
// Given an approximation x to 1/a, a better one is x' = x(2-ax).
599+
//
600+
// You can derive this by using the Newton-Raphson formula with the function
601+
// f(x) = 1/x - a. But another way to see that it works is to say: suppose
602+
// that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) =
603+
// 1-e^2. So the error in x' is the square of the error in x, i.e. the number
604+
// of correct bits in x' is double the number in x.
605+
606+
// An initial approximation to the reciprocal
607+
DyadicFloat<Bits> x(Sign::POS, -32 - a.exponent - Bits,
608+
uint64_t(0xFFFFFFFFFFFFFFFF) /
609+
static_cast<uint64_t>(a.mantissa >> (Bits - 32)));
610+
611+
// The constant 2, which we'll need in every iteration
612+
DyadicFloat<Bits> two(Sign::POS, 1, 1);
613+
614+
// We expect at least 31 correct bits from our 32-bit starting approximation
615+
size_t ok_bits = 31;
616+
617+
// The number of good bits doubles in each iteration, except that rounding
618+
// errors introduce a little extra each time. Subtract a bit from our
619+
// accuracy assessment to account for that.
620+
while (ok_bits < Bits) {
621+
x = quick_mul(x, quick_sub(two, quick_mul(a, x)));
622+
ok_bits = 2 * ok_bits - 1;
623+
}
624+
625+
return x;
626+
}
627+
628+
// Correctly rounded division of 2 dyadic floats, assuming the
629+
// exponent remains within range.
630+
template <size_t Bits>
631+
LIBC_INLINE constexpr DyadicFloat<Bits>
632+
rounded_div(const DyadicFloat<Bits> &af, const DyadicFloat<Bits> &bf) {
633+
using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>;
634+
635+
// Make an approximation to the quotient as a * (1/b). Both the
636+
// multiplication and the reciprocal are a bit sloppy, which doesn't
637+
// matter, because we're going to correct for that below.
638+
auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf));
639+
640+
// Switch to BigInt and stop using quick_add and quick_mul: now
641+
// we're working in exact integers so as to get the true remainder.
642+
DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa;
643+
q <<= 2; // leave room for a round bit, even if exponent decreases
644+
a <<= af.exponent - bf.exponent - qf.exponent + 2;
645+
DblMant qb = q * b;
646+
if (qb < a) {
647+
DblMant too_small = a - b;
648+
while (qb <= too_small) {
649+
qb += b;
650+
++q;
651+
}
652+
} else {
653+
while (qb > a) {
654+
qb -= b;
655+
--q;
656+
}
657+
}
658+
659+
DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q);
660+
return DyadicFloat<Bits>::round(qbig.sign, qbig.exponent + Bits,
661+
qbig.mantissa, Bits);
662+
}
663+
467664
// Simple polynomial approximation.
468665
template <size_t Bits>
469666
LIBC_INLINE constexpr DyadicFloat<Bits>

libc/src/__support/big_int.h

+13-7
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,18 @@ struct BigInt {
936936
// Return the i-th word of the number.
937937
LIBC_INLINE constexpr WordType &operator[](size_t i) { return val[i]; }
938938

939+
// Return the i-th bit of the number.
940+
LIBC_INLINE constexpr bool get_bit(size_t i) const {
941+
const size_t word_index = i / WORD_SIZE;
942+
return 1 & (val[word_index] >> (i % WORD_SIZE));
943+
}
944+
945+
// Set the i-th bit of the number.
946+
LIBC_INLINE constexpr void set_bit(size_t i) {
947+
const size_t word_index = i / WORD_SIZE;
948+
val[word_index] |= WordType(1) << (i % WORD_SIZE);
949+
}
950+
939951
private:
940952
LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
941953
constexpr auto compare = [](WordType a, WordType b) {
@@ -968,7 +980,7 @@ struct BigInt {
968980
}
969981

970982
LIBC_INLINE constexpr void decrement() {
971-
multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
983+
multiword::sub_with_borrow(val, cpp::array<WordType, 1>{1});
972984
}
973985

974986
LIBC_INLINE constexpr void extend(size_t index, bool is_neg) {
@@ -989,12 +1001,6 @@ struct BigInt {
9891001
LIBC_INLINE constexpr void clear_msb() {
9901002
val.back() &= mask_trailing_ones<WordType, WORD_SIZE - 1>();
9911003
}
992-
993-
LIBC_INLINE constexpr void set_bit(size_t i) {
994-
const size_t word_index = i / WORD_SIZE;
995-
val[word_index] |= WordType(1) << (i % WORD_SIZE);
996-
}
997-
9981004
LIBC_INLINE constexpr static Division divide_unsigned(const BigInt &dividend,
9991005
const BigInt &divider) {
10001006
BigInt remainder = dividend;

libc/src/__support/sign.h

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ struct Sign {
2929
static const Sign POS;
3030
static const Sign NEG;
3131

32+
LIBC_INLINE constexpr Sign negate() const { return Sign(!is_negative); }
33+
3234
private:
3335
LIBC_INLINE constexpr explicit Sign(bool is_negative)
3436
: is_negative(is_negative) {}

libc/src/stdio/printf_core/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ endif()
1616
if(LIBC_CONF_PRINTF_FLOAT_TO_STR_NO_SPECIALIZE_LD)
1717
list(APPEND printf_config_copts "-DLIBC_COPT_FLOAT_TO_STR_NO_SPECIALIZE_LD")
1818
endif()
19+
if(LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320)
20+
list(APPEND printf_config_copts "-DLIBC_COPT_FLOAT_TO_STR_USE_FLOAT320")
21+
endif()
1922
if(LIBC_CONF_PRINTF_DISABLE_FIXED_POINT)
2023
list(APPEND printf_config_copts "-DLIBC_COPT_PRINTF_DISABLE_FIXED_POINT")
2124
endif()

libc/src/stdio/printf_core/converter_atlas.h

+4
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
// defines convert_float_decimal
2727
// defines convert_float_dec_exp
2828
// defines convert_float_dec_auto
29+
#ifdef LIBC_COPT_FLOAT_TO_STR_USE_FLOAT320
30+
#include "src/stdio/printf_core/float_dec_converter_limited.h"
31+
#else
2932
#include "src/stdio/printf_core/float_dec_converter.h"
33+
#endif
3034
// defines convert_float_hex_exp
3135
#include "src/stdio/printf_core/float_hex_converter.h"
3236
#endif // LIBC_COPT_PRINTF_DISABLE_FLOAT

0 commit comments

Comments
 (0)