@@ -164,6 +164,170 @@ template <size_t radix> using Custom = details::Fmt<radix>;
164
164
165
165
} // namespace radix
166
166
167
+ // Extract the low-order decimal digit from a value of integer type T. The
168
+ // returned value is the digit itself, from 0 to 9. The input value is passed
169
+ // by reference, and modified by dividing by 10, so that iterating this
170
+ // function extracts all the digits of the original number one at a time from
171
+ // low to high.
172
+ template <typename T>
173
+ LIBC_INLINE cpp::enable_if_t <cpp::is_integral_v<T>, uint8_t >
174
+ extract_decimal_digit (T &value) {
175
+ const uint8_t digit (static_cast <uint8_t >(value % 10 ));
176
+ // For built-in integer types, we assume that an adequately fast division is
177
+ // available. If hardware division isn't implemented, then with a divisor
178
+ // known at compile time the compiler might be able to generate an optimized
179
+ // sequence instead.
180
+ value /= 10 ;
181
+ return digit;
182
+ }
183
+
184
+ // A specialization of extract_decimal_digit for the BigInt type in big_int.h,
185
+ // avoiding the use of general-purpose BigInt division which is very slow.
186
+ template <typename T>
187
+ LIBC_INLINE cpp::enable_if_t <is_big_int_v<T>, uint8_t >
188
+ extract_decimal_digit (T &value) {
189
+ // There are two essential ways you can turn n into (n/10,n%10). One is
190
+ // ordinary integer division. The other is a modular-arithmetic approach in
191
+ // which you first compute n%10 by bit twiddling, then subtract it off to get
192
+ // a value that is definitely a multiple of 10. Then you divide that by 10 in
193
+ // two steps: shift right to divide off a factor of 2, and then divide off a
194
+ // factor of 5 by multiplying by the modular inverse of 5 mod 2^BITS. (That
195
+ // last step only works if you know there's no remainder, which is why you
196
+ // had to subtract off the output digit first.)
197
+ //
198
+ // Either approach can be made to work in linear time. This code uses the
199
+ // modular-arithmetic technique, because the other approach either does a lot
200
+ // of integer divisions (requiring a fast hardware divider), or else uses a
201
+ // "multiply by an approximation to the reciprocal" technique which depends
202
+ // on careful error analysis which might go wrong in an untested edge case.
203
+
204
+ using Word = typename T::word_type;
205
+
206
+ // Find the remainder (value % 10). We do this by breaking up the input
207
+ // integer into chunks of size WORD_SIZE/2, so that the sum of them doesn't
208
+ // overflow a Word. Then we sum all the half-words times 6, except the bottom
209
+ // one, which is added to that sum without scaling.
210
+ //
211
+ // Why 6? Because you can imagine that the original number had the form
212
+ //
213
+ // halfwords[0] + K*halfwords[1] + K^2*halfwords[2] + ...
214
+ //
215
+ // where K = 2^(WORD_SIZE/2). Since WORD_SIZE is expected to be a multiple of
216
+ // 8, that makes WORD_SIZE/2 a multiple of 4, so that K is a power of 16. And
217
+ // all powers of 16 (larger than 1) are congruent to 6 mod 10, by induction:
218
+ // 16 itself is, and 6^2=36 is also congruent to 6.
219
+ Word acc_remainder = 0 ;
220
+ constexpr Word HALFWORD_BITS = T::WORD_SIZE / 2 ;
221
+ constexpr Word HALFWORD_MASK = ((Word (1 ) << HALFWORD_BITS) - 1 );
222
+ // Sum both halves of all words except the low one.
223
+ for (size_t i = 1 ; i < T::WORD_COUNT; i++) {
224
+ acc_remainder += value.val [i] >> HALFWORD_BITS;
225
+ acc_remainder += value.val [i] & HALFWORD_MASK;
226
+ }
227
+ // Add the high half of the low word. Then we have everything that needs to
228
+ // be multiplied by 6, so do that.
229
+ acc_remainder += value.val [0 ] >> HALFWORD_BITS;
230
+ acc_remainder *= 6 ;
231
+ // Having multiplied it by 6, add the lowest half-word, and then reduce mod
232
+ // 10 by normal integer division to finish.
233
+ acc_remainder += value.val [0 ] & HALFWORD_MASK;
234
+ uint8_t digit = acc_remainder % 10 ;
235
+
236
+ // Now we have the output digit. Subtract it from the input value, and shift
237
+ // right to divide by 2.
238
+ value -= digit;
239
+ value >>= 1 ;
240
+
241
+ // Now all that's left is to multiply by the inverse of 5 mod 2^BITS. No
242
+ // matter what the value of BITS, the inverse of 5 has the very convenient
243
+ // form 0xCCCC...CCCD, with as many C hex digits in the middle as necessary.
244
+ //
245
+ // We could construct a second BigInt with all words 0xCCCCCCCCCCCCCCCC,
246
+ // increment the bottom word, and call a general-purpose multiply function.
247
+ // But we can do better, by taking advantage of the regularity: we can do
248
+ // this particular operation in linear time, whereas a general multiplier
249
+ // would take superlinear time (quadratic in small cases).
250
+ //
251
+ // To begin with, instead of computing n*0xCCCC...CCCD, we'll compute
252
+ // n*0xCCCC...CCCC and then add it to the original n. Then all the words of
253
+ // the multiplier have the same value 0xCCCCCCCCCCCCCCCC, which I'll just
254
+ // denote as C. If we also write t = 2^WORD_SIZE, and imagine (as an example)
255
+ // that the input number has three words x,y,z with x being the low word,
256
+ // then we're computing
257
+ //
258
+ // (x + y t + z t^2) * (C + C t + C t^2)
259
+ //
260
+ // = x C + y C t + z C t^2
261
+ // + x C t + y C t^2 + z C t^3
262
+ // + x C t^2 + y C t^3 + z C t^4
263
+ //
264
+ // but we're working mod t^3, so the high-order terms vanish and this becomes
265
+ //
266
+ // x C + y C t + z C t^2
267
+ // + x C t + y C t^2
268
+ // + x C t^2
269
+ //
270
+ // = x C + (x+y) C t + (x+y+z) C t^2
271
+ //
272
+ // So all you have to do is to work from the low word of the integer upwards,
273
+ // accumulating C times the sum of all the words you've seen so far to get
274
+ // x*C, (x+y)*C, (x+y+z)*C and so on. In each step you add another product to
275
+ // the accumulator, and add the accumulator to the corresponding word of the
276
+ // original number (so that we end up with value*CCCD, not just value*CCCC).
277
+ //
278
+ // If you do that literally, then your accumulator has to be three words
279
+ // wide, because the sum of words can overflow into a second word, and
280
+ // multiplying by C adds another word. But we can do slightly better by
281
+ // breaking each product word*C up into a bottom half and a top half. If we
282
+ // write x*C = xl + xh*t, and similarly for y and z, then our sum becomes
283
+ //
284
+ // (xl + xh t) + (yl + yh t) t + (zl + zh t) t^2
285
+ // + (xl + xh t) t + (yl + yh t) t^2
286
+ // + (xl + xh t) t^2
287
+ //
288
+ // and if you expand out again, collect terms, and discard t^3 terms, you get
289
+ //
290
+ // (xl)
291
+ // + (xl + xh + yl) t
292
+ // + (xl + xh + yl + yh + zl) t^2
293
+ //
294
+ // in which each coefficient is the sum of all the low words of the products
295
+ // up to _and including_ the current word, plus all the high words up to but
296
+ // _not_ including the current word. So now you only have to retain two words
297
+ // of sum instead of three.
298
+ //
299
+ // We do this entire procedure in a single in-place pass over the input
300
+ // number, reading each word to make its product with C and then adding the
301
+ // low word of the accumulator to it.
302
+ constexpr Word C = Word (-1 ) / 5 * 4 ; // calculate 0xCCCC as 4/5 of 0xFFFF
303
+ Word acc_lo = 0 , acc_hi = 0 ; // accumulator of all the half-products so far
304
+ Word carry_bit, carry_word = 0 ;
305
+
306
+ for (size_t i = 0 ; i < T::WORD_COUNT; i++) {
307
+ // Make the two-word product of C with the current input word.
308
+ multiword::DoubleWide<Word> product = multiword::mul2 (C, value.val [i]);
309
+
310
+ // Add the low half of the product to our accumulator, but not yet the high
311
+ // half.
312
+ acc_lo = add_with_carry<Word>(acc_lo, product[0 ], 0 , carry_bit);
313
+ acc_hi += carry_bit;
314
+
315
+ // Now the accumulator contains exactly the value we need to add to the
316
+ // current input word. Add it, plus any carries from lower words, and make
317
+ // a new word of carry data to propagate into the next iteration.
318
+ value.val [i] = add_with_carry<Word>(value.val [i], carry_word, 0 , carry_bit);
319
+ carry_word = acc_hi + carry_bit;
320
+ value.val [i] = add_with_carry<Word>(value.val [i], acc_lo, 0 , carry_bit);
321
+ carry_word += carry_bit;
322
+
323
+ // Now add the high half of the current product to our accumulator.
324
+ acc_lo = add_with_carry<Word>(acc_lo, product[1 ], 0 , carry_bit);
325
+ acc_hi += carry_bit;
326
+ }
327
+
328
+ return digit;
329
+ }
330
+
167
331
// See file header for documentation.
168
332
template <typename T, typename Fmt = radix::Dec> class IntegerToString {
169
333
static_assert (cpp::is_integral_v<T> || is_big_int_v<T>);
@@ -229,6 +393,15 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
229
393
}
230
394
}
231
395
396
+ LIBC_INLINE static void
397
+ write_unsigned_number_dec (UNSIGNED_T value,
398
+ details::BackwardStringBufferWriter &sink) {
399
+ while (sink.ok () && value != 0 ) {
400
+ const uint8_t digit = extract_decimal_digit (value);
401
+ sink.push (digit_char (digit));
402
+ }
403
+ }
404
+
232
405
// Returns the absolute value of 'value' as 'UNSIGNED_T'.
233
406
LIBC_INLINE static UNSIGNED_T abs (T value) {
234
407
if (cpp::is_unsigned_v<T> || value >= 0 )
@@ -256,7 +429,7 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
256
429
LIBC_INLINE static void write (T value,
257
430
details::BackwardStringBufferWriter &sink) {
258
431
if constexpr (Fmt::BASE == 10 ) {
259
- write_unsigned_number (abs (value), sink);
432
+ write_unsigned_number_dec (abs (value), sink);
260
433
} else {
261
434
write_unsigned_number (static_cast <UNSIGNED_T>(value), sink);
262
435
}
0 commit comments