Skip to content

Commit 001d094

Browse files
committed
Some small performance improvements on 32-bit architectures.
1 parent 08eb078 commit 001d094

File tree

7 files changed

+122
-38
lines changed

7 files changed

+122
-38
lines changed

src/ec/ec_c25519_m31.c

Lines changed: 62 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,7 @@ reduce_final_f255(uint32_t *d)
372372
static void
373373
f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374374
{
375-
uint32_t t[18];
376-
uint64_t cc, w;
375+
uint32_t t[18], cc;
377376
int i;
378377

379378
/*
@@ -389,21 +388,42 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
389388
* offset 9*30 = 270, word 9+k must be added to word k with
390389
* a factor of 19*2^15 = 622592. The extra bits in word 8 are also
391390
* added that way.
391+
*
392+
* Keeping the carry on 32 bits helps with 32-bit architectures,
393+
* and does not noticeably impact performance on 64-bit systems.
392394
*/
393-
cc = MUL31(t[8] >> 15, 19);
395+
cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */
394396
t[8] &= 0x7FFF;
395397
for (i = 0; i < 9; i ++) {
396-
w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
398+
uint64_t w;
399+
400+
w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
397401
t[i] = (uint32_t)w & 0x3FFFFFFF;
398-
cc = w >> 30;
402+
cc = (uint32_t)(w >> 30); /* at most 622592 */
399403
}
400-
cc = MUL31(w >> 15, 19);
404+
405+
/*
406+
* Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
407+
* This was split into two parts (upper of 257 bits, lower of 255
408+
* bits), and the upper was added to the lower with a factor 19,
409+
* which means that the intermediate value is less than 77*2^255
410+
* (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
411+
* less than 77, and the initial carry cc is at most 76*19 = 1444.
412+
*/
413+
cc = MUL15(t[8] >> 15, 19);
401414
t[8] &= 0x7FFF;
402415
for (i = 0; i < 9; i ++) {
403-
w = t[i] + cc;
404-
d[i] = (uint32_t)w & 0x3FFFFFFF;
405-
cc = w >> 30;
416+
uint32_t z;
417+
418+
z = t[i] + cc;
419+
d[i] = z & 0x3FFFFFFF;
420+
cc = z >> 30;
406421
}
422+
423+
/*
424+
* Final result is at most 2^255 + 1443. In particular, the last
425+
* carry is necessarily 0, since t[8] was truncated to 15 bits.
426+
*/
407427
}
408428

409429
/*
@@ -415,8 +435,7 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
415435
static void
416436
f255_square(uint32_t *d, const uint32_t *a)
417437
{
418-
uint32_t t[18];
419-
uint64_t cc, w;
438+
uint32_t t[18], cc;
420439
int i;
421440

422441
/*
@@ -428,24 +447,25 @@ f255_square(uint32_t *d, const uint32_t *a)
428447

429448
/*
430449
* Modular reduction: each high word is added where necessary.
431-
* Since the modulus is 2^255-19 and word 9 corresponds to
432-
* offset 9*30 = 270, word 9+k must be added to word k with
433-
* a factor of 19*2^15 = 622592. The extra bits in word 8 are also
434-
* added that way.
450+
* See f255_mul() for details on the reduction and carry limits.
435451
*/
436-
cc = MUL31(t[8] >> 15, 19);
452+
cc = MUL15(t[8] >> 15, 19);
437453
t[8] &= 0x7FFF;
438454
for (i = 0; i < 9; i ++) {
439-
w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
455+
uint64_t w;
456+
457+
w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
440458
t[i] = (uint32_t)w & 0x3FFFFFFF;
441-
cc = w >> 30;
459+
cc = (uint32_t)(w >> 30);
442460
}
443-
cc = MUL31(w >> 15, 19);
461+
cc = MUL15(t[8] >> 15, 19);
444462
t[8] &= 0x7FFF;
445463
for (i = 0; i < 9; i ++) {
446-
w = t[i] + cc;
447-
d[i] = (uint32_t)w & 0x3FFFFFFF;
448-
cc = w >> 30;
464+
uint32_t z;
465+
466+
z = t[i] + cc;
467+
d[i] = z & 0x3FFFFFFF;
468+
cc = z >> 30;
449469
}
450470
}
451471

@@ -515,20 +535,31 @@ static void
515535
f255_mul_a24(uint32_t *d, const uint32_t *a)
516536
{
517537
int i;
518-
uint64_t cc, w;
538+
uint64_t w;
539+
uint32_t cc;
519540

541+
/*
542+
* a[] is over 256 bits, thus a[8] has length at most 16 bits.
543+
* We single out the processing of the last word: intermediate
544+
* value w is up to 121665*2^16, yielding a carry for the next
545+
* loop of at most 19*(121665*2^16/2^15) = 4623289.
546+
*/
520547
cc = 0;
521-
for (i = 0; i < 9; i ++) {
522-
w = MUL31(a[i], 121665) + cc;
548+
for (i = 0; i < 8; i ++) {
549+
w = MUL31(a[i], 121665) + (uint64_t)cc;
523550
d[i] = (uint32_t)w & 0x3FFFFFFF;
524-
cc = w >> 30;
551+
cc = (uint32_t)(w >> 30);
525552
}
526-
cc = MUL31((uint32_t)(w >> 15), 19);
527-
d[8] &= 0x7FFF;
553+
w = MUL31(a[8], 121665) + (uint64_t)cc;
554+
d[8] = (uint32_t)w & 0x7FFF;
555+
cc = MUL15((uint32_t)(w >> 15), 19);
556+
528557
for (i = 0; i < 9; i ++) {
529-
w = (uint64_t)d[i] + cc;
530-
d[i] = w & 0x3FFFFFFF;
531-
cc = w >> 30;
558+
uint32_t z;
559+
560+
z = d[i] + cc;
561+
d[i] = z & 0x3FFFFFFF;
562+
cc = z >> 30;
532563
}
533564
}
534565

src/ec/ec_p256_m15.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1739,7 +1739,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
17391739
memcpy(P->y, ty, sizeof ty);
17401740
memset(P->z, 0, sizeof P->z);
17411741
P->z[0] = 1;
1742-
return NEQ(bad, 0) ^ 1;
1742+
return EQ(bad, 0);
17431743
}
17441744

17451745
/*

src/ec/ec_p256_m31.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len)
10891089
memcpy(P->y, ty, sizeof ty);
10901090
memset(P->z, 0, sizeof P->z);
10911091
P->z[0] = 1;
1092-
return NEQ(bad, 0) ^ 1;
1092+
return EQ(bad, 0);
10931093
}
10941094

10951095
/*

src/inner.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@
114114
#define BR_64 1
115115
#elif defined(__x86_64__) || defined(_M_X64)
116116
#define BR_64 1
117+
#elif defined(__aarch64__) || defined(_M_ARM64)
118+
#define BR_64 1
117119
#endif
118120
#endif
119121

src/int/i31_montmul.c

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,45 @@ void
2929
br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
3030
const uint32_t *m, uint32_t m0i)
3131
{
32+
/*
33+
* Each outer loop iteration computes:
34+
* d <- (d + xu*y + f*m) / 2^31
35+
* We have xu <= 2^31-1 and f <= 2^31-1.
36+
* Thus, if d <= 2*m-1 on input, then:
37+
* 2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
38+
* and the new d value is less than 2*m.
39+
*
40+
* We represent d over 31-bit words, with an extra word 'dh'
41+
* which can thus be only 0 or 1.
42+
*/
3243
size_t len, len4, u, v;
33-
uint64_t dh;
44+
uint32_t dh;
3445

3546
len = (m[0] + 31) >> 5;
3647
len4 = len & ~(size_t)3;
3748
br_i31_zero(d, m[0]);
3849
dh = 0;
3950
for (u = 0; u < len; u ++) {
51+
/*
52+
* The carry for each operation fits on 32 bits:
53+
* d[v+1] <= 2^31-1
54+
* xu*y[v+1] <= (2^31-1)*(2^31-1)
55+
* f*m[v+1] <= (2^31-1)*(2^31-1)
56+
* r <= 2^32-1
57+
* (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
58+
* After division by 2^31, the new r is then at most 2^32-1
59+
*
60+
* Using a 32-bit carry has performance benefits on 32-bit
61+
* systems; however, on 64-bit architectures, we prefer to
62+
* keep the carry (r) in a 64-bit register, thus avoiding some
63+
* "clear high bits" operations.
64+
*/
4065
uint32_t f, xu;
41-
uint64_t r, zh;
66+
#if BR_64
67+
uint64_t r;
68+
#else
69+
uint32_t r;
70+
#endif
4271

4372
xu = x[u + 1];
4473
f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i);
@@ -73,9 +102,14 @@ br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
73102
d[v] = (uint32_t)z & 0x7FFFFFFF;
74103
}
75104

76-
zh = dh + r;
77-
d[len] = (uint32_t)zh & 0x7FFFFFFF;
78-
dh = zh >> 31;
105+
/*
106+
* Since the new dh can only be 0 or 1, the addition of
107+
* the old dh with the carry MUST fit on 32 bits, and
108+
* thus can be done into dh itself.
109+
*/
110+
dh += r;
111+
d[len] = dh & 0x7FFFFFFF;
112+
dh >>= 31;
79113
}
80114

81115
/*

src/int/i31_mulacc.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,20 @@ br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
4545
for (u = 0; u < blen; u ++) {
4646
uint32_t f;
4747
size_t v;
48+
49+
/*
50+
* Carry always fits on 31 bits; we want to keep it in a
51+
* 32-bit register on 32-bit architectures (on a 64-bit
52+
* architecture, cast down from 64 to 32 bits means
53+
* clearing the high bits, which is not free; on a 32-bit
54+
* architecture, the same operation really means ignoring
55+
* the top register, which has negative or zero cost).
56+
*/
57+
#if BR_64
4858
uint64_t cc;
59+
#else
60+
uint32_t cc;
61+
#endif
4962

5063
f = b[1 + u];
5164
cc = 0;

src/int/i32_mulacc.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,11 @@ br_i32_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b)
3636
for (u = 0; u < blen; u ++) {
3737
uint32_t f;
3838
size_t v;
39+
#if BR_64
3940
uint64_t cc;
41+
#else
42+
uint32_t cc;
43+
#endif
4044

4145
f = b[1 + u];
4246
cc = 0;

0 commit comments

Comments
 (0)