@@ -99,6 +99,9 @@ f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
99
99
unsigned __int128 w ;
100
100
uint64_t t ;
101
101
102
+ /*
103
+ * Do the addition, with an extra carry in t.
104
+ */
102
105
w = (unsigned __int128 )a [0 ] + b [0 ];
103
106
d [0 ] = (uint64_t )w ;
104
107
w = (unsigned __int128 )a [1 ] + b [1 ] + (w >> 64 );
@@ -110,7 +113,7 @@ f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
110
113
t = (uint64_t )(w >> 64 );
111
114
112
115
/*
113
- * 2^256 = 2^224 - 2^192 - 2^96 + 1 in the field .
116
+ * Fold carry t, using: 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p .
114
117
*/
115
118
w = (unsigned __int128 )d [0 ] + t ;
116
119
d [0 ] = (uint64_t )w ;
@@ -119,8 +122,22 @@ f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
119
122
/* Here, carry "w >> 64" can only be 0 or -1 */
120
123
w = (unsigned __int128 )d [2 ] - ((w >> 64 ) & 1 );
121
124
d [2 ] = (uint64_t )w ;
122
- /* Again, carry is 0 or -1 */
123
- d [3 ] += (uint64_t )(w >> 64 ) + (t << 32 ) - t ;
125
+ /* Again, carry is 0 or -1. But there can be carry only if t = 1,
126
+ in which case the addition of (t << 32) - t is positive. */
127
+ w = (unsigned __int128 )d [3 ] - ((w >> 64 ) & 1 ) + (t << 32 ) - t ;
128
+ d [3 ] = (uint64_t )w ;
129
+ t = (uint64_t )(w >> 64 );
130
+
131
+ /*
132
+ * There can be an extra carry here, which we must fold again.
133
+ */
134
+ w = (unsigned __int128 )d [0 ] + t ;
135
+ d [0 ] = (uint64_t )w ;
136
+ w = (unsigned __int128 )d [1 ] + (w >> 64 ) - (t << 32 );
137
+ d [1 ] = (uint64_t )w ;
138
+ w = (unsigned __int128 )d [2 ] - ((w >> 64 ) & 1 );
139
+ d [2 ] = (uint64_t )w ;
140
+ d [3 ] += (t << 32 ) - t - (uint64_t )((w >> 64 ) & 1 );
124
141
125
142
#elif BR_UMUL128
126
143
@@ -140,6 +157,15 @@ f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
140
157
cc = _addcarry_u64 (cc , d [0 ], 0 , & d [0 ]);
141
158
cc = _addcarry_u64 (cc , d [1 ], - (t << 32 ), & d [1 ]);
142
159
cc = _addcarry_u64 (cc , d [2 ], - t , & d [2 ]);
160
+ cc = _addcarry_u64 (cc , d [3 ], (t << 32 ) - (t << 1 ), & d [3 ]);
161
+
162
+ /*
163
+ * We have to do it again if there still is a carry.
164
+ */
165
+ t = cc ;
166
+ cc = _addcarry_u64 (cc , d [0 ], 0 , & d [0 ]);
167
+ cc = _addcarry_u64 (cc , d [1 ], - (t << 32 ), & d [1 ]);
168
+ cc = _addcarry_u64 (cc , d [2 ], - t , & d [2 ]);
143
169
(void )_addcarry_u64 (cc , d [3 ], (t << 32 ) - (t << 1 ), & d [3 ]);
144
170
145
171
#endif
@@ -167,6 +193,7 @@ f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
167
193
t = (uint64_t )(w >> 64 ) & 1 ;
168
194
169
195
/*
196
+ * If there is a borrow (t = 1), then we must add the modulus
170
197
* p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
171
198
*/
172
199
w = (unsigned __int128 )d [0 ] - t ;
@@ -177,6 +204,20 @@ f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
177
204
w = (unsigned __int128 )d [2 ] + (w >> 64 );
178
205
d [2 ] = (uint64_t )w ;
179
206
/* Again, carry is 0 or +1 */
207
+ w = (unsigned __int128 )d [3 ] + (w >> 64 ) - (t << 32 ) + t ;
208
+ d [3 ] = (uint64_t )w ;
209
+ t = (uint64_t )(w >> 64 ) & 1 ;
210
+
211
+ /*
212
+ * There may be again a borrow, in which case we must add the
213
+ * modulus again.
214
+ */
215
+ w = (unsigned __int128 )d [0 ] - t ;
216
+ d [0 ] = (uint64_t )w ;
217
+ w = (unsigned __int128 )d [1 ] + (t << 32 ) - ((w >> 64 ) & 1 );
218
+ d [1 ] = (uint64_t )w ;
219
+ w = (unsigned __int128 )d [2 ] + (w >> 64 );
220
+ d [2 ] = (uint64_t )w ;
180
221
d [3 ] += (uint64_t )(w >> 64 ) - (t << 32 ) + t ;
181
222
182
223
#elif BR_UMUL128
@@ -190,13 +231,23 @@ f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
190
231
cc = _subborrow_u64 (cc , a [3 ], b [3 ], & d [3 ]);
191
232
192
233
/*
193
- * If there is a carry, then we need to add p.
234
+ * If there is a borrow, then we need to add p. We (virtually)
235
+ * add 2^256, then subtract 2^256 - p.
236
+ */
237
+ t = cc ;
238
+ cc = _subborrow_u64 (0 , d [0 ], t , & d [0 ]);
239
+ cc = _subborrow_u64 (cc , d [1 ], - (t << 32 ), & d [1 ]);
240
+ cc = _subborrow_u64 (cc , d [2 ], - t , & d [2 ]);
241
+ cc = _subborrow_u64 (cc , d [3 ], (t << 32 ) - (t << 1 ), & d [3 ]);
242
+
243
+ /*
244
+ * If there still is a borrow, then we need to add p again.
194
245
*/
195
246
t = cc ;
196
- cc = _addcarry_u64 (0 , d [0 ], - t , & d [0 ]);
197
- cc = _addcarry_u64 (cc , d [1 ], ( - t ) >> 32 , & d [1 ]);
198
- cc = _addcarry_u64 (cc , d [2 ], 0 , & d [2 ]);
199
- (void )_addcarry_u64 (cc , d [3 ], t - (t << 32 ), & d [3 ]);
247
+ cc = _subborrow_u64 (0 , d [0 ], t , & d [0 ]);
248
+ cc = _subborrow_u64 (cc , d [1 ], - ( t << 32 ) , & d [1 ]);
249
+ cc = _subborrow_u64 (cc , d [2 ], - t , & d [2 ]);
250
+ (void )_subborrow_u64 (cc , d [3 ], ( t << 32 ) - (t << 1 ), & d [3 ]);
200
251
201
252
#endif
202
253
}
0 commit comments