Skip to content

Commit b5abdc1

Browse files
committed
Extract problematic lambdas into named functions
1 parent 17b25ed commit b5abdc1

File tree

1 file changed

+56
-56
lines changed

1 file changed

+56
-56
lines changed

poly1305/src/backend/avx2/helpers.rs

Lines changed: 56 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -448,62 +448,9 @@ impl Unreduced130 {
448448
pub(super) fn reduce(self) -> Aligned130 {
449449
unsafe {
450450
// Starting with the following limb layout:
451-
// x.v1 = [ _, _, _, t_4]
452-
// x.v0 = [t_3, t_2, t_1, t_0]
453-
let x = self;
454-
455-
// Carry chain
456-
let adc = |v1: __m256i, v0: __m256i| -> (__m256i, __m256i) {
457-
// [t_3, t_2 % 2^26, t_1 % 2^26, t_0 % 2^26]
458-
// + [t_2 >> 26, t_1 >> 26, t_0 >> 26, 0 ]
459-
// = [
460-
// t_3 + t_2 >> 26,
461-
// t_2 % 2^26 + t_1 >> 26,
462-
// t_1 % 2^26 + t_0 >> 26,
463-
// t_0 % 2^26,
464-
// ]
465-
let v0 = _mm256_add_epi64(
466-
_mm256_and_si256(v0, _mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)),
467-
_mm256_permute4x64_epi64(
468-
_mm256_srlv_epi64(v0, _mm256_set_epi64x(64, 26, 26, 26)),
469-
set02(2, 1, 0, 3),
470-
),
471-
);
472-
// [_, _, _, t_4]
473-
// + [
474-
// (t_2 % 2^26 + t_1 >> 26) >> 26,
475-
// (t_1 % 2^26 + t_0 >> 26) >> 26,
476-
// (t_0 % 2^26 ) >> 26,
477-
// (t_3 + t_2 >> 26) >> 26,
478-
// ]
479-
// = [_, _, _, t_4 + (t_3 + t_2 >> 26) >> 26]
480-
let v1 = _mm256_add_epi64(
481-
v1,
482-
_mm256_permute4x64_epi64(_mm256_srli_epi64(v0, 26), set02(2, 1, 0, 3)),
483-
);
484-
// [
485-
// (t_3 + t_2 >> 26) % 2^26,
486-
// t_2 % 2^26 + t_1 >> 26,
487-
// t_1 % 2^26 + t_0 >> 26,
488-
// t_0 % 2^26,
489-
// ]
490-
let chain = _mm256_and_si256(v0, _mm256_set_epi64x(0x3ffffff, -1, -1, -1));
491-
492-
(v1, chain)
493-
};
494-
495-
// Reduction modulus 2^130-5
496-
let red = |v1: __m256i, v0: __m256i| -> (__m256i, __m256i) {
497-
// t = [0, 0, 0, t_4 >> 26]
498-
let t = _mm256_srlv_epi64(v1, _mm256_set_epi64x(64, 64, 64, 26));
499-
// v0 + 5·t = [t_3, t_2, t_1, t_0 + 5·(t_4 >> 26)]
500-
let red_0 = _mm256_add_epi64(_mm256_add_epi64(v0, t), _mm256_slli_epi64(t, 2));
501-
// [0, 0, 0, t_4 % 2^26]
502-
let red_1 = _mm256_and_si256(v1, _mm256_set_epi64x(0, 0, 0, 0x3ffffff));
503-
(red_1, red_0)
504-
};
505-
506-
let (red_1, red_0) = adc(x.v1, x.v0);
451+
// self.v1 = [ _, _, _, t_4]
452+
// self.v0 = [t_3, t_2, t_1, t_0]
453+
let (red_1, red_0) = adc(self.v1, self.v0);
507454
let (red_1, red_0) = red(red_1, red_0);
508455
let (red_1, red_0) = adc(red_1, red_0);
509456

@@ -517,6 +464,59 @@ impl Unreduced130 {
517464
}
518465
}
519466

467+
/// Carry chain
468+
#[inline(always)]
469+
unsafe fn adc(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) {
470+
// [t_3, t_2 % 2^26, t_1 % 2^26, t_0 % 2^26]
471+
// + [t_2 >> 26, t_1 >> 26, t_0 >> 26, 0 ]
472+
// = [
473+
// t_3 + t_2 >> 26,
474+
// t_2 % 2^26 + t_1 >> 26,
475+
// t_1 % 2^26 + t_0 >> 26,
476+
// t_0 % 2^26,
477+
// ]
478+
let v0 = _mm256_add_epi64(
479+
_mm256_and_si256(v0, _mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)),
480+
_mm256_permute4x64_epi64(
481+
_mm256_srlv_epi64(v0, _mm256_set_epi64x(64, 26, 26, 26)),
482+
set02(2, 1, 0, 3),
483+
),
484+
);
485+
// [_, _, _, t_4]
486+
// + [
487+
// (t_2 % 2^26 + t_1 >> 26) >> 26,
488+
// (t_1 % 2^26 + t_0 >> 26) >> 26,
489+
// (t_0 % 2^26 ) >> 26,
490+
// (t_3 + t_2 >> 26) >> 26,
491+
// ]
492+
// = [_, _, _, t_4 + (t_3 + t_2 >> 26) >> 26]
493+
let v1 = _mm256_add_epi64(
494+
v1,
495+
_mm256_permute4x64_epi64(_mm256_srli_epi64(v0, 26), set02(2, 1, 0, 3)),
496+
);
497+
// [
498+
// (t_3 + t_2 >> 26) % 2^26,
499+
// t_2 % 2^26 + t_1 >> 26,
500+
// t_1 % 2^26 + t_0 >> 26,
501+
// t_0 % 2^26,
502+
// ]
503+
let chain = _mm256_and_si256(v0, _mm256_set_epi64x(0x3ffffff, -1, -1, -1));
504+
505+
(v1, chain)
506+
}
507+
508+
/// Reduction modulus 2^130-5
509+
#[inline(always)]
510+
unsafe fn red(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) {
511+
// t = [0, 0, 0, t_4 >> 26]
512+
let t = _mm256_srlv_epi64(v1, _mm256_set_epi64x(64, 64, 64, 26));
513+
// v0 + 5·t = [t_3, t_2, t_1, t_0 + 5·(t_4 >> 26)]
514+
let red_0 = _mm256_add_epi64(_mm256_add_epi64(v0, t), _mm256_slli_epi64(t, 2));
515+
// [0, 0, 0, t_4 % 2^26]
516+
let red_1 = _mm256_and_si256(v1, _mm256_set_epi64x(0, 0, 0, 0x3ffffff));
517+
(red_1, red_0)
518+
}
519+
520520
/// A pair of `Aligned130`s.
521521
#[derive(Clone, Debug)]
522522
pub(super) struct Aligned2x130 {

0 commit comments

Comments
 (0)