|
1 | 1 | use super::{super::fold_tail, Simd, __cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256};
|
2 | 2 | use core::ops::BitXor;
|
3 | 3 | use lazy_static::lazy_static;
|
4 |
| - |
5 |
| -// PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we |
6 |
| -// need to manually specify the intrinsic, otherwise rustc will inline it poorly. |
7 |
| -#[allow(improper_ctypes)] |
8 |
| -extern "C" { |
9 |
| - #[link_name = "llvm.x86.pclmulqdq.256"] |
10 |
| - fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i; |
11 |
| -} |
| 4 | +use std::arch::x86_64::_mm256_clmulepi64_epi128; |
12 | 5 |
|
13 | 6 | #[derive(Clone, Copy, Debug)]
|
14 | 7 | pub struct Simd256(__m256i);
|
15 | 8 |
|
| 9 | +// this lazy_static bit takes throughput from ~39GiB/s to ~52GiB/s |
16 | 10 | lazy_static! {
|
17 | 11 | static ref VPCLMULQDQ_SUPPORTED : bool = {
|
18 | 12 | let avx2 = is_x86_feature_detected!("avx2");
|
@@ -45,8 +39,8 @@ impl Simd256 {
|
45 | 39 | #[inline]
|
46 | 40 | #[target_feature(enable = "avx2", enable = "vpclmulqdq")]
|
47 | 41 | pub unsafe fn fold_32(self, coeff: Self) -> Self {
|
48 |
| - let h = pclmulqdq_256(self.0, coeff.0, 0x11); |
49 |
| - let l = pclmulqdq_256(self.0, coeff.0, 0x00); |
| 42 | + let h = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x11); |
| 43 | + let l = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x00); |
50 | 44 | Self(h) ^ Self(l)
|
51 | 45 | }
|
52 | 46 | }
|
|
0 commit comments