Skip to content

Commit a945288

Browse files
committed
Use internal Rust functions
1 parent 602e0e9 commit a945288

File tree

2 files changed

+5
-11
lines changed

2 files changed

+5
-11
lines changed

src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
2727
#![cfg_attr(
2828
feature = "vpclmulqdq",
29-
feature(simd_ffi, link_llvm_intrinsics, avx512_target_feature,)
29+
feature(avx512_target_feature, stdarch_x86_avx512)
3030
)]
3131

3232
mod pclmulqdq;

src/pclmulqdq/x86_64/vpclmulqdq.rs

+4-10
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,12 @@
11
use super::{super::fold_tail, Simd, __cpuid_count, __m256i, _mm256_set_epi64x, _mm256_xor_si256};
22
use core::ops::BitXor;
33
use lazy_static::lazy_static;
4-
5-
// PCLMULQDQ can be used without avx512vl. However, this is only addressed by rust recently --- so we
6-
// need to manually specify the intrinsic, otherwise rustc will inline it poorly.
7-
#[allow(improper_ctypes)]
8-
extern "C" {
9-
#[link_name = "llvm.x86.pclmulqdq.256"]
10-
fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
11-
}
4+
use std::arch::x86_64::_mm256_clmulepi64_epi128;
125

136
#[derive(Clone, Copy, Debug)]
147
pub struct Simd256(__m256i);
158

9+
// this lazy_static bit takes throughput from ~39GiB/s to ~52GiB/s
1610
lazy_static! {
1711
static ref VPCLMULQDQ_SUPPORTED : bool = {
1812
let avx2 = is_x86_feature_detected!("avx2");
@@ -45,8 +39,8 @@ impl Simd256 {
4539
#[inline]
4640
#[target_feature(enable = "avx2", enable = "vpclmulqdq")]
4741
pub unsafe fn fold_32(self, coeff: Self) -> Self {
48-
let h = pclmulqdq_256(self.0, coeff.0, 0x11);
49-
let l = pclmulqdq_256(self.0, coeff.0, 0x00);
42+
let h = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x11);
43+
let l = _mm256_clmulepi64_epi128(self.0, coeff.0, 0x00);
5044
Self(h) ^ Self(l)
5145
}
5246
}

0 commit comments

Comments
 (0)