Skip to content

Commit cc66833

Browse files
committed
Merge branch 'add-avx512-vpclmulqdq'
2 parents a680157 + 7edf216 commit cc66833

File tree

6 files changed

+317
-31
lines changed

6 files changed

+317
-31
lines changed

Diff for: Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ rust-version = "1.70.0"
1616

1717
[dependencies]
1818
crc = "3"
19+
lazy_static = { version = "1.4.0", optional = true }
1920

2021
[dev-dependencies]
2122
crc = "3"
@@ -25,6 +26,7 @@ rand = "0.8"
2526

2627
[features]
2728
pmull = [] # deprecated, no longer have any effect.
29+
vpclmulqdq = ["lazy_static"]
2830
fake-simd = []
2931

3032
[[bench]]

Diff for: README.md

+21-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ crc64fast-nvme
88
SIMD-accelerated carryless-multiplication [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme) checksum computation
99
(similar to [crc32fast](https://crates.io/crates/crc32fast) and forked from [crc64fast](https://github.com/tikv/crc64fast) which calculates [CRC-64/XZ](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-xz) [a.k.a `CRC-64/GO-ECMA`]).
1010

11-
`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023) and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) (where it's called `CRC-64/Rocksoft`) and [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) as `CRC64-NVME`. (Note that the Check value in the spec uses incorrect endianness (Section 5.2.1.3.4, Figure 120, page 83).
11+
`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023) and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) (where it's called `CRC-64/Rocksoft`) and is [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) as `CRC64-NVME`. (Note that the Check value in the spec uses incorrect endianness [Section 5.2.1.3.4, Figure 120, page 83]).
1212

1313
SIMD-accelerated carryless-multiplication is based on the Intel [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) paper.
1414

@@ -48,14 +48,28 @@ be chosen based on CPU feature at runtime.
4848
* using PCLMULQDQ + SSE 4.1 on x86/x86_64
4949
* using PMULL + NEON on AArch64 (64-bit ARM)
5050

51-
| Algorithm | Throughput (x86_64) | Throughput (aarch64) |
52-
|:-----------------------|--------------------:|---------------------:|
53-
| [crc 3.0.1] | 0.5 GiB/s | 0.3 GiB/s |
54-
| crc64fast-nvme (table) | 2.3 GiB/s | 1.8 GiB/s |
55-
| crc64fast-nvme (simd) | 28.2 GiB/s | 20.0 GiB/s |
51+
| Algorithm | Throughput (x86_64) | Throughput (aarch64) |
52+
|:----------------------------|--------------------:|---------------------:|
53+
| [crc 3.0.1] | 0.5 GiB/s | 0.3 GiB/s |
54+
| crc64fast-nvme (table) | 2.3 GiB/s | 1.8 GiB/s |
55+
| crc64fast-nvme (SIMD) | 28.2 GiB/s | 20.0 GiB/s |
56+
| crc64fast-nvme (VPCLMULQDQ) | 52 GiB/s | n/a |
5657

5758
[crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html
5859

60+
## Experimental "Vector Carry-Less Multiplication of Quadwords" (VPCLMULQDQ) support
61+
62+
Using Rust's support for [AVX512 intrinsics](https://github.com/rust-lang/rust/issues/111137), specifically [VPCLMULQDQ](https://doc.rust-lang.org/src/core/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs.html), we can massively improve throughput for x86_64 processors which support them (Intel Ice Lake+ and AMD Zen4+).
63+
64+
Specifically, on an `m7i.8xlarge` EC2 instance (4th gen Xeon, aka Sapphire Rapids), throughput approximately _doubles_ from ~26GiB/s to ~52GiB/s.
65+
66+
Since these are currently marked as unstable features in Rust, you'll need to build with `nightly` and enable the `vpclmulqdq` feature:
67+
68+
```
69+
rustup toolchain install nightly
70+
cargo +nightly build --features="vpclmulqdq" -r
71+
```
72+
5973
## References
6074

6175
* [crc32-fast](https://crates.io/crates/crc32fast) - Original `crc32` implementation in Rust.
@@ -70,6 +84,7 @@ be chosen based on CPU feature at runtime.
7084
* [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq) - Insightful question & answer to CRC32 implementation details.
7185
* [AWS S3 announcement about CRC64-NVME support](https://aws.amazon.com/blogs/aws/introducing-default-data-integrity-protections-for-new-objects-in-amazon-s3/)
7286
* [AWS S3 docs on checking object integrity using CRC64-NVME](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
87+
* [Vector Carry-Less Multiplication of Quadwords (VPCLMULQDQ) details](https://en.wikichip.org/wiki/x86/vpclmulqdq)
7388

7489
## License
7590

Diff for: src/lib.rs

+15-2
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,22 @@
1717
//! let checksum = c.sum64();
1818
//! assert_eq!(checksum, 0xd9160d1fa8e418e3);
1919
//! ```
20+
//!
21+
//! Tracking links for unstable features used here (which require nightly builds):
22+
//!
23+
//! - simd_ffi: https://github.com/rust-lang/rust/issues/27731
24+
//! - link_llvm_intrinsics: https://github.com/rust-lang/rust/issues/29602
25+
//! - avx512_target_feature: https://github.com/rust-lang/rust/issues/111137
26+
27+
#![cfg_attr(
28+
feature = "vpclmulqdq",
29+
feature(avx512_target_feature, stdarch_x86_avx512)
30+
)]
2031

2132
mod pclmulqdq;
2233
mod table;
2334

24-
type UpdateFn = fn(u64, &[u8]) -> u64;
35+
type UpdateFn = unsafe fn(u64, &[u8]) -> u64;
2536

2637
/// Represents an in-progress CRC-64 computation.
2738
#[derive(Clone)]
@@ -52,7 +63,9 @@ impl Digest {
5263

5364
/// Writes some data into the digest.
5465
pub fn write(&mut self, bytes: &[u8]) {
55-
self.state = (self.computer)(self.state, bytes);
66+
unsafe {
67+
self.state = (self.computer)(self.state, bytes);
68+
}
5669
}
5770

5871
/// Computes the current CRC-64/NVME value.

Diff for: src/pclmulqdq/mod.rs

+42-12
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,24 @@
77
//!
88
//! [white paper]: https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
99
10+
use std::{
11+
fmt::Debug,
12+
ops::{BitXor, BitXorAssign},
13+
};
14+
15+
use super::table;
16+
17+
use self::arch::Simd;
18+
1019
#[cfg(not(feature = "fake-simd"))]
11-
#[cfg_attr(target_arch = "x86_64", path = "x86_64.rs")]
20+
#[cfg_attr(target_arch = "x86_64", path = "x86_64/mod.rs")]
1221
#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
1322
#[cfg_attr(target_arch = "x86", path = "x86.rs")]
1423
mod arch;
1524

1625
#[cfg(feature = "fake-simd")]
1726
mod arch;
1827

19-
use self::arch::Simd;
20-
use super::table;
21-
use std::{
22-
fmt::Debug,
23-
ops::{BitXor, BitXorAssign},
24-
};
25-
2628
/// This trait must be implemented on `self::arch::Simd` to provide the
2729
/// platform-specific SIMD implementations.
2830
trait SimdExt: Copy + Debug + BitXor {
@@ -71,24 +73,47 @@ impl BitXorAssign for Simd {
7173
}
7274

7375
pub fn get_update() -> super::UpdateFn {
76+
#[cfg(feature = "vpclmulqdq")]
77+
{
78+
use arch::vpclmulqdq::*;
79+
if Simd256::is_supported() {
80+
return update_256_batch;
81+
}
82+
}
83+
7484
if Simd::is_supported() {
75-
update
85+
update_128_batch
7686
} else {
7787
table::update
7888
}
7989
}
8090

81-
fn update(mut state: u64, bytes: &[u8]) -> u64 {
82-
let (left, middle, right) = unsafe { bytes.align_to::<[Simd; 8]>() };
91+
// This function is unsafe because it uses platform dependent functions.
92+
unsafe fn update_128_batch(mut state: u64, bytes: &[u8]) -> u64 {
93+
let (left, middle, right) = bytes.align_to::<[Simd; 8]>();
8394
if let Some((first, rest)) = middle.split_first() {
8495
state = table::update(state, left);
85-
state = unsafe { update_simd(state, first, rest) };
96+
state = update_simd(state, first, rest);
8697
table::update(state, right)
8798
} else {
8899
table::update(state, bytes)
89100
}
90101
}
91102

103+
#[cfg(feature = "vpclmulqdq")]
104+
#[target_feature(enable = "avx2", enable = "vpclmulqdq")]
105+
unsafe fn update_256_batch(mut state: u64, bytes: &[u8]) -> u64 {
106+
use arch::vpclmulqdq::*;
107+
let (left, middle, right) = bytes.align_to::<[[Simd256; 4]; 2]>();
108+
if let Some((first, rest)) = middle.split_first() {
109+
state = update_128_batch(state, left);
110+
state = update_vpclmulqdq(state, first, rest);
111+
update_128_batch(state, right)
112+
} else {
113+
update_128_batch(state, bytes)
114+
}
115+
}
116+
92117
#[cfg_attr(
93118
any(target_arch = "x86", target_arch = "x86_64"),
94119
target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")
@@ -112,6 +137,11 @@ unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64
112137
}
113138
}
114139

140+
fold_tail(x)
141+
}
142+
143+
#[inline(always)]
144+
unsafe fn fold_tail(x: [Simd; 8]) -> u64 {
115145
let coeffs = [
116146
Simd::new(table::K_895, table::K_959), // fold by distance of 112 bytes
117147
Simd::new(table::K_767, table::K_831), // fold by distance of 96 bytes

Diff for: src/pclmulqdq/x86_64.rs renamed to src/pclmulqdq/x86_64/mod.rs

+21-11
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
// Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0.
22

3-
//! x86_64 implementation of the PCLMULQDQ-based CRC calculation.
3+
//! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation.
44
55
#[cfg(target_arch = "x86_64")]
66
use std::arch::x86_64::*;
77
use std::ops::BitXor;
88

9+
#[cfg(feature = "vpclmulqdq")]
10+
pub mod vpclmulqdq;
11+
912
#[repr(transparent)]
1013
#[derive(Copy, Clone, Debug)]
1114
pub struct Simd(__m128i);
@@ -14,41 +17,48 @@ impl super::SimdExt for Simd {
1417
fn is_supported() -> bool {
1518
is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128
1619
&& is_x86_feature_detected!("sse2") // (all other _mm_*)
17-
&& is_x86_feature_detected!("sse4.1") // _mm_extract_epi64
20+
&& is_x86_feature_detected!("sse4.1")
1821
}
1922

2023
#[inline]
2124
#[target_feature(enable = "sse2")]
2225
unsafe fn new(high: u64, low: u64) -> Self {
23-
Self(_mm_set_epi64x(high as i64, low as i64))
26+
// On 32-bit systems, we need to split u64 into low and high 32-bit parts
27+
let high_low = (high & 0xFFFFFFFF) as i32;
28+
let high_high = ((high >> 32) & 0xFFFFFFFF) as i32;
29+
let low_low = (low & 0xFFFFFFFF) as i32;
30+
let low_high = ((low >> 32) & 0xFFFFFFFF) as i32;
31+
32+
// Create the 128-bit vector using 32-bit parts
33+
Self(_mm_set_epi32(high_high, high_low, low_high, low_low))
2434
}
2535

2636
#[inline]
2737
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
2838
unsafe fn fold_16(self, coeff: Self) -> Self {
29-
let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x11));
30-
let l = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00));
39+
let h = Self(_mm_clmulepi64_si128::<0x11>(self.0, coeff.0));
40+
let l = Self(_mm_clmulepi64_si128::<0x00>(self.0, coeff.0));
3141
h ^ l
3242
}
3343

3444
#[inline]
3545
#[target_feature(enable = "sse2", enable = "pclmulqdq")]
3646
unsafe fn fold_8(self, coeff: u64) -> Self {
3747
let coeff = Self::new(0, coeff);
38-
let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00));
39-
let l = Self(_mm_srli_si128(self.0, 8));
48+
let h = Self(_mm_clmulepi64_si128::<0x00>(self.0, coeff.0));
49+
let l = Self(_mm_srli_si128::<8>(self.0));
4050
h ^ l
4151
}
4252

4353
#[inline]
4454
#[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")]
4555
unsafe fn barrett(self, poly: u64, mu: u64) -> u64 {
4656
let polymu = Self::new(poly, mu);
47-
let t1 = _mm_clmulepi64_si128(self.0, polymu.0, 0x00);
48-
let h = Self(_mm_slli_si128(t1, 8));
49-
let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10));
57+
let t1 = _mm_clmulepi64_si128::<0x00>(self.0, polymu.0);
58+
let h = Self(_mm_slli_si128::<8>(t1));
59+
let l = Self(_mm_clmulepi64_si128::<0x10>(t1, polymu.0));
5060
let reduced = h ^ l ^ self;
51-
_mm_extract_epi64(reduced.0, 1) as u64
61+
_mm_extract_epi64::<1>(reduced.0) as u64
5262
}
5363
}
5464

0 commit comments

Comments
 (0)