-
Notifications
You must be signed in to change notification settings - Fork 235
Use REP MOVSQ/STOSQ on x86_64 #365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 8 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
c6c7621
mem: Move mem* functions to separate directory
josephlr 80b7c01
memcpy: Create separate memcpy.rs file
josephlr ee54782
benches: Add benchmarks for mem* functions
josephlr fb03d26
mem: Add REP MOVSB/STOSB implementations
josephlr 2a0132c
mem: Add documentations for REP string insturctions
josephlr aa75260
Use quad-word rep string instructions
josephlr de4ed28
Prevent panic when compiled in debug mode
josephlr fe71a12
Add tests for mem* functions
josephlr aa326a3
Add build/test with the "asm" feature
josephlr d4a180a
Add byte length to Bencher
josephlr File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
use super::c_int; | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { | ||
let mut i = 0; | ||
while i < n { | ||
*dest.offset(i as isize) = *src.offset(i as isize); | ||
i += 1; | ||
} | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { | ||
if src < dest as *const u8 { | ||
// copy from end | ||
let mut i = n; | ||
while i != 0 { | ||
i -= 1; | ||
*dest.offset(i as isize) = *src.offset(i as isize); | ||
} | ||
} else { | ||
// copy from beginning | ||
let mut i = 0; | ||
while i < n { | ||
*dest.offset(i as isize) = *src.offset(i as isize); | ||
i += 1; | ||
} | ||
} | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { | ||
let mut i = 0; | ||
while i < n { | ||
*s.offset(i as isize) = c as u8; | ||
i += 1; | ||
} | ||
s | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
use super::c_int; | ||
|
||
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have | ||
// been enhanced to perform better than an simple qword loop, making them ideal | ||
// for implementing memcpy/memset. Note that "rep cmps" has received no such | ||
// enhancement, so it is not used to implement memcmp. | ||
// | ||
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been | ||
// further enhanced to automatically select the best microarchitectural | ||
// implementation based on length and alignment. See the following features from | ||
// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual": | ||
// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) | ||
// - FSRM - Fast Short REP MOV (Ice Lake and later) | ||
// - Fast Zero-Length MOVSB (On no current hardware) | ||
// - Fast Short STOSB (On no current hardware) | ||
// However, to avoid run-time feature detection, we don't use these byte-based | ||
// instructions for most of the copying, preferring the qword variants. | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { | ||
let qword_count = count >> 3; | ||
let byte_count = count & 0b111; | ||
asm!( | ||
"rep movsq [rdi], [rsi]", | ||
"mov ecx, {byte_count:e}", | ||
"rep movsb [rdi], [rsi]", | ||
byte_count = in(reg) byte_count, | ||
inout("rcx") qword_count => _, | ||
inout("rdi") dest => _, | ||
inout("rsi") src => _, | ||
options(nostack, preserves_flags) | ||
); | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { | ||
let delta = (dest as usize).wrapping_sub(src as usize); | ||
if delta >= count { | ||
// We can copy forwards because either dest is far enough ahead of src, | ||
// or src is ahead of dest (and delta overflowed). | ||
return self::memcpy(dest, src, count); | ||
} | ||
// copy backwards | ||
let qword_count = count >> 3; | ||
let byte_count = count & 0b111; | ||
asm!( | ||
"std", | ||
"rep movsq [rdi], [rsi]", | ||
"mov ecx, {byte_count:e}", | ||
"add rdi, 7", | ||
"add rsi, 7", | ||
"rep movsb [rdi], [rsi]", | ||
"cld", | ||
byte_count = in(reg) byte_count, | ||
inout("rcx") qword_count => _, | ||
inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _, | ||
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _, | ||
options(nostack) | ||
); | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 { | ||
let qword_count = count >> 3; | ||
let byte_count = count & 0b111; | ||
asm!( | ||
"rep stosq [rdi], rax", | ||
"mov ecx, {byte_count:e}", | ||
"rep stosb [rdi], al", | ||
byte_count = in(reg) byte_count, | ||
inout("rcx") qword_count => _, | ||
inout("rdi") dest => _, | ||
in("rax") (c as u8 as u64) * 0x0101010101010101, | ||
options(nostack, preserves_flags) | ||
); | ||
dest | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#![feature(test)] | ||
|
||
extern crate test; | ||
use test::{black_box, Bencher}; | ||
|
||
extern crate compiler_builtins; | ||
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; | ||
|
||
fn memcpy_builtin(b: &mut Bencher, n: usize) { | ||
let v1 = vec![1u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
b.iter(|| { | ||
josephlr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let src: &[u8] = black_box(&v1); | ||
let dst: &mut [u8] = black_box(&mut v2); | ||
dst.copy_from_slice(src); | ||
}) | ||
} | ||
|
||
fn memcpy_rust(b: &mut Bencher, n: usize) { | ||
let v1 = vec![1u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
b.iter(|| { | ||
let src: &[u8] = black_box(&v1); | ||
let dst: &mut [u8] = black_box(&mut v2); | ||
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) } | ||
}) | ||
} | ||
|
||
fn memset_builtin(b: &mut Bencher, n: usize) { | ||
let mut v1 = vec![0u8; n]; | ||
b.iter(|| { | ||
let dst: &mut [u8] = black_box(&mut v1); | ||
let val: u8 = black_box(27); | ||
for b in dst { | ||
*b = val; | ||
} | ||
}) | ||
} | ||
|
||
fn memset_rust(b: &mut Bencher, n: usize) { | ||
let mut v1 = vec![0u8; n]; | ||
b.iter(|| { | ||
let dst: &mut [u8] = black_box(&mut v1); | ||
let val = black_box(27); | ||
unsafe { memset(dst.as_mut_ptr(), val, n) } | ||
}) | ||
} | ||
|
||
fn memcmp_builtin(b: &mut Bencher, n: usize) { | ||
let v1 = vec![0u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
v2[n - 1] = 1; | ||
b.iter(|| { | ||
let s1: &[u8] = black_box(&v1); | ||
let s2: &[u8] = black_box(&v2); | ||
s1.cmp(s2) | ||
}) | ||
} | ||
|
||
fn memcmp_rust(b: &mut Bencher, n: usize) { | ||
let v1 = vec![0u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
v2[n - 1] = 1; | ||
b.iter(|| { | ||
let s1: &[u8] = black_box(&v1); | ||
let s2: &[u8] = black_box(&v2); | ||
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) } | ||
}) | ||
} | ||
|
||
fn memmove_builtin(b: &mut Bencher, n: usize) { | ||
let mut v = vec![0u8; n + n / 2]; | ||
b.iter(|| { | ||
let s: &mut [u8] = black_box(&mut v); | ||
s.copy_within(0..n, n / 2); | ||
}) | ||
} | ||
|
||
fn memmove_rust(b: &mut Bencher, n: usize) { | ||
let mut v = vec![0u8; n + n / 2]; | ||
b.iter(|| { | ||
let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr(); | ||
let src: *const u8 = black_box(&v).as_ptr(); | ||
unsafe { memmove(dst, src, n) }; | ||
}) | ||
} | ||
|
||
#[bench] | ||
fn memcpy_builtin_4096(b: &mut Bencher) { | ||
memcpy_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memcpy_rust_4096(b: &mut Bencher) { | ||
memcpy_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memcpy_builtin_1048576(b: &mut Bencher) { | ||
memcpy_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memcpy_rust_1048576(b: &mut Bencher) { | ||
memcpy_rust(b, 1048576) | ||
} | ||
|
||
#[bench] | ||
fn memset_builtin_4096(b: &mut Bencher) { | ||
memset_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memset_rust_4096(b: &mut Bencher) { | ||
memset_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memset_builtin_1048576(b: &mut Bencher) { | ||
memset_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memset_rust_1048576(b: &mut Bencher) { | ||
memset_rust(b, 1048576) | ||
} | ||
|
||
#[bench] | ||
fn memcmp_builtin_4096(b: &mut Bencher) { | ||
memcmp_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memcmp_rust_4096(b: &mut Bencher) { | ||
memcmp_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memcmp_builtin_1048576(b: &mut Bencher) { | ||
memcmp_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memcmp_rust_1048576(b: &mut Bencher) { | ||
memcmp_rust(b, 1048576) | ||
} | ||
|
||
#[bench] | ||
fn memmove_builtin_4096(b: &mut Bencher) { | ||
memmove_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memmove_rust_4096(b: &mut Bencher) { | ||
memmove_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memmove_builtin_1048576(b: &mut Bencher) { | ||
memmove_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memmove_rust_1048576(b: &mut Bencher) { | ||
memmove_rust(b, 1048576) | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.