Skip to content

Commit 5e41b86

Browse files
josephlrAaronKutch
authored andcommitted
Use REP MOVSB/STOSB when the ERMSB feature is present (rust-lang#392)
* Reorganize mem functions This reduces the amount of platform-specific code Signed-off-by: Joe Richey <[email protected]> * Use ERMSB implementations if the feature is set Signed-off-by: Joe Richey <[email protected]> * Add non-aligned benchmarks Signed-off-by: Joe Richey <[email protected]>
1 parent 332790c commit 5e41b86

File tree

5 files changed

+148
-88
lines changed

5 files changed

+148
-88
lines changed

src/mem/impls.rs

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
use super::c_int;
2+
3+
#[inline(always)]
4+
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
5+
let mut i = 0;
6+
while i < n {
7+
*dest.offset(i as isize) = *src.offset(i as isize);
8+
i += 1;
9+
}
10+
}
11+
12+
#[inline(always)]
13+
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
14+
// copy from end
15+
let mut i = n;
16+
while i != 0 {
17+
i -= 1;
18+
*dest.offset(i as isize) = *src.offset(i as isize);
19+
}
20+
}
21+
22+
#[inline(always)]
23+
pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
24+
let mut i = 0;
25+
while i < n {
26+
*s.offset(i as isize) = c;
27+
i += 1;
28+
}
29+
}

src/mem/memcpy.rs

-41
This file was deleted.

src/mem/mod.rs

+26-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl};
1111

1212
// memcpy/memmove/memset have optimized implementations on some architectures
1313
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
14-
mod memcpy;
15-
pub use self::memcpy::*;
14+
mod impls;
15+
16+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
17+
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
18+
impls::copy_forward(dest, src, n);
19+
dest
20+
}
21+
22+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
23+
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
24+
let delta = (dest as usize).wrapping_sub(src as usize);
25+
if delta >= n {
26+
// We can copy forwards because either dest is far enough ahead of src,
27+
// or src is ahead of dest (and delta overflowed).
28+
impls::copy_forward(dest, src, n);
29+
} else {
30+
impls::copy_backward(dest, src, n);
31+
}
32+
dest
33+
}
34+
35+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
36+
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
37+
impls::set_bytes(s, c as u8, n);
38+
s
39+
}
1640

1741
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
1842
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {

src/mem/x86_64.rs

+37-21
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
use super::c_int;
2-
31
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
42
// been enhanced to perform better than an simple qword loop, making them ideal
53
// for implementing memcpy/memset. Note that "rep cmps" has received no such
@@ -13,11 +11,26 @@ use super::c_int;
1311
// - FSRM - Fast Short REP MOV (Ice Lake and later)
1412
// - Fast Zero-Length MOVSB (On no current hardware)
1513
// - Fast Short STOSB (On no current hardware)
16-
// However, to avoid run-time feature detection, we don't use these byte-based
17-
// instructions for most of the copying, preferring the qword variants.
14+
//
15+
// To simplify things, we switch to using the byte-based variants if the "ermsb"
16+
// feature is present at compile-time. We don't bother detecting other features.
17+
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18+
19+
#[inline(always)]
20+
#[cfg(target_feature = "ermsb")]
21+
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
22+
asm!(
23+
"rep movsb [rdi], [rsi]",
24+
inout("rcx") count => _,
25+
inout("rdi") dest => _,
26+
inout("rsi") src => _,
27+
options(nostack, preserves_flags)
28+
);
29+
}
1830

19-
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
20-
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
31+
#[inline(always)]
32+
#[cfg(not(target_feature = "ermsb"))]
33+
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
2134
let qword_count = count >> 3;
2235
let byte_count = count & 0b111;
2336
asm!(
@@ -30,18 +43,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) ->
3043
inout("rsi") src => _,
3144
options(nostack, preserves_flags)
3245
);
33-
dest
3446
}
3547

36-
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
37-
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
38-
let delta = (dest as usize).wrapping_sub(src as usize);
39-
if delta >= count {
40-
// We can copy forwards because either dest is far enough ahead of src,
41-
// or src is ahead of dest (and delta overflowed).
42-
return self::memcpy(dest, src, count);
43-
}
44-
// copy backwards
48+
#[inline(always)]
49+
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
4550
let qword_count = count >> 3;
4651
let byte_count = count & 0b111;
4752
asm!(
@@ -58,11 +63,23 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) ->
5863
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
5964
options(nostack)
6065
);
61-
dest
6266
}
6367

64-
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
65-
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
68+
#[inline(always)]
69+
#[cfg(target_feature = "ermsb")]
70+
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
71+
asm!(
72+
"rep stosb [rdi], al",
73+
inout("rcx") count => _,
74+
inout("rdi") dest => _,
75+
inout("al") c => _,
76+
options(nostack, preserves_flags)
77+
)
78+
}
79+
80+
#[inline(always)]
81+
#[cfg(not(target_feature = "ermsb"))]
82+
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
6683
let qword_count = count >> 3;
6784
let byte_count = count & 0b111;
6885
asm!(
@@ -72,8 +89,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u
7289
byte_count = in(reg) byte_count,
7390
inout("rcx") qword_count => _,
7491
inout("rdi") dest => _,
75-
in("rax") (c as u8 as u64) * 0x0101010101010101,
92+
in("rax") (c as u64) * 0x0101010101010101,
7693
options(nostack, preserves_flags)
7794
);
78-
dest
7995
}

testcrate/benches/mem.rs

+56-24
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,45 @@ use test::{black_box, Bencher};
66
extern crate compiler_builtins;
77
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
88

9-
fn memcpy_builtin(b: &mut Bencher, n: usize) {
10-
let v1 = vec![1u8; n];
11-
let mut v2 = vec![0u8; n];
9+
fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) {
10+
let v1 = vec![1u8; n + offset];
11+
let mut v2 = vec![0u8; n + offset];
1212
b.bytes = n as u64;
1313
b.iter(|| {
14-
let src: &[u8] = black_box(&v1);
15-
let dst: &mut [u8] = black_box(&mut v2);
14+
let src: &[u8] = black_box(&v1[offset..]);
15+
let dst: &mut [u8] = black_box(&mut v2[offset..]);
1616
dst.copy_from_slice(src);
1717
})
1818
}
1919

20-
fn memcpy_rust(b: &mut Bencher, n: usize) {
21-
let v1 = vec![1u8; n];
22-
let mut v2 = vec![0u8; n];
20+
fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) {
21+
let v1 = vec![1u8; n + offset];
22+
let mut v2 = vec![0u8; n + offset];
2323
b.bytes = n as u64;
2424
b.iter(|| {
25-
let src: &[u8] = black_box(&v1);
26-
let dst: &mut [u8] = black_box(&mut v2);
25+
let src: &[u8] = black_box(&v1[offset..]);
26+
let dst: &mut [u8] = black_box(&mut v2[offset..]);
2727
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
2828
})
2929
}
3030

31-
fn memset_builtin(b: &mut Bencher, n: usize) {
32-
let mut v1 = vec![0u8; n];
31+
fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
32+
let mut v1 = vec![0u8; n + offset];
3333
b.bytes = n as u64;
3434
b.iter(|| {
35-
let dst: &mut [u8] = black_box(&mut v1);
35+
let dst: &mut [u8] = black_box(&mut v1[offset..]);
3636
let val: u8 = black_box(27);
3737
for b in dst {
3838
*b = val;
3939
}
4040
})
4141
}
4242

43-
fn memset_rust(b: &mut Bencher, n: usize) {
44-
let mut v1 = vec![0u8; n];
43+
fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
44+
let mut v1 = vec![0u8; n + offset];
4545
b.bytes = n as u64;
4646
b.iter(|| {
47-
let dst: &mut [u8] = black_box(&mut v1);
47+
let dst: &mut [u8] = black_box(&mut v1[offset..]);
4848
let val = black_box(27);
4949
unsafe { memset(dst.as_mut_ptr(), val, n) }
5050
})
@@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) {
9595

9696
#[bench]
9797
fn memcpy_builtin_4096(b: &mut Bencher) {
98-
memcpy_builtin(b, 4096)
98+
memcpy_builtin(b, 4096, 0)
9999
}
100100
#[bench]
101101
fn memcpy_rust_4096(b: &mut Bencher) {
102-
memcpy_rust(b, 4096)
102+
memcpy_rust(b, 4096, 0)
103103
}
104104
#[bench]
105105
fn memcpy_builtin_1048576(b: &mut Bencher) {
106-
memcpy_builtin(b, 1048576)
106+
memcpy_builtin(b, 1048576, 0)
107107
}
108108
#[bench]
109109
fn memcpy_rust_1048576(b: &mut Bencher) {
110-
memcpy_rust(b, 1048576)
110+
memcpy_rust(b, 1048576, 0)
111+
}
112+
#[bench]
113+
fn memcpy_builtin_4096_offset(b: &mut Bencher) {
114+
memcpy_builtin(b, 4096, 65)
115+
}
116+
#[bench]
117+
fn memcpy_rust_4096_offset(b: &mut Bencher) {
118+
memcpy_rust(b, 4096, 65)
119+
}
120+
#[bench]
121+
fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
122+
memcpy_builtin(b, 1048576, 65)
123+
}
124+
#[bench]
125+
fn memcpy_rust_1048576_offset(b: &mut Bencher) {
126+
memcpy_rust(b, 1048576, 65)
111127
}
112128

113129
#[bench]
114130
fn memset_builtin_4096(b: &mut Bencher) {
115-
memset_builtin(b, 4096)
131+
memset_builtin(b, 4096, 0)
116132
}
117133
#[bench]
118134
fn memset_rust_4096(b: &mut Bencher) {
119-
memset_rust(b, 4096)
135+
memset_rust(b, 4096, 0)
120136
}
121137
#[bench]
122138
fn memset_builtin_1048576(b: &mut Bencher) {
123-
memset_builtin(b, 1048576)
139+
memset_builtin(b, 1048576, 0)
124140
}
125141
#[bench]
126142
fn memset_rust_1048576(b: &mut Bencher) {
127-
memset_rust(b, 1048576)
143+
memset_rust(b, 1048576, 0)
144+
}
145+
#[bench]
146+
fn memset_builtin_4096_offset(b: &mut Bencher) {
147+
memset_builtin(b, 4096, 65)
148+
}
149+
#[bench]
150+
fn memset_rust_4096_offset(b: &mut Bencher) {
151+
memset_rust(b, 4096, 65)
152+
}
153+
#[bench]
154+
fn memset_builtin_1048576_offset(b: &mut Bencher) {
155+
memset_builtin(b, 1048576, 65)
156+
}
157+
#[bench]
158+
fn memset_rust_1048576_offset(b: &mut Bencher) {
159+
memset_rust(b, 1048576, 65)
128160
}
129161

130162
#[bench]

0 commit comments

Comments
 (0)