Skip to content

Commit db0ca0c

Browse files
committed
Align destination in mem* instructions.
While misaligned reads are generally fast, misaligned writes aren't and can have severe penalties.
1 parent f10dbd9 commit db0ca0c

File tree

1 file changed

+94
-36
lines changed

1 file changed

+94
-36
lines changed

src/mem/x86_64.rs

+94-36
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// feature is present at compile-time. We don't bother detecting other features.
1717
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
1818

19+
use core::arch::asm;
1920
use core::intrinsics;
2021
use core::mem;
2122

@@ -34,40 +35,61 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
3435

3536
#[inline(always)]
3637
#[cfg(not(target_feature = "ermsb"))]
37-
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
38-
let qword_count = count >> 3;
39-
let byte_count = count & 0b111;
40-
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
41-
core::arch::asm!(
42-
"repe movsq (%rsi), (%rdi)",
43-
"mov {byte_count:e}, %ecx",
44-
"repe movsb (%rsi), (%rdi)",
45-
byte_count = in(reg) byte_count,
38+
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
39+
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
40+
// Separating the blocks gives the compiler more freedom to reorder instructions.
41+
// It also allows us to trivially skip the rep movsb, which is faster when memcpying
42+
// aligned data.
43+
if pre_byte_count > 0 {
44+
asm!(
45+
"rep movsb",
46+
inout("ecx") pre_byte_count => _,
47+
inout("rdi") dest => dest,
48+
inout("rsi") src => src,
49+
options(nostack, preserves_flags)
50+
);
51+
}
52+
asm!(
53+
"rep movsq",
4654
inout("rcx") qword_count => _,
47-
inout("rdi") dest => _,
48-
inout("rsi") src => _,
49-
options(att_syntax, nostack, preserves_flags)
55+
inout("rdi") dest => dest,
56+
inout("rsi") src => src,
57+
options(nostack, preserves_flags)
5058
);
59+
if byte_count > 0 {
60+
asm!(
61+
"rep movsb",
62+
inout("ecx") byte_count => _,
63+
inout("rdi") dest => _,
64+
inout("rsi") src => _,
65+
options(nostack, preserves_flags)
66+
);
67+
}
5168
}
5269

5370
#[inline(always)]
5471
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
55-
let qword_count = count >> 3;
56-
let byte_count = count & 0b111;
57-
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
58-
core::arch::asm!(
72+
let (pre_byte_count, qword_count, byte_count) = rep_param_rev(dest, count);
73+
// We can't separate this block due to std/cld
74+
asm!(
5975
"std",
60-
"repe movsq (%rsi), (%rdi)",
61-
"movl {byte_count:e}, %ecx",
62-
"addq $7, %rdi",
63-
"addq $7, %rsi",
64-
"repe movsb (%rsi), (%rdi)",
76+
"rep movsb",
77+
"sub rsi, 7",
78+
"sub rdi, 7",
79+
"mov rcx, {qword_count}",
80+
"rep movsq",
81+
"add rsi, 7",
82+
"add rdi, 7",
83+
"mov ecx, {byte_count:e}",
84+
"rep movsb",
6585
"cld",
6686
byte_count = in(reg) byte_count,
67-
inout("rcx") qword_count => _,
68-
inout("rdi") dest.add(count).wrapping_sub(8) => _,
69-
inout("rsi") src.add(count).wrapping_sub(8) => _,
70-
options(att_syntax, nostack)
87+
qword_count = in(reg) qword_count,
88+
inout("ecx") pre_byte_count => _,
89+
inout("rdi") dest.add(count - 1) => _,
90+
inout("rsi") src.add(count - 1) => _,
91+
// We modify flags, but we restore it afterwards
92+
options(nostack, preserves_flags)
7193
);
7294
}
7395

@@ -86,20 +108,36 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
86108

87109
#[inline(always)]
88110
#[cfg(not(target_feature = "ermsb"))]
89-
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
90-
let qword_count = count >> 3;
91-
let byte_count = count & 0b111;
92-
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
93-
core::arch::asm!(
94-
"repe stosq %rax, (%rdi)",
95-
"mov {byte_count:e}, %ecx",
96-
"repe stosb %al, (%rdi)",
97-
byte_count = in(reg) byte_count,
111+
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
112+
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
113+
// Separating the blocks gives the compiler more freedom to reorder instructions.
114+
// It also allows us to trivially skip the rep stosb, which is faster when memcpying
115+
// aligned data.
116+
if pre_byte_count > 0 {
117+
asm!(
118+
"rep stosb",
119+
inout("ecx") pre_byte_count => _,
120+
inout("rdi") dest => dest,
121+
in("al") c,
122+
options(nostack, preserves_flags)
123+
);
124+
}
125+
asm!(
126+
"rep stosq",
98127
inout("rcx") qword_count => _,
99-
inout("rdi") dest => _,
128+
inout("rdi") dest => dest,
100129
in("rax") (c as u64) * 0x0101010101010101,
101-
options(att_syntax, nostack, preserves_flags)
130+
options(nostack, preserves_flags)
102131
);
132+
if byte_count > 0 {
133+
asm!(
134+
"rep stosb",
135+
inout("ecx") byte_count => _,
136+
inout("rdi") dest => _,
137+
in("al") c,
138+
options(nostack, preserves_flags)
139+
);
140+
}
103141
}
104142

105143
#[inline(always)]
@@ -156,3 +194,23 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
156194
c16(a.cast(), b.cast(), n)
157195
}
158196
}
197+
198+
/// Determine optimal parameters for a `rep` instruction.
199+
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
200+
// Unaligned writes are still slow on modern processors, so align the destination address.
201+
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
202+
count -= pre_byte_count;
203+
let qword_count = count >> 3;
204+
let byte_count = count & 0b111;
205+
(pre_byte_count, qword_count, byte_count)
206+
}
207+
208+
/// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set).
209+
fn rep_param_rev(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
210+
// Unaligned writes are still slow on modern processors, so align the destination address.
211+
let pre_byte_count = ((dest as usize + count) & 0b111).min(count);
212+
count -= pre_byte_count;
213+
let qword_count = count >> 3;
214+
let byte_count = count & 0b111;
215+
(pre_byte_count, qword_count, byte_count)
216+
}

0 commit comments

Comments
 (0)