16
16
// feature is present at compile-time. We don't bother detecting other features.
17
17
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18
18
19
+ use core:: arch:: asm;
19
20
use core:: intrinsics;
20
21
use core:: mem;
21
22
@@ -34,40 +35,61 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
34
35
35
36
#[ inline( always) ]
36
37
#[ cfg( not( target_feature = "ermsb" ) ) ]
37
- pub unsafe fn copy_forward ( dest : * mut u8 , src : * const u8 , count : usize ) {
38
- let qword_count = count >> 3 ;
39
- let byte_count = count & 0b111 ;
40
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
41
- core:: arch:: asm!(
42
- "repe movsq (%rsi), (%rdi)" ,
43
- "mov {byte_count:e}, %ecx" ,
44
- "repe movsb (%rsi), (%rdi)" ,
45
- byte_count = in( reg) byte_count,
38
+ pub unsafe fn copy_forward ( mut dest : * mut u8 , mut src : * const u8 , count : usize ) {
39
+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
40
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
41
+ // It also allows us to trivially skip the rep movsb, which is faster when memcpying
42
+ // aligned data.
43
+ if pre_byte_count > 0 {
44
+ asm ! (
45
+ "rep movsb" ,
46
+ inout( "ecx" ) pre_byte_count => _,
47
+ inout( "rdi" ) dest => dest,
48
+ inout( "rsi" ) src => src,
49
+ options( nostack, preserves_flags)
50
+ ) ;
51
+ }
52
+ asm ! (
53
+ "rep movsq" ,
46
54
inout( "rcx" ) qword_count => _,
47
- inout( "rdi" ) dest => _ ,
48
- inout( "rsi" ) src => _ ,
49
- options( att_syntax , nostack, preserves_flags)
55
+ inout( "rdi" ) dest => dest ,
56
+ inout( "rsi" ) src => src ,
57
+ options( nostack, preserves_flags)
50
58
) ;
59
+ if byte_count > 0 {
60
+ asm ! (
61
+ "rep movsb" ,
62
+ inout( "ecx" ) byte_count => _,
63
+ inout( "rdi" ) dest => _,
64
+ inout( "rsi" ) src => _,
65
+ options( nostack, preserves_flags)
66
+ ) ;
67
+ }
51
68
}
52
69
53
70
#[ inline( always) ]
54
71
pub unsafe fn copy_backward ( dest : * mut u8 , src : * const u8 , count : usize ) {
55
- let qword_count = count >> 3 ;
56
- let byte_count = count & 0b111 ;
57
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
58
- core:: arch:: asm!(
72
+ let ( pre_byte_count, qword_count, byte_count) = rep_param_rev ( dest, count) ;
73
+ // We can't separate this block due to std/cld
74
+ asm ! (
59
75
"std" ,
60
- "repe movsq (%rsi), (%rdi)" ,
61
- "movl {byte_count:e}, %ecx" ,
62
- "addq $7, %rdi" ,
63
- "addq $7, %rsi" ,
64
- "repe movsb (%rsi), (%rdi)" ,
76
+ "rep movsb" ,
77
+ "sub rsi, 7" ,
78
+ "sub rdi, 7" ,
79
+ "mov rcx, {qword_count}" ,
80
+ "rep movsq" ,
81
+ "add rsi, 7" ,
82
+ "add rdi, 7" ,
83
+ "mov ecx, {byte_count:e}" ,
84
+ "rep movsb" ,
65
85
"cld" ,
66
86
byte_count = in( reg) byte_count,
67
- inout( "rcx" ) qword_count => _,
68
- inout( "rdi" ) dest. add( count) . wrapping_sub( 8 ) => _,
69
- inout( "rsi" ) src. add( count) . wrapping_sub( 8 ) => _,
70
- options( att_syntax, nostack)
87
+ qword_count = in( reg) qword_count,
88
+ inout( "ecx" ) pre_byte_count => _,
89
+ inout( "rdi" ) dest. add( count - 1 ) => _,
90
+ inout( "rsi" ) src. add( count - 1 ) => _,
91
+ // We modify flags, but we restore it afterwards
92
+ options( nostack, preserves_flags)
71
93
) ;
72
94
}
73
95
@@ -86,20 +108,36 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
86
108
87
109
#[ inline( always) ]
88
110
#[ cfg( not( target_feature = "ermsb" ) ) ]
89
- pub unsafe fn set_bytes ( dest : * mut u8 , c : u8 , count : usize ) {
90
- let qword_count = count >> 3 ;
91
- let byte_count = count & 0b111 ;
92
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
93
- core:: arch:: asm!(
94
- "repe stosq %rax, (%rdi)" ,
95
- "mov {byte_count:e}, %ecx" ,
96
- "repe stosb %al, (%rdi)" ,
97
- byte_count = in( reg) byte_count,
111
+ pub unsafe fn set_bytes ( mut dest : * mut u8 , c : u8 , count : usize ) {
112
+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
113
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
114
+ // It also allows us to trivially skip the rep stosb, which is faster when memcpying
115
+ // aligned data.
116
+ if pre_byte_count > 0 {
117
+ asm ! (
118
+ "rep stosb" ,
119
+ inout( "ecx" ) pre_byte_count => _,
120
+ inout( "rdi" ) dest => dest,
121
+ in( "al" ) c,
122
+ options( nostack, preserves_flags)
123
+ ) ;
124
+ }
125
+ asm ! (
126
+ "rep stosq" ,
98
127
inout( "rcx" ) qword_count => _,
99
- inout( "rdi" ) dest => _ ,
128
+ inout( "rdi" ) dest => dest ,
100
129
in( "rax" ) ( c as u64 ) * 0x0101010101010101 ,
101
- options( att_syntax , nostack, preserves_flags)
130
+ options( nostack, preserves_flags)
102
131
) ;
132
+ if byte_count > 0 {
133
+ asm ! (
134
+ "rep stosb" ,
135
+ inout( "ecx" ) byte_count => _,
136
+ inout( "rdi" ) dest => _,
137
+ in( "al" ) c,
138
+ options( nostack, preserves_flags)
139
+ ) ;
140
+ }
103
141
}
104
142
105
143
#[ inline( always) ]
@@ -156,3 +194,23 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
156
194
c16 ( a. cast ( ) , b. cast ( ) , n)
157
195
}
158
196
}
197
+
198
+ /// Determine optimal parameters for a `rep` instruction.
199
+ fn rep_param ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
200
+ // Unaligned writes are still slow on modern processors, so align the destination address.
201
+ let pre_byte_count = ( ( 8 - ( dest as usize & 0b111 ) ) & 0b111 ) . min ( count) ;
202
+ count -= pre_byte_count;
203
+ let qword_count = count >> 3 ;
204
+ let byte_count = count & 0b111 ;
205
+ ( pre_byte_count, qword_count, byte_count)
206
+ }
207
+
208
+ /// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set).
209
+ fn rep_param_rev ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
210
+ // Unaligned writes are still slow on modern processors, so align the destination address.
211
+ let pre_byte_count = ( ( dest as usize + count) & 0b111 ) . min ( count) ;
212
+ count -= pre_byte_count;
213
+ let qword_count = count >> 3 ;
214
+ let byte_count = count & 0b111 ;
215
+ ( pre_byte_count, qword_count, byte_count)
216
+ }
0 commit comments