@@ -173,88 +173,82 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
173
173
c16 ( a. cast ( ) , b. cast ( ) , n)
174
174
}
175
175
176
+
177
+ // In order to process more than on byte simultaneously when executing strlen,
178
+ // two things must be considered:
179
+ // * An n byte read with an n-byte aligned address will never cross
180
+ // a page boundary and will always succeed. Any smaller alignment
181
+ // may result in a read that will cross a page boundary, which may
182
+ // trigger an access violation.
183
+ // * Surface Rust considers any kind of out-of-bounds read as undefined
184
+ // behaviour. To dodge this, memory access operations are written
185
+ // using inline assembly.
186
+
176
187
#[ cfg( target_feature = "sse2" ) ]
177
188
#[ inline( always) ]
178
189
pub unsafe fn c_string_length ( s : * const core:: ffi:: c_char ) -> usize {
179
- let mut n: usize ;
180
-
181
- asm ! (
182
- // For small sizes, we avoid invoking SSE instructions.
183
- // make manual comparisons instead.
184
- "xor %eax, %eax" ,
185
- "cmpb $0, (%rdi)" ,
186
- "je 3f" ,
187
- "mov $1, %eax" ,
188
- "cmpb $0, 1(%rdi)" ,
189
- "je 3f" ,
190
- "mov $2, %eax" ,
191
- "cmpb $0, 2(%rdi)" ,
192
- "je 3f" ,
193
- "mov $3, %eax" ,
194
- "cmpb $0, 3(%rdi)" ,
195
- "je 3f" ,
196
-
197
- // Adjust address
198
- "add $4, %rdi" ,
190
+ use core:: arch:: x86_64:: { __m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8} ;
199
191
200
- // Align the address to 16 bytes (xmm register size).
201
- // This is important, since an n byte read
202
- // with n byte alignment is guranteed to never cross
203
- // a page boundary and thus will never try to access
204
- // memory which may not be accessible.
205
- "mov %edi, %ecx" ,
206
- "and $15, %ecx" ,
207
- "and $-16, %rdi" ,
192
+ let mut n = 0 ;
193
+
194
+ // The use of _mm_movemask_epi8 and company allow for speedups,
195
+ // but they aren't cheap by themselves. Thus, possibly small strings
196
+ // are handled in simple loops.
208
197
209
- // zero out an xmm register for comparisons with zero.
210
- "pxor %xmm0, %xmm0" ,
198
+ for _ in 0 ..4 {
199
+ if * s == 0 {
200
+ return n;
201
+ }
211
202
212
- // One manual iteration of a zero byte search.
213
- // Ensuring proper alignment may cause us to read
214
- // memory _before_ the actual string start.
215
- // Thus, one separate iteration is needed to handle this special case.
216
- "movdqa (%rdi), %xmm1" ,
217
- "pcmpeqb %xmm0, %xmm1" ,
218
- "pmovmskb %xmm1, %eax" ,
219
- // Shift out comparisons that don't belong to the actual string.
220
- "shr %cl, %eax" ,
221
- // Check if there was a zero
222
- "test %eax, %eax" ,
223
- "jz 1f" ,
203
+ n += 1 ;
204
+ s = s. add ( 1 ) ;
205
+ }
206
+
207
+ // Shave of the least significand bits to align the address to a 16
208
+ // byte boundary. The shaved of bits are used to correct the first iteration.
224
209
225
- // A zero was found: calculate result and exit.
226
- "bsf %eax, %eax" ,
227
- "add $4, %eax" ,
228
- "jmp 3f" ,
210
+ let align = s as usize & 15 ;
211
+ let mut s = ( ( s as usize ) - align) as * const __m128i ;
212
+ let zero = _mm_set1_epi8 ( 0 ) ;
229
213
230
- // No zero was found: prepare main loop.
231
- "1:" ,
232
- "add $16, %rdi" ,
233
- "neg %rcx" ,
234
- "add $4, %rcx" ,
214
+ let x = {
215
+ let r;
216
+ asm ! (
217
+ "movdqa ({addr}), {dest}" ,
218
+ addr = in( reg) s,
219
+ dest = out( xmm_reg) r,
220
+ options( att_syntax, nostack) ,
221
+ ) ;
222
+ r
223
+ } ;
224
+ let cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( x, zero) ) >> align;
235
225
236
- // main loop
237
- "2:" ,
238
- "movdqa (%rdi), %xmm1" ,
239
- "add $16, %rdi" ,
240
- "add $16, %rcx" ,
241
- "pcmpeqb %xmm0, %xmm1" ,
242
- "pmovmskb %xmm1, %eax" ,
243
- // Check if there was a zero
244
- "test %eax, %eax" ,
245
- "jz 2b" ,
226
+ if cmp != 0 {
227
+ return n + cmp. trailing_zeros ( ) as usize ;
228
+ }
246
229
247
- // A zero was found: calculate result and exit.
248
- "bsf %eax, %eax" ,
249
- "add %rcx, %rax" ,
250
- "3:" ,
251
- inout( "rdi" ) s => _,
252
- out( "rax" ) n,
253
- out( "rcx" ) _,
254
- options( att_syntax, nostack) ,
255
- ) ;
230
+ n += 16 - align;
231
+ s = s. add ( 1 ) ;
256
232
257
- n
233
+ loop {
234
+ let x = {
235
+ let r;
236
+ asm ! (
237
+ "movdqa ({addr}), {dest}" ,
238
+ addr = in( reg) s,
239
+ dest = out( xmm_reg) r,
240
+ options( att_syntax, nostack) ,
241
+ ) ;
242
+ r
243
+ } ;
244
+ let cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( x, zero) ) as u32 ;
245
+ if cmp == 0 {
246
+ n += 16 ;
247
+ s = s. add ( 1 ) ;
248
+ } else {
249
+ return n + cmp. trailing_zeros ( ) as usize ;
250
+ }
251
+ }
258
252
}
259
253
260
254
// Provided for scenarios like kernel development, where SSE might not
@@ -263,11 +257,52 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize {
263
257
#[ inline( always) ]
264
258
pub unsafe fn c_string_length ( mut s : * const core:: ffi:: c_char ) -> usize {
265
259
let mut n = 0 ;
266
- while * s != 0 {
260
+
261
+ // Check bytes in steps of one until
262
+ // either a zero byte is discovered or
263
+ // pointer is aligned to an eight byte boundary.
264
+
265
+ while s as usize & 7 != 0 {
266
+ if * s == 0 {
267
+ return n;
268
+ }
269
+
267
270
n += 1 ;
268
271
s = s. add ( 1 ) ;
269
272
}
270
- n
273
+
274
+ // Check bytes in steps of eight until a zero
275
+ // byte is discovered.
276
+
277
+ let mut s = s as * const u64 ;
278
+
279
+ loop {
280
+ let mut cs = {
281
+ let r: u64 ;
282
+ asm ! (
283
+ "mov ({addr}), {dest}" ,
284
+ addr = in( reg) s,
285
+ dest = out( reg) r,
286
+ options( att_syntax, nostack) ,
287
+ ) ;
288
+ r
289
+ } ;
290
+ // Detect if a word has a zero byte, taken from
291
+ // https://graphics.stanford.edu/~seander/bithacks.html
292
+ if ( cs. wrapping_sub ( 0x0101010101010101 ) & !cs & 0x8080808080808080 ) != 0 {
293
+ loop {
294
+ if cs & 255 == 0 {
295
+ return n;
296
+ } else {
297
+ cs >>= 8 ;
298
+ n += 1 ;
299
+ }
300
+ }
301
+ } else {
302
+ n += 8 ;
303
+ s = s. add ( 1 ) ;
304
+ }
305
+ }
271
306
}
272
307
273
308
/// Determine optimal parameters for a `rep` instruction.
0 commit comments