@@ -173,6 +173,136 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
173
173
c16 ( a. cast ( ) , b. cast ( ) , n)
174
174
}
175
175
176
+ // In order to process more than on byte simultaneously when executing strlen,
177
+ // two things must be considered:
178
+ // * An n byte read with an n-byte aligned address will never cross
179
+ // a page boundary and will always succeed. Any smaller alignment
180
+ // may result in a read that will cross a page boundary, which may
181
+ // trigger an access violation.
182
+ // * Surface Rust considers any kind of out-of-bounds read as undefined
183
+ // behaviour. To dodge this, memory access operations are written
184
+ // using inline assembly.
185
+
186
+ #[ cfg( target_feature = "sse2" ) ]
187
+ #[ inline( always) ]
188
+ pub unsafe fn c_string_length ( mut s : * const core:: ffi:: c_char ) -> usize {
189
+ use core:: arch:: x86_64:: { __m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8} ;
190
+
191
+ let mut n = 0 ;
192
+
193
+ // The use of _mm_movemask_epi8 and company allow for speedups,
194
+ // but they aren't cheap by themselves. Thus, possibly small strings
195
+ // are handled in simple loops.
196
+
197
+ for _ in 0 ..4 {
198
+ if * s == 0 {
199
+ return n;
200
+ }
201
+
202
+ n += 1 ;
203
+ s = s. add ( 1 ) ;
204
+ }
205
+
206
+ // Shave of the least significand bits to align the address to a 16
207
+ // byte boundary. The shaved of bits are used to correct the first iteration.
208
+
209
+ let align = s as usize & 15 ;
210
+ let mut s = ( ( s as usize ) - align) as * const __m128i ;
211
+ let zero = _mm_set1_epi8 ( 0 ) ;
212
+
213
+ let x = {
214
+ let r;
215
+ asm ! (
216
+ "movdqa ({addr}), {dest}" ,
217
+ addr = in( reg) s,
218
+ dest = out( xmm_reg) r,
219
+ options( att_syntax, nostack) ,
220
+ ) ;
221
+ r
222
+ } ;
223
+ let cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( x, zero) ) >> align;
224
+
225
+ if cmp != 0 {
226
+ return n + cmp. trailing_zeros ( ) as usize ;
227
+ }
228
+
229
+ n += 16 - align;
230
+ s = s. add ( 1 ) ;
231
+
232
+ loop {
233
+ let x = {
234
+ let r;
235
+ asm ! (
236
+ "movdqa ({addr}), {dest}" ,
237
+ addr = in( reg) s,
238
+ dest = out( xmm_reg) r,
239
+ options( att_syntax, nostack) ,
240
+ ) ;
241
+ r
242
+ } ;
243
+ let cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( x, zero) ) as u32 ;
244
+ if cmp == 0 {
245
+ n += 16 ;
246
+ s = s. add ( 1 ) ;
247
+ } else {
248
+ return n + cmp. trailing_zeros ( ) as usize ;
249
+ }
250
+ }
251
+ }
252
+
253
+ // Provided for scenarios like kernel development, where SSE might not
254
+ // be available.
255
+ #[ cfg( not( target_feature = "sse2" ) ) ]
256
+ #[ inline( always) ]
257
+ pub unsafe fn c_string_length ( mut s : * const core:: ffi:: c_char ) -> usize {
258
+ let mut n = 0 ;
259
+
260
+ // Check bytes in steps of one until
261
+ // either a zero byte is discovered or
262
+ // pointer is aligned to an eight byte boundary.
263
+
264
+ while s as usize & 7 != 0 {
265
+ if * s == 0 {
266
+ return n;
267
+ }
268
+ n += 1 ;
269
+ s = s. add ( 1 ) ;
270
+ }
271
+
272
+ // Check bytes in steps of eight until a zero
273
+ // byte is discovered.
274
+
275
+ let mut s = s as * const u64 ;
276
+
277
+ loop {
278
+ let mut cs = {
279
+ let r: u64 ;
280
+ asm ! (
281
+ "mov ({addr}), {dest}" ,
282
+ addr = in( reg) s,
283
+ dest = out( reg) r,
284
+ options( att_syntax, nostack) ,
285
+ ) ;
286
+ r
287
+ } ;
288
+ // Detect if a word has a zero byte, taken from
289
+ // https://graphics.stanford.edu/~seander/bithacks.html
290
+ if ( cs. wrapping_sub ( 0x0101010101010101 ) & !cs & 0x8080808080808080 ) != 0 {
291
+ loop {
292
+ if cs & 255 == 0 {
293
+ return n;
294
+ } else {
295
+ cs >>= 8 ;
296
+ n += 1 ;
297
+ }
298
+ }
299
+ } else {
300
+ n += 8 ;
301
+ s = s. add ( 1 ) ;
302
+ }
303
+ }
304
+ }
305
+
176
306
/// Determine optimal parameters for a `rep` instruction.
177
307
fn rep_param ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
178
308
// Unaligned writes are still slow on modern processors, so align the destination address.
0 commit comments