Skip to content

Commit b788cf3

Browse files
authored
Merge pull request #516 from TDecki/master
2 parents 72c0856 + 6488b26 commit b788cf3

File tree

3 files changed

+141
-7
lines changed

3 files changed

+141
-7
lines changed

src/mem/impls.rs

+10
Original file line numberDiff line numberDiff line change
@@ -279,3 +279,13 @@ pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
279279
}
280280
0
281281
}
282+
283+
#[inline(always)]
284+
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
285+
let mut n = 0;
286+
while *s != 0 {
287+
n += 1;
288+
s = s.add(1);
289+
}
290+
n
291+
}

src/mem/mod.rs

+1-7
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,7 @@ intrinsics! {
6363
#[mem_builtin]
6464
#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
6565
pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
66-
let mut n = 0;
67-
let mut s = s;
68-
while *s != 0 {
69-
n += 1;
70-
s = s.offset(1);
71-
}
72-
n
66+
impls::c_string_length(s)
7367
}
7468
}
7569

src/mem/x86_64.rs

+130
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,136 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
173173
c16(a.cast(), b.cast(), n)
174174
}
175175

176+
// In order to process more than on byte simultaneously when executing strlen,
177+
// two things must be considered:
178+
// * An n byte read with an n-byte aligned address will never cross
179+
// a page boundary and will always succeed. Any smaller alignment
180+
// may result in a read that will cross a page boundary, which may
181+
// trigger an access violation.
182+
// * Surface Rust considers any kind of out-of-bounds read as undefined
183+
// behaviour. To dodge this, memory access operations are written
184+
// using inline assembly.
185+
186+
#[cfg(target_feature = "sse2")]
187+
#[inline(always)]
188+
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
189+
use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
190+
191+
let mut n = 0;
192+
193+
// The use of _mm_movemask_epi8 and company allow for speedups,
194+
// but they aren't cheap by themselves. Thus, possibly small strings
195+
// are handled in simple loops.
196+
197+
for _ in 0..4 {
198+
if *s == 0 {
199+
return n;
200+
}
201+
202+
n += 1;
203+
s = s.add(1);
204+
}
205+
206+
// Shave of the least significand bits to align the address to a 16
207+
// byte boundary. The shaved of bits are used to correct the first iteration.
208+
209+
let align = s as usize & 15;
210+
let mut s = ((s as usize) - align) as *const __m128i;
211+
let zero = _mm_set1_epi8(0);
212+
213+
let x = {
214+
let r;
215+
asm!(
216+
"movdqa ({addr}), {dest}",
217+
addr = in(reg) s,
218+
dest = out(xmm_reg) r,
219+
options(att_syntax, nostack),
220+
);
221+
r
222+
};
223+
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
224+
225+
if cmp != 0 {
226+
return n + cmp.trailing_zeros() as usize;
227+
}
228+
229+
n += 16 - align;
230+
s = s.add(1);
231+
232+
loop {
233+
let x = {
234+
let r;
235+
asm!(
236+
"movdqa ({addr}), {dest}",
237+
addr = in(reg) s,
238+
dest = out(xmm_reg) r,
239+
options(att_syntax, nostack),
240+
);
241+
r
242+
};
243+
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
244+
if cmp == 0 {
245+
n += 16;
246+
s = s.add(1);
247+
} else {
248+
return n + cmp.trailing_zeros() as usize;
249+
}
250+
}
251+
}
252+
253+
// Provided for scenarios like kernel development, where SSE might not
254+
// be available.
255+
#[cfg(not(target_feature = "sse2"))]
256+
#[inline(always)]
257+
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
258+
let mut n = 0;
259+
260+
// Check bytes in steps of one until
261+
// either a zero byte is discovered or
262+
// pointer is aligned to an eight byte boundary.
263+
264+
while s as usize & 7 != 0 {
265+
if *s == 0 {
266+
return n;
267+
}
268+
n += 1;
269+
s = s.add(1);
270+
}
271+
272+
// Check bytes in steps of eight until a zero
273+
// byte is discovered.
274+
275+
let mut s = s as *const u64;
276+
277+
loop {
278+
let mut cs = {
279+
let r: u64;
280+
asm!(
281+
"mov ({addr}), {dest}",
282+
addr = in(reg) s,
283+
dest = out(reg) r,
284+
options(att_syntax, nostack),
285+
);
286+
r
287+
};
288+
// Detect if a word has a zero byte, taken from
289+
// https://graphics.stanford.edu/~seander/bithacks.html
290+
if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
291+
loop {
292+
if cs & 255 == 0 {
293+
return n;
294+
} else {
295+
cs >>= 8;
296+
n += 1;
297+
}
298+
}
299+
} else {
300+
n += 8;
301+
s = s.add(1);
302+
}
303+
}
304+
}
305+
176306
/// Determine optimal parameters for a `rep` instruction.
177307
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
178308
// Unaligned writes are still slow on modern processors, so align the destination address.

0 commit comments

Comments
 (0)