Skip to content

Specialize strlen for x86_64. #516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 12, 2023
Merged
10 changes: 10 additions & 0 deletions src/mem/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,13 @@ pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
}
0
}

#[inline(always)]
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
let mut n = 0;
while *s != 0 {
n += 1;
s = s.add(1);
}
n
}
8 changes: 1 addition & 7 deletions src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,7 @@ intrinsics! {
#[mem_builtin]
#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
let mut n = 0;
let mut s = s;
while *s != 0 {
n += 1;
s = s.offset(1);
}
n
impls::c_string_length(s)
}
}

Expand Down
83 changes: 83 additions & 0 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,89 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
c16(a.cast(), b.cast(), n)
}

#[inline(always)]
pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize {
let mut n: usize;

asm!(
// For small sizes, we avoid invoking SSE instructions.
// make manual comparisons instead.
"xor %eax, %eax",
"cmpb $0, (%rdi)",
"je 3f",
"mov $1, %eax",
"cmpb $0, 1(%rdi)",
"je 3f",
"mov $2, %eax",
"cmpb $0, 2(%rdi)",
"je 3f",
"mov $3, %eax",
"cmpb $0, 3(%rdi)",
"je 3f",

// Adjust address
"add $4, %rdi",

// Align the address to 16 bytes (xmm register size).
// This is important, since an n byte read
// with n byte alignment is guranteed to never cross
// a page boundary and thus will never try to access
// memory which may not be accessible.
"mov %edi, %ecx",
"and $15, %ecx",
"and $-16, %rdi",

// zero out an xmm register for comparisons with zero.
"pxor %xmm0, %xmm0",

// One manual iteration of a zero byte search.
// Ensuring proper alignment may cause us to read
// memory _before_ the actual string start.
// Thus, one separate iteration is needed to handle this special case.
"movdqa (%rdi), %xmm1",
"pcmpeqb %xmm0, %xmm1",
"pmovmskb %xmm1, %eax",
// Shift out comparisons that don't belong to the actual string.
"shr %cl, %eax",
// Check if there was a zero
"test %eax, %eax",
"jz 1f",

// A zero was found: calculate result and exit.
"bsf %eax, %eax",
"add $4, %eax",
"jmp 3f",

// No zero was found: prepare main loop.
"1:",
"add $16, %rdi",
"neg %rcx",
"add $4, %rcx",

// main loop
"2:",
"movdqa (%rdi), %xmm1",
"add $16, %rdi",
"add $16, %rcx",
"pcmpeqb %xmm0, %xmm1",
"pmovmskb %xmm1, %eax",
// Check if there was a zero
"test %eax, %eax",
"jz 2b",

// A zero was found: calculate result and exit.
"bsf %eax, %eax",
"add %rcx, %rax",
"3:",
inout("rdi") s => _,
out("rax") n,
out("rcx") _,
options(att_syntax, nostack),
);

n
}

/// Determine optimal parameters for a `rep` instruction.
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
// Unaligned writes are still slow on modern processors, so align the destination address.
Expand Down