-
Notifications
You must be signed in to change notification settings - Fork 229
Specialize strlen
for x86_64
.
#516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
src/mem/x86_64.rs
Outdated
|
||
asm!( | ||
// search for a zero byte | ||
"xor al, al", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
xor eax, eax
avoids a potential partial register stall and is 1 byte shorter I believe.
src/mem/x86_64.rs
Outdated
"xor al, al", | ||
|
||
// unbounded memory region | ||
"xor rcx, rcx", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
xor ecx, ecx
has the same effect and saves a rex prefix.
src/mem/x86_64.rs
Outdated
"xor rcx, rcx", | ||
"not rcx", | ||
|
||
// forward direction |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe this is guaranteed to be set to the forward direction already due to abi requirements.
Have you profiled this to confirm that it is indeed faster than the generic version? I was under the impression that the x86 string instructions tend to have relatively poor performance due to being microcoded. |
Well... I thought I benched it when I submitted this PR. Turns out my benchmarking routine had a bug. I rerun the benchmark and uploaded the results as well as the used program. I'll rewrite this PR. Please be patient. |
Is it really necessary to implement this in assembly? I feel that an implementation that used the SSE intrinsics would be much more readable and easier to maintain. |
The issue lies within the performed memory access operations. I did find a possible compromise in the snippet below, which uses assembly
pub unsafe extern "C" fn strlen(mut s: *const std::ffi::c_char) -> usize {
use std::arch::x86_64::*;
use std::arch::asm;
let mut n = 0;
for _ in 0..4 {
if *s == 0 {
return n;
}
n += 1;
s = s.add(1);
}
let align = s as usize & 15;
let mut s = ((s as usize) - align) as *const __m128i;
let zero = _mm_set1_epi8(0);
let x = {
let r; asm!("movdqa {dest}, [{addr}]", addr = in(reg) s, dest = out(xmm_reg) r);
r
};
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
if cmp != 0 {
return n + cmp.trailing_zeros() as usize;
}
n += 16 - align;
s = s.add(1);
loop {
let x = {
let r; asm!("movdqa {dest}, [{addr}]", addr = in(reg) s, dest = out(xmm_reg) r);
r
};
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
if cmp == 0 {
n += 16;
s = s.add(1);
} else {
return n + cmp.trailing_zeros() as usize;
}
}
} |
I think this version using assembly just for the accesses is fine and definitely more readable. Have you benchmarked it? |
Yes. Benchmarking was done using this: use criterion::*;
#[inline(never)]
pub unsafe extern "C" fn strlen_naive(mut s: *const std::ffi::c_char) -> usize {
let mut n = 0;
while *s != 0 {
n += 1;
s = s.add(1);
}
n
}
#[inline(never)]
pub unsafe extern "C" fn strlen_kernel(mut s: *const std::ffi::c_char) -> usize {
use std::arch::asm;
let mut n = 0;
while s as usize & 7 != 0 {
if *s == 0 {
return n;
}
n += 1;
s = s.add(1);
}
let mut s = s as *const u64;
loop {
let mut cs = {
let r: u64;
asm!("mov {dest}, [{addr}]", addr = in(reg) s, dest = out(reg) r);
r
};
// Detect if a word has a zero byte, taken from
// https://graphics.stanford.edu/~seander/bithacks.html
if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
loop {
if cs & 255 == 0 {
return n;
} else {
cs >>= 8;
n += 1;
}
}
} else {
n += 8;
s = s.add(1);
}
}
}
pub unsafe extern "C" fn strlen_sse(mut s: *const std::ffi::c_char) -> usize {
use std::arch::x86_64::*;
use std::arch::asm;
let mut n = 0;
for _ in 0..4 {
if *s == 0 {
return n;
}
n += 1;
s = s.add(1);
}
let align = s as usize & 15;
let mut s = ((s as usize) - align) as *const __m128i;
let zero = _mm_set1_epi8(0);
let x = {
let r;
asm!("movdqa {dest}, [{addr}]", addr = in(reg) s, dest = out(xmm_reg) r);
r
};
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
if cmp != 0 {
return n + cmp.trailing_zeros() as usize;
}
n += 16 - align;
s = s.add(1);
loop {
let x = {
let r;
asm!("movdqa {dest}, [{addr}]", addr = in(reg) s, dest = out(xmm_reg) r);
r
};
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
if cmp == 0 {
n += 16;
s = s.add(1);
} else {
return n + cmp.trailing_zeros() as usize;
}
}
}
fn bench_strlen(c: &mut Criterion, len: usize) {
let mut v = vec![1i8; len];
v[len - 1] = 0;
let mut group = c.benchmark_group(format!("strlen, length {}", len));
group.bench_function("strlen_naive", |b| b.iter(|| {
black_box(&mut v);
let r = unsafe {
strlen_naive(v.as_ptr())
};
assert_eq!(r, len - 1);
}));
group.bench_function("strlen_kernel", |b| b.iter(|| {
black_box(&mut v);
let r = unsafe {
strlen_kernel(v.as_ptr())
};
assert_eq!(r, len - 1);
}));
group.bench_function("strlen_sse", |b| b.iter(|| {
black_box(&mut v);
let r = unsafe {
strlen_sse(v.as_ptr())
};
assert_eq!(r, len - 1);
}));
}
fn bench_strlen_1(c: &mut Criterion) {
bench_strlen(c, 1)
}
fn bench_strlen_7(c: &mut Criterion) {
bench_strlen(c, 7)
}
fn bench_strlen_15(c: &mut Criterion) {
bench_strlen(c, 15)
}
fn bench_strlen_300(c: &mut Criterion) {
bench_strlen(c, 300)
}
fn bench_strlen_2048(c: &mut Criterion) {
bench_strlen(c, 2048)
}
fn bench_strlen_10_000(c: &mut Criterion) {
bench_strlen(c, 10_000)
}
fn bench_strlen_50_000(c: &mut Criterion) {
bench_strlen(c, 50_000)
}
fn bench_strlen_100_000(c: &mut Criterion) {
bench_strlen(c, 100_000)
}
fn bench_strlen_1_000_000(c: &mut Criterion) {
bench_strlen(c, 1_000_000)
}
criterion_group! { bench_strlen_group,
bench_strlen_1,
bench_strlen_7,
bench_strlen_15,
bench_strlen_300,
bench_strlen_2048,
bench_strlen_10_000,
bench_strlen_50_000,
bench_strlen_100_000,
bench_strlen_1_000_000,
}
criterion_main!(bench_strlen_group); Result is the following:
|
Perfect! I'll merge this once you update the PR to only have the memory access as assembly. |
@Amanieu The PR is ready. |
No description provided.