Skip to content

Commit 1df0d1c

Browse files
authored
Final version.
1 parent afa3d3e commit 1df0d1c

File tree

1 file changed

+108
-73
lines changed

1 file changed

+108
-73
lines changed

src/mem/x86_64.rs

Lines changed: 108 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -173,88 +173,82 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
173173
c16(a.cast(), b.cast(), n)
174174
}
175175

176+
177+
// In order to process more than on byte simultaneously when executing strlen,
178+
// two things must be considered:
179+
// * An n byte read with an n-byte aligned address will never cross
180+
// a page boundary and will always succeed. Any smaller alignment
181+
// may result in a read that will cross a page boundary, which may
182+
// trigger an access violation.
183+
// * Surface Rust considers any kind of out-of-bounds read as undefined
184+
// behaviour. To dodge this, memory access operations are written
185+
// using inline assembly.
186+
176187
#[cfg(target_feature = "sse2")]
177188
#[inline(always)]
178189
pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize {
179-
let mut n: usize;
180-
181-
asm!(
182-
// For small sizes, we avoid invoking SSE instructions.
183-
// make manual comparisons instead.
184-
"xor %eax, %eax",
185-
"cmpb $0, (%rdi)",
186-
"je 3f",
187-
"mov $1, %eax",
188-
"cmpb $0, 1(%rdi)",
189-
"je 3f",
190-
"mov $2, %eax",
191-
"cmpb $0, 2(%rdi)",
192-
"je 3f",
193-
"mov $3, %eax",
194-
"cmpb $0, 3(%rdi)",
195-
"je 3f",
196-
197-
// Adjust address
198-
"add $4, %rdi",
190+
use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
199191

200-
// Align the address to 16 bytes (xmm register size).
201-
// This is important, since an n byte read
202-
// with n byte alignment is guranteed to never cross
203-
// a page boundary and thus will never try to access
204-
// memory which may not be accessible.
205-
"mov %edi, %ecx",
206-
"and $15, %ecx",
207-
"and $-16, %rdi",
192+
let mut n = 0;
193+
194+
// The use of _mm_movemask_epi8 and company allow for speedups,
195+
// but they aren't cheap by themselves. Thus, possibly small strings
196+
// are handled in simple loops.
208197

209-
// zero out an xmm register for comparisons with zero.
210-
"pxor %xmm0, %xmm0",
198+
for _ in 0..4 {
199+
if *s == 0 {
200+
return n;
201+
}
211202

212-
// One manual iteration of a zero byte search.
213-
// Ensuring proper alignment may cause us to read
214-
// memory _before_ the actual string start.
215-
// Thus, one separate iteration is needed to handle this special case.
216-
"movdqa (%rdi), %xmm1",
217-
"pcmpeqb %xmm0, %xmm1",
218-
"pmovmskb %xmm1, %eax",
219-
// Shift out comparisons that don't belong to the actual string.
220-
"shr %cl, %eax",
221-
// Check if there was a zero
222-
"test %eax, %eax",
223-
"jz 1f",
203+
n += 1;
204+
s = s.add(1);
205+
}
206+
207+
// Shave of the least significand bits to align the address to a 16
208+
// byte boundary. The shaved of bits are used to correct the first iteration.
224209

225-
// A zero was found: calculate result and exit.
226-
"bsf %eax, %eax",
227-
"add $4, %eax",
228-
"jmp 3f",
210+
let align = s as usize & 15;
211+
let mut s = ((s as usize) - align) as *const __m128i;
212+
let zero = _mm_set1_epi8(0);
229213

230-
// No zero was found: prepare main loop.
231-
"1:",
232-
"add $16, %rdi",
233-
"neg %rcx",
234-
"add $4, %rcx",
214+
let x = {
215+
let r;
216+
asm!(
217+
"movdqa ({addr}), {dest}",
218+
addr = in(reg) s,
219+
dest = out(xmm_reg) r,
220+
options(att_syntax, nostack),
221+
);
222+
r
223+
};
224+
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
235225

236-
// main loop
237-
"2:",
238-
"movdqa (%rdi), %xmm1",
239-
"add $16, %rdi",
240-
"add $16, %rcx",
241-
"pcmpeqb %xmm0, %xmm1",
242-
"pmovmskb %xmm1, %eax",
243-
// Check if there was a zero
244-
"test %eax, %eax",
245-
"jz 2b",
226+
if cmp != 0 {
227+
return n + cmp.trailing_zeros() as usize;
228+
}
246229

247-
// A zero was found: calculate result and exit.
248-
"bsf %eax, %eax",
249-
"add %rcx, %rax",
250-
"3:",
251-
inout("rdi") s => _,
252-
out("rax") n,
253-
out("rcx") _,
254-
options(att_syntax, nostack),
255-
);
230+
n += 16 - align;
231+
s = s.add(1);
256232

257-
n
233+
loop {
234+
let x = {
235+
let r;
236+
asm!(
237+
"movdqa ({addr}), {dest}",
238+
addr = in(reg) s,
239+
dest = out(xmm_reg) r,
240+
options(att_syntax, nostack),
241+
);
242+
r
243+
};
244+
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
245+
if cmp == 0 {
246+
n += 16;
247+
s = s.add(1);
248+
} else {
249+
return n + cmp.trailing_zeros() as usize;
250+
}
251+
}
258252
}
259253

260254
// Provided for scenarios like kernel development, where SSE might not
@@ -263,11 +257,52 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize {
263257
#[inline(always)]
264258
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
265259
let mut n = 0;
266-
while *s != 0 {
260+
261+
// Check bytes in steps of one until
262+
// either a zero byte is discovered or
263+
// pointer is aligned to an eight byte boundary.
264+
265+
while s as usize & 7 != 0 {
266+
if *s == 0 {
267+
return n;
268+
}
269+
267270
n += 1;
268271
s = s.add(1);
269272
}
270-
n
273+
274+
// Check bytes in steps of eight until a zero
275+
// byte is discovered.
276+
277+
let mut s = s as *const u64;
278+
279+
loop {
280+
let mut cs = {
281+
let r: u64;
282+
asm!(
283+
"mov ({addr}), {dest}",
284+
addr = in(reg) s,
285+
dest = out(reg) r,
286+
options(att_syntax, nostack),
287+
);
288+
r
289+
};
290+
// Detect if a word has a zero byte, taken from
291+
// https://graphics.stanford.edu/~seander/bithacks.html
292+
if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
293+
loop {
294+
if cs & 255 == 0 {
295+
return n;
296+
} else {
297+
cs >>= 8;
298+
n += 1;
299+
}
300+
}
301+
} else {
302+
n += 8;
303+
s = s.add(1);
304+
}
305+
}
271306
}
272307

273308
/// Determine optimal parameters for a `rep` instruction.

0 commit comments

Comments
 (0)