From 2a67ad74b77dd5c7dbd77a27156176136b0b606d Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Tue, 21 Feb 2023 23:13:02 +0100 Subject: [PATCH 01/11] Specialize `strlen` for `x86_64`. --- src/mem/impls.rs | 10 ++++++++++ src/mem/mod.rs | 8 +------- src/mem/x86_64.rs | 29 +++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/mem/impls.rs b/src/mem/impls.rs index 72003a5c4..23c9d8d32 100644 --- a/src/mem/impls.rs +++ b/src/mem/impls.rs @@ -279,3 +279,13 @@ pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 { } 0 } + +#[inline(always)] +pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { + let mut n = 0; + while *s != 0 { + n += 1; + s = s.add(1); + } + n +} diff --git a/src/mem/mod.rs b/src/mem/mod.rs index c5b0ddc16..be118778b 100644 --- a/src/mem/mod.rs +++ b/src/mem/mod.rs @@ -63,13 +63,7 @@ intrinsics! { #[mem_builtin] #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize { - let mut n = 0; - let mut s = s; - while *s != 0 { - n += 1; - s = s.offset(1); - } - n + impls::c_string_length(s) } } diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 17b461f79..ea8f6d819 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -173,6 +173,35 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { c16(a.cast(), b.cast(), n) } +#[inline(always)] +pub unsafe fn c_string_length(s: *const std::ffi::c_char) -> usize { + let mut n: usize; + + std::arch::asm!( + // search for a zero byte + "xor al, al", + + // unbounded memory region + "xor rcx, rcx", + "not rcx", + + // forward direction + "cld", + + // perform search + "repne scasb", + + // extract length + "not rcx", + "dec rcx", + inout("rdi") s => _, + out("rcx") n, + options(nostack), + ); + + n +} + /// Determine optimal parameters for a `rep` instruction. fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { // Unaligned writes are still slow on modern processors, so align the destination address. From 7711331f0a41355dc03a202c193cef5a7c31be6a Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Tue, 21 Feb 2023 23:32:39 +0100 Subject: [PATCH 02/11] Correct path. --- src/mem/x86_64.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index ea8f6d819..13e186e64 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -177,7 +177,7 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { pub unsafe fn c_string_length(s: *const std::ffi::c_char) -> usize { let mut n: usize; - std::arch::asm!( + asm!( // search for a zero byte "xor al, al", From 1fdf932338e9440c1602deb4efc572dc92efabc9 Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Tue, 21 Feb 2023 23:36:47 +0100 Subject: [PATCH 03/11] Update path for argument. --- src/mem/x86_64.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 13e186e64..282074a6c 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -174,7 +174,7 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { } #[inline(always)] -pub unsafe fn c_string_length(s: *const std::ffi::c_char) -> usize { +pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { let mut n: usize; asm!( From 0a0fa0b9fb0bf16776c7eec392f7947c3efe811b Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Wed, 22 Feb 2023 00:07:41 +0100 Subject: [PATCH 04/11] Improve assembly quality + AT&T syntax. --- src/mem/x86_64.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 282074a6c..fe93ae7ae 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -179,24 +179,25 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { asm!( // search for a zero byte - "xor al, al", + "xor %eax, %eax", // unbounded memory region - "xor rcx, rcx", - "not rcx", + "xor %ecx, %ecx", + "not %rcx", // forward direction - "cld", + // (already set thanks to abi) + //"cld", // perform search - "repne scasb", + "repne scasb (%rdi), %al", // extract length - "not rcx", - "dec rcx", + "not %rcx", + "dec %rcx", inout("rdi") s => _, out("rcx") n, - options(nostack), + options(att_syntax, nostack), ); n From 1a2f3b21d53555af4c596e48359aeee7c14671d8 Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Wed, 22 Feb 2023 00:10:46 +0100 Subject: [PATCH 05/11] Remove superfluous comment. --- src/mem/x86_64.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index fe93ae7ae..5752005a4 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -185,10 +185,6 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { "xor %ecx, %ecx", "not %rcx", - // forward direction - // (already set thanks to abi) - //"cld", - // perform search "repne scasb (%rdi), %al", From 7e4742d48ed0d942d6dd88607619f3efe1cec06b Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Wed, 22 Feb 2023 21:54:33 +0100 Subject: [PATCH 06/11] Change implementation to SSE --- src/mem/x86_64.rs | 77 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 5752005a4..daa92098e 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -178,21 +178,78 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { let mut n: usize; asm!( - // search for a zero byte + // For small sizes, we avoid invoking SSE instructions. + // make manual comparisons instead. "xor %eax, %eax", + "cmpb $0, (%rdi)", + "je 3f", + "mov $1, %eax", + "cmpb $0, 1(%rdi)", + "je 3f", + "mov $2, %eax", + "cmpb $0, 2(%rdi)", + "je 3f", + "mov $3, %eax", + "cmpb $0, 3(%rdi)", + "je 3f", - // unbounded memory region - "xor %ecx, %ecx", - "not %rcx", + // Adjust address + "add $4, %rdi", - // perform search - "repne scasb (%rdi), %al", + // Align the address to 16 bytes (xmm register size). + // This is important, since an n byte read + // with n byte alignment is guranteed to never cross + // a page boundary and thus will never try to access + // memory which may not be accessible. + "mov %edi, %ecx", + "and $15, %ecx", + "and $-16, %rdi", - // extract length - "not %rcx", - "dec %rcx", + // zero out an xmm register for comparisons with zero. + "pxor %xmm0, %xmm0", + + // One manual iteration of a zero byte search. + // Ensuring proper alignment may cause us to read + // memory _before_ the actual string start. + // Thus, one separate iteration is needed to handle this special case. + "movdqa (%rdi), %xmm1", + "pcmpeqb %xmm0, %xmm1", + "pmovmskb %xmm1, %eax", + // Shift out comparisons that don't belong to the actual string. + "shr %cl, %eax", + // Check if there was a zero + "test %eax, %eax", + "jz 1f", + + // A zero was found: calculate result and exit. + "bsf %eax, %eax", + "add $4, %eax", + "jmp 3f", + + // No zero was found: prepare main loop. + "1:", + "add $16, %rdi", + "neg %rcx", + "add $4, %rcx", + + // main loop + "2:", + "movdqa (%rdi), %xmm1", + "add $16, %rdi", + "add $16, %rcx", + "pcmpeqb %xmm0, %xmm1", + "pmovmskb %xmm1, %eax", + // Check if there was a zero + "test %eax, %eax", + "jz 2b", + + // A zero was found: calculate result and exit. + "bsf %eax, %eax", + "add %rcx, %rax", + "3:", inout("rdi") s => _, - out("rcx") n, + out("rax") n, + out("rcx") _, options(att_syntax, nostack), ); From 9c0a19c33d717ea794a8c5452bdf50022ee40820 Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Wed, 22 Feb 2023 22:16:29 +0100 Subject: [PATCH 07/11] Provide a non-sse version for x86_64. --- src/mem/x86_64.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index daa92098e..e9003310c 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -173,6 +173,7 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { c16(a.cast(), b.cast(), n) } +#[cfg(target_feature="sse2")] #[inline(always)] pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { let mut n: usize; @@ -256,6 +257,19 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { n } +// Provided for scenarios like kernel development, where SSE might not +// be available. +#[cfg(not(target_feature="sse2"))] +#[inline(always)] +pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { + let mut n = 0; + while *s != 0 { + n += 1; + s = s.add(1); + } + n +} + /// Determine optimal parameters for a `rep` instruction. fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { // Unaligned writes are still slow on modern processors, so align the destination address. From afa3d3ed3a9050a560aa9a9011f3d4e1d1be0d79 Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Wed, 22 Feb 2023 22:19:10 +0100 Subject: [PATCH 08/11] Formatting --- src/mem/x86_64.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index e9003310c..321d59296 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -173,7 +173,7 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { c16(a.cast(), b.cast(), n) } -#[cfg(target_feature="sse2")] +#[cfg(target_feature = "sse2")] #[inline(always)] pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { let mut n: usize; @@ -259,7 +259,7 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { // Provided for scenarios like kernel development, where SSE might not // be available. -#[cfg(not(target_feature="sse2"))] +#[cfg(not(target_feature = "sse2"))] #[inline(always)] pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { let mut n = 0; From 1df0d1c146f57c8767357782d9fffd7e4d0c83aa Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Mon, 6 Mar 2023 19:20:30 +0100 Subject: [PATCH 09/11] Final version. --- src/mem/x86_64.rs | 181 +++++++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 73 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 321d59296..ad6ff9d17 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -173,88 +173,82 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { c16(a.cast(), b.cast(), n) } + +// In order to process more than on byte simultaneously when executing strlen, +// two things must be considered: +// * An n byte read with an n-byte aligned address will never cross +// a page boundary and will always succeed. Any smaller alignment +// may result in a read that will cross a page boundary, which may +// trigger an access violation. +// * Surface Rust considers any kind of out-of-bounds read as undefined +// behaviour. To dodge this, memory access operations are written +// using inline assembly. + #[cfg(target_feature = "sse2")] #[inline(always)] pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { - let mut n: usize; - - asm!( - // For small sizes, we avoid invoking SSE instructions. - // make manual comparisons instead. - "xor %eax, %eax", - "cmpb $0, (%rdi)", - "je 3f", - "mov $1, %eax", - "cmpb $0, 1(%rdi)", - "je 3f", - "mov $2, %eax", - "cmpb $0, 2(%rdi)", - "je 3f", - "mov $3, %eax", - "cmpb $0, 3(%rdi)", - "je 3f", - - // Adjust address - "add $4, %rdi", + use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; - // Align the address to 16 bytes (xmm register size). - // This is important, since an n byte read - // with n byte alignment is guranteed to never cross - // a page boundary and thus will never try to access - // memory which may not be accessible. - "mov %edi, %ecx", - "and $15, %ecx", - "and $-16, %rdi", + let mut n = 0; + + // The use of _mm_movemask_epi8 and company allow for speedups, + // but they aren't cheap by themselves. Thus, possibly small strings + // are handled in simple loops. - // zero out an xmm register for comparisons with zero. - "pxor %xmm0, %xmm0", + for _ in 0..4 { + if *s == 0 { + return n; + } - // One manual iteration of a zero byte search. - // Ensuring proper alignment may cause us to read - // memory _before_ the actual string start. - // Thus, one separate iteration is needed to handle this special case. - "movdqa (%rdi), %xmm1", - "pcmpeqb %xmm0, %xmm1", - "pmovmskb %xmm1, %eax", - // Shift out comparisons that don't belong to the actual string. - "shr %cl, %eax", - // Check if there was a zero - "test %eax, %eax", - "jz 1f", + n += 1; + s = s.add(1); + } + + // Shave of the least significand bits to align the address to a 16 + // byte boundary. The shaved of bits are used to correct the first iteration. - // A zero was found: calculate result and exit. - "bsf %eax, %eax", - "add $4, %eax", - "jmp 3f", + let align = s as usize & 15; + let mut s = ((s as usize) - align) as *const __m128i; + let zero = _mm_set1_epi8(0); - // No zero was found: prepare main loop. - "1:", - "add $16, %rdi", - "neg %rcx", - "add $4, %rcx", + let x = { + let r; + asm!( + "movdqa ({addr}), {dest}", + addr = in(reg) s, + dest = out(xmm_reg) r, + options(att_syntax, nostack), + ); + r + }; + let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align; - // main loop - "2:", - "movdqa (%rdi), %xmm1", - "add $16, %rdi", - "add $16, %rcx", - "pcmpeqb %xmm0, %xmm1", - "pmovmskb %xmm1, %eax", - // Check if there was a zero - "test %eax, %eax", - "jz 2b", + if cmp != 0 { + return n + cmp.trailing_zeros() as usize; + } - // A zero was found: calculate result and exit. - "bsf %eax, %eax", - "add %rcx, %rax", - "3:", - inout("rdi") s => _, - out("rax") n, - out("rcx") _, - options(att_syntax, nostack), - ); + n += 16 - align; + s = s.add(1); - n + loop { + let x = { + let r; + asm!( + "movdqa ({addr}), {dest}", + addr = in(reg) s, + dest = out(xmm_reg) r, + options(att_syntax, nostack), + ); + r + }; + let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32; + if cmp == 0 { + n += 16; + s = s.add(1); + } else { + return n + cmp.trailing_zeros() as usize; + } + } } // Provided for scenarios like kernel development, where SSE might not @@ -263,11 +257,52 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { #[inline(always)] pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { let mut n = 0; - while *s != 0 { + + // Check bytes in steps of one until + // either a zero byte is discovered or + // pointer is aligned to an eight byte boundary. + + while s as usize & 7 != 0 { + if *s == 0 { + return n; + } + n += 1; s = s.add(1); } - n + + // Check bytes in steps of eight until a zero + // byte is discovered. + + let mut s = s as *const u64; + + loop { + let mut cs = { + let r: u64; + asm!( + "mov ({addr}), {dest}", + addr = in(reg) s, + dest = out(reg) r, + options(att_syntax, nostack), + ); + r + }; + // Detect if a word has a zero byte, taken from + // https://graphics.stanford.edu/~seander/bithacks.html + if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 { + loop { + if cs & 255 == 0 { + return n; + } else { + cs >>= 8; + n += 1; + } + } + } else { + n += 8; + s = s.add(1); + } + } } /// Determine optimal parameters for a `rep` instruction. From 4f77170ea5637745a369394ee58ecf6a2d62a50e Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Mon, 6 Mar 2023 19:24:02 +0100 Subject: [PATCH 10/11] formatting --- src/mem/x86_64.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index ad6ff9d17..5377f0423 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -173,7 +173,6 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { c16(a.cast(), b.cast(), n) } - // In order to process more than on byte simultaneously when executing strlen, // two things must be considered: // * An n byte read with an n-byte aligned address will never cross @@ -190,7 +189,7 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; let mut n = 0; - + // The use of _mm_movemask_epi8 and company allow for speedups, // but they aren't cheap by themselves. Thus, possibly small strings // are handled in simple loops. @@ -266,11 +265,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { if *s == 0 { return n; } - n += 1; s = s.add(1); } - + // Check bytes in steps of eight until a zero // byte is discovered. From 6488b26a05078639def6f74b19c494fbd64c9697 Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Mon, 6 Mar 2023 19:28:49 +0100 Subject: [PATCH 11/11] more fixing --- src/mem/x86_64.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 5377f0423..40b67093f 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -185,7 +185,7 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { #[cfg(target_feature = "sse2")] #[inline(always)] -pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { +pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; let mut n = 0; @@ -202,7 +202,7 @@ pub unsafe fn c_string_length(s: *const core::ffi::c_char) -> usize { n += 1; s = s.add(1); } - + // Shave of the least significand bits to align the address to a 16 // byte boundary. The shaved of bits are used to correct the first iteration.