Merge pull request #516 from TDecki/master

Amanieu · web-flow · commit b788cf35c072 · 2023-03-12T22:17:23.000+01:00
diff --git a/src/mem/impls.rs b/src/mem/impls.rs
@@ -279,3 +279,13 @@ pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
     }
     0
 }
+
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    let mut n = 0;
+    while *s != 0 {
+        n += 1;
+        s = s.add(1);
+    }
+    n
+}
diff --git a/src/mem/mod.rs b/src/mem/mod.rs
@@ -63,13 +63,7 @@ intrinsics! {
     #[mem_builtin]
     #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
     pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
-        let mut n = 0;
-        let mut s = s;
-        while *s != 0 {
-            n += 1;
-            s = s.offset(1);
-        }
-        n
+        impls::c_string_length(s)
     }
 }
 
diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs
@@ -173,6 +173,136 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
     c16(a.cast(), b.cast(), n)
 }
 
+// In order to process more than on byte simultaneously when executing strlen,
+// two things must be considered:
+// * An n byte read with an n-byte aligned address will never cross
+//   a page boundary and will always succeed. Any smaller alignment
+//   may result in a read that will cross a page boundary, which may
+//   trigger an access violation.
+// * Surface Rust considers any kind of out-of-bounds read as undefined
+//   behaviour. To dodge this, memory access operations are written
+//   using inline assembly.
+
+#[cfg(target_feature = "sse2")]
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
+
+    let mut n = 0;
+
+    // The use of _mm_movemask_epi8 and company allow for speedups,
+    // but they aren't cheap by themselves. Thus, possibly small strings
+    // are handled in simple loops.
+
+    for _ in 0..4 {
+        if *s == 0 {
+            return n;
+        }
+
+        n += 1;
+        s = s.add(1);
+    }
+
+    // Shave of the least significand bits to align the address to a 16
+    // byte boundary. The shaved of bits are used to correct the first iteration.
+
+    let align = s as usize & 15;
+    let mut s = ((s as usize) - align) as *const __m128i;
+    let zero = _mm_set1_epi8(0);
+
+    let x = {
+        let r;
+        asm!(
+            "movdqa ({addr}), {dest}",
+            addr = in(reg) s,
+            dest = out(xmm_reg) r,
+            options(att_syntax, nostack),
+        );
+        r
+    };
+    let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
+
+    if cmp != 0 {
+        return n + cmp.trailing_zeros() as usize;
+    }
+
+    n += 16 - align;
+    s = s.add(1);
+
+    loop {
+        let x = {
+            let r;
+            asm!(
+                "movdqa ({addr}), {dest}",
+                addr = in(reg) s,
+                dest = out(xmm_reg) r,
+                options(att_syntax, nostack),
+            );
+            r
+        };
+        let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
+        if cmp == 0 {
+            n += 16;
+            s = s.add(1);
+        } else {
+            return n + cmp.trailing_zeros() as usize;
+        }
+    }
+}
+
+// Provided for scenarios like kernel development, where SSE might not
+// be available.
+#[cfg(not(target_feature = "sse2"))]
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    let mut n = 0;
+
+    // Check bytes in steps of one until
+    // either a zero byte is discovered or
+    // pointer is aligned to an eight byte boundary.
+
+    while s as usize & 7 != 0 {
+        if *s == 0 {
+            return n;
+        }
+        n += 1;
+        s = s.add(1);
+    }
+
+    // Check bytes in steps of eight until a zero
+    // byte is discovered.
+
+    let mut s = s as *const u64;
+
+    loop {
+        let mut cs = {
+            let r: u64;
+            asm!(
+                "mov ({addr}), {dest}",
+                addr = in(reg) s,
+                dest = out(reg) r,
+                options(att_syntax, nostack),
+            );
+            r
+        };
+        // Detect if a word has a zero byte, taken from
+        // https://graphics.stanford.edu/~seander/bithacks.html
+        if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
+            loop {
+                if cs & 255 == 0 {
+                    return n;
+                } else {
+                    cs >>= 8;
+                    n += 1;
+                }
+            }
+        } else {
+            n += 8;
+            s = s.add(1);
+        }
+    }
+}
+
 /// Determine optimal parameters for a `rep` instruction.
 fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
     // Unaligned writes are still slow on modern processors, so align the destination address.

Original file line number	Diff line number	Diff line change
`@@ -279,3 +279,13 @@ pub unsafe fn compare_bytes(s1: const u8, s2: const u8, n: usize) -> i32 {`
`279`	`279`	`}`
`280`	`280`	`0`
`281`	`281`	`}`
	`282`	`+`
	`283`	`+#[inline(always)]`
	`284`	`+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {`
	`285`	`+ let mut n = 0;`
	`286`	`+ while *s != 0 {`
	`287`	`+ n += 1;`
	`288`	`+ s = s.add(1);`
	`289`	`+ }`
	`290`	`+ n`
	`291`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -63,13 +63,7 @@ intrinsics! {`
`63`	`63`	`#[mem_builtin]`
`64`	`64`	`#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]`
`65`	`65`	`pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {`
`66`		`- let mut n = 0;`
`67`		`- let mut s = s;`
`68`		`- while *s != 0 {`
`69`		`- n += 1;`
`70`		`- s = s.offset(1);`
`71`		`- }`
`72`		`- n`
	`66`	`+ impls::c_string_length(s)`
`73`	`67`	`}`
`74`	`68`	`}`
`75`	`69`