Add codegen test to ensure the pattern used in convert_while_ascii gets vectorized

jhorstmann · jhorstmann · commit a7aafd740a96 · 2024-06-03T21:43:45.000+02:00
diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs
@@ -613,7 +613,7 @@ pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
 #[cfg(not(test))]
 #[cfg(not(no_global_oom_handling))]
 fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
-    // process the input in chunks to enable auto-vectorization
+    // Process the input in chunks to enable auto-vectorization
     const USIZE_SIZE: usize = mem::size_of::<usize>();
     const MAGIC_UNROLL: usize = 2;
     const N: usize = USIZE_SIZE * MAGIC_UNROLL;
@@ -635,9 +635,10 @@ fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
             is_ascii[j] = chunk[j] <= 127;
         }
 
-        // auto-vectorization for this check is a bit fragile,
-        // sum and comparing against the chunk size gives the best result,
-        // specifically a pmovmsk instruction on x86.
+        // Auto-vectorization for this check is a bit fragile, sum and comparing against the chunk
+        // size gives the best result, specifically a pmovmsk instruction on x86.
+        // There is a codegen test in `issue-123712-str-to-lower-autovectorization.rs` which should
+        // be updated when this method is changed.
         if is_ascii.iter().map(|x| *x as u8).sum::<u8>() as usize != N {
             break;
         }
diff --git a/tests/codegen/issues/issue-123712-str-to-lower-autovectorization.rs b/tests/codegen/issues/issue-123712-str-to-lower-autovectorization.rs
@@ -0,0 +1,48 @@
+//@ compile-flags: -Copt-level=3
+#![crate_type = "lib"]
+
+/// Ensure that the ascii-prefix loop for `str::to_lowercase` and `str::to_uppercase` uses vector
+/// instructions. Since these methods do not get inlined, the relevant code is duplicated here and
+/// should be updated when the implementation changes.
+// CHECK-LABEL: @lower_while_ascii
+// CHECK: [[A:%[0-9]]] = load <16 x i8>
+// CHECK-NEXT: [[B:%[0-9]]] = icmp slt <16 x i8> [[A]], zeroinitializer
+// CHECK-NEXT: [[C:%[0-9]]] = bitcast <16 x i1> [[B]] to i16
+#[no_mangle]
+pub fn lower_while_ascii(mut input: &[u8], mut output: &mut [u8]) -> usize {
+    // process the input in chunks to enable auto-vectorization
+    const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+    const MAGIC_UNROLL: usize = 2;
+    const N: usize = USIZE_SIZE * MAGIC_UNROLL;
+
+    output = &mut output[..input.len()];
+
+    let mut ascii_prefix_len = 0_usize;
+    let mut is_ascii = [false; N];
+
+    while input.len() >= N {
+        let chunk = unsafe { input.get_unchecked(..N) };
+        let out_chunk = unsafe { output.get_unchecked_mut(..N) };
+
+        for j in 0..N {
+            is_ascii[j] = chunk[j] <= 127;
+        }
+
+        // auto-vectorization for this check is a bit fragile,
+        // sum and comparing against the chunk size gives the best result,
+        // specifically a pmovmsk instruction on x86.
+        if is_ascii.iter().map(|x| *x as u8).sum::<u8>() as usize != N {
+            break;
+        }
+
+        for j in 0..N {
+            out_chunk[j] = chunk[j].to_ascii_lowercase();
+        }
+
+        ascii_prefix_len += N;
+        input = unsafe { input.get_unchecked(N..) };
+        output = unsafe { output.get_unchecked_mut(N..) };
+    }
+
+    ascii_prefix_len
+}