Skip to content

Commit 1b5c02b

Browse files
committed
Add is_ascii function optimized for x86-64 for [u8]
The new `is_ascii` function is optimized to use the `pmovmskb` vector instruction which tests the high bit in a lane. This corresponds to the same check of whether a byte is ASCII so ASCII validity checking can be vectorized. This instruction does not exist on other platforms so it is likely to regress performance and is gated to all(target_arch = "x86_64", target_feature = "sse2"). Add codegen test Remove crate::mem import for functions included in the prelude
1 parent d7d67ad commit 1b5c02b

File tree

3 files changed

+85
-21
lines changed

3 files changed

+85
-21
lines changed

library/core/benches/ascii/is_ascii.rs

+11-9
Original file line numberDiff line numberDiff line change
@@ -54,35 +54,37 @@ benches! {
5454
}
5555

5656
fn case04_while_loop(bytes: &[u8]) {
57-
// Constant chosen to enable `pmovmskb` instruction on x86-64
58-
const N: usize = 32;
57+
// Process chunks of 32 bytes at a time in the fast path to enable
58+
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
59+
// can be OR'd together and then the resulting vector can be tested for
60+
// non-ASCII bytes.
61+
const CHUNK_SIZE: usize = 32;
5962

6063
let mut i = 0;
6164

62-
while i + N <= bytes.len() {
63-
let chunk_end = i + N;
65+
while i + CHUNK_SIZE <= bytes.len() {
66+
let chunk_end = i + CHUNK_SIZE;
6467

6568
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
6669
// creates a mask from the most significant bit of each byte.
6770
// ASCII bytes are less than 128 (0x80), so their most significant
68-
// bit is unset. Thus, detecting non-ASCII bytes can be done in one
69-
// instruction.
71+
// bit is unset.
7072
let mut count = 0;
7173
while i < chunk_end {
72-
count += (bytes[i] <= 127) as u8;
74+
count += bytes[i].is_ascii() as u8;
7375
i += 1;
7476
}
7577

7678
// All bytes should be <= 127 so count is equal to chunk size.
77-
if count != N as u8 {
79+
if count != CHUNK_SIZE as u8 {
7880
return false;
7981
}
8082
}
8183

8284
// Process the remaining `bytes.len() % N` bytes.
8385
let mut is_ascii = true;
8486
while i < bytes.len() {
85-
is_ascii &= bytes[i] <= 127;
87+
is_ascii &= bytes[i].is_ascii();
8688
i += 1;
8789
}
8890

library/core/src/slice/ascii.rs

+58-12
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
use core::ascii::EscapeDefault;
44

55
use crate::fmt::{self, Write};
6+
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
67
use crate::intrinsics::const_eval_select;
7-
use crate::{ascii, iter, mem, ops};
8+
use crate::{ascii, iter, ops};
89

910
#[cfg(not(test))]
1011
impl [u8] {
@@ -308,14 +309,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
308309
}
309310
}
310311

311-
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
312-
/// from `../str/mod.rs`, which does something similar for utf8 validation.
313-
#[inline]
314-
const fn contains_nonascii(v: usize) -> bool {
315-
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
316-
(NONASCII_MASK & v) != 0
317-
}
318-
319312
/// ASCII test *without* the chunk-at-a-time optimizations.
320313
///
321314
/// This is carefully structured to produce nice small code -- it's smaller in
@@ -346,6 +339,7 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
346339
///
347340
/// If any of these loads produces something for which `contains_nonascii`
348341
/// (above) returns true, then we know the answer is false.
342+
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
349343
#[inline]
350344
#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
351345
const fn is_ascii(s: &[u8]) -> bool {
@@ -356,7 +350,14 @@ const fn is_ascii(s: &[u8]) -> bool {
356350
if const {
357351
is_ascii_simple(s)
358352
} else {
359-
const USIZE_SIZE: usize = mem::size_of::<usize>();
353+
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
354+
/// from `../str/mod.rs`, which does something similar for utf8 validation.
355+
const fn contains_nonascii(v: usize) -> bool {
356+
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
357+
(NONASCII_MASK & v) != 0
358+
}
359+
360+
const USIZE_SIZE: usize = size_of::<usize>();
360361

361362
let len = s.len();
362363
let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
@@ -366,7 +367,7 @@ const fn is_ascii(s: &[u8]) -> bool {
366367
//
367368
// We also do this for architectures where `size_of::<usize>()` isn't
368369
// sufficient alignment for `usize`, because it's a weird edge case.
369-
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
370+
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
370371
return is_ascii_simple(s);
371372
}
372373

@@ -400,7 +401,7 @@ const fn is_ascii(s: &[u8]) -> bool {
400401
// have alignment information it should have given a `usize::MAX` for
401402
// `align_offset` earlier, sending things through the scalar path instead of
402403
// this one, so this check should pass if it's reachable.
403-
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
404+
debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
404405

405406
// Read subsequent words until the last aligned word, excluding the last
406407
// aligned word by itself to be done in tail check later, to ensure that
@@ -435,3 +436,48 @@ const fn is_ascii(s: &[u8]) -> bool {
435436
}
436437
)
437438
}
439+
440+
/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
441+
/// platforms.
442+
///
443+
/// Other platforms are not likely to benefit from this code structure, so they
444+
/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
445+
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
446+
#[inline]
447+
const fn is_ascii(bytes: &[u8]) -> bool {
448+
// Process chunks of 32 bytes at a time in the fast path to enable
449+
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
450+
// can be OR'd together and then the resulting vector can be tested for
451+
// non-ASCII bytes.
452+
const CHUNK_SIZE: usize = 32;
453+
454+
let mut i = 0;
455+
456+
while i + CHUNK_SIZE <= bytes.len() {
457+
let chunk_end = i + CHUNK_SIZE;
458+
459+
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
460+
// creates a mask from the most significant bit of each byte.
461+
// ASCII bytes are less than 128 (0x80), so their most significant
462+
// bit is unset.
463+
let mut count = 0;
464+
while i < chunk_end {
465+
count += bytes[i].is_ascii() as u8;
466+
i += 1;
467+
}
468+
469+
// All bytes should be <= 127 so count is equal to chunk size.
470+
if count != CHUNK_SIZE as u8 {
471+
return false;
472+
}
473+
}
474+
475+
// Process the remaining `bytes.len() % N` bytes.
476+
let mut is_ascii = true;
477+
while i < bytes.len() {
478+
is_ascii &= bytes[i].is_ascii();
479+
i += 1;
480+
}
481+
482+
is_ascii
483+
}

tests/codegen/slice-is-ascii.rs

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
//@ only-x86_64
2+
//@ compile-flags: -C opt-level=3
3+
#![crate_type = "lib"]
4+
5+
/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
6+
/// Platforms lacking an equivalent instruction use other techniques for
7+
/// optimizing `is_ascii`.
8+
// CHECK-LABEL: @is_ascii_autovectorized
9+
#[no_mangle]
10+
pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
11+
// CHECK: load <32 x i8>
12+
// CHECK-NEXT: icmp slt <32 x i8>
13+
// CHECK-NEXT: bitcast <32 x i1>
14+
// CHECK-NEXT: icmp eq i32
15+
s.is_ascii()
16+
}

0 commit comments

Comments
 (0)