Make some usize-typed masks definition agnostic to the size of usize

eduardosm · eduardosm · commit 93ae6f80e377 · 2022-04-15T17:04:59.000+02:00
Some masks where defined as
```rust
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
```
where it was assumed that `usize` is never wider than 64, which is currently true.

To make those constants valid in a hypothetical 128-bit target, these constants have been redefined in an `usize`-width-agnostic way
```rust
const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; size_of::&lt;usize&gt;()]);
```

There are already some cases where Rust anticipates the possibility of supporting 128-bit targets, such as not implementing `From&lt;usize&gt;` for `u64`.
diff --git a/library/core/benches/ascii/is_ascii.rs b/library/core/benches/ascii/is_ascii.rs
@@ -77,6 +77,6 @@ fn is_ascii_align_to_unrolled(bytes: &[u8]) -> bool {
 
 #[inline]
 fn contains_nonascii(v: usize) -> bool {
-    const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
+    const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; core::mem::size_of::<usize>()]);
     (NONASCII_MASK & v) != 0
 }
diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs
@@ -890,6 +890,27 @@ impl usize {
     widening_impl! { usize, u128, 64, unsigned }
 }
 
+impl usize {
+    /// Returns an `usize` where every byte is equal to `x`.
+    #[inline]
+    pub(crate) const fn repeat_u8(x: u8) -> usize {
+        usize::from_ne_bytes([x; mem::size_of::<usize>()])
+    }
+
+    /// Returns an `usize` where every byte pair is equal to `x`.
+    #[inline]
+    pub(crate) const fn repeat_u16(x: u16) -> usize {
+        let mut r = 0usize;
+        let mut i = 0;
+        while i < mem::size_of::<usize>() {
+            // Use `wrapping_shl` to make it work on targets with 16-bit `usize`
+            r = r.wrapping_shl(16) | (x as usize);
+            i += 2;
+        }
+        r
+    }
+}
+
 /// A classification of floating point numbers.
 ///
 /// This `enum` is used as the return type for [`f32::classify`] and [`f64::classify`]. See
diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
@@ -235,7 +235,7 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
 /// from `../str/mod.rs`, which does something similar for utf8 validation.
 #[inline]
 fn contains_nonascii(v: usize) -> bool {
-    const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
+    const NONASCII_MASK: usize = usize::repeat_u8(0x80);
     (NONASCII_MASK & v) != 0
 }
 
diff --git a/library/core/src/slice/memchr.rs b/library/core/src/slice/memchr.rs
@@ -4,12 +4,8 @@
 use crate::cmp;
 use crate::mem;
 
-const LO_U64: u64 = 0x0101010101010101;
-const HI_U64: u64 = 0x8080808080808080;
-
-// Use truncation.
-const LO_USIZE: usize = LO_U64 as usize;
-const HI_USIZE: usize = HI_U64 as usize;
+const LO_USIZE: usize = usize::repeat_u8(0x01);
+const HI_USIZE: usize = usize::repeat_u8(0x80);
 const USIZE_BYTES: usize = mem::size_of::<usize>();
 
 /// Returns `true` if `x` contains any zero byte.
diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs
@@ -112,16 +112,16 @@ fn do_count_chars(s: &str) -> usize {
 // true)
 #[inline]
 fn contains_non_continuation_byte(w: usize) -> usize {
-    const LSB: usize = 0x0101_0101_0101_0101u64 as usize;
+    const LSB: usize = usize::repeat_u8(0x01);
     ((!w >> 7) | (w >> 6)) & LSB
 }
 
 // Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but
 // more efficient.
 #[inline]
 fn sum_bytes_in_usize(values: usize) -> usize {
-    const LSB_SHORTS: usize = 0x0001_0001_0001_0001_u64 as usize;
-    const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize;
+    const LSB_SHORTS: usize = usize::repeat_u16(0x0001);
+    const SKIP_BYTES: usize = usize::repeat_u16(0x00ff);
 
     let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES);
     pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8)
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -112,8 +112,7 @@ where
     Some(ch)
 }
 
-// use truncation to fit u64 into usize
-const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
+const NONASCII_MASK: usize = usize::repeat_u8(0x80);
 
 /// Returns `true` if any byte in the word `x` is nonascii (>= 128).
 #[inline]

Original file line number	Diff line number	Diff line change
`@@ -77,6 +77,6 @@ fn is_ascii_align_to_unrolled(bytes: &[u8]) -> bool {`
`77`	`77`
`78`	`78`	`#[inline]`
`79`	`79`	`fn contains_nonascii(v: usize) -> bool {`
`80`		`- const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;`
	`80`	`+ const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; core::mem::size_of::<usize>()]);`
`81`	`81`	`(NONASCII_MASK & v) != 0`
`82`	`82`	`}`
Original file line number	Diff line number	Diff line change
`@@ -235,7 +235,7 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {`
`235`	`235`	/// from `../str/mod.rs`, which does something similar for utf8 validation.
`236`	`236`	`#[inline]`
`237`	`237`	`fn contains_nonascii(v: usize) -> bool {`
`238`		`- const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;`
	`238`	`+ const NONASCII_MASK: usize = usize::repeat_u8(0x80);`
`239`	`239`	`(NONASCII_MASK & v) != 0`
`240`	`240`	`}`
`241`	`241`
Original file line number	Diff line number	Diff line change
`@@ -112,8 +112,7 @@ where`
`112`	`112`	`Some(ch)`
`113`	`113`	`}`
`114`	`114`
`115`		`-// use truncation to fit u64 into usize`
`116`		`-const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;`
	`115`	`+const NONASCII_MASK: usize = usize::repeat_u8(0x80);`
`117`	`116`
`118`	`117`	/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
`119`	`118`	`#[inline]`