rust-lang · bors · Nov 21, 2021 · Oct 6, 2021
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
@@ -39,10 +39,9 @@ impl<'a> Iterator for Chars<'a> {
 
     #[inline]
     fn next(&mut self) -> Option<char> {
-        next_code_point(&mut self.iter).map(|ch| {
-            // SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
-            unsafe { char::from_u32_unchecked(ch) }
-        })
+        // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
+        // the resulting `ch` is a valid Unicode Scalar Value.
+        unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
     }
 
     #[inline]
@@ -81,10 +80,9 @@ impl fmt::Debug for Chars<'_> {
 impl<'a> DoubleEndedIterator for Chars<'a> {
     #[inline]
     fn next_back(&mut self) -> Option<char> {
-        next_code_point_reverse(&mut self.iter).map(|ch| {
-            // SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
-            unsafe { char::from_u32_unchecked(ch) }
-        })
+        // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
+        // the resulting `ch` is a valid Unicode Scalar Value.
+        unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
     }
 }
 

diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -25,19 +25,15 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
     (byte as i8) < -64
 }
 
-#[inline]
-const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
-    match opt {
-        Some(&byte) => byte,
-        None => 0,
-    }
-}
-
 /// Reads the next code point out of a byte iterator (assuming a
 /// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[unstable(feature = "str_internals", issue = "none")]
 #[inline]
-pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
+pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
     // Decode UTF-8
     let x = *bytes.next()?;
     if x < 128 {
@@ -48,18 +44,24 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<
     // Decode from a byte combination out of: [[[x y] z] w]
     // NOTE: Performance is sensitive to the exact formulation here
     let init = utf8_first_byte(x, 2);
-    let y = unwrap_or_0(bytes.next());
+    // SAFETY: `bytes` produces an UTF-8-like string,
+    // so the iterator must produce a value here.
+    let y = unsafe { *bytes.next().unwrap_unchecked() };
     let mut ch = utf8_acc_cont_byte(init, y);
     if x >= 0xE0 {
         // [[x y z] w] case
         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
-        let z = unwrap_or_0(bytes.next());
+        // SAFETY: `bytes` produces an UTF-8-like string,
+        // so the iterator must produce a value here.
+        let z = unsafe { *bytes.next().unwrap_unchecked() };
         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
         ch = init << 12 | y_z;
         if x >= 0xF0 {
             // [x y z w] case
             // use only the lower 3 bits of `init`
-            let w = unwrap_or_0(bytes.next());
+            // SAFETY: `bytes` produces an UTF-8-like string,
+            // so the iterator must produce a value here.
+            let w = unsafe { *bytes.next().unwrap_unchecked() };
             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
         }
     }
@@ -69,8 +71,12 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<
 
 /// Reads the last code point out of a byte iterator (assuming a
 /// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[inline]
-pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
+pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
 where
     I: DoubleEndedIterator<Item = &'a u8>,
 {
@@ -83,13 +89,19 @@ where
     // Multibyte case follows
     // Decode from a byte combination out of: [x [y [z w]]]
     let mut ch;
-    let z = unwrap_or_0(bytes.next_back());
+    // SAFETY: `bytes` produces an UTF-8-like string,
+    // so the iterator must produce a value here.
+    let z = unsafe { *bytes.next_back().unwrap_unchecked() };
     ch = utf8_first_byte(z, 2);
     if utf8_is_cont_byte(z) {
-        let y = unwrap_or_0(bytes.next_back());
+        // SAFETY: `bytes` produces an UTF-8-like string,
+        // so the iterator must produce a value here.
+        let y = unsafe { *bytes.next_back().unwrap_unchecked() };
         ch = utf8_first_byte(y, 3);
         if utf8_is_cont_byte(y) {
-            let x = unwrap_or_0(bytes.next_back());
+            // SAFETY: `bytes` produces an UTF-8-like string,
+            // so the iterator must produce a value here.
+            let x = unsafe { *bytes.next_back().unwrap_unchecked() };
             ch = utf8_first_byte(x, 4);
             ch = utf8_acc_cont_byte(ch, y);
         }

diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
@@ -809,7 +809,8 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {
 
     #[inline]
     fn next(&mut self) -> Option<CodePoint> {
-        next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
+        // SAFETY: `self.bytes` has been created from a WTF-8 string
+        unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
     }
 
     #[inline]