Skip to content

libcore: assume the input of next_code_point and next_code_point_reverse is UTF-8-like #89611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions library/core/src/str/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@ impl<'a> Iterator for Chars<'a> {

#[inline]
fn next(&mut self) -> Option<char> {
next_code_point(&mut self.iter).map(|ch| {
// SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
unsafe { char::from_u32_unchecked(ch) }
})
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
// the resulting `ch` is a valid Unicode Scalar Value.
unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
}

#[inline]
Expand Down Expand Up @@ -81,10 +80,9 @@ impl fmt::Debug for Chars<'_> {
impl<'a> DoubleEndedIterator for Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<char> {
next_code_point_reverse(&mut self.iter).map(|ch| {
// SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
unsafe { char::from_u32_unchecked(ch) }
})
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
// the resulting `ch` is a valid Unicode Scalar Value.
unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
}
}

Expand Down
44 changes: 28 additions & 16 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,15 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}

#[inline]
const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
// Decode UTF-8
let x = *bytes.next()?;
if x < 128 {
Expand All @@ -48,18 +44,24 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next().unwrap_unchecked() };
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next().unwrap_unchecked() };
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let w = unsafe { *bytes.next().unwrap_unchecked() };
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Expand All @@ -69,8 +71,12 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<

/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
I: DoubleEndedIterator<Item = &'a u8>,
{
Expand All @@ -83,13 +89,19 @@ where
// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
let z = unwrap_or_0(bytes.next_back());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
let y = unwrap_or_0(bytes.next_back());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
let x = unwrap_or_0(bytes.next_back());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
Expand Down
3 changes: 2 additions & 1 deletion library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,8 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {

#[inline]
fn next(&mut self) -> Option<CodePoint> {
next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
// SAFETY: `self.bytes` has been created from a WTF-8 string
unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
}

#[inline]
Expand Down