-
Notifications
You must be signed in to change notification settings - Fork 13.4k
speed up String::push
and String::insert
#124810
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1806,39 +1806,71 @@ const fn len_utf16(code: u32) -> usize { | |
#[inline] | ||
pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { | ||
let len = len_utf8(code); | ||
match (len, &mut *dst) { | ||
(1, [a, ..]) => { | ||
*a = code as u8; | ||
} | ||
(2, [a, b, ..]) => { | ||
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; | ||
*b = (code & 0x3F) as u8 | TAG_CONT; | ||
} | ||
(3, [a, b, c, ..]) => { | ||
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; | ||
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT; | ||
*c = (code & 0x3F) as u8 | TAG_CONT; | ||
} | ||
(4, [a, b, c, d, ..]) => { | ||
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; | ||
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT; | ||
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT; | ||
*d = (code & 0x3F) as u8 | TAG_CONT; | ||
} | ||
_ => { | ||
const_panic!( | ||
"encode_utf8: buffer does not have enough bytes to encode code point", | ||
"encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}", | ||
code: u32 = code, | ||
len: usize = len, | ||
dst_len: usize = dst.len(), | ||
) | ||
} | ||
}; | ||
if dst.len() < len { | ||
const_panic!( | ||
"encode_utf8: buffer does not have enough bytes to encode code point", | ||
"encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}", | ||
code: u32 = code, | ||
len: usize = len, | ||
dst_len: usize = dst.len(), | ||
); | ||
} | ||
|
||
// SAFETY: `dst` is checked to be at least the length needed to encode the codepoint. | ||
unsafe { encode_utf8_raw_unchecked(code, dst.as_mut_ptr()) }; | ||
|
||
// SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds. | ||
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) } | ||
} | ||
|
||
/// Encodes a raw `u32` value as UTF-8 into the byte buffer pointed to by `dst`. | ||
/// | ||
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. | ||
/// (Creating a `char` in the surrogate range is UB.) | ||
/// The result is valid [generalized UTF-8] but not valid UTF-8. | ||
/// | ||
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 | ||
/// | ||
/// # Safety | ||
/// | ||
/// The behavior is undefined if the buffer pointed to by `dst` is not | ||
/// large enough to hold the encoded codepoint. A buffer of length four | ||
/// is large enough to encode any `char`. | ||
/// | ||
/// For a safe version of this function, see the [`encode_utf8_raw`] function. | ||
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] | ||
#[doc(hidden)] | ||
#[inline] | ||
pub const unsafe fn encode_utf8_raw_unchecked(code: u32, dst: *mut u8) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's been long enough that I'm forgetting context here, but why was this changed away from pub const unsafe fn encode_utf8_raw_unchecked(
code: u32, dst: &mut [MaybeUninit<u8>]
) -> &mut [u8] {
// Write the characters then call MaybeUninit::assume_init_ref
} Then lengths get checked and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The motivation for the change from It might still work, but without the non-const |
||
let len = len_utf8(code); | ||
// SAFETY: The caller must guarantee that the buffer pointed to by `dst` | ||
// is at least `len` bytes long. | ||
unsafe { | ||
match len { | ||
1 => { | ||
*dst = code as u8; | ||
} | ||
2 => { | ||
*dst = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; | ||
*dst.add(1) = (code & 0x3F) as u8 | TAG_CONT; | ||
} | ||
3 => { | ||
*dst = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; | ||
*dst.add(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; | ||
*dst.add(2) = (code & 0x3F) as u8 | TAG_CONT; | ||
} | ||
4 => { | ||
*dst = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; | ||
*dst.add(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; | ||
*dst.add(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; | ||
*dst.add(3) = (code & 0x3F) as u8 | TAG_CONT; | ||
} | ||
// SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8. | ||
_ => crate::hint::unreachable_unchecked(), | ||
} | ||
} | ||
} | ||
|
||
/// Encodes a raw `u32` value as native endian UTF-16 into the provided `u16` buffer, | ||
/// and then returns the subslice of the buffer that contains the encoded character. | ||
/// | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
//! Check that `String::push` is optimized enough not to call `memcpy`. | ||
|
||
//@ compile-flags: -O | ||
#![crate_type = "lib"] | ||
|
||
// CHECK-LABEL: @string_push_does_not_call_memcpy | ||
#[no_mangle] | ||
pub fn string_push_does_not_call_memcpy(s: &mut String, ch: char) { | ||
// CHECK-NOT: call void @llvm.memcpy | ||
s.push(ch); | ||
} |
Uh oh!
There was an error while loading. Please reload this page.