@@ -1806,39 +1806,71 @@ const fn len_utf16(code: u32) -> usize {
1806
1806
#[ inline]
1807
1807
pub const fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
1808
1808
let len = len_utf8 ( code) ;
1809
- match ( len, & mut * dst) {
1810
- ( 1 , [ a, ..] ) => {
1811
- * a = code as u8 ;
1812
- }
1813
- ( 2 , [ a, b, ..] ) => {
1814
- * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1815
- * b = ( code & 0x3F ) as u8 | TAG_CONT ;
1816
- }
1817
- ( 3 , [ a, b, c, ..] ) => {
1818
- * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1819
- * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1820
- * c = ( code & 0x3F ) as u8 | TAG_CONT ;
1821
- }
1822
- ( 4 , [ a, b, c, d, ..] ) => {
1823
- * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1824
- * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1825
- * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1826
- * d = ( code & 0x3F ) as u8 | TAG_CONT ;
1827
- }
1828
- _ => {
1829
- const_panic ! (
1830
- "encode_utf8: buffer does not have enough bytes to encode code point" ,
1831
- "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}" ,
1832
- code: u32 = code,
1833
- len: usize = len,
1834
- dst_len: usize = dst. len( ) ,
1835
- )
1836
- }
1837
- } ;
1809
+ if dst. len ( ) < len {
1810
+ const_panic ! (
1811
+ "encode_utf8: buffer does not have enough bytes to encode code point" ,
1812
+ "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}" ,
1813
+ code: u32 = code,
1814
+ len: usize = len,
1815
+ dst_len: usize = dst. len( ) ,
1816
+ ) ;
1817
+ }
1818
+
1819
+ // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
1820
+ unsafe { encode_utf8_raw_unchecked ( code, dst. as_mut_ptr ( ) ) } ;
1821
+
1838
1822
// SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
1839
1823
unsafe { slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , len) }
1840
1824
}
1841
1825
1826
+ /// Encodes a raw `u32` value as UTF-8 into the byte buffer pointed to by `dst`.
1827
+ ///
1828
+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1829
+ /// (Creating a `char` in the surrogate range is UB.)
1830
+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1831
+ ///
1832
+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1833
+ ///
1834
+ /// # Safety
1835
+ ///
1836
+ /// The behavior is undefined if the buffer pointed to by `dst` is not
1837
+ /// large enough to hold the encoded codepoint. A buffer of length four
1838
+ /// is large enough to encode any `char`.
1839
+ ///
1840
+ /// For a safe version of this function, see the [`encode_utf8_raw`] function.
1841
+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1842
+ #[ doc( hidden) ]
1843
+ #[ inline]
1844
+ pub const unsafe fn encode_utf8_raw_unchecked ( code : u32 , dst : * mut u8 ) {
1845
+ let len = len_utf8 ( code) ;
1846
+ // SAFETY: The caller must guarantee that the buffer pointed to by `dst`
1847
+ // is at least `len` bytes long.
1848
+ unsafe {
1849
+ match len {
1850
+ 1 => {
1851
+ * dst = code as u8 ;
1852
+ }
1853
+ 2 => {
1854
+ * dst = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1855
+ * dst. add ( 1 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1856
+ }
1857
+ 3 => {
1858
+ * dst = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1859
+ * dst. add ( 1 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1860
+ * dst. add ( 2 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1861
+ }
1862
+ 4 => {
1863
+ * dst = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1864
+ * dst. add ( 1 ) = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1865
+ * dst. add ( 2 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1866
+ * dst. add ( 3 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1867
+ }
1868
+ // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
1869
+ _ => crate :: hint:: unreachable_unchecked ( ) ,
1870
+ }
1871
+ }
1872
+ }
1873
+
1842
1874
/// Encodes a raw `u32` value as native endian UTF-16 into the provided `u16` buffer,
1843
1875
/// and then returns the subslice of the buffer that contains the encoded character.
1844
1876
///
0 commit comments