@@ -34,6 +34,102 @@ use core::ptr::addr_of;
34
34
35
35
use super :: c;
36
36
37
+ /// Creates a null-terminated UTF-16 string from a str.
38
+ pub macro wide_str ( $str: literal) { {
39
+ const _: ( ) = {
40
+ if core:: slice:: memchr:: memchr ( 0 , $str. as_bytes ( ) ) . is_some ( ) {
41
+ panic ! ( "null terminated strings cannot contain interior nulls" ) ;
42
+ }
43
+ } ;
44
+ crate :: sys:: pal:: windows:: api:: utf16!( concat!( $str , '\0' ) )
45
+ } }
46
+
47
+ /// Creates a UTF-16 string from a str without null termination.
48
+ pub macro utf16 ( $str: expr) { {
49
+ const UTF8 : & str = $str;
50
+ const UTF16_LEN : usize = crate :: sys:: pal:: windows:: api:: utf16_len ( UTF8 ) ;
51
+ const UTF16 : [ u16 ; UTF16_LEN ] = crate :: sys:: pal:: windows:: api:: to_utf16 ( UTF8 ) ;
52
+ & UTF16
53
+ } }
54
+
55
+ #[ cfg( test) ]
56
+ mod tests;
57
+
58
+ /// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
59
+ pub const fn utf16_len ( s : & str ) -> usize {
60
+ let s = s. as_bytes ( ) ;
61
+ let mut i = 0 ;
62
+ let mut len = 0 ;
63
+ while i < s. len ( ) {
64
+ // the length of a UTF-8 encoded code-point is given by the number of
65
+ // leading ones, except in the case of ASCII.
66
+ let utf8_len = match s[ i] . leading_ones ( ) {
67
+ 0 => 1 ,
68
+ n => n as usize ,
69
+ } ;
70
+ i += utf8_len;
71
+ // Note that UTF-16 surrogates (U+D800 to U+DFFF) are not encodable as UTF-8,
72
+ // so (unlike with WTF-8) we don't have to worry about how they'll get re-encoded.
73
+ len += if utf8_len < 4 { 1 } else { 2 } ;
74
+ }
75
+ len
76
+ }
77
+
78
+ /// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
79
+ ///
80
+ /// Note that this is designed for use in const contexts so is not optimized.
81
+ pub const fn to_utf16 < const UTF16_LEN : usize > ( s : & str ) -> [ u16 ; UTF16_LEN ] {
82
+ let mut output = [ 0_u16 ; UTF16_LEN ] ;
83
+ let mut pos = 0 ;
84
+ let s = s. as_bytes ( ) ;
85
+ let mut i = 0 ;
86
+ while i < s. len ( ) {
87
+ match s[ i] . leading_ones ( ) {
88
+ // Decode UTF-8 based on its length.
89
+ // See https://en.wikipedia.org/wiki/UTF-8
90
+ 0 => {
91
+ // ASCII is the same in both encodings
92
+ output[ pos] = s[ i] as u16 ;
93
+ i += 1 ;
94
+ pos += 1 ;
95
+ }
96
+ 2 => {
97
+ // Bits: 110xxxxx 10xxxxxx
98
+ output[ pos] = ( ( s[ i] as u16 & 0b11111 ) << 6 ) | ( s[ i + 1 ] as u16 & 0b111111 ) ;
99
+ i += 2 ;
100
+ pos += 1 ;
101
+ }
102
+ 3 => {
103
+ // Bits: 1110xxxx 10xxxxxx 10xxxxxx
104
+ output[ pos] = ( ( s[ i] as u16 & 0b1111 ) << 12 )
105
+ | ( ( s[ i + 1 ] as u16 & 0b111111 ) << 6 )
106
+ | ( s[ i + 2 ] as u16 & 0b111111 ) ;
107
+ i += 3 ;
108
+ pos += 1 ;
109
+ }
110
+ 4 => {
111
+ // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
112
+ let mut c = ( ( s[ i] as u32 & 0b111 ) << 18 )
113
+ | ( ( s[ i + 1 ] as u32 & 0b111111 ) << 12 )
114
+ | ( ( s[ i + 2 ] as u32 & 0b111111 ) << 6 )
115
+ | ( s[ i + 3 ] as u32 & 0b111111 ) ;
116
+ // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
117
+ // - Subtract 0x10000 from the code point
118
+ // - For the high surrogate, shift right by 10 then add 0xD800
119
+ // - For the low surrogate, take the low 10 bits then add 0xDC00
120
+ c -= 0x10000 ;
121
+ output[ pos] = ( ( c >> 10 ) + 0xD800 ) as u16 ;
122
+ output[ pos + 1 ] = ( ( c & 0b1111111111 ) + 0xDC00 ) as u16 ;
123
+ i += 4 ;
124
+ pos += 2 ;
125
+ }
126
+ // valid UTF-8 cannot have any other values
127
+ _ => unreachable ! ( ) ,
128
+ }
129
+ }
130
+ output
131
+ }
132
+
37
133
/// Helper method for getting the size of `T` as a u32.
38
134
/// Errors at compile time if the size would overflow.
39
135
///
0 commit comments