@@ -34,6 +34,100 @@ use core::ptr::addr_of;
34
34
35
35
use super :: c;
36
36
37
+ /// Creates a null-terminated UTF-16 string from a str.
38
+ pub macro wide_str ( $str: literal) { {
39
+ const _: ( ) = {
40
+ if core:: slice:: memchr:: memchr ( 0 , $str. as_bytes ( ) ) . is_some ( ) {
41
+ panic ! ( "null terminated strings cannot contain interior nulls" ) ;
42
+ }
43
+ } ;
44
+ crate :: sys:: pal:: windows:: api:: utf16!( concat!( $str , '\0' ) )
45
+ } }
46
+
47
+ /// Creates a UTF-16 string from a str without null termination.
48
+ pub macro utf16 ( $str: expr) { {
49
+ const UTF8 : & str = $str;
50
+ const UTF16_LEN : usize = crate :: sys:: pal:: windows:: api:: utf16_len ( UTF8 ) ;
51
+ const UTF16 : [ u16 ; UTF16_LEN ] = crate :: sys:: pal:: windows:: api:: to_utf16 ( UTF8 ) ;
52
+ & UTF16
53
+ } }
54
+
55
+ #[ cfg( test) ]
56
+ mod tests;
57
+
58
+ /// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
59
+ pub const fn utf16_len ( s : & str ) -> usize {
60
+ let s = s. as_bytes ( ) ;
61
+ let mut i = 0 ;
62
+ let mut len = 0 ;
63
+ while i < s. len ( ) {
64
+ // the length of a UTF-8 encoded code-point is given by the number of
65
+ // leading ones, except in the case of ASCII.
66
+ let utf8_len = match s[ i] . leading_ones ( ) {
67
+ 0 => 1 ,
68
+ n => n as usize ,
69
+ } ;
70
+ i += utf8_len;
71
+ len += if utf8_len < 4 { 1 } else { 2 } ;
72
+ }
73
+ len
74
+ }
75
+
76
+ /// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
77
+ ///
78
+ /// Note that this is designed for use in const contexts so is not optimized.
79
+ pub const fn to_utf16 < const UTF16_LEN : usize > ( s : & str ) -> [ u16 ; UTF16_LEN ] {
80
+ let mut output = [ 0_u16 ; UTF16_LEN ] ;
81
+ let mut pos = 0 ;
82
+ let s = s. as_bytes ( ) ;
83
+ let mut i = 0 ;
84
+ while i < s. len ( ) {
85
+ match s[ i] . leading_ones ( ) {
86
+ // Decode UTF-8 based on its length.
87
+ // See https://en.wikipedia.org/wiki/UTF-8
88
+ 0 => {
89
+ // ASCII is the same in both encodings
90
+ output[ pos] = s[ i] as u16 ;
91
+ i += 1 ;
92
+ pos += 1 ;
93
+ }
94
+ 2 => {
95
+ // Bits: 110xxxxx 10xxxxxx
96
+ output[ pos] = ( ( s[ i] as u16 & 0b11111 ) << 6 ) | ( s[ i + 1 ] as u16 & 0b111111 ) ;
97
+ i += 2 ;
98
+ pos += 1 ;
99
+ }
100
+ 3 => {
101
+ // Bits: 1110xxxx 10xxxxxx 10xxxxxx
102
+ output[ pos] = ( ( s[ i] as u16 & 0b1111 ) << 12 )
103
+ | ( ( s[ i + 1 ] as u16 & 0b111111 ) << 6 )
104
+ | ( s[ i + 2 ] as u16 & 0b111111 ) ;
105
+ i += 3 ;
106
+ pos += 1 ;
107
+ }
108
+ 4 => {
109
+ // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
110
+ let mut c = ( ( s[ i] as u32 & 0b111 ) << 18 )
111
+ | ( ( s[ i + 1 ] as u32 & 0b111111 ) << 12 )
112
+ | ( ( s[ i + 2 ] as u32 & 0b111111 ) << 6 )
113
+ | ( s[ i + 3 ] as u32 & 0b111111 ) ;
114
+ // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
115
+ // - Subtract 0x10000 from the code point
116
+ // - For the high surrogate, shift right by 10 then add 0xD800
117
+ // - For the low surrogate, take the low 10 bits then add 0xDC00
118
+ c -= 0x10000 ;
119
+ output[ pos] = ( ( c >> 10 ) + 0xD800 ) as u16 ;
120
+ output[ pos + 1 ] = ( ( c & 0b1111111111 ) + 0xDC00 ) as u16 ;
121
+ i += 4 ;
122
+ pos += 2 ;
123
+ }
124
+ // valid UTF-8 cannot have any other values
125
+ _ => unreachable ! ( ) ,
126
+ }
127
+ }
128
+ output
129
+ }
130
+
37
131
/// Helper method for getting the size of `T` as a u32.
38
132
/// Errors at compile time if the size would overflow.
39
133
///
0 commit comments