Skip to content

Commit 3f90d4a

Browse files
committed
Add const UTF-8 to UTF-16 conversion macros
`wide_str!` creates a null terminated UTF-16 string whereas `utf16!` just creates a UTF-16 string without adding a null.
1 parent 5b6d1cc commit 3f90d4a

File tree

3 files changed

+113
-2
lines changed

3 files changed

+113
-2
lines changed

Diff for: std/src/sys/pal/windows/api.rs

+94
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,100 @@ use core::ptr::addr_of;
3434

3535
use super::c;
3636

37+
/// Creates a null-terminated UTF-16 string from a str.
38+
pub macro wide_str($str:literal) {{
39+
const _: () = {
40+
if core::slice::memchr::memchr(0, $str.as_bytes()).is_some() {
41+
panic!("null terminated strings cannot contain interior nulls");
42+
}
43+
};
44+
crate::sys::pal::windows::api::utf16!(concat!($str, '\0'))
45+
}}
46+
47+
/// Creates a UTF-16 string from a str without null termination.
48+
pub macro utf16($str:expr) {{
49+
const UTF8: &str = $str;
50+
const UTF16_LEN: usize = crate::sys::pal::windows::api::utf16_len(UTF8);
51+
const UTF16: [u16; UTF16_LEN] = crate::sys::pal::windows::api::to_utf16(UTF8);
52+
&UTF16
53+
}}
54+
55+
#[cfg(test)]
56+
mod tests;
57+
58+
/// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
59+
pub const fn utf16_len(s: &str) -> usize {
60+
let s = s.as_bytes();
61+
let mut i = 0;
62+
let mut len = 0;
63+
while i < s.len() {
64+
// the length of a UTF-8 encoded code-point is given by the number of
65+
// leading ones, except in the case of ASCII.
66+
let utf8_len = match s[i].leading_ones() {
67+
0 => 1,
68+
n => n as usize,
69+
};
70+
i += utf8_len;
71+
len += if utf8_len < 4 { 1 } else { 2 };
72+
}
73+
len
74+
}
75+
76+
/// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
77+
///
78+
/// Note that this is designed for use in const contexts so is not optimized.
79+
pub const fn to_utf16<const UTF16_LEN: usize>(s: &str) -> [u16; UTF16_LEN] {
80+
let mut output = [0_u16; UTF16_LEN];
81+
let mut pos = 0;
82+
let s = s.as_bytes();
83+
let mut i = 0;
84+
while i < s.len() {
85+
match s[i].leading_ones() {
86+
// Decode UTF-8 based on its length.
87+
// See https://en.wikipedia.org/wiki/UTF-8
88+
0 => {
89+
// ASCII is the same in both encodings
90+
output[pos] = s[i] as u16;
91+
i += 1;
92+
pos += 1;
93+
}
94+
2 => {
95+
// Bits: 110xxxxx 10xxxxxx
96+
output[pos] = ((s[i] as u16 & 0b11111) << 6) | (s[i + 1] as u16 & 0b111111);
97+
i += 2;
98+
pos += 1;
99+
}
100+
3 => {
101+
// Bits: 1110xxxx 10xxxxxx 10xxxxxx
102+
output[pos] = ((s[i] as u16 & 0b1111) << 12)
103+
| ((s[i + 1] as u16 & 0b111111) << 6)
104+
| (s[i + 2] as u16 & 0b111111);
105+
i += 3;
106+
pos += 1;
107+
}
108+
4 => {
109+
// Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
110+
let mut c = ((s[i] as u32 & 0b111) << 18)
111+
| ((s[i + 1] as u32 & 0b111111) << 12)
112+
| ((s[i + 2] as u32 & 0b111111) << 6)
113+
| (s[i + 3] as u32 & 0b111111);
114+
// re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
115+
// - Subtract 0x10000 from the code point
116+
// - For the high surrogate, shift right by 10 then add 0xD800
117+
// - For the low surrogate, take the low 10 bits then add 0xDC00
118+
c -= 0x10000;
119+
output[pos] = ((c >> 10) + 0xD800) as u16;
120+
output[pos + 1] = ((c & 0b1111111111) + 0xDC00) as u16;
121+
i += 4;
122+
pos += 2;
123+
}
124+
// valid UTF-8 cannot have any other values
125+
_ => unreachable!(),
126+
}
127+
}
128+
output
129+
}
130+
37131
/// Helper method for getting the size of `T` as a u32.
38132
/// Errors at compile time if the size would overflow.
39133
///

Diff for: std/src/sys/pal/windows/api/tests.rs

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
use crate::sys::pal::windows::api::{utf16, wide_str};
2+
3+
macro_rules! check_utf16 {
4+
($str:literal) => {{
5+
assert!(wide_str!($str).iter().copied().eq($str.encode_utf16().chain([0])));
6+
assert!(utf16!($str).iter().copied().eq($str.encode_utf16()));
7+
}};
8+
}
9+
10+
#[test]
11+
fn test_utf16_macros() {
12+
check_utf16!("hello world");
13+
check_utf16!("€4.50");
14+
check_utf16!("𨉟呐㗂越");
15+
check_utf16!("Pchnąć w tę łódź jeża lub ośm skrzyń fig");
16+
}

Diff for: std/src/sys/pal/windows/mod.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@ use crate::io::ErrorKind;
55
use crate::mem::MaybeUninit;
66
use crate::os::windows::ffi::{OsStrExt, OsStringExt};
77
use crate::path::PathBuf;
8+
use crate::sys::pal::windows::api::wide_str;
89
use crate::time::Duration;
910

1011
pub use self::rand::hashmap_random_keys;
1112

1213
#[macro_use]
1314
pub mod compat;
1415

16+
mod api;
17+
1518
pub mod alloc;
1619
pub mod args;
1720
pub mod c;
@@ -41,8 +44,6 @@ cfg_if::cfg_if! {
4144
}
4245
}
4346

44-
mod api;
45-
4647
/// Map a Result<T, WinError> to io::Result<T>.
4748
trait IoResult<T> {
4849
fn io_result(self) -> crate::io::Result<T>;

0 commit comments

Comments
 (0)