Skip to content

Commit fa05853

Browse files
borsgitbot
authored and
gitbot
committed
Auto merge of rust-lang#83342 - Count-Count:win-console-incomplete-utf8, r=m-ou-se
Allow writing of incomplete UTF-8 sequences to the Windows console via stdout/stderr # Problem Writes of just an incomplete UTF-8 byte sequence (e.g. `b"\xC3"` or `b"\xF0\x9F"`) to stdout/stderr with a Windows console attached error with `io::ErrorKind::InvalidData, "Windows stdio in console mode does not support writing non-UTF-8 byte sequences"` even though further writes could complete the codepoint. This is currently a rare occurence since the [linewritershim](https://github.com/rust-lang/rust/blob/2c56ea38b045624dc8b42ec948fc169eaff1206a/library/std/src/io/buffered/linewritershim.rs) implementation flushes complete lines immediately and buffers up to 1024 bytes for incomplete lines. It can still happen as described in rust-lang#83258. The problem will become more pronounced once the developer can switch stdout/stderr from line-buffered to block-buffered or immediate when the changes in the "Switchable buffering for Stdout" pull request (rust-lang#78515) get merged. # Patch description If there is at least one valid UTF-8 codepoint all valid UTF-8 is passed through to the extracted `write_valid_utf8_to_console()` fn. The new code only comes into play if `write()` is being passed a short byte slice comprising an incomplete UTF-8 codepoint. In this case up to three bytes are buffered in the `IncompleteUtf8` struct associated with `Stdout` / `Stderr`. The bytes are accepted one at a time. As soon as an error can be detected `io::ErrorKind::InvalidData, "Windows stdio in console mode does not support writing non-UTF-8 byte sequences"` is returned. Once a complete UTF-8 codepoint is received it is passed to the `write_valid_utf8_to_console()` and the buffer length is set to zero. Calling `flush()` will neither error nor write anything if an incomplete codepoint is present in the buffer. # Tests Currently there are no Windows-specific tests for console writing code at all. Writing (regression) tests for this problem is a bit challenging since unit tests and UI tests don't run in a console and suddenly popping up another console window might be surprising to developers running the testsuite and it might not work at all in CI builds. To just test the new functionality in unit tests the code would need to be refactored. Some guidance on how to proceed would be appreciated. # Public API changes * `std::str::verifications::utf8_char_width()` would be exposed as `std::str::utf8_char_width()` behind the "str_internals" feature gate. # Related issues * Fixes rust-lang#83258. * PR rust-lang#78515 will exacerbate the problem. # Open questions * Add tests? * Squash into one commit with better commit message?
2 parents 9d72ed0 + 0dbd0ce commit fa05853

File tree

2 files changed

+91
-15
lines changed

2 files changed

+91
-15
lines changed

core/src/str/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ pub use iter::SplitAsciiWhitespace;
6969
pub use iter::SplitInclusive;
7070

7171
#[unstable(feature = "str_internals", issue = "none")]
72-
pub use validations::next_code_point;
72+
pub use validations::{next_code_point, utf8_char_width};
7373

7474
use iter::MatchIndicesInternal;
7575
use iter::SplitInternal;

std/src/sys/windows/stdio.rs

+90-14
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,25 @@ use crate::str;
88
use crate::sys::c;
99
use crate::sys::cvt;
1010
use crate::sys::handle::Handle;
11+
use core::str::utf8_char_width;
1112

1213
// Don't cache handles but get them fresh for every read/write. This allows us to track changes to
1314
// the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490.
1415
pub struct Stdin {
1516
surrogate: u16,
1617
}
17-
pub struct Stdout;
18-
pub struct Stderr;
18+
pub struct Stdout {
19+
incomplete_utf8: IncompleteUtf8,
20+
}
21+
22+
pub struct Stderr {
23+
incomplete_utf8: IncompleteUtf8,
24+
}
25+
26+
struct IncompleteUtf8 {
27+
bytes: [u8; 4],
28+
len: u8,
29+
}
1930

2031
// Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see
2132
// #13304 for details).
@@ -50,7 +61,15 @@ fn is_console(handle: c::HANDLE) -> bool {
5061
unsafe { c::GetConsoleMode(handle, &mut mode) != 0 }
5162
}
5263

53-
fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> {
64+
fn write(
65+
handle_id: c::DWORD,
66+
data: &[u8],
67+
incomplete_utf8: &mut IncompleteUtf8,
68+
) -> io::Result<usize> {
69+
if data.is_empty() {
70+
return Ok(0);
71+
}
72+
5473
let handle = get_handle(handle_id)?;
5574
if !is_console(handle) {
5675
let handle = Handle::new(handle);
@@ -59,22 +78,73 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> {
5978
return ret;
6079
}
6180

62-
// As the console is meant for presenting text, we assume bytes of `data` come from a string
63-
// and are encoded as UTF-8, which needs to be encoded as UTF-16.
81+
if incomplete_utf8.len > 0 {
82+
assert!(
83+
incomplete_utf8.len < 4,
84+
"Unexpected number of bytes for incomplete UTF-8 codepoint."
85+
);
86+
if data[0] >> 6 != 0b10 {
87+
// not a continuation byte - reject
88+
incomplete_utf8.len = 0;
89+
return Err(io::Error::new_const(
90+
io::ErrorKind::InvalidData,
91+
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
92+
));
93+
}
94+
incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0];
95+
incomplete_utf8.len += 1;
96+
let char_width = utf8_char_width(incomplete_utf8.bytes[0]);
97+
if (incomplete_utf8.len as usize) < char_width {
98+
// more bytes needed
99+
return Ok(1);
100+
}
101+
let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]);
102+
incomplete_utf8.len = 0;
103+
match s {
104+
Ok(s) => {
105+
assert_eq!(char_width, s.len());
106+
let written = write_valid_utf8_to_console(handle, s)?;
107+
assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes
108+
return Ok(1);
109+
}
110+
Err(_) => {
111+
return Err(io::Error::new_const(
112+
io::ErrorKind::InvalidData,
113+
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
114+
));
115+
}
116+
}
117+
}
118+
119+
// As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8,
120+
// which needs to be encoded as UTF-16.
64121
//
65122
// If the data is not valid UTF-8 we write out as many bytes as are valid.
66-
// Only when there are no valid bytes (which will happen on the next call), return an error.
123+
// If the first byte is invalid it is either first byte of a multi-byte sequence but the
124+
// provided byte slice is too short or it is the first byte of an invalide multi-byte sequence.
67125
let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2);
68126
let utf8 = match str::from_utf8(&data[..len]) {
69127
Ok(s) => s,
70128
Err(ref e) if e.valid_up_to() == 0 => {
71-
return Err(io::Error::new_const(
72-
io::ErrorKind::InvalidData,
73-
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
74-
));
129+
let first_byte_char_width = utf8_char_width(data[0]);
130+
if first_byte_char_width > 1 && data.len() < first_byte_char_width {
131+
incomplete_utf8.bytes[0] = data[0];
132+
incomplete_utf8.len = 1;
133+
return Ok(1);
134+
} else {
135+
return Err(io::Error::new_const(
136+
io::ErrorKind::InvalidData,
137+
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
138+
));
139+
}
75140
}
76141
Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
77142
};
143+
144+
write_valid_utf8_to_console(handle, utf8)
145+
}
146+
147+
fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> {
78148
let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2];
79149
let mut len_utf16 = 0;
80150
for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) {
@@ -254,15 +324,21 @@ fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result<usize> {
254324
Ok(written)
255325
}
256326

327+
impl IncompleteUtf8 {
328+
pub const fn new() -> IncompleteUtf8 {
329+
IncompleteUtf8 { bytes: [0; 4], len: 0 }
330+
}
331+
}
332+
257333
impl Stdout {
258334
pub const fn new() -> Stdout {
259-
Stdout
335+
Stdout { incomplete_utf8: IncompleteUtf8::new() }
260336
}
261337
}
262338

263339
impl io::Write for Stdout {
264340
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
265-
write(c::STD_OUTPUT_HANDLE, buf)
341+
write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8)
266342
}
267343

268344
fn flush(&mut self) -> io::Result<()> {
@@ -272,13 +348,13 @@ impl io::Write for Stdout {
272348

273349
impl Stderr {
274350
pub const fn new() -> Stderr {
275-
Stderr
351+
Stderr { incomplete_utf8: IncompleteUtf8::new() }
276352
}
277353
}
278354

279355
impl io::Write for Stderr {
280356
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
281-
write(c::STD_ERROR_HANDLE, buf)
357+
write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8)
282358
}
283359

284360
fn flush(&mut self) -> io::Result<()> {

0 commit comments

Comments
 (0)