Rollup merge of #91754 - Patrick-Poitras:rm-4byte-minimum-stdio-windows, r=Mark-Simulacrum

matthiaskrgr · web-flow · commit efaf4299411c · 2022-01-03T23:47:28.000+01:00
Modifications to `std::io::Stdin` on Windows so that there is no longer a 4-byte buffer minimum in read(). This is an attempted fix of issue #91722, where a too-small buffer was passed to the read function of stdio on Windows. This caused an error to be returned when `read_to_end` or `read_to_string` were called. Both delegate to `std::io::default_read_to_end`, which creates a buffer that is of length >0, and forwards it to `std::io::Stdin::read()`. The latter method returns an error if the length of the buffer is less than 4, as there might not be enough space to allocate a UTF-16 character. This creates a problem when the buffer length is in `0 < N < 4`, causing the bug. The current modification creates an internal buffer, much like the one used for the write functions I'd also like to acknowledge the help of `@agausmann` and `@hkratz` in detecting and isolating the bug, and for suggestions that made the fix possible. Couple disclaimers: - Firstly, I didn't know where to put code to replicate the bug found in the issue. It would probably be wise to add that case to the testing suite, but I'm afraid that I don't know _where_ that test should be added. - Secondly, the code is fairly fundamental to IO operations, so my fears are that this may cause some undesired side effects ~or performance loss in benchmarks.~ The testing suite runs on my computer, and it does fix the issue noted in #91722. - Thirdly, I left the "surrogate" field in the Stdin struct, but from a cursory glance, it seems to be serving the same purpose for other functions. Perhaps merging the two would be appropriate. Finally, this is my first pull request to the rust language, and as such some things may be weird/unidiomatic/plain out bad. If there are any obvious improvements I could do to the code, or any other suggestions, I would appreciate them. Edit: Closes #91722
diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs
@@ -15,7 +15,9 @@ use core::str::utf8_char_width;
 // the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490.
 pub struct Stdin {
     surrogate: u16,
+    incomplete_utf8: IncompleteUtf8,
 }
+
 pub struct Stdout {
     incomplete_utf8: IncompleteUtf8,
 }
@@ -29,6 +31,25 @@ struct IncompleteUtf8 {
     len: u8,
 }
 
+impl IncompleteUtf8 {
+    // Implemented for use in Stdin::read.
+    fn read(&mut self, buf: &mut [u8]) -> usize {
+        // Write to buffer until the buffer is full or we run out of bytes.
+        let to_write = cmp::min(buf.len(), self.len as usize);
+        buf[..to_write].copy_from_slice(&self.bytes[..to_write]);
+
+        // Rotate the remaining bytes if not enough remaining space in buffer.
+        if usize::from(self.len) > buf.len() {
+            self.bytes.copy_within(to_write.., 0);
+            self.len -= to_write as u8;
+        } else {
+            self.len = 0;
+        }
+
+        to_write
+    }
+}
+
 // Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see
 // #13304 for details).
 //
@@ -205,7 +226,7 @@ fn write_u16s(handle: c::HANDLE, data: &[u16]) -> io::Result<usize> {
 
 impl Stdin {
     pub const fn new() -> Stdin {
-        Stdin { surrogate: 0 }
+        Stdin { surrogate: 0, incomplete_utf8: IncompleteUtf8::new() }
     }
 }
 
@@ -221,24 +242,39 @@ impl io::Read for Stdin {
             }
         }
 
-        if buf.len() == 0 {
-            return Ok(0);
-        } else if buf.len() < 4 {
-            return Err(io::Error::new_const(
-                io::ErrorKind::InvalidInput,
-                &"Windows stdin in console mode does not support a buffer too small to \
-                 guarantee holding one arbitrary UTF-8 character (4 bytes)",
-            ));
+        // If there are bytes in the incomplete utf-8, start with those.
+        // (No-op if there is nothing in the buffer.)
+        let mut bytes_copied = self.incomplete_utf8.read(buf);
+
+        if bytes_copied == buf.len() {
+            return Ok(bytes_copied);
+        } else if buf.len() - bytes_copied < 4 {
+            // Not enough space to get a UTF-8 byte. We will use the incomplete UTF8.
+            let mut utf16_buf = [0u16; 1];
+            // Read one u16 character.
+            let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, 1, &mut self.surrogate)?;
+            // Read bytes, using the (now-empty) self.incomplete_utf8 as extra space.
+            let read_bytes = utf16_to_utf8(&utf16_buf[..read], &mut self.incomplete_utf8.bytes)?;
+
+            // Read in the bytes from incomplete_utf8 until the buffer is full.
+            self.incomplete_utf8.len = read_bytes as u8;
+            // No-op if no bytes.
+            bytes_copied += self.incomplete_utf8.read(&mut buf[bytes_copied..]);
+            Ok(bytes_copied)
+        } else {
+            let mut utf16_buf = [0u16; MAX_BUFFER_SIZE / 2];
+            // In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So
+            // we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets
+            // lost.
+            let amount = cmp::min(buf.len() / 3, utf16_buf.len());
+            let read =
+                read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?;
+
+            match utf16_to_utf8(&utf16_buf[..read], buf) {
+                Ok(value) => return Ok(bytes_copied + value),
+                Err(e) => return Err(e),
+            }
         }
-
-        let mut utf16_buf = [0u16; MAX_BUFFER_SIZE / 2];
-        // In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So
-        // we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets
-        // lost.
-        let amount = cmp::min(buf.len() / 3, utf16_buf.len());
-        let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?;
-
-        utf16_to_utf8(&utf16_buf[..read], buf)
     }
 }