Correct WTF-8 parsing

purplesyringa · lucacasonato · purplesyringa · commit 96ae60445df6 · 2024-08-12T21:12:03.000+03:00
Closes #877. This is a good time to make ByteBuf parsing more consistent as I'm rewriting it anyway. This commit integrates the changes from #877 and also handles a leading surrogate followed by a surrogate pair correctly. This does not affect performance significantly. Co-authored-by: Luca Casonato <hello@lcas.dev>
diff --git a/src/de.rs b/src/de.rs
@@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     ///
     /// The behavior of serde_json is specified to fail on non-UTF-8 strings
     /// when deserializing into Rust UTF-8 string types such as String, and
-    /// succeed with non-UTF-8 bytes when deserializing using this method.
+    /// succeed with the bytes representing the [WTF-8] encoding of code points
+    /// when deserializing using this method.
+    ///
+    /// [WTF-8]: https://simonsapin.github.io/wtf-8
     ///
     /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
     /// still checked if the hex number represents a valid Unicode code point.
diff --git a/src/read.rs b/src/read.rs
@@ -898,7 +898,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
     validate: bool,
     scratch: &mut Vec<u8>,
 ) -> Result<()> {
-    let n = tri!(read.decode_hex_escape());
+    let mut n = tri!(read.decode_hex_escape());
 
     // Non-BMP characters are encoded as a sequence of two hex
     // escapes, representing UTF-16 surrogates. If deserializing a
@@ -909,56 +909,64 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
         return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
     }
 
-    if n < 0xD800 || n > 0xDBFF {
-        // Every u16 outside of the surrogate ranges is guaranteed to be a
-        // legal char.
-        push_wtf8_codepoint(n as u32, scratch);
-        return Ok(());
-    }
+    loop {
+        if n < 0xD800 || n > 0xDBFF {
+            // Every u16 outside of the surrogate ranges is guaranteed to be a
+            // legal char.
+            push_wtf8_codepoint(n as u32, scratch);
+            return Ok(());
+        }
 
-    // n is a leading surrogate, we now expect a trailing surrogate.
-    let n1 = n;
+        // n is a leading surrogate, we now expect a trailing surrogate.
+        let n1 = n;
 
-    if tri!(peek_or_eof(read)) == b'\\' {
-        read.discard();
-    } else {
-        return if validate {
+        if tri!(peek_or_eof(read)) == b'\\' {
             read.discard();
-            error(read, ErrorCode::UnexpectedEndOfHexEscape)
         } else {
-            push_wtf8_codepoint(n1 as u32, scratch);
-            Ok(())
-        };
-    }
+            return if validate {
+                read.discard();
+                error(read, ErrorCode::UnexpectedEndOfHexEscape)
+            } else {
+                push_wtf8_codepoint(n1 as u32, scratch);
+                Ok(())
+            };
+        }
 
-    if tri!(peek_or_eof(read)) == b'u' {
-        read.discard();
-    } else {
-        return if validate {
+        if tri!(peek_or_eof(read)) == b'u' {
             read.discard();
-            error(read, ErrorCode::UnexpectedEndOfHexEscape)
         } else {
-            push_wtf8_codepoint(n1 as u32, scratch);
-            // The \ prior to this byte started an escape sequence,
-            // so we need to parse that now. This recursive call
-            // does not blow the stack on malicious input because
-            // the escape is not \u, so it will be handled by one
-            // of the easy nonrecursive cases.
-            parse_escape(read, validate, scratch)
-        };
-    }
+            return if validate {
+                read.discard();
+                error(read, ErrorCode::UnexpectedEndOfHexEscape)
+            } else {
+                push_wtf8_codepoint(n1 as u32, scratch);
+                // The \ prior to this byte started an escape sequence,
+                // so we need to parse that now. This recursive call
+                // does not blow the stack on malicious input because
+                // the escape is not \u, so it will be handled by one
+                // of the easy nonrecursive cases.
+                parse_escape(read, validate, scratch)
+            };
+        }
 
-    let n2 = tri!(read.decode_hex_escape());
+        let n2 = tri!(read.decode_hex_escape());
 
-    if n2 < 0xDC00 || n2 > 0xDFFF {
-        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-    }
+        if n2 < 0xDC00 || n2 > 0xDFFF {
+            if validate {
+                return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+            }
+            push_wtf8_codepoint(n1 as u32, scratch);
+            // If n2 is a leading surrogate, we need to restart.
+            n = n2;
+            continue;
+        }
 
-    // This value is in range U+10000..=U+10FFFF, which is always a
-    // valid codepoint.
-    let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
-    push_wtf8_codepoint(n, scratch);
-    Ok(())
+        // This value is in range U+10000..=U+10FFFF, which is always a
+        // valid codepoint.
+        let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
+        push_wtf8_codepoint(n, scratch);
+        return Ok(());
+    }
 }
 
 /// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
diff --git a/tests/test.rs b/tests/test.rs
@@ -1707,7 +1707,7 @@ fn test_byte_buf_de() {
 }
 
 #[test]
-fn test_byte_buf_de_lone_surrogate() {
+fn test_byte_buf_de_invalid_surrogates() {
     let bytes = ByteBuf::from(vec![237, 160, 188]);
     let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
     assert_eq!(v, bytes);
@@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() {
     let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
     assert_eq!(v, bytes);
 
-    let bytes = ByteBuf::from(vec![237, 176, 129]);
-    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
-    assert_eq!(v, bytes);
-
     let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
     assert!(res.is_err());
 
     let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
     assert!(res.is_err());
 
-    let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
-    assert!(res.is_err());
+    // lone trailing surrogate
+    let bytes = ByteBuf::from(vec![237, 176, 129]);
+    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by other leading surrogate
+    let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by "a" (U+0061) in \u encoding
+    let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+0080
+    let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+FFFF
+    let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
+    let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
+    assert_eq!(v, bytes);
+}
+
+#[test]
+fn test_byte_buf_de_surrogate_pair() {
+    // leading surrogate followed by trailing surrogate
+    let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by a surrogate pair
+    let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
 }
 
 #[cfg(feature = "raw_value")]
 #[test]
-fn test_raw_de_lone_surrogate() {
+fn test_raw_de_invalid_surrogates() {
     use serde_json::value::RawValue;
 
     assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() {
     assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
+}
+
+#[cfg(feature = "raw_value")]
+#[test]
+fn test_raw_de_surrogate_pair() {
+    use serde_json::value::RawValue;
+
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
 }
 
 #[test]