Skip to content

Commit 96ae604

Browse files
Correct WTF-8 parsing
Closes #877. This is a good time to make ByteBuf parsing more consistent as I'm rewriting it anyway. This commit integrates the changes from #877 and also handles a leading surrogate followed by a surrogate pair correctly. This does not affect performance significantly. Co-authored-by: Luca Casonato <[email protected]>
1 parent 236cc82 commit 96ae604

File tree

3 files changed

+103
-50
lines changed

3 files changed

+103
-50
lines changed

src/de.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
15751575
///
15761576
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
15771577
/// when deserializing into Rust UTF-8 string types such as String, and
1578-
/// succeed with non-UTF-8 bytes when deserializing using this method.
1578+
/// succeed with the bytes representing the [WTF-8] encoding of code points
1579+
/// when deserializing using this method.
1580+
///
1581+
/// [WTF-8]: https://simonsapin.github.io/wtf-8
15791582
///
15801583
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
15811584
/// still checked if the hex number represents a valid Unicode code point.

src/read.rs

+49-41
Original file line numberDiff line numberDiff line change
@@ -898,7 +898,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
898898
validate: bool,
899899
scratch: &mut Vec<u8>,
900900
) -> Result<()> {
901-
let n = tri!(read.decode_hex_escape());
901+
let mut n = tri!(read.decode_hex_escape());
902902

903903
// Non-BMP characters are encoded as a sequence of two hex
904904
// escapes, representing UTF-16 surrogates. If deserializing a
@@ -909,56 +909,64 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
909909
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
910910
}
911911

912-
if n < 0xD800 || n > 0xDBFF {
913-
// Every u16 outside of the surrogate ranges is guaranteed to be a
914-
// legal char.
915-
push_wtf8_codepoint(n as u32, scratch);
916-
return Ok(());
917-
}
912+
loop {
913+
if n < 0xD800 || n > 0xDBFF {
914+
// Every u16 outside of the surrogate ranges is guaranteed to be a
915+
// legal char.
916+
push_wtf8_codepoint(n as u32, scratch);
917+
return Ok(());
918+
}
918919

919-
// n is a leading surrogate, we now expect a trailing surrogate.
920-
let n1 = n;
920+
// n is a leading surrogate, we now expect a trailing surrogate.
921+
let n1 = n;
921922

922-
if tri!(peek_or_eof(read)) == b'\\' {
923-
read.discard();
924-
} else {
925-
return if validate {
923+
if tri!(peek_or_eof(read)) == b'\\' {
926924
read.discard();
927-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
928925
} else {
929-
push_wtf8_codepoint(n1 as u32, scratch);
930-
Ok(())
931-
};
932-
}
926+
return if validate {
927+
read.discard();
928+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
929+
} else {
930+
push_wtf8_codepoint(n1 as u32, scratch);
931+
Ok(())
932+
};
933+
}
933934

934-
if tri!(peek_or_eof(read)) == b'u' {
935-
read.discard();
936-
} else {
937-
return if validate {
935+
if tri!(peek_or_eof(read)) == b'u' {
938936
read.discard();
939-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
940937
} else {
941-
push_wtf8_codepoint(n1 as u32, scratch);
942-
// The \ prior to this byte started an escape sequence,
943-
// so we need to parse that now. This recursive call
944-
// does not blow the stack on malicious input because
945-
// the escape is not \u, so it will be handled by one
946-
// of the easy nonrecursive cases.
947-
parse_escape(read, validate, scratch)
948-
};
949-
}
938+
return if validate {
939+
read.discard();
940+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
941+
} else {
942+
push_wtf8_codepoint(n1 as u32, scratch);
943+
// The \ prior to this byte started an escape sequence,
944+
// so we need to parse that now. This recursive call
945+
// does not blow the stack on malicious input because
946+
// the escape is not \u, so it will be handled by one
947+
// of the easy nonrecursive cases.
948+
parse_escape(read, validate, scratch)
949+
};
950+
}
950951

951-
let n2 = tri!(read.decode_hex_escape());
952+
let n2 = tri!(read.decode_hex_escape());
952953

953-
if n2 < 0xDC00 || n2 > 0xDFFF {
954-
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
955-
}
954+
if n2 < 0xDC00 || n2 > 0xDFFF {
955+
if validate {
956+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
957+
}
958+
push_wtf8_codepoint(n1 as u32, scratch);
959+
// If n2 is a leading surrogate, we need to restart.
960+
n = n2;
961+
continue;
962+
}
956963

957-
// This value is in range U+10000..=U+10FFFF, which is always a
958-
// valid codepoint.
959-
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
960-
push_wtf8_codepoint(n, scratch);
961-
Ok(())
964+
// This value is in range U+10000..=U+10FFFF, which is always a
965+
// valid codepoint.
966+
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
967+
push_wtf8_codepoint(n, scratch);
968+
return Ok(());
969+
}
962970
}
963971

964972
/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient

tests/test.rs

+50-8
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ fn test_byte_buf_de() {
17071707
}
17081708

17091709
#[test]
1710-
fn test_byte_buf_de_lone_surrogate() {
1710+
fn test_byte_buf_de_invalid_surrogates() {
17111711
let bytes = ByteBuf::from(vec![237, 160, 188]);
17121712
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
17131713
assert_eq!(v, bytes);
@@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() {
17201720
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
17211721
assert_eq!(v, bytes);
17221722

1723-
let bytes = ByteBuf::from(vec![237, 176, 129]);
1724-
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1725-
assert_eq!(v, bytes);
1726-
17271723
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
17281724
assert!(res.is_err());
17291725

17301726
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
17311727
assert!(res.is_err());
17321728

1733-
let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
1734-
assert!(res.is_err());
1729+
// lone trailing surrogate
1730+
let bytes = ByteBuf::from(vec![237, 176, 129]);
1731+
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1732+
assert_eq!(v, bytes);
1733+
1734+
// leading surrogate followed by other leading surrogate
1735+
let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
1736+
let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
1737+
assert_eq!(v, bytes);
1738+
1739+
// leading surrogate followed by "a" (U+0061) in \u encoding
1740+
let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
1741+
let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
1742+
assert_eq!(v, bytes);
1743+
1744+
// leading surrogate followed by U+0080
1745+
let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
1746+
let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
1747+
assert_eq!(v, bytes);
1748+
1749+
// leading surrogate followed by U+FFFF
1750+
let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
1751+
let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
1752+
assert_eq!(v, bytes);
1753+
}
1754+
1755+
#[test]
1756+
fn test_byte_buf_de_surrogate_pair() {
1757+
// leading surrogate followed by trailing surrogate
1758+
let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
1759+
let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
1760+
assert_eq!(v, bytes);
1761+
1762+
// leading surrogate followed by a surrogate pair
1763+
let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]);
1764+
let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap();
1765+
assert_eq!(v, bytes);
17351766
}
17361767

17371768
#[cfg(feature = "raw_value")]
17381769
#[test]
1739-
fn test_raw_de_lone_surrogate() {
1770+
fn test_raw_de_invalid_surrogates() {
17401771
use serde_json::value::RawValue;
17411772

17421773
assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() {
17461777
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
17471778
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
17481779
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
1780+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
1781+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
1782+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
1783+
}
1784+
1785+
#[cfg(feature = "raw_value")]
1786+
#[test]
1787+
fn test_raw_de_surrogate_pair() {
1788+
use serde_json::value::RawValue;
1789+
1790+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
17491791
}
17501792

17511793
#[test]

0 commit comments

Comments
 (0)