Skip to content

Commit a38dbf3

Browse files
committed
Mark \u parsing as cold
This counterintuitively speeds up War and Peace 275 -> 290 MB/s (+5%) by enabling inlining of encode_utf8 and extend_from_slice.
1 parent cf771a0 commit a38dbf3

File tree

1 file changed

+81
-71
lines changed

1 file changed

+81
-71
lines changed

src/read.rs

+81-71
Original file line numberDiff line numberDiff line change
@@ -882,87 +882,97 @@ fn parse_escape<'de, R: Read<'de>>(
882882
b'n' => scratch.push(b'\n'),
883883
b'r' => scratch.push(b'\r'),
884884
b't' => scratch.push(b'\t'),
885-
b'u' => {
886-
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
887-
scratch.extend_from_slice(&[
888-
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
889-
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
890-
(n & 0b0011_1111) as u8 | 0b1000_0000,
891-
]);
892-
}
893-
894-
let c = match tri!(read.decode_hex_escape()) {
895-
n @ 0xDC00..=0xDFFF => {
896-
return if validate {
897-
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
898-
} else {
899-
encode_surrogate(scratch, n);
900-
Ok(())
901-
};
902-
}
885+
b'u' => return parse_unicode_escape(read, validate, scratch),
886+
_ => {
887+
return error(read, ErrorCode::InvalidEscape);
888+
}
889+
}
903890

904-
// Non-BMP characters are encoded as a sequence of two hex
905-
// escapes, representing UTF-16 surrogates. If deserializing a
906-
// utf-8 string the surrogates are required to be paired,
907-
// whereas deserializing a byte string accepts lone surrogates.
908-
n1 @ 0xD800..=0xDBFF => {
909-
if tri!(peek_or_eof(read)) == b'\\' {
910-
read.discard();
911-
} else {
912-
return if validate {
913-
read.discard();
914-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
915-
} else {
916-
encode_surrogate(scratch, n1);
917-
Ok(())
918-
};
919-
}
891+
Ok(())
892+
}
920893

921-
if tri!(peek_or_eof(read)) == b'u' {
922-
read.discard();
923-
} else {
924-
return if validate {
925-
read.discard();
926-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
927-
} else {
928-
encode_surrogate(scratch, n1);
929-
// The \ prior to this byte started an escape sequence,
930-
// so we need to parse that now. This recursive call
931-
// does not blow the stack on malicious input because
932-
// the escape is not \u, so it will be handled by one
933-
// of the easy nonrecursive cases.
934-
parse_escape(read, validate, scratch)
935-
};
936-
}
894+
/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u
895+
/// has just been read.
896+
#[cold]
897+
fn parse_unicode_escape<'de, R: Read<'de>>(
898+
read: &mut R,
899+
validate: bool,
900+
scratch: &mut Vec<u8>,
901+
) -> Result<()> {
902+
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
903+
scratch.extend_from_slice(&[
904+
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
905+
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
906+
(n & 0b0011_1111) as u8 | 0b1000_0000,
907+
]);
908+
}
909+
910+
let c = match tri!(read.decode_hex_escape()) {
911+
n @ 0xDC00..=0xDFFF => {
912+
return if validate {
913+
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
914+
} else {
915+
encode_surrogate(scratch, n);
916+
Ok(())
917+
};
918+
}
937919

938-
let n2 = tri!(read.decode_hex_escape());
920+
// Non-BMP characters are encoded as a sequence of two hex
921+
// escapes, representing UTF-16 surrogates. If deserializing a
922+
// utf-8 string the surrogates are required to be paired,
923+
// whereas deserializing a byte string accepts lone surrogates.
924+
n1 @ 0xD800..=0xDBFF => {
925+
if tri!(peek_or_eof(read)) == b'\\' {
926+
read.discard();
927+
} else {
928+
return if validate {
929+
read.discard();
930+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
931+
} else {
932+
encode_surrogate(scratch, n1);
933+
Ok(())
934+
};
935+
}
939936

940-
if n2 < 0xDC00 || n2 > 0xDFFF {
941-
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
942-
}
937+
if tri!(peek_or_eof(read)) == b'u' {
938+
read.discard();
939+
} else {
940+
return if validate {
941+
read.discard();
942+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
943+
} else {
944+
encode_surrogate(scratch, n1);
945+
// The \ prior to this byte started an escape sequence,
946+
// so we need to parse that now. This recursive call
947+
// does not blow the stack on malicious input because
948+
// the escape is not \u, so it will be handled by one
949+
// of the easy nonrecursive cases.
950+
parse_escape(read, validate, scratch)
951+
};
952+
}
943953

944-
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
954+
let n2 = tri!(read.decode_hex_escape());
945955

946-
match char::from_u32(n) {
947-
Some(c) => c,
948-
None => {
949-
return error(read, ErrorCode::InvalidUnicodeCodePoint);
950-
}
951-
}
952-
}
956+
if n2 < 0xDC00 || n2 > 0xDFFF {
957+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
958+
}
953959

954-
// Every u16 outside of the surrogate ranges above is guaranteed
955-
// to be a legal char.
956-
n => char::from_u32(n as u32).unwrap(),
957-
};
960+
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
958961

959-
scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
960-
}
961-
_ => {
962-
return error(read, ErrorCode::InvalidEscape);
962+
match char::from_u32(n) {
963+
Some(c) => c,
964+
None => {
965+
return error(read, ErrorCode::InvalidUnicodeCodePoint);
966+
}
967+
}
963968
}
964-
}
965969

970+
// Every u16 outside of the surrogate ranges above is guaranteed
971+
// to be a legal char.
972+
n => char::from_u32(n as u32).unwrap(),
973+
};
974+
975+
scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
966976
Ok(())
967977
}
968978

0 commit comments

Comments
 (0)