@@ -882,87 +882,97 @@ fn parse_escape<'de, R: Read<'de>>(
882
882
b'n' => scratch. push ( b'\n' ) ,
883
883
b'r' => scratch. push ( b'\r' ) ,
884
884
b't' => scratch. push ( b'\t' ) ,
885
- b'u' => {
886
- fn encode_surrogate ( scratch : & mut Vec < u8 > , n : u16 ) {
887
- scratch. extend_from_slice ( & [
888
- ( n >> 12 & 0b0000_1111 ) as u8 | 0b1110_0000 ,
889
- ( n >> 6 & 0b0011_1111 ) as u8 | 0b1000_0000 ,
890
- ( n & 0b0011_1111 ) as u8 | 0b1000_0000 ,
891
- ] ) ;
892
- }
893
-
894
- let c = match tri ! ( read. decode_hex_escape( ) ) {
895
- n @ 0xDC00 ..=0xDFFF => {
896
- return if validate {
897
- error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape )
898
- } else {
899
- encode_surrogate ( scratch, n) ;
900
- Ok ( ( ) )
901
- } ;
902
- }
885
+ b'u' => return parse_unicode_escape ( read, validate, scratch) ,
886
+ _ => {
887
+ return error ( read, ErrorCode :: InvalidEscape ) ;
888
+ }
889
+ }
903
890
904
- // Non-BMP characters are encoded as a sequence of two hex
905
- // escapes, representing UTF-16 surrogates. If deserializing a
906
- // utf-8 string the surrogates are required to be paired,
907
- // whereas deserializing a byte string accepts lone surrogates.
908
- n1 @ 0xD800 ..=0xDBFF => {
909
- if tri ! ( peek_or_eof( read) ) == b'\\' {
910
- read. discard ( ) ;
911
- } else {
912
- return if validate {
913
- read. discard ( ) ;
914
- error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
915
- } else {
916
- encode_surrogate ( scratch, n1) ;
917
- Ok ( ( ) )
918
- } ;
919
- }
891
+ Ok ( ( ) )
892
+ }
920
893
921
- if tri ! ( peek_or_eof( read) ) == b'u' {
922
- read. discard ( ) ;
923
- } else {
924
- return if validate {
925
- read. discard ( ) ;
926
- error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
927
- } else {
928
- encode_surrogate ( scratch, n1) ;
929
- // The \ prior to this byte started an escape sequence,
930
- // so we need to parse that now. This recursive call
931
- // does not blow the stack on malicious input because
932
- // the escape is not \u, so it will be handled by one
933
- // of the easy nonrecursive cases.
934
- parse_escape ( read, validate, scratch)
935
- } ;
936
- }
894
+ /// Parses a JSON \u escape and appends it into the scratch space. Assumes \u
895
+ /// has just been read.
896
+ #[ cold]
897
+ fn parse_unicode_escape < ' de , R : Read < ' de > > (
898
+ read : & mut R ,
899
+ validate : bool ,
900
+ scratch : & mut Vec < u8 > ,
901
+ ) -> Result < ( ) > {
902
+ fn encode_surrogate ( scratch : & mut Vec < u8 > , n : u16 ) {
903
+ scratch. extend_from_slice ( & [
904
+ ( n >> 12 & 0b0000_1111 ) as u8 | 0b1110_0000 ,
905
+ ( n >> 6 & 0b0011_1111 ) as u8 | 0b1000_0000 ,
906
+ ( n & 0b0011_1111 ) as u8 | 0b1000_0000 ,
907
+ ] ) ;
908
+ }
909
+
910
+ let c = match tri ! ( read. decode_hex_escape( ) ) {
911
+ n @ 0xDC00 ..=0xDFFF => {
912
+ return if validate {
913
+ error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape )
914
+ } else {
915
+ encode_surrogate ( scratch, n) ;
916
+ Ok ( ( ) )
917
+ } ;
918
+ }
937
919
938
- let n2 = tri ! ( read. decode_hex_escape( ) ) ;
920
+ // Non-BMP characters are encoded as a sequence of two hex
921
+ // escapes, representing UTF-16 surrogates. If deserializing a
922
+ // utf-8 string the surrogates are required to be paired,
923
+ // whereas deserializing a byte string accepts lone surrogates.
924
+ n1 @ 0xD800 ..=0xDBFF => {
925
+ if tri ! ( peek_or_eof( read) ) == b'\\' {
926
+ read. discard ( ) ;
927
+ } else {
928
+ return if validate {
929
+ read. discard ( ) ;
930
+ error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
931
+ } else {
932
+ encode_surrogate ( scratch, n1) ;
933
+ Ok ( ( ) )
934
+ } ;
935
+ }
939
936
940
- if n2 < 0xDC00 || n2 > 0xDFFF {
941
- return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
942
- }
937
+ if tri ! ( peek_or_eof( read) ) == b'u' {
938
+ read. discard ( ) ;
939
+ } else {
940
+ return if validate {
941
+ read. discard ( ) ;
942
+ error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
943
+ } else {
944
+ encode_surrogate ( scratch, n1) ;
945
+ // The \ prior to this byte started an escape sequence,
946
+ // so we need to parse that now. This recursive call
947
+ // does not blow the stack on malicious input because
948
+ // the escape is not \u, so it will be handled by one
949
+ // of the easy nonrecursive cases.
950
+ parse_escape ( read, validate, scratch)
951
+ } ;
952
+ }
943
953
944
- let n = ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000 ;
954
+ let n2 = tri ! ( read . decode_hex_escape ( ) ) ;
945
955
946
- match char:: from_u32 ( n) {
947
- Some ( c) => c,
948
- None => {
949
- return error ( read, ErrorCode :: InvalidUnicodeCodePoint ) ;
950
- }
951
- }
952
- }
956
+ if n2 < 0xDC00 || n2 > 0xDFFF {
957
+ return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
958
+ }
953
959
954
- // Every u16 outside of the surrogate ranges above is guaranteed
955
- // to be a legal char.
956
- n => char:: from_u32 ( n as u32 ) . unwrap ( ) ,
957
- } ;
960
+ let n = ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000 ;
958
961
959
- scratch. extend_from_slice ( c. encode_utf8 ( & mut [ 0_u8 ; 4 ] ) . as_bytes ( ) ) ;
960
- }
961
- _ => {
962
- return error ( read, ErrorCode :: InvalidEscape ) ;
962
+ match char:: from_u32 ( n) {
963
+ Some ( c) => c,
964
+ None => {
965
+ return error ( read, ErrorCode :: InvalidUnicodeCodePoint ) ;
966
+ }
967
+ }
963
968
}
964
- }
965
969
970
+ // Every u16 outside of the surrogate ranges above is guaranteed
971
+ // to be a legal char.
972
+ n => char:: from_u32 ( n as u32 ) . unwrap ( ) ,
973
+ } ;
974
+
975
+ scratch. extend_from_slice ( c. encode_utf8 ( & mut [ 0_u8 ; 4 ] ) . as_bytes ( ) ) ;
966
976
Ok ( ( ) )
967
977
}
968
978
0 commit comments