Skip to content

Commit 4c62ab1

Browse files
committed
auto merge of #13469 : kmcallister/rust/utf16, r=huonw
This fixes two separate issues related to character encoding. * Add `encode_utf16` to the `Char` trait, analogous to `encode_utf8`. `&str` already supports UTF-16 encoding but only with a heap allocation. Also fix `encode_utf8` docs and add tests. * Correctly decode non-BMP hex escapes in JSON (#13064).
2 parents 770b2fe + cee9a83 commit 4c62ab1

File tree

3 files changed

+126
-44
lines changed

3 files changed

+126
-44
lines changed

src/libserialize/json.rs

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ use std::io::MemWriter;
239239
use std::io;
240240
use std::num;
241241
use std::str;
242+
use std::str::ScalarValue;
242243
use std::strbuf::StrBuf;
243244

244245
use Encodable;
@@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
11291130
Ok(res)
11301131
}
11311132

1133+
fn decode_hex_escape(&mut self) -> DecodeResult<u16> {
1134+
let mut i = 0u;
1135+
let mut n = 0u16;
1136+
while i < 4u && !self.eof() {
1137+
self.bump();
1138+
n = match self.ch_or_null() {
1139+
c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)),
1140+
'a' | 'A' => n * 16_u16 + 10_u16,
1141+
'b' | 'B' => n * 16_u16 + 11_u16,
1142+
'c' | 'C' => n * 16_u16 + 12_u16,
1143+
'd' | 'D' => n * 16_u16 + 13_u16,
1144+
'e' | 'E' => n * 16_u16 + 14_u16,
1145+
'f' | 'F' => n * 16_u16 + 15_u16,
1146+
_ => return self.error(
1147+
~"invalid \\u escape (unrecognized hex)")
1148+
};
1149+
1150+
i += 1u;
1151+
}
1152+
1153+
// Error out if we didn't parse 4 digits.
1154+
if i != 4u {
1155+
return self.error(
1156+
~"invalid \\u escape (not four digits)");
1157+
}
1158+
1159+
Ok(n)
1160+
}
1161+
11321162
fn parse_str(&mut self) -> DecodeResult<~str> {
11331163
let mut escape = false;
11341164
let mut res = StrBuf::new();
@@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
11491179
'n' => res.push_char('\n'),
11501180
'r' => res.push_char('\r'),
11511181
't' => res.push_char('\t'),
1152-
'u' => {
1153-
// Parse \u1234.
1154-
let mut i = 0u;
1155-
let mut n = 0u;
1156-
while i < 4u && !self.eof() {
1157-
self.bump();
1158-
n = match self.ch_or_null() {
1159-
c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint),
1160-
'a' | 'A' => n * 16u + 10u,
1161-
'b' | 'B' => n * 16u + 11u,
1162-
'c' | 'C' => n * 16u + 12u,
1163-
'd' | 'D' => n * 16u + 13u,
1164-
'e' | 'E' => n * 16u + 14u,
1165-
'f' | 'F' => n * 16u + 15u,
1182+
'u' => match try!(self.decode_hex_escape()) {
1183+
0xDC00 .. 0xDFFF => return self.error(
1184+
~"lone trailing surrogate in hex escape"),
1185+
1186+
// Non-BMP characters are encoded as a sequence of
1187+
// two hex escapes, representing UTF-16 surrogates.
1188+
n1 @ 0xD800 .. 0xDBFF => {
1189+
let c1 = self.next_char();
1190+
let c2 = self.next_char();
1191+
match (c1, c2) {
1192+
(Some('\\'), Some('u')) => (),
11661193
_ => return self.error(
1167-
~"invalid \\u escape (unrecognized hex)")
1168-
};
1169-
1170-
i += 1u;
1171-
}
1194+
~"unexpected end of non-BMP hex escape"),
1195+
}
11721196

1173-
// Error out if we didn't parse 4 digits.
1174-
if i != 4u {
1175-
return self.error(
1176-
~"invalid \\u escape (not four digits)");
1197+
let buf = [n1, try!(self.decode_hex_escape())];
1198+
match str::utf16_items(buf.as_slice()).next() {
1199+
Some(ScalarValue(c)) => res.push_char(c),
1200+
_ => return self.error(
1201+
~"lone leading surrogate in hex escape"),
1202+
}
11771203
}
11781204

1179-
res.push_char(char::from_u32(n as u32).unwrap());
1180-
}
1205+
n => match char::from_u32(n as u32) {
1206+
Some(c) => res.push_char(c),
1207+
None => return self.error(
1208+
format!("invalid Unicode codepoint {:u}", n)),
1209+
},
1210+
},
11811211
_ => return self.error(~"invalid escape"),
11821212
}
11831213
escape = false;
@@ -2139,6 +2169,16 @@ mod tests {
21392169
assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
21402170
assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
21412171
assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));
2172+
2173+
// Non-BMP escapes. The exact error messages and positions are kind of
2174+
// arbitrary.
2175+
assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9")));
2176+
assert!(from_str("\"\\ud83d\"").is_err());
2177+
assert!(from_str("\"\\udca9\"").is_err());
2178+
assert!(from_str("\"\\ud83d\\ud83d\"").is_err());
2179+
assert!(from_str("\"\\ud83dx\"").is_err());
2180+
assert!(from_str("\"\\udca9\\udca9\"").is_err());
2181+
assert!(from_str("\"\\udca9x\"").is_err());
21422182
}
21432183

21442184
#[test]

src/libstd/char.rs

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ use unicode::{derived_property, property, general_category, decompose, conversio
3232

3333
#[cfg(test)] use str::Str;
3434
#[cfg(test)] use strbuf::StrBuf;
35+
#[cfg(test)] use slice::ImmutableVector;
3536

3637
#[cfg(not(test))] use cmp::{Eq, Ord};
3738
#[cfg(not(test))] use default::Default;
@@ -560,11 +561,19 @@ pub trait Char {
560561

561562
/// Encodes this character as UTF-8 into the provided byte buffer.
562563
///
563-
/// The buffer must be at least 4 bytes long or a runtime failure will
564+
/// The buffer must be at least 4 bytes long or a runtime failure may
564565
/// occur.
565566
///
566-
/// This will then return the number of characters written to the slice.
567+
/// This will then return the number of bytes written to the slice.
567568
fn encode_utf8(&self, dst: &mut [u8]) -> uint;
569+
570+
/// Encodes this character as UTF-16 into the provided `u16` buffer.
571+
///
572+
/// The buffer must be at least 2 elements long or a runtime failure may
573+
/// occur.
574+
///
575+
/// This will then return the number of `u16`s written to the slice.
576+
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
568577
}
569578

570579
impl Char for char {
@@ -602,7 +611,7 @@ impl Char for char {
602611

603612
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
604613

605-
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
614+
fn encode_utf8(&self, dst: &mut [u8]) -> uint {
606615
let code = *self as uint;
607616
if code < MAX_ONE_B {
608617
dst[0] = code as u8;
@@ -624,6 +633,24 @@ impl Char for char {
624633
return 4;
625634
}
626635
}
636+
637+
fn encode_utf16(&self, dst: &mut [u16]) -> uint {
638+
let mut ch = *self as uint;
639+
if (ch & 0xFFFF_u) == ch {
640+
// The BMP falls through (assuming non-surrogate, as it
641+
// should)
642+
assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
643+
dst[0] = ch as u16;
644+
1
645+
} else {
646+
// Supplementary planes break into surrogates.
647+
assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
648+
ch -= 0x1_0000_u;
649+
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
650+
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
651+
2
652+
}
653+
}
627654
}
628655

629656
#[cfg(not(test))]
@@ -788,3 +815,31 @@ fn test_to_str() {
788815
let s = 't'.to_str();
789816
assert_eq!(s, ~"t");
790817
}
818+
819+
#[test]
820+
fn test_encode_utf8() {
821+
fn check(input: char, expect: &[u8]) {
822+
let mut buf = [0u8, ..4];
823+
let n = input.encode_utf8(buf /* as mut slice! */);
824+
assert_eq!(buf.slice_to(n), expect);
825+
}
826+
827+
check('x', [0x78]);
828+
check('\u00e9', [0xc3, 0xa9]);
829+
check('\ua66e', [0xea, 0x99, 0xae]);
830+
check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
831+
}
832+
833+
#[test]
834+
fn test_encode_utf16() {
835+
fn check(input: char, expect: &[u16]) {
836+
let mut buf = [0u16, ..2];
837+
let n = input.encode_utf16(buf /* as mut slice! */);
838+
assert_eq!(buf.slice_to(n), expect);
839+
}
840+
841+
check('x', [0x0078]);
842+
check('\u00e9', [0x00e9]);
843+
check('\ua66e', [0xa66e]);
844+
check('\U0001f4a9', [0xd83d, 0xdca9]);
845+
}

src/libstd/str.rs

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,22 +2555,9 @@ impl<'a> StrSlice<'a> for &'a str {
25552555
fn to_utf16(&self) -> ~[u16] {
25562556
let mut u = ~[];
25572557
for ch in self.chars() {
2558-
// Arithmetic with u32 literals is easier on the eyes than chars.
2559-
let mut ch = ch as u32;
2560-
2561-
if (ch & 0xFFFF_u32) == ch {
2562-
// The BMP falls through (assuming non-surrogate, as it
2563-
// should)
2564-
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
2565-
u.push(ch as u16)
2566-
} else {
2567-
// Supplementary planes break into surrogates.
2568-
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
2569-
ch -= 0x1_0000_u32;
2570-
let w1 = 0xD800_u16 | ((ch >> 10) as u16);
2571-
let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2572-
u.push_all([w1, w2])
2573-
}
2558+
let mut buf = [0u16, ..2];
2559+
let n = ch.encode_utf16(buf /* as mut slice! */);
2560+
u.push_all(buf.slice_to(n));
25742561
}
25752562
u
25762563
}

0 commit comments

Comments
 (0)