@@ -564,6 +564,18 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
564
564
Section: Misc
565
565
*/
566
566
567
+ // Return the initial codepoint accumulator for the first byte.
568
+ // The first byte is special, only want bottom 5 bits for width 2, 4 bits
569
+ // for width 3, and 3 bits for width 4
570
+ macro_rules! utf8_first_byte(
571
+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as uint)
572
+ )
573
+
574
+ // return the value of $ch updated with continuation byte $byte
575
+ macro_rules! utf8_acc_cont_byte(
576
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as uint)
577
+ )
578
+
567
579
/// Determines if a vector of bytes contains valid UTF-8
568
580
pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
569
581
let mut i = 0 u;
@@ -577,11 +589,26 @@ pub fn is_utf8(v: &[u8]) -> bool {
577
589
578
590
let nexti = i + w;
579
591
if nexti > total { return false ; }
592
+ // 1. Make sure the correct number of continuation bytes are present
593
+ // 2. Check codepoint ranges (deny overlong encodings)
594
+ // 2-byte encoding is for codepoints \u0080 to \u07ff
595
+ // 3-byte encoding is for codepoints \u0800 to \uffff
596
+ // 4-byte encoding is for codepoints \u10000 to \u10ffff
580
597
598
+ // 2-byte encodings are correct if the width and continuation match up
581
599
if v[ i + 1 ] & 192u8 != TAG_CONT_U8 { return false ; }
582
600
if w > 2 {
601
+ let mut ch;
602
+ ch = utf8_first_byte ! ( v[ i] , w) ;
603
+ ch = utf8_acc_cont_byte ! ( ch, v[ i + 1 ] ) ;
583
604
if v[ i + 2 ] & 192u8 != TAG_CONT_U8 { return false ; }
584
- if w > 3 && ( v[ i + 3 ] & 192u8 != TAG_CONT_U8 ) { return false ; }
605
+ ch = utf8_acc_cont_byte ! ( ch, v[ i + 2 ] ) ;
606
+ if w == 3 && ch < MAX_TWO_B { return false ; }
607
+ if w > 3 {
608
+ if v[ i + 3 ] & 192u8 != TAG_CONT_U8 { return false ; }
609
+ ch = utf8_acc_cont_byte ! ( ch, v[ i + 3 ] ) ;
610
+ if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false ; }
611
+ }
585
612
}
586
613
587
614
i = nexti;
@@ -712,7 +739,7 @@ static UTF8_CHAR_WIDTH: [u8, ..256] = [
712
739
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0x9F
713
740
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
714
741
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xBF
715
- 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
742
+ 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
716
743
2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // 0xDF
717
744
3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
718
745
4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
@@ -738,6 +765,7 @@ static MAX_TWO_B: uint = 2048u;
738
765
static TAG_THREE_B : uint = 224 u;
739
766
static MAX_THREE_B : uint = 65536 u;
740
767
static TAG_FOUR_B : uint = 240 u;
768
+ static MAX_UNICODE : uint = 1114112 u;
741
769
742
770
/// Unsafe operations
743
771
pub mod raw {
@@ -1665,12 +1693,10 @@ impl<'self> StrSlice<'self> for &'self str {
1665
1693
let w = UTF8_CHAR_WIDTH[val] as uint;
1666
1694
assert!((w != 0));
1667
1695
1668
- // First byte is special, only want bottom 5 bits for width 2, 4 bits
1669
- // for width 3, and 3 bits for width 4
1670
- val &= 0x7Fu >> w;
1671
- val = (val << 6) | (s[i + 1] & 63u8) as uint;
1672
- if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1673
- if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1696
+ val = utf8_first_byte!(val, w);
1697
+ val = utf8_acc_cont_byte!(val, s[i + 1]);
1698
+ if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
1699
+ if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
1674
1700
1675
1701
return CharRange {ch: val as char, next: i + w};
1676
1702
}
@@ -2035,7 +2061,7 @@ impl OwnedStr for ~str {
2035
2061
/// Appends a character to the back of a string
2036
2062
#[inline]
2037
2063
fn push_char(&mut self, c: char) {
2038
- assert!(c as uint <= 0x10ffff ); // FIXME: #7609: should be enforced on all `char`
2064
+ assert!(( c as uint) < MAX_UNICODE ); // FIXME: #7609: should be enforced on all `char`
2039
2065
unsafe {
2040
2066
let code = c as uint;
2041
2067
let nb = if code < MAX_ONE_B { 1u }
@@ -2802,6 +2828,17 @@ mod tests {
2802
2828
assert_eq!(ss, from_bytes(bb));
2803
2829
}
2804
2830
2831
+ #[test]
2832
+ fn test_is_utf8_deny_overlong() {
2833
+ assert!(!is_utf8([0xc0, 0x80]));
2834
+ assert!(!is_utf8([0xc0, 0xae]));
2835
+ assert!(!is_utf8([0xe0, 0x80, 0x80]));
2836
+ assert!(!is_utf8([0xe0, 0x80, 0xaf]));
2837
+ assert!(!is_utf8([0xe0, 0x81, 0x81]));
2838
+ assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
2839
+ }
2840
+
2841
+
2805
2842
#[test]
2806
2843
#[ignore(cfg(windows))]
2807
2844
fn test_from_bytes_fail() {
0 commit comments