@@ -564,18 +564,6 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
564
564
Section: Misc
565
565
*/
566
566
567
- // Return the initial codepoint accumulator for the first byte.
568
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
569
- // for width 3, and 3 bits for width 4
570
- macro_rules! utf8_first_byte(
571
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as uint)
572
- )
573
-
574
- // return the value of $ch updated with continuation byte $byte
575
- macro_rules! utf8_acc_cont_byte(
576
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as uint)
577
- )
578
-
579
567
/// Determines if a vector of bytes contains valid UTF-8
580
568
pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
581
569
let mut i = 0 u;
@@ -589,26 +577,11 @@ pub fn is_utf8(v: &[u8]) -> bool {
589
577
590
578
let nexti = i + w;
591
579
if nexti > total { return false ; }
592
- // 1. Make sure the correct number of continuation bytes are present
593
- // 2. Check codepoint ranges (deny overlong encodings)
594
- // 2-byte encoding is for codepoints \u0080 to \u07ff
595
- // 3-byte encoding is for codepoints \u0800 to \uffff
596
- // 4-byte encoding is for codepoints \u10000 to \u10ffff
597
580
598
- // 2-byte encodings are correct if the width and continuation match up
599
581
if v[ i + 1 ] & 192u8 != TAG_CONT_U8 { return false ; }
600
582
if w > 2 {
601
- let mut ch;
602
- ch = utf8_first_byte ! ( v[ i] , w) ;
603
- ch = utf8_acc_cont_byte ! ( ch, v[ i + 1 ] ) ;
604
583
if v[ i + 2 ] & 192u8 != TAG_CONT_U8 { return false ; }
605
- ch = utf8_acc_cont_byte ! ( ch, v[ i + 2 ] ) ;
606
- if w == 3 && ch < MAX_TWO_B { return false ; }
607
- if w > 3 {
608
- if v[ i + 3 ] & 192u8 != TAG_CONT_U8 { return false ; }
609
- ch = utf8_acc_cont_byte ! ( ch, v[ i + 3 ] ) ;
610
- if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false ; }
611
- }
584
+ if w > 3 && ( v[ i + 3 ] & 192u8 != TAG_CONT_U8 ) { return false ; }
612
585
}
613
586
614
587
i = nexti;
@@ -726,7 +699,7 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
726
699
}
727
700
728
701
// https://tools.ietf.org/html/rfc3629
729
- priv static UTF8_CHAR_WIDTH : [ u8 , ..256 ] = [
702
+ static UTF8_CHAR_WIDTH : [ u8 , ..256 ] = [
730
703
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
731
704
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x1F
732
705
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
@@ -739,7 +712,7 @@ priv static UTF8_CHAR_WIDTH: [u8, ..256] = [
739
712
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0x9F
740
713
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
741
714
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xBF
742
- 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
715
+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
743
716
2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // 0xDF
744
717
3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
745
718
4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
@@ -757,15 +730,14 @@ pub struct CharRange {
757
730
}
758
731
759
732
// UTF-8 tags and ranges
760
- priv static TAG_CONT_U8 : u8 = 128u8 ;
761
- priv static TAG_CONT : uint = 128 u;
762
- priv static MAX_ONE_B : uint = 128 u;
763
- priv static TAG_TWO_B : uint = 192 u;
764
- priv static MAX_TWO_B : uint = 2048 u;
765
- priv static TAG_THREE_B : uint = 224 u;
766
- priv static MAX_THREE_B : uint = 65536 u;
767
- priv static TAG_FOUR_B : uint = 240 u;
768
- priv static MAX_UNICODE : uint = 1114112 u;
733
+ static TAG_CONT_U8 : u8 = 128u8 ;
734
+ static TAG_CONT : uint = 128 u;
735
+ static MAX_ONE_B : uint = 128 u;
736
+ static TAG_TWO_B : uint = 192 u;
737
+ static MAX_TWO_B : uint = 2048 u;
738
+ static TAG_THREE_B : uint = 224 u;
739
+ static MAX_THREE_B : uint = 65536 u;
740
+ static TAG_FOUR_B : uint = 240 u;
769
741
770
742
/// Unsafe operations
771
743
pub mod raw {
@@ -1693,10 +1665,12 @@ impl<'self> StrSlice<'self> for &'self str {
1693
1665
let w = UTF8_CHAR_WIDTH[val] as uint;
1694
1666
assert!((w != 0));
1695
1667
1696
- val = utf8_first_byte!(val, w);
1697
- val = utf8_acc_cont_byte!(val, s[i + 1]);
1698
- if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
1699
- if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
1668
+ // First byte is special, only want bottom 5 bits for width 2, 4 bits
1669
+ // for width 3, and 3 bits for width 4
1670
+ val &= 0x7Fu >> w;
1671
+ val = (val << 6) | (s[i + 1] & 63u8) as uint;
1672
+ if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1673
+ if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1700
1674
1701
1675
return CharRange {ch: val as char, next: i + w};
1702
1676
}
@@ -2061,7 +2035,7 @@ impl OwnedStr for ~str {
2061
2035
/// Appends a character to the back of a string
2062
2036
#[inline]
2063
2037
fn push_char(&mut self, c: char) {
2064
- assert!(( c as uint) < MAX_UNICODE ); // FIXME: #7609: should be enforced on all `char`
2038
+ assert!(c as uint <= 0x10ffff ); // FIXME: #7609: should be enforced on all `char`
2065
2039
unsafe {
2066
2040
let code = c as uint;
2067
2041
let nb = if code < MAX_ONE_B { 1u }
@@ -2825,23 +2799,9 @@ mod tests {
2825
2799
0x20_u8, 0x4e_u8, 0x61_u8,
2826
2800
0x6d_u8];
2827
2801
2828
-
2829
2802
assert_eq!(ss, from_bytes(bb));
2830
- assert_eq!(~" 𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰",
2831
- from_bytes(bytes!(" 𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰")));
2832
2803
}
2833
2804
2834
- #[test]
2835
- fn test_is_utf8_deny_overlong() {
2836
- assert!(!is_utf8([0xc0, 0x80]));
2837
- assert!(!is_utf8([0xc0, 0xae]));
2838
- assert!(!is_utf8([0xe0, 0x80, 0x80]));
2839
- assert!(!is_utf8([0xe0, 0x80, 0xaf]));
2840
- assert!(!is_utf8([0xe0, 0x81, 0x81]));
2841
- assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
2842
- }
2843
-
2844
-
2845
2805
#[test]
2846
2806
#[ignore(cfg(windows))]
2847
2807
fn test_from_bytes_fail() {
0 commit comments