@@ -722,17 +722,29 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
722
722
end - start
723
723
}
724
724
725
+ // https://tools.ietf.org/html/rfc3629
726
+ static UTF8_CHAR_WIDTH : [ u8 , ..256 ] = [
727
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
728
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x1F
729
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
730
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x3F
731
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
732
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x5F
733
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
734
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x7F
735
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
736
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0x9F
737
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
738
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xBF
739
+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
740
+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // 0xDF
741
+ 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
742
+ 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
743
+ ] ;
744
+
725
745
/// Given a first byte, determine how many bytes are in this UTF-8 character
726
746
pub fn utf8_char_width ( b : u8 ) -> uint {
727
- let byte: uint = b as uint ;
728
- if byte < 128 u { return 1 u; }
729
- // Not a valid start byte
730
- if byte < 192 u { return 0 u; }
731
- if byte < 224 u { return 2 u; }
732
- if byte < 240 u { return 3 u; }
733
- if byte < 248 u { return 4 u; }
734
- if byte < 252 u { return 5 u; }
735
- return 6 u;
747
+ return UTF8_CHAR_WIDTH [ b] as uint ;
736
748
}
737
749
738
750
#[ allow( missing_doc) ]
@@ -1714,26 +1726,29 @@ impl<'self> StrSlice<'self> for &'self str {
1714
1726
* If `i` is greater than or equal to the length of the string.
1715
1727
* If `i` is not the index of the beginning of a valid UTF-8 character.
1716
1728
*/
1729
+ #[inline]
1717
1730
fn char_range_at(&self, i: uint) -> CharRange {
1718
- let b0 = self[i];
1719
- let w = utf8_char_width(b0);
1720
- assert!((w != 0u));
1721
- if w == 1u { return CharRange {ch: b0 as char, next: i + 1u}; }
1722
- let mut val = 0u;
1723
- let end = i + w;
1724
- let mut i = i + 1u;
1725
- while i < end {
1726
- let byte = self[i];
1727
- assert_eq!(byte & 192u8, TAG_CONT_U8);
1728
- val <<= 6u;
1729
- val += (byte & 63u8) as uint;
1730
- i += 1u;
1731
+ if (self[i] < 128u8) {
1732
+ return CharRange {ch: self[i] as char, next: i + 1 };
1731
1733
}
1732
- // Clunky way to get the right bits from the first byte. Uses two shifts,
1733
- // the first to clip off the marker bits at the left of the byte, and then
1734
- // a second (as uint) to get it to the right position.
1735
- val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u);
1736
- return CharRange {ch: val as char, next: i};
1734
+
1735
+ // Multibyte case is a fn to allow char_range_at to inline cleanly
1736
+ fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1737
+ let mut val = s[i] as uint;
1738
+ let w = UTF8_CHAR_WIDTH[val] as uint;
1739
+ assert!((w != 0));
1740
+
1741
+ // First byte is special, only want bottom 5 bits for width 2, 4 bits
1742
+ // for width 3, and 3 bits for width 4
1743
+ val &= 0x7Fu >> w;
1744
+ val = (val << 6) | (s[i + 1] & 63u8) as uint;
1745
+ if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1746
+ if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1747
+
1748
+ return CharRange {ch: val as char, next: i + w};
1749
+ }
1750
+
1751
+ return multibyte_char_range_at(*self, i);
1737
1752
}
1738
1753
1739
1754
/// Plucks the character starting at the `i`th byte of a string
@@ -2430,7 +2445,11 @@ mod tests {
2430
2445
fn test_push_char() {
2431
2446
let mut data = ~" ประเทศไทย中";
2432
2447
data.push_char('华');
2433
- assert_eq!(~" ประเทศไทย中华", data);
2448
+ data.push_char('b'); // 1 byte
2449
+ data.push_char('¢'); // 2 byte
2450
+ data.push_char('€'); // 3 byte
2451
+ data.push_char('𤭢'); // 4 byte
2452
+ assert_eq!(~" ประเทศไทย中华b¢€𤭢", data);
2434
2453
}
2435
2454
2436
2455
#[test]
@@ -3240,6 +3259,19 @@ mod tests {
3240
3259
" 22 ".cmp(& &" 1234 ") == Greater;
3241
3260
}
3242
3261
3262
+ #[test]
3263
+ fn test_char_range_at() {
3264
+ let data = ~" b¢€𤭢𤭢€¢b";
3265
+ assert_eq!('b', data.char_range_at(0).ch);
3266
+ assert_eq!('¢', data.char_range_at(1).ch);
3267
+ assert_eq!('€', data.char_range_at(3).ch);
3268
+ assert_eq!('𤭢', data.char_range_at(6).ch);
3269
+ assert_eq!('𤭢', data.char_range_at(10).ch);
3270
+ assert_eq!('€', data.char_range_at(14).ch);
3271
+ assert_eq!('¢', data.char_range_at(17).ch);
3272
+ assert_eq!('b', data.char_range_at(19).ch);
3273
+ }
3274
+
3243
3275
#[test]
3244
3276
fn test_char_range_at_reverse_underflow() {
3245
3277
assert_eq!(" abc".char_range_at_reverse(0).next, 0);
0 commit comments