Skip to content

Commit 1796373

Browse files
committed
char_range_at perf work
Moves multibyte code to it's own function to make char_range_at easier to inline, and faster for single and multibyte chars. Benchmarked reading example.json 100 times, 1.18s before, 1.08s after.
1 parent e95fcfa commit 1796373

File tree

1 file changed

+60
-28
lines changed

1 file changed

+60
-28
lines changed

src/libstd/str.rs

Lines changed: 60 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -722,17 +722,29 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
722722
end - start
723723
}
724724

725+
// https://tools.ietf.org/html/rfc3629
726+
static UTF8_CHAR_WIDTH: [u8, ..256] = [
727+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
728+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
729+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
730+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
731+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
732+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
733+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
734+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
735+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
736+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
737+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
738+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
739+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
740+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
741+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
742+
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
743+
];
744+
725745
/// Given a first byte, determine how many bytes are in this UTF-8 character
726746
pub fn utf8_char_width(b: u8) -> uint {
727-
let byte: uint = b as uint;
728-
if byte < 128u { return 1u; }
729-
// Not a valid start byte
730-
if byte < 192u { return 0u; }
731-
if byte < 224u { return 2u; }
732-
if byte < 240u { return 3u; }
733-
if byte < 248u { return 4u; }
734-
if byte < 252u { return 5u; }
735-
return 6u;
747+
return UTF8_CHAR_WIDTH[b] as uint;
736748
}
737749

738750
#[allow(missing_doc)]
@@ -1714,26 +1726,29 @@ impl<'self> StrSlice<'self> for &'self str {
17141726
* If `i` is greater than or equal to the length of the string.
17151727
* If `i` is not the index of the beginning of a valid UTF-8 character.
17161728
*/
1729+
#[inline]
17171730
fn char_range_at(&self, i: uint) -> CharRange {
1718-
let b0 = self[i];
1719-
let w = utf8_char_width(b0);
1720-
assert!((w != 0u));
1721-
if w == 1u { return CharRange {ch: b0 as char, next: i + 1u}; }
1722-
let mut val = 0u;
1723-
let end = i + w;
1724-
let mut i = i + 1u;
1725-
while i < end {
1726-
let byte = self[i];
1727-
assert_eq!(byte & 192u8, TAG_CONT_U8);
1728-
val <<= 6u;
1729-
val += (byte & 63u8) as uint;
1730-
i += 1u;
1731+
if (self[i] < 128u8) {
1732+
return CharRange {ch: self[i] as char, next: i + 1 };
17311733
}
1732-
// Clunky way to get the right bits from the first byte. Uses two shifts,
1733-
// the first to clip off the marker bits at the left of the byte, and then
1734-
// a second (as uint) to get it to the right position.
1735-
val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u);
1736-
return CharRange {ch: val as char, next: i};
1734+
1735+
// Multibyte case is a fn to allow char_range_at to inline cleanly
1736+
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1737+
let mut val = s[i] as uint;
1738+
let w = UTF8_CHAR_WIDTH[val] as uint;
1739+
assert!((w != 0));
1740+
1741+
// First byte is special, only want bottom 5 bits for width 2, 4 bits
1742+
// for width 3, and 3 bits for width 4
1743+
val &= 0x7Fu >> w;
1744+
val = (val << 6) | (s[i + 1] & 63u8) as uint;
1745+
if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1746+
if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1747+
1748+
return CharRange {ch: val as char, next: i + w};
1749+
}
1750+
1751+
return multibyte_char_range_at(*self, i);
17371752
}
17381753
17391754
/// Plucks the character starting at the `i`th byte of a string
@@ -2430,7 +2445,11 @@ mod tests {
24302445
fn test_push_char() {
24312446
let mut data = ~"ประเทศไทย中";
24322447
data.push_char('华');
2433-
assert_eq!(~"ประเทศไทย中华", data);
2448+
data.push_char('b'); // 1 byte
2449+
data.push_char('¢'); // 2 byte
2450+
data.push_char('€'); // 3 byte
2451+
data.push_char('𤭢'); // 4 byte
2452+
assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
24342453
}
24352454
24362455
#[test]
@@ -3240,6 +3259,19 @@ mod tests {
32403259
"22".cmp(& &"1234") == Greater;
32413260
}
32423261
3262+
#[test]
3263+
fn test_char_range_at() {
3264+
let data = ~"b¢€𤭢𤭢€¢b";
3265+
assert_eq!('b', data.char_range_at(0).ch);
3266+
assert_eq!('¢', data.char_range_at(1).ch);
3267+
assert_eq!('€', data.char_range_at(3).ch);
3268+
assert_eq!('𤭢', data.char_range_at(6).ch);
3269+
assert_eq!('𤭢', data.char_range_at(10).ch);
3270+
assert_eq!('€', data.char_range_at(14).ch);
3271+
assert_eq!('¢', data.char_range_at(17).ch);
3272+
assert_eq!('b', data.char_range_at(19).ch);
3273+
}
3274+
32433275
#[test]
32443276
fn test_char_range_at_reverse_underflow() {
32453277
assert_eq!("abc".char_range_at_reverse(0).next, 0);

0 commit comments

Comments
 (0)