Skip to content

Commit 5aee5a1

Browse files
committed
Optimize is_utf8
Manually unroll the multibyte loops, and optimize for the single byte chars.
1 parent 1796373 commit 5aee5a1

File tree

2 files changed

+27
-8
lines changed

2 files changed

+27
-8
lines changed

src/libstd/str.rs

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool {
596596
let mut i = 0u;
597597
let total = v.len();
598598
while i < total {
599-
let mut chsize = utf8_char_width(v[i]);
600-
if chsize == 0u { return false; }
601-
if i + chsize > total { return false; }
602-
i += 1u;
603-
while chsize > 1u {
604-
if v[i] & 192u8 != TAG_CONT_U8 { return false; }
599+
if v[i] < 128u8 {
605600
i += 1u;
606-
chsize -= 1u;
601+
} else {
602+
let w = utf8_char_width(v[i]);
603+
if w == 0u { return false; }
604+
605+
let nexti = i + w;
606+
if nexti > total { return false; }
607+
608+
if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
609+
if w > 2 {
610+
if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
611+
if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
612+
}
613+
614+
i = nexti;
607615
}
608616
}
609-
return true;
617+
true
610618
}
611619

612620
/// Determines if a vector of `u16` contains valid UTF-16

src/test/run-pass/utf8_chars.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,20 @@ pub fn main() {
2727
assert!(s.char_at(1u) == 'é');
2828

2929
assert!((str::is_utf8(s.as_bytes())));
30+
// invalid prefix
3031
assert!((!str::is_utf8(~[0x80_u8])));
32+
// invalid 2 byte prefix
3133
assert!((!str::is_utf8(~[0xc0_u8])));
3234
assert!((!str::is_utf8(~[0xc0_u8, 0x10_u8])));
35+
// invalid 3 byte prefix
36+
assert!((!str::is_utf8(~[0xe0_u8])));
37+
assert!((!str::is_utf8(~[0xe0_u8, 0x10_u8])));
38+
assert!((!str::is_utf8(~[0xe0_u8, 0xff_u8, 0x10_u8])));
39+
// invalid 4 byte prefix
40+
assert!((!str::is_utf8(~[0xf0_u8])));
41+
assert!((!str::is_utf8(~[0xf0_u8, 0x10_u8])));
42+
assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0x10_u8])));
43+
assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0xff_u8, 0x10_u8])));
3344

3445
let mut stack = ~"a×c€";
3546
assert_eq!(stack.pop_char(), '€');

0 commit comments

Comments
 (0)