Skip to content

Commit 394ef49

Browse files
committed
---
yaml --- r: 108406 b: refs/heads/dist-snap c: a39056e h: refs/heads/master v: v3
1 parent 9d829b0 commit 394ef49

File tree

2 files changed

+12
-62
lines changed

2 files changed

+12
-62
lines changed

[refs]

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ refs/heads/try: f64fdf524a434f0e5cd0bc91d09c144723f3c90d
66
refs/tags/release-0.1: 1f5c5126e96c79d22cb7862f75304136e204f105
77
refs/heads/ndm: f3868061cd7988080c30d6d5bf352a5a5fe2460b
88
refs/heads/try2: 147ecfdd8221e4a4d4e090486829a06da1e0ca3c
9-
refs/heads/dist-snap: a68d10e6adc048b3a5f90e376f232a39223d1db8
9+
refs/heads/dist-snap: a39056e614b61489a8b8afc4171586e454d4dcbd
1010
refs/tags/release-0.2: c870d2dffb391e14efb05aa27898f1f6333a9596
1111
refs/tags/release-0.3: b5f0d0f648d9a6153664837026ba1be43d3e2503
1212
refs/heads/try3: 9387340aab40a73e8424c48fd42f0c521a4875c0

branches/dist-snap/src/libstd/str.rs

Lines changed: 11 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool {
813813

814814
#[inline(always)]
815815
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
816-
let mut i = 0u;
817-
let total = v.len();
818-
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
819-
unsafe { *xs.unsafe_ref(i) }
820-
}
821-
while i < total {
822-
let v_i = unsafe_get(v, i);
823-
if v_i < 128u8 {
824-
i += 1u;
825-
} else {
826-
let w = utf8_char_width(v_i);
827-
if w == 0u { return Some(i); }
828-
829-
let nexti = i + w;
830-
if nexti > total { return Some(i); }
816+
let mut it = v.iter();
831817

832-
// 2-byte encoding is for codepoints \u0080 to \u07ff
833-
// first C2 80 last DF BF
834-
// 3-byte encoding is for codepoints \u0800 to \uffff
835-
// first E0 A0 80 last EF BF BF
836-
// excluding surrogates codepoints \ud800 to \udfff
837-
// ED A0 80 to ED BF BF
838-
// 4-byte encoding is for codepoints \u10000 to \u10ffff
839-
// first F0 90 80 80 last F4 8F BF BF
840-
//
841-
// Use the UTF-8 syntax from the RFC
842-
//
843-
// https://tools.ietf.org/html/rfc3629
844-
// UTF8-1 = %x00-7F
845-
// UTF8-2 = %xC2-DF UTF8-tail
846-
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
847-
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
848-
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
849-
// %xF4 %x80-8F 2( UTF8-tail )
850-
// UTF8-tail = %x80-BF
851-
match w {
852-
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
853-
return Some(i)
854-
},
855-
3 => match (v_i,
856-
unsafe_get(v, i + 1),
857-
unsafe_get(v, i + 2) & 192u8) {
858-
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
859-
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
860-
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
861-
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
862-
_ => return Some(i),
863-
},
864-
_ => match (v_i,
865-
unsafe_get(v, i + 1),
866-
unsafe_get(v, i + 2) & 192u8,
867-
unsafe_get(v, i + 3) & 192u8) {
868-
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
869-
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
870-
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
871-
_ => return Some(i)
872-
},
873-
}
874-
875-
i = nexti;
876-
}
818+
let ok = run_utf8_validation_iterator(&mut it);
819+
if ok {
820+
None
821+
} else {
822+
// work out how many valid bytes we've consumed
823+
// (run_utf8_validation_iterator resets the iterator to just
824+
// after the last good byte), which we can do because the
825+
// vector iterator size_hint is exact.
826+
let (remaining, _) = it.size_hint();
827+
Some(v.len() - remaining)
877828
}
878-
None
879829
}
880830

881831
/// Determines if a vector of `u16` contains valid UTF-16

0 commit comments

Comments
 (0)