@@ -824,41 +824,52 @@ pub fn is_utf16(v: &[u16]) -> bool {
824
824
}
825
825
}
826
826
827
- /// Iterates over the utf -16 characters in the specified slice, yielding each
828
- /// decoded unicode character to the function provided .
827
+ /// An iterator that decodes UTF -16 encoded codepoints from a vector
828
+ /// of `u16`s .
829
829
///
830
- /// # Failures
831
- ///
832
- /// * Fails on invalid utf-16 data
833
- pub fn utf16_chars ( v : & [ u16 ] , f: |char|) {
834
- let len = v. len ( ) ;
835
- let mut i = 0 u;
836
- while i < len && v[ i] != 0u16 {
837
- let u = v[ i] ;
838
-
839
- if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
840
- f ( unsafe { cast:: transmute ( u as u32 ) } ) ;
841
- i += 1 u;
830
+ /// Fails when it encounters invalid UTF-16 data.
831
+ pub struct UTF16Chars < ' a > {
832
+ priv iter : vec:: Items < ' a , u16 >
833
+ }
834
+ impl < ' a > Iterator < char > for UTF16Chars < ' a > {
835
+ fn next ( & mut self ) -> Option < char > {
836
+ let u = match self . iter . next ( ) {
837
+ Some ( u) => * u,
838
+ None => return None
839
+ } ;
840
+ match char:: from_u32 ( u as u32 ) {
841
+ Some ( c) => Some ( c) ,
842
+ None => {
843
+ let u2 = * self . iter . next ( ) . expect ( "UTF16Chars: unmatched lead surrogate" ) ;
844
+ if u < 0xD7FF || u > 0xDBFF ||
845
+ u2 < 0xDC00 || u2 > 0xDFFF {
846
+ fail ! ( "UTF16Chars: invalid surrogate pair" )
847
+ }
842
848
843
- } else {
844
- let u2 = v[ i+1 u] ;
845
- assert ! ( u >= 0xD800_u16 && u <= 0xDBFF_u16 ) ;
846
- assert ! ( u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16 ) ;
847
- let mut c: u32 = ( u - 0xD800_u16 ) as u32 ;
848
- c = c << 10 ;
849
- c |= ( u2 - 0xDC00_u16 ) as u32 ;
850
- c |= 0x1_0000_u32 ;
851
- f ( unsafe { cast:: transmute ( c) } ) ;
852
- i += 2 u;
849
+ let mut c = ( u - 0xD800 ) as u32 << 10 | ( u2 - 0xDC00 ) as u32 | 0x1_0000 ;
850
+ char:: from_u32 ( c)
851
+ }
853
852
}
854
853
}
854
+
855
+ fn size_hint ( & self ) -> ( uint , Option < uint > ) {
856
+ let ( low, high) = self . iter . size_hint ( ) ;
857
+ // we could be entirely surrogates (2 elements per char), or
858
+ // entirely non-surrogates (1 element per char)
859
+ ( low / 2 , high)
860
+ }
861
+ }
862
+
863
+ /// Create an iterator over the UTF-16 encoded codepoints in `v`.
864
+ ///
865
+ /// The iterator fails if it attempts to decode invalid UTF-16 data.
866
+ pub fn utf16_chars < ' a > ( v : & ' a [ u16 ] ) -> UTF16Chars < ' a > {
867
+ UTF16Chars { iter : v. iter ( ) }
855
868
}
856
869
857
870
/// Allocates a new string from the utf-16 slice provided
858
871
pub fn from_utf16 ( v : & [ u16 ] ) -> ~str {
859
- let mut buf = with_capacity ( v. len ( ) ) ;
860
- utf16_chars ( v, |ch| buf. push_char ( ch) ) ;
861
- buf
872
+ utf16_chars ( v) . collect ( )
862
873
}
863
874
864
875
/// Allocates a new string with the specified capacity. The string returned is
0 commit comments