@@ -97,47 +97,121 @@ impl<'a> CharEq for &'a [char] {
97
97
Section: Iterators
98
98
*/
99
99
100
- /// External iterator for a string's characters.
101
- /// Use with the `std::iter` module.
100
+ /// Iterator for the char (representing *Unicode Scalar Values*) of a string
101
+ ///
102
+ /// Created with the method `.chars()`.
102
103
#[ deriving( Clone ) ]
103
104
pub struct Chars < ' a > {
104
- /// The slice remaining to be iterated
105
- string : & ' a str ,
105
+ iter : slice:: Items < ' a , u8 >
106
+ }
107
+
108
+ // Return the initial codepoint accumulator for the first byte.
109
+ // The first byte is special, only want bottom 5 bits for width 2, 4 bits
110
+ // for width 3, and 3 bits for width 4
111
+ macro_rules! utf8_first_byte(
112
+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
113
+ )
114
+
115
+ // return the value of $ch updated with continuation byte $byte
116
+ macro_rules! utf8_acc_cont_byte(
117
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
118
+ )
119
+
120
+ macro_rules! utf8_is_cont_byte(
121
+ ( $byte: expr) => ( ( $byte & 192u8 ) == 128 )
122
+ )
123
+
124
+ #[ inline]
125
+ fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
126
+ match opt {
127
+ Some ( & byte) => byte,
128
+ None => 0 ,
129
+ }
106
130
}
107
131
108
132
impl < ' a > Iterator < char > for Chars < ' a > {
109
133
#[ inline]
110
134
fn next ( & mut self ) -> Option < char > {
111
- // Decode the next codepoint, then update
112
- // the slice to be just the remaining part
113
- if self . string . len ( ) != 0 {
114
- let CharRange { ch, next} = self . string . char_range_at ( 0 ) ;
135
+ // Decode UTF-8, using the valid UTF-8 invariant
136
+ #[ inline]
137
+ fn decode_multibyte < ' a > ( x : u8 , it : & mut slice:: Items < ' a , u8 > ) -> char {
138
+ // NOTE: Performance is very sensitive to the exact formulation here
139
+ // Decode from a byte combination out of: [[[x y] z] w]
140
+ let cont_mask = 0x3F ; // continuation byte mask
141
+ let init = utf8_first_byte ! ( x, 2 ) ;
142
+ let y = unwrap_or_0 ( it. next ( ) ) ;
143
+ let mut ch = utf8_acc_cont_byte ! ( init, y) ;
144
+ if x >= 0xE0 {
145
+ /* [[x y z] w] case */
146
+ let z = unwrap_or_0 ( it. next ( ) ) ;
147
+
148
+ let y_z = ( ( ( y & cont_mask) as u32 ) << 6 ) | ( z & cont_mask) as u32 ;
149
+ ch = init << 12 | y_z;
150
+ if x >= 0xF0 {
151
+ /* [x y z w] case */
152
+ let w = unwrap_or_0 ( it. next ( ) ) ;
153
+ ch = ( init & 7 ) << 18 | y_z << 6 | ( w & cont_mask) as u32 ;
154
+ }
155
+ }
115
156
unsafe {
116
- self . string = raw:: slice_unchecked ( self . string , next, self . string . len ( ) ) ;
157
+ mem:: transmute ( ch)
158
+ }
159
+ }
160
+
161
+ match self . iter . next ( ) {
162
+ None => None ,
163
+ Some ( & next_byte) => {
164
+ if next_byte < 128 {
165
+ Some ( next_byte as char )
166
+ } else {
167
+ Some ( decode_multibyte ( next_byte, & mut self . iter ) )
168
+ }
117
169
}
118
- Some ( ch)
119
- } else {
120
- None
121
170
}
122
171
}
123
172
124
173
#[ inline]
125
174
fn size_hint ( & self ) -> ( uint , Option < uint > ) {
126
- ( self . string . len ( ) . saturating_add ( 3 ) /4 , Some ( self . string . len ( ) ) )
175
+ let ( len, _) = self . iter . size_hint ( ) ;
176
+ ( len. saturating_add ( 3 ) / 4 , Some ( len) )
127
177
}
128
178
}
129
179
130
180
impl < ' a > DoubleEndedIterator < char > for Chars < ' a > {
131
181
#[ inline]
132
182
fn next_back ( & mut self ) -> Option < char > {
133
- if self . string . len ( ) != 0 {
134
- let CharRange { ch, next} = self . string . char_range_at_reverse ( self . string . len ( ) ) ;
183
+ #[ inline]
184
+ fn decode_multibyte_back < ' a > ( w : u8 , it : & mut slice:: Items < ' a , u8 > ) -> char {
185
+ // Decode from a byte combination out of: [x [y [z w]]]
186
+ let mut ch;
187
+ let z = unwrap_or_0 ( it. next_back ( ) ) ;
188
+ ch = utf8_first_byte ! ( z, 2 ) ;
189
+ if utf8_is_cont_byte ! ( z) {
190
+ let y = unwrap_or_0 ( it. next_back ( ) ) ;
191
+ ch = utf8_first_byte ! ( y, 3 ) ;
192
+ if utf8_is_cont_byte ! ( y) {
193
+ let x = unwrap_or_0 ( it. next_back ( ) ) ;
194
+ ch = utf8_first_byte ! ( x, 4 ) ;
195
+ ch = utf8_acc_cont_byte ! ( ch, y) ;
196
+ }
197
+ ch = utf8_acc_cont_byte ! ( ch, z) ;
198
+ }
199
+ ch = utf8_acc_cont_byte ! ( ch, w) ;
200
+
135
201
unsafe {
136
- self . string = raw:: slice_unchecked ( self . string , 0 , next) ;
202
+ mem:: transmute ( ch)
203
+ }
204
+ }
205
+
206
+ match self . iter . next_back ( ) {
207
+ None => None ,
208
+ Some ( & back_byte) => {
209
+ if back_byte < 128 {
210
+ Some ( back_byte as char )
211
+ } else {
212
+ Some ( decode_multibyte_back ( back_byte, & mut self . iter ) )
213
+ }
137
214
}
138
- Some ( ch)
139
- } else {
140
- None
141
215
}
142
216
}
143
217
}
@@ -146,18 +220,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
146
220
/// Use with the `std::iter` module.
147
221
#[ deriving( Clone ) ]
148
222
pub struct CharOffsets < ' a > {
149
- /// The original string to be iterated
150
- string : & ' a str ,
223
+ front : uint ,
224
+ back : uint ,
151
225
iter : Chars < ' a > ,
152
226
}
153
227
154
228
impl < ' a > Iterator < ( uint , char ) > for CharOffsets < ' a > {
155
229
#[ inline]
156
230
fn next ( & mut self ) -> Option < ( uint , char ) > {
157
- // Compute the byte offset by using the pointer offset between
158
- // the original string slice and the iterator's remaining part
159
- let offset = self . iter . string . as_ptr ( ) as uint - self . string . as_ptr ( ) as uint ;
160
- self . iter . next ( ) . map ( |ch| ( offset, ch) )
231
+ match self . iter . next ( ) {
232
+ None => None ,
233
+ Some ( ch) => {
234
+ let index = self . front ;
235
+ let ( len, _) = self . iter . iter . size_hint ( ) ;
236
+ self . front += self . back - self . front - len;
237
+ Some ( ( index, ch) )
238
+ }
239
+ }
161
240
}
162
241
163
242
#[ inline]
@@ -169,11 +248,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
169
248
impl < ' a > DoubleEndedIterator < ( uint , char ) > for CharOffsets < ' a > {
170
249
#[ inline]
171
250
fn next_back ( & mut self ) -> Option < ( uint , char ) > {
172
- self . iter . next_back ( ) . map ( |ch| {
173
- let offset = self . iter . string . len ( ) +
174
- self . iter . string . as_ptr ( ) as uint - self . string . as_ptr ( ) as uint ;
175
- ( offset, ch)
176
- } )
251
+ match self . iter . next_back ( ) {
252
+ None => None ,
253
+ Some ( ch) => {
254
+ let ( len, _) = self . iter . iter . size_hint ( ) ;
255
+ self . back -= self . back - self . front - len;
256
+ Some ( ( self . back , ch) )
257
+ }
258
+ }
177
259
}
178
260
}
179
261
@@ -880,18 +962,6 @@ pub struct CharRange {
880
962
pub next : uint ,
881
963
}
882
964
883
- // Return the initial codepoint accumulator for the first byte.
884
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
885
- // for width 3, and 3 bits for width 4
886
- macro_rules! utf8_first_byte(
887
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
888
- )
889
-
890
- // return the value of $ch updated with continuation byte $byte
891
- macro_rules! utf8_acc_cont_byte(
892
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
893
- )
894
-
895
965
static TAG_CONT_U8 : u8 = 128u8 ;
896
966
897
967
/// Unsafe operations
@@ -1608,7 +1678,7 @@ impl<'a> StrSlice<'a> for &'a str {
1608
1678
1609
1679
#[ inline]
1610
1680
fn chars ( & self ) -> Chars < ' a > {
1611
- Chars { string : * self }
1681
+ Chars { iter : self . as_bytes ( ) . iter ( ) }
1612
1682
}
1613
1683
1614
1684
#[ inline]
@@ -1618,7 +1688,7 @@ impl<'a> StrSlice<'a> for &'a str {
1618
1688
1619
1689
#[ inline]
1620
1690
fn char_indices ( & self ) -> CharOffsets < ' a > {
1621
- CharOffsets { string : * self , iter : self . chars ( ) }
1691
+ CharOffsets { front : 0 , back : self . len ( ) , iter : self . chars ( ) }
1622
1692
}
1623
1693
1624
1694
#[ inline]
0 commit comments