Skip to content

Commit 42357d7

Browse files
author
root
committed
core::str: Implement Chars iterator using slice::Items
Re-use the vector iterator to implement the chars iterator. The iterator uses our guarantee that the string contains valid UTF-8, but its only unsafe code is transmuting the decoded u32 into char.
1 parent d6b42c2 commit 42357d7

File tree

1 file changed

+114
-44
lines changed

1 file changed

+114
-44
lines changed

src/libcore/str.rs

+114-44
Original file line numberDiff line numberDiff line change
@@ -97,47 +97,121 @@ impl<'a> CharEq for &'a [char] {
9797
Section: Iterators
9898
*/
9999

100-
/// External iterator for a string's characters.
101-
/// Use with the `std::iter` module.
100+
/// Iterator for the char (representing *Unicode Scalar Values*) of a string
101+
///
102+
/// Created with the method `.chars()`.
102103
#[deriving(Clone)]
103104
pub struct Chars<'a> {
104-
/// The slice remaining to be iterated
105-
string: &'a str,
105+
iter: slice::Items<'a, u8>
106+
}
107+
108+
// Return the initial codepoint accumulator for the first byte.
109+
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
110+
// for width 3, and 3 bits for width 4
111+
macro_rules! utf8_first_byte(
112+
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
113+
)
114+
115+
// return the value of $ch updated with continuation byte $byte
116+
macro_rules! utf8_acc_cont_byte(
117+
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
118+
)
119+
120+
macro_rules! utf8_is_cont_byte(
121+
($byte:expr) => (($byte & 192u8) == 128)
122+
)
123+
124+
#[inline]
125+
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
126+
match opt {
127+
Some(&byte) => byte,
128+
None => 0,
129+
}
106130
}
107131

108132
impl<'a> Iterator<char> for Chars<'a> {
109133
#[inline]
110134
fn next(&mut self) -> Option<char> {
111-
// Decode the next codepoint, then update
112-
// the slice to be just the remaining part
113-
if self.string.len() != 0 {
114-
let CharRange {ch, next} = self.string.char_range_at(0);
135+
// Decode UTF-8, using the valid UTF-8 invariant
136+
#[inline]
137+
fn decode_multibyte<'a>(x: u8, it: &mut slice::Items<'a, u8>) -> char {
138+
// NOTE: Performance is very sensitive to the exact formulation here
139+
// Decode from a byte combination out of: [[[x y] z] w]
140+
let cont_mask = 0x3F; // continuation byte mask
141+
let init = utf8_first_byte!(x, 2);
142+
let y = unwrap_or_0(it.next());
143+
let mut ch = utf8_acc_cont_byte!(init, y);
144+
if x >= 0xE0 {
145+
/* [[x y z] w] case */
146+
let z = unwrap_or_0(it.next());
147+
148+
let y_z = (((y & cont_mask) as u32) << 6) | (z & cont_mask) as u32;
149+
ch = init << 12 | y_z;
150+
if x >= 0xF0 {
151+
/* [x y z w] case */
152+
let w = unwrap_or_0(it.next());
153+
ch = (init & 7) << 18 | y_z << 6 | (w & cont_mask) as u32;
154+
}
155+
}
115156
unsafe {
116-
self.string = raw::slice_unchecked(self.string, next, self.string.len());
157+
mem::transmute(ch)
158+
}
159+
}
160+
161+
match self.iter.next() {
162+
None => None,
163+
Some(&next_byte) => {
164+
if next_byte < 128 {
165+
Some(next_byte as char)
166+
} else {
167+
Some(decode_multibyte(next_byte, &mut self.iter))
168+
}
117169
}
118-
Some(ch)
119-
} else {
120-
None
121170
}
122171
}
123172

124173
#[inline]
125174
fn size_hint(&self) -> (uint, Option<uint>) {
126-
(self.string.len().saturating_add(3)/4, Some(self.string.len()))
175+
let (len, _) = self.iter.size_hint();
176+
(len.saturating_add(3) / 4, Some(len))
127177
}
128178
}
129179

130180
impl<'a> DoubleEndedIterator<char> for Chars<'a> {
131181
#[inline]
132182
fn next_back(&mut self) -> Option<char> {
133-
if self.string.len() != 0 {
134-
let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
183+
#[inline]
184+
fn decode_multibyte_back<'a>(w: u8, it: &mut slice::Items<'a, u8>) -> char {
185+
// Decode from a byte combination out of: [x [y [z w]]]
186+
let mut ch;
187+
let z = unwrap_or_0(it.next_back());
188+
ch = utf8_first_byte!(z, 2);
189+
if utf8_is_cont_byte!(z) {
190+
let y = unwrap_or_0(it.next_back());
191+
ch = utf8_first_byte!(y, 3);
192+
if utf8_is_cont_byte!(y) {
193+
let x = unwrap_or_0(it.next_back());
194+
ch = utf8_first_byte!(x, 4);
195+
ch = utf8_acc_cont_byte!(ch, y);
196+
}
197+
ch = utf8_acc_cont_byte!(ch, z);
198+
}
199+
ch = utf8_acc_cont_byte!(ch, w);
200+
135201
unsafe {
136-
self.string = raw::slice_unchecked(self.string, 0, next);
202+
mem::transmute(ch)
203+
}
204+
}
205+
206+
match self.iter.next_back() {
207+
None => None,
208+
Some(&back_byte) => {
209+
if back_byte < 128 {
210+
Some(back_byte as char)
211+
} else {
212+
Some(decode_multibyte_back(back_byte, &mut self.iter))
213+
}
137214
}
138-
Some(ch)
139-
} else {
140-
None
141215
}
142216
}
143217
}
@@ -146,18 +220,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
146220
/// Use with the `std::iter` module.
147221
#[deriving(Clone)]
148222
pub struct CharOffsets<'a> {
149-
/// The original string to be iterated
150-
string: &'a str,
223+
front: uint,
224+
back: uint,
151225
iter: Chars<'a>,
152226
}
153227

154228
impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
155229
#[inline]
156230
fn next(&mut self) -> Option<(uint, char)> {
157-
// Compute the byte offset by using the pointer offset between
158-
// the original string slice and the iterator's remaining part
159-
let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
160-
self.iter.next().map(|ch| (offset, ch))
231+
match self.iter.next() {
232+
None => None,
233+
Some(ch) => {
234+
let index = self.front;
235+
let (len, _) = self.iter.iter.size_hint();
236+
self.front += self.back - self.front - len;
237+
Some((index, ch))
238+
}
239+
}
161240
}
162241

163242
#[inline]
@@ -169,11 +248,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
169248
impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
170249
#[inline]
171250
fn next_back(&mut self) -> Option<(uint, char)> {
172-
self.iter.next_back().map(|ch| {
173-
let offset = self.iter.string.len() +
174-
self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
175-
(offset, ch)
176-
})
251+
match self.iter.next_back() {
252+
None => None,
253+
Some(ch) => {
254+
let (len, _) = self.iter.iter.size_hint();
255+
self.back -= self.back - self.front - len;
256+
Some((self.back, ch))
257+
}
258+
}
177259
}
178260
}
179261

@@ -880,18 +962,6 @@ pub struct CharRange {
880962
pub next: uint,
881963
}
882964

883-
// Return the initial codepoint accumulator for the first byte.
884-
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
885-
// for width 3, and 3 bits for width 4
886-
macro_rules! utf8_first_byte(
887-
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
888-
)
889-
890-
// return the value of $ch updated with continuation byte $byte
891-
macro_rules! utf8_acc_cont_byte(
892-
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
893-
)
894-
895965
static TAG_CONT_U8: u8 = 128u8;
896966

897967
/// Unsafe operations
@@ -1608,7 +1678,7 @@ impl<'a> StrSlice<'a> for &'a str {
16081678

16091679
#[inline]
16101680
fn chars(&self) -> Chars<'a> {
1611-
Chars{string: *self}
1681+
Chars{iter: self.as_bytes().iter()}
16121682
}
16131683

16141684
#[inline]
@@ -1618,7 +1688,7 @@ impl<'a> StrSlice<'a> for &'a str {
16181688

16191689
#[inline]
16201690
fn char_indices(&self) -> CharOffsets<'a> {
1621-
CharOffsets{string: *self, iter: self.chars()}
1691+
CharOffsets{front: 0, back: self.len(), iter: self.chars()}
16221692
}
16231693

16241694
#[inline]

0 commit comments

Comments
 (0)