@@ -117,27 +117,99 @@ use crate::vec::Vec;
117
117
///
118
118
/// # UTF-8
119
119
///
120
- /// `String`s are always valid UTF-8. This has a few implications, the first of
121
- /// which is that if you need a non-UTF-8 string, consider [`OsString`]. It is
122
- /// similar, but without the UTF-8 constraint. The second implication is that
123
- /// you cannot index into a `String`:
120
+ /// `String`s are always valid UTF-8. If you need a non-UTF-8 string, consider
121
+ /// [`OsString`]. It is similar, but without the UTF-8 constraint. Because UTF-8
122
+ /// is a variable width encoding, `String`s are typically smaller than an array of
123
+ /// the same `chars`:
124
+ ///
125
+ /// ```
126
+ /// use std::mem;
127
+ ///
128
+ /// // `s` is ASCII which represents each `char` as one byte
129
+ /// let s = "hello";
130
+ /// assert_eq!(s.len(), 5);
131
+ ///
132
+ /// // A `char` array with the same contents would be longer because
133
+ /// // every `char` is four bytes
134
+ /// let s = ['h', 'e', 'l', 'l', 'o'];
135
+ /// let size: usize = s.into_iter().map(|c| mem::size_of_val(&c)).sum();
136
+ /// assert_eq!(size, 20);
137
+ ///
138
+ /// // However, for non-ASCII strings, the difference will be smaller
139
+ /// // and sometimes they are the same
140
+ /// let s = "💖💖💖💖💖";
141
+ /// assert_eq!(s.len(), 20);
142
+ ///
143
+ /// let s = ['💖', '💖', '💖', '💖', '💖'];
144
+ /// let size: usize = s.into_iter().map(|c| mem::size_of_val(&c)).sum();
145
+ /// assert_eq!(size, 20);
146
+ /// ```
147
+ ///
148
+ /// This raises interesting questions as to how `s[i]` should work.
149
+ /// What should `i` be here? Several options include byte indices and
150
+ /// `char` indices but, because of UTF-8 encoding, only byte indices
151
+ /// would provide constant time indexing. Getting the `i`th `char`, for
152
+ /// example, is available using [`chars`]:
153
+ ///
154
+ /// ```
155
+ /// let s = "hello";
156
+ /// let third_character = s.chars().nth(2);
157
+ /// assert_eq!(third_character, Some('l'));
158
+ ///
159
+ /// let s = "💖💖💖💖💖";
160
+ /// let third_character = s.chars().nth(2);
161
+ /// assert_eq!(third_character, Some('💖'));
162
+ /// ```
163
+ ///
164
+ /// Next, what should `s[i]` return? Because indexing returns a reference
165
+ /// to underlying data it could be `&u8`, `&[u8]`, or something else similar.
166
+ /// Since we're only providing one index, `&u8` makes the most sense but that
167
+ /// might not be what the user expects and can be explicitly achieved with
168
+ /// [`as_bytes()`]:
169
+ ///
170
+ /// ```
171
+ /// // The first byte is 104 - the byte value of `'h'`
172
+ /// let s = "hello";
173
+ /// assert_eq!(s.as_bytes()[0], 104);
174
+ /// // or
175
+ /// assert_eq!(s.as_bytes()[0], b'h');
176
+ ///
177
+ /// // The first byte is 240 which isn't obviously useful
178
+ /// let s = "💖💖💖💖💖";
179
+ /// assert_eq!(s.as_bytes()[0], 240);
180
+ /// ```
181
+ ///
182
+ /// Due to these ambiguities/restrictions, indexing with a `usize` is simply
183
+ /// forbidden:
124
184
///
125
185
/// ```compile_fail,E0277
126
186
/// let s = "hello";
127
187
///
128
- /// println!("The first letter of s is {}", s[0]); // ERROR!!!
188
+ /// // The following will not compile!
189
+ /// println!("The first letter of s is {}", s[0]);
129
190
/// ```
130
191
///
192
+ /// It is more clear, however, how `&s[i..j]` should work (that is,
193
+ /// indexing with a range). It should accept byte indices (to be constant-time)
194
+ /// and return a `&str` which is UTF-8 encoded. This is also called "string slicing".
195
+ /// Note this will panic if the byte indices provided are not character
196
+ /// boundaries - see [`is_char_boundary`] for more details. See the implementations
197
+ /// for [`SliceIndex<str>`] for more details on string slicing. For a non-panicking
198
+ /// version of string slicing, see [`get`].
199
+ ///
131
200
/// [`OsString`]: ../../std/ffi/struct.OsString.html "ffi::OsString"
201
+ /// [`SliceIndex<str>`]: core::slice::SliceIndex
202
+ /// [`as_bytes()`]: str::as_bytes
203
+ /// [`get`]: str::get
204
+ /// [`is_char_boundary`]: str::is_char_boundary
132
205
///
133
- /// Indexing is intended to be a constant-time operation, but UTF-8 encoding
134
- /// does not allow us to do this. Furthermore, it's not clear what sort of
135
- /// thing the index should return: a byte, a codepoint, or a grapheme cluster.
136
- /// The [`bytes`] and [`chars`] methods return iterators over the first
137
- /// two, respectively.
206
+ /// The [`bytes`] and [`chars`] methods return iterators over the bytes and
207
+ /// codepoints of the string, respectively. To iterate over codepoints along
208
+ /// with byte indices, use [`char_indices`].
138
209
///
139
210
/// [`bytes`]: str::bytes
140
211
/// [`chars`]: str::chars
212
+ /// [`char_indices`]: str::char_indices
141
213
///
142
214
/// # Deref
143
215
///
0 commit comments