Skip to content

Commit dcb444a

Browse files
committed
Auto merge of #99884 - nnethercote:lexer-improvements, r=matklad
Lexer improvements Some cleanups and small speed improvements. r? `@matklad`
2 parents 1f5d8d4 + 99f5c79 commit dcb444a

File tree

10 files changed

+128
-117
lines changed

10 files changed

+128
-117
lines changed

compiler/rustc_ast/src/util/comments.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
194194
}
195195

196196
for token in rustc_lexer::tokenize(&text[pos..]) {
197-
let token_text = &text[pos..pos + token.len];
197+
let token_text = &text[pos..pos + token.len as usize];
198198
match token.kind {
199199
rustc_lexer::TokenKind::Whitespace => {
200200
if let Some(mut idx) = token_text.find('\n') {
@@ -211,8 +211,10 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
211211
}
212212
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
213213
if doc_style.is_none() {
214-
let code_to_the_right =
215-
!matches!(text[pos + token.len..].chars().next(), Some('\r' | '\n'));
214+
let code_to_the_right = !matches!(
215+
text[pos + token.len as usize..].chars().next(),
216+
Some('\r' | '\n')
217+
);
216218
let style = match (code_to_the_left, code_to_the_right) {
217219
(_, true) => CommentStyle::Mixed,
218220
(false, false) => CommentStyle::Isolated,
@@ -246,7 +248,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
246248
code_to_the_left = true;
247249
}
248250
}
249-
pos += token.len;
251+
pos += token.len as usize;
250252
}
251253

252254
comments

compiler/rustc_lexer/src/cursor.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ impl<'a> Cursor<'a> {
6161
}
6262

6363
/// Returns amount of already consumed symbols.
64-
pub(crate) fn len_consumed(&self) -> usize {
65-
self.initial_len - self.chars.as_str().len()
64+
pub(crate) fn len_consumed(&self) -> u32 {
65+
(self.initial_len - self.chars.as_str().len()) as u32
6666
}
6767

6868
/// Resets the number of bytes consumed to 0.

compiler/rustc_lexer/src/lib.rs

+48-41
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,17 @@ use std::convert::TryFrom;
3838
#[derive(Debug)]
3939
pub struct Token {
4040
pub kind: TokenKind,
41-
pub len: usize,
41+
pub len: u32,
4242
}
4343

4444
impl Token {
45-
fn new(kind: TokenKind, len: usize) -> Token {
45+
fn new(kind: TokenKind, len: u32) -> Token {
4646
Token { kind, len }
4747
}
4848
}
4949

5050
/// Enum representing common lexeme types.
51-
// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
52-
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
51+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
5352
pub enum TokenKind {
5453
// Multi-char tokens:
5554
/// "// comment"
@@ -76,7 +75,7 @@ pub enum TokenKind {
7675
/// tokens.
7776
UnknownPrefix,
7877
/// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
79-
Literal { kind: LiteralKind, suffix_start: usize },
78+
Literal { kind: LiteralKind, suffix_start: u32 },
8079
/// "'a"
8180
Lifetime { starts_with_number: bool },
8281

@@ -160,26 +159,24 @@ pub enum LiteralKind {
160159
Str { terminated: bool },
161160
/// "b"abc"", "b"abc"
162161
ByteStr { terminated: bool },
163-
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
164-
RawStr { n_hashes: u8, err: Option<RawStrError> },
165-
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
166-
RawByteStr { n_hashes: u8, err: Option<RawStrError> },
162+
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
163+
/// an invalid literal.
164+
RawStr { n_hashes: Option<u8> },
165+
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
166+
/// indicates an invalid literal.
167+
RawByteStr { n_hashes: Option<u8> },
167168
}
168169

169-
/// Error produced validating a raw string. Represents cases like:
170-
/// - `r##~"abcde"##`: `InvalidStarter`
171-
/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
172-
/// - Too many `#`s (>255): `TooManyDelimiters`
173-
// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
174170
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
175171
pub enum RawStrError {
176-
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
172+
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
177173
InvalidStarter { bad_char: char },
178-
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
179-
/// may have intended to terminate it.
180-
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
174+
/// The string was not terminated, e.g. `r###"abcde"##`.
175+
/// `possible_terminator_offset` is the number of characters after `r` or
176+
/// `br` where they may have intended to terminate it.
177+
NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
181178
/// More than 255 `#`s exist.
182-
TooManyDelimiters { found: usize },
179+
TooManyDelimiters { found: u32 },
183180
}
184181

185182
/// Base of numeric literal encoding according to its prefix.
@@ -221,11 +218,25 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
221218
}
222219

223220
/// Parses the first token from the provided input string.
221+
#[inline]
224222
pub fn first_token(input: &str) -> Token {
225223
debug_assert!(!input.is_empty());
226224
Cursor::new(input).advance_token()
227225
}
228226

227+
/// Validates a raw string literal. Used for getting more information about a
228+
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
229+
#[inline]
230+
pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
231+
debug_assert!(!input.is_empty());
232+
let mut cursor = Cursor::new(input);
233+
// Move past the leading `r` or `br`.
234+
for _ in 0..prefix_len {
235+
cursor.bump().unwrap();
236+
}
237+
cursor.raw_double_quoted_string(prefix_len).map(|_| ())
238+
}
239+
229240
/// Creates an iterator that produces tokens from the input string.
230241
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
231242
let mut cursor = Cursor::new(input);
@@ -315,12 +326,12 @@ impl Cursor<'_> {
315326
'r' => match (self.first(), self.second()) {
316327
('#', c1) if is_id_start(c1) => self.raw_ident(),
317328
('#', _) | ('"', _) => {
318-
let (n_hashes, err) = self.raw_double_quoted_string(1);
329+
let res = self.raw_double_quoted_string(1);
319330
let suffix_start = self.len_consumed();
320-
if err.is_none() {
331+
if res.is_ok() {
321332
self.eat_literal_suffix();
322333
}
323-
let kind = RawStr { n_hashes, err };
334+
let kind = RawStr { n_hashes: res.ok() };
324335
Literal { kind, suffix_start }
325336
}
326337
_ => self.ident_or_unknown_prefix(),
@@ -350,12 +361,12 @@ impl Cursor<'_> {
350361
}
351362
('r', '"') | ('r', '#') => {
352363
self.bump();
353-
let (n_hashes, err) = self.raw_double_quoted_string(2);
364+
let res = self.raw_double_quoted_string(2);
354365
let suffix_start = self.len_consumed();
355-
if err.is_none() {
366+
if res.is_ok() {
356367
self.eat_literal_suffix();
357368
}
358-
let kind = RawByteStr { n_hashes, err };
369+
let kind = RawByteStr { n_hashes: res.ok() };
359370
Literal { kind, suffix_start }
360371
}
361372
_ => self.ident_or_unknown_prefix(),
@@ -698,19 +709,18 @@ impl Cursor<'_> {
698709
}
699710

700711
/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
701-
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u8, Option<RawStrError>) {
712+
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
702713
// Wrap the actual function to handle the error with too many hashes.
703714
// This way, it eats the whole raw string.
704-
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
715+
let n_hashes = self.raw_string_unvalidated(prefix_len)?;
705716
// Only up to 255 `#`s are allowed in raw strings
706717
match u8::try_from(n_hashes) {
707-
Ok(num) => (num, err),
708-
// We lie about the number of hashes here :P
709-
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
718+
Ok(num) => Ok(num),
719+
Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
710720
}
711721
}
712722

713-
fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
723+
fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
714724
debug_assert!(self.prev() == 'r');
715725
let start_pos = self.len_consumed();
716726
let mut possible_terminator_offset = None;
@@ -729,7 +739,7 @@ impl Cursor<'_> {
729739
Some('"') => (),
730740
c => {
731741
let c = c.unwrap_or(EOF_CHAR);
732-
return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
742+
return Err(RawStrError::InvalidStarter { bad_char: c });
733743
}
734744
}
735745

@@ -739,14 +749,11 @@ impl Cursor<'_> {
739749
self.eat_while(|c| c != '"');
740750

741751
if self.is_eof() {
742-
return (
743-
n_start_hashes,
744-
Some(RawStrError::NoTerminator {
745-
expected: n_start_hashes,
746-
found: max_hashes,
747-
possible_terminator_offset,
748-
}),
749-
);
752+
return Err(RawStrError::NoTerminator {
753+
expected: n_start_hashes,
754+
found: max_hashes,
755+
possible_terminator_offset,
756+
});
750757
}
751758

752759
// Eat closing double quote.
@@ -764,7 +771,7 @@ impl Cursor<'_> {
764771
}
765772

766773
if n_end_hashes == n_start_hashes {
767-
return (n_start_hashes, None);
774+
return Ok(n_start_hashes);
768775
} else if n_end_hashes > max_hashes {
769776
// Keep track of possible terminators to give a hint about
770777
// where there might be a missing terminator

compiler/rustc_lexer/src/tests.rs

+22-28
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,39 @@ use super::*;
22

33
use expect_test::{expect, Expect};
44

5-
fn check_raw_str(s: &str, expected_hashes: u8, expected_err: Option<RawStrError>) {
5+
fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) {
66
let s = &format!("r{}", s);
77
let mut cursor = Cursor::new(s);
88
cursor.bump();
9-
let (n_hashes, err) = cursor.raw_double_quoted_string(0);
10-
assert_eq!(n_hashes, expected_hashes);
11-
assert_eq!(err, expected_err);
9+
let res = cursor.raw_double_quoted_string(0);
10+
assert_eq!(res, expected);
1211
}
1312

1413
#[test]
1514
fn test_naked_raw_str() {
16-
check_raw_str(r#""abc""#, 0, None);
15+
check_raw_str(r#""abc""#, Ok(0));
1716
}
1817

1918
#[test]
2019
fn test_raw_no_start() {
21-
check_raw_str(r##""abc"#"##, 0, None);
20+
check_raw_str(r##""abc"#"##, Ok(0));
2221
}
2322

2423
#[test]
2524
fn test_too_many_terminators() {
2625
// this error is handled in the parser later
27-
check_raw_str(r###"#"abc"##"###, 1, None);
26+
check_raw_str(r###"#"abc"##"###, Ok(1));
2827
}
2928

3029
#[test]
3130
fn test_unterminated() {
3231
check_raw_str(
3332
r#"#"abc"#,
34-
1,
35-
Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
33+
Err(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
3634
);
3735
check_raw_str(
3836
r###"##"abc"#"###,
39-
2,
40-
Some(RawStrError::NoTerminator {
37+
Err(RawStrError::NoTerminator {
4138
expected: 2,
4239
found: 1,
4340
possible_terminator_offset: Some(7),
@@ -46,41 +43,38 @@ fn test_unterminated() {
4643
// We're looking for "# not just any #
4744
check_raw_str(
4845
r###"##"abc#"###,
49-
2,
50-
Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
46+
Err(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
5147
)
5248
}
5349

5450
#[test]
5551
fn test_invalid_start() {
56-
check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
52+
check_raw_str(r##"#~"abc"#"##, Err(RawStrError::InvalidStarter { bad_char: '~' }));
5753
}
5854

5955
#[test]
6056
fn test_unterminated_no_pound() {
6157
// https://github.com/rust-lang/rust/issues/70677
6258
check_raw_str(
6359
r#"""#,
64-
0,
65-
Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
60+
Err(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
6661
);
6762
}
6863

6964
#[test]
7065
fn test_too_many_hashes() {
7166
let max_count = u8::MAX;
72-
let mut hashes: String = "#".repeat(max_count.into());
67+
let hashes1 = "#".repeat(max_count as usize);
68+
let hashes2 = "#".repeat(max_count as usize + 1);
69+
let middle = "\"abc\"";
70+
let s1 = [&hashes1, middle, &hashes1].join("");
71+
let s2 = [&hashes2, middle, &hashes2].join("");
7372

74-
// Valid number of hashes (255 = 2^8 - 1 = u8::MAX), but invalid string.
75-
check_raw_str(&hashes, max_count, Some(RawStrError::InvalidStarter { bad_char: '\u{0}' }));
73+
// Valid number of hashes (255 = 2^8 - 1 = u8::MAX).
74+
check_raw_str(&s1, Ok(255));
7675

7776
// One more hash sign (256 = 2^8) becomes too many.
78-
hashes.push('#');
79-
check_raw_str(
80-
&hashes,
81-
0,
82-
Some(RawStrError::TooManyDelimiters { found: usize::from(max_count) + 1 }),
83-
);
77+
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
8478
}
8579

8680
#[test]
@@ -251,7 +245,7 @@ fn raw_string() {
251245
check_lexing(
252246
"r###\"\"#a\\b\x00c\"\"###",
253247
expect![[r#"
254-
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 17 }, len: 17 }
248+
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 }
255249
"#]],
256250
)
257251
}
@@ -295,9 +289,9 @@ br###"raw"###suffix
295289
Token { kind: Whitespace, len: 1 }
296290
Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 }
297291
Token { kind: Whitespace, len: 1 }
298-
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 12 }, len: 18 }
292+
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 12 }, len: 18 }
299293
Token { kind: Whitespace, len: 1 }
300-
Token { kind: Literal { kind: RawByteStr { n_hashes: 3, err: None }, suffix_start: 13 }, len: 19 }
294+
Token { kind: Literal { kind: RawByteStr { n_hashes: Some(3) }, suffix_start: 13 }, len: 19 }
301295
Token { kind: Whitespace, len: 1 }
302296
"#]],
303297
)

0 commit comments

Comments
 (0)