Skip to content

Commit 6201eab

Browse files
committed
Auto merge of rust-lang#102302 - nnethercote:more-lexer-improvements, r=matklad
More lexer improvements A follow-up to rust-lang#99884. r? `@matklad`
2 parents 837bf37 + d0a26ac commit 6201eab

File tree

7 files changed

+429
-443
lines changed

7 files changed

+429
-443
lines changed

compiler/rustc_ast/src/token.rs

+1-6
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use rustc_span::symbol::{kw, sym};
1313
use rustc_span::symbol::{Ident, Symbol};
1414
use rustc_span::{self, edition::Edition, Span, DUMMY_SP};
1515
use std::borrow::Cow;
16-
use std::{fmt, mem};
16+
use std::fmt;
1717

1818
#[derive(Clone, Copy, PartialEq, Encodable, Decodable, Debug, HashStable_Generic)]
1919
pub enum CommentKind {
@@ -335,11 +335,6 @@ impl Token {
335335
Token::new(Ident(ident.name, ident.is_raw_guess()), ident.span)
336336
}
337337

338-
/// Return this token by value and leave a dummy token in its place.
339-
pub fn take(&mut self) -> Self {
340-
mem::replace(self, Token::dummy())
341-
}
342-
343338
/// For interpolated tokens, returns a span of the fragment to which the interpolated
344339
/// token refers. For all other tokens this is just a regular span.
345340
/// It is particularly important to use this for identifiers and lifetimes

compiler/rustc_errors/src/lib.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ pub mod translation;
6262
pub use diagnostic_builder::IntoDiagnostic;
6363
pub use snippet::Style;
6464

65-
pub type PResult<'a, T> = Result<T, DiagnosticBuilder<'a, ErrorGuaranteed>>;
65+
pub type PErr<'a> = DiagnosticBuilder<'a, ErrorGuaranteed>;
66+
pub type PResult<'a, T> = Result<T, PErr<'a>>;
6667

6768
// `PResult` is used a lot. Make sure it doesn't unintentionally get bigger.
6869
// (See also the comment on `DiagnosticBuilder`'s `diagnostic` field.)

compiler/rustc_lexer/src/cursor.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ use std::str::Chars;
44
///
55
/// Next characters can be peeked via `first` method,
66
/// and position can be shifted forward via `bump` method.
7-
pub(crate) struct Cursor<'a> {
8-
initial_len: usize,
7+
pub struct Cursor<'a> {
8+
len_remaining: usize,
99
/// Iterator over chars. Slightly faster than a &str.
1010
chars: Chars<'a>,
1111
#[cfg(debug_assertions)]
@@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> {
1515
pub(crate) const EOF_CHAR: char = '\0';
1616

1717
impl<'a> Cursor<'a> {
18-
pub(crate) fn new(input: &'a str) -> Cursor<'a> {
18+
pub fn new(input: &'a str) -> Cursor<'a> {
1919
Cursor {
20-
initial_len: input.len(),
20+
len_remaining: input.len(),
2121
chars: input.chars(),
2222
#[cfg(debug_assertions)]
2323
prev: EOF_CHAR,
@@ -61,13 +61,13 @@ impl<'a> Cursor<'a> {
6161
}
6262

6363
/// Returns amount of already consumed symbols.
64-
pub(crate) fn len_consumed(&self) -> u32 {
65-
(self.initial_len - self.chars.as_str().len()) as u32
64+
pub(crate) fn pos_within_token(&self) -> u32 {
65+
(self.len_remaining - self.chars.as_str().len()) as u32
6666
}
6767

6868
/// Resets the number of bytes consumed to 0.
69-
pub(crate) fn reset_len_consumed(&mut self) {
70-
self.initial_len = self.chars.as_str().len();
69+
pub(crate) fn reset_pos_within_token(&mut self) {
70+
self.len_remaining = self.chars.as_str().len();
7171
}
7272

7373
/// Moves to the next character.

compiler/rustc_lexer/src/lib.rs

+26-27
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,11 @@ pub mod unescape;
2929
#[cfg(test)]
3030
mod tests;
3131

32+
pub use crate::cursor::Cursor;
33+
3234
use self::LiteralKind::*;
3335
use self::TokenKind::*;
34-
use crate::cursor::{Cursor, EOF_CHAR};
36+
use crate::cursor::EOF_CHAR;
3537
use std::convert::TryFrom;
3638

3739
/// Parsed token.
@@ -139,6 +141,9 @@ pub enum TokenKind {
139141

140142
/// Unknown token, not expected by the lexer, e.g. "№"
141143
Unknown,
144+
145+
/// End of input.
146+
Eof,
142147
}
143148

144149
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -219,13 +224,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
219224
None
220225
}
221226

222-
/// Parses the first token from the provided input string.
223-
#[inline]
224-
pub fn first_token(input: &str) -> Token {
225-
debug_assert!(!input.is_empty());
226-
Cursor::new(input).advance_token()
227-
}
228-
229227
/// Validates a raw string literal. Used for getting more information about a
230228
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
231229
#[inline]
@@ -243,12 +241,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
243241
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
244242
let mut cursor = Cursor::new(input);
245243
std::iter::from_fn(move || {
246-
if cursor.is_eof() {
247-
None
248-
} else {
249-
cursor.reset_len_consumed();
250-
Some(cursor.advance_token())
251-
}
244+
let token = cursor.advance_token();
245+
if token.kind != TokenKind::Eof { Some(token) } else { None }
252246
})
253247
}
254248

@@ -311,8 +305,11 @@ pub fn is_ident(string: &str) -> bool {
311305

312306
impl Cursor<'_> {
313307
/// Parses a token from the input string.
314-
fn advance_token(&mut self) -> Token {
315-
let first_char = self.bump().unwrap();
308+
pub fn advance_token(&mut self) -> Token {
309+
let first_char = match self.bump() {
310+
Some(c) => c,
311+
None => return Token::new(TokenKind::Eof, 0),
312+
};
316313
let token_kind = match first_char {
317314
// Slash, comment or block comment.
318315
'/' => match self.first() {
@@ -329,7 +326,7 @@ impl Cursor<'_> {
329326
('#', c1) if is_id_start(c1) => self.raw_ident(),
330327
('#', _) | ('"', _) => {
331328
let res = self.raw_double_quoted_string(1);
332-
let suffix_start = self.len_consumed();
329+
let suffix_start = self.pos_within_token();
333330
if res.is_ok() {
334331
self.eat_literal_suffix();
335332
}
@@ -344,7 +341,7 @@ impl Cursor<'_> {
344341
('\'', _) => {
345342
self.bump();
346343
let terminated = self.single_quoted_string();
347-
let suffix_start = self.len_consumed();
344+
let suffix_start = self.pos_within_token();
348345
if terminated {
349346
self.eat_literal_suffix();
350347
}
@@ -354,7 +351,7 @@ impl Cursor<'_> {
354351
('"', _) => {
355352
self.bump();
356353
let terminated = self.double_quoted_string();
357-
let suffix_start = self.len_consumed();
354+
let suffix_start = self.pos_within_token();
358355
if terminated {
359356
self.eat_literal_suffix();
360357
}
@@ -364,7 +361,7 @@ impl Cursor<'_> {
364361
('r', '"') | ('r', '#') => {
365362
self.bump();
366363
let res = self.raw_double_quoted_string(2);
367-
let suffix_start = self.len_consumed();
364+
let suffix_start = self.pos_within_token();
368365
if res.is_ok() {
369366
self.eat_literal_suffix();
370367
}
@@ -381,7 +378,7 @@ impl Cursor<'_> {
381378
// Numeric literal.
382379
c @ '0'..='9' => {
383380
let literal_kind = self.number(c);
384-
let suffix_start = self.len_consumed();
381+
let suffix_start = self.pos_within_token();
385382
self.eat_literal_suffix();
386383
TokenKind::Literal { kind: literal_kind, suffix_start }
387384
}
@@ -420,7 +417,7 @@ impl Cursor<'_> {
420417
// String literal.
421418
'"' => {
422419
let terminated = self.double_quoted_string();
423-
let suffix_start = self.len_consumed();
420+
let suffix_start = self.pos_within_token();
424421
if terminated {
425422
self.eat_literal_suffix();
426423
}
@@ -433,7 +430,9 @@ impl Cursor<'_> {
433430
}
434431
_ => Unknown,
435432
};
436-
Token::new(token_kind, self.len_consumed())
433+
let res = Token::new(token_kind, self.pos_within_token());
434+
self.reset_pos_within_token();
435+
res
437436
}
438437

439438
fn line_comment(&mut self) -> TokenKind {
@@ -618,7 +617,7 @@ impl Cursor<'_> {
618617

619618
if !can_be_a_lifetime {
620619
let terminated = self.single_quoted_string();
621-
let suffix_start = self.len_consumed();
620+
let suffix_start = self.pos_within_token();
622621
if terminated {
623622
self.eat_literal_suffix();
624623
}
@@ -643,7 +642,7 @@ impl Cursor<'_> {
643642
if self.first() == '\'' {
644643
self.bump();
645644
let kind = Char { terminated: true };
646-
Literal { kind, suffix_start: self.len_consumed() }
645+
Literal { kind, suffix_start: self.pos_within_token() }
647646
} else {
648647
Lifetime { starts_with_number }
649648
}
@@ -724,7 +723,7 @@ impl Cursor<'_> {
724723

725724
fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
726725
debug_assert!(self.prev() == 'r');
727-
let start_pos = self.len_consumed();
726+
let start_pos = self.pos_within_token();
728727
let mut possible_terminator_offset = None;
729728
let mut max_hashes = 0;
730729

@@ -778,7 +777,7 @@ impl Cursor<'_> {
778777
// Keep track of possible terminators to give a hint about
779778
// where there might be a missing terminator
780779
possible_terminator_offset =
781-
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
780+
Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
782781
max_hashes = n_end_hashes;
783782
}
784783
}

0 commit comments

Comments
 (0)