Skip to content

Commit 466d3e7

Browse files
authored
Rollup merge of rust-lang#72884 - Julian-Wollersberger:raw_str_error_cleanup, r=petrochenkov
RawString error reporting cleanup I simplified how errors with raw string are represented in the lexer and reportet in the parser, by using one enum instead of two structs with impls. This makes 70 code lines obsolete. I also noticed some other things (2nd commit) and added a missing test for the `too many '#' symbols' error. My original intent was to improve performance, but the only thing I found was to inline some functions in `cursor.rs`. It's effect is barely measurable, though. There is one open question. Before, the compiler aborts when encountering the `too many '#' symbols` error. Now the lexer says in this case that there are 0 hashes, and then later the parser aborts on the error. I'm worrying that the parser may be changed to recover and continue, and then later stages will see the wrong numer of hashes and act strange. (eg. the `format!` macro expansion). Is that possibility important enough today to worry about it?
2 parents eeaf497 + 7be8077 commit 466d3e7

File tree

3 files changed

+90
-240
lines changed

3 files changed

+90
-240
lines changed

src/librustc_lexer/src/lib.rs

+45-105
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mod tests;
2929
use self::LiteralKind::*;
3030
use self::TokenKind::*;
3131
use crate::cursor::{Cursor, EOF_CHAR};
32-
use std::convert::TryInto;
32+
use std::convert::TryFrom;
3333

3434
/// Parsed token.
3535
/// It doesn't contain information about data that has been parsed,
@@ -142,84 +142,24 @@ pub enum LiteralKind {
142142
/// "b"abc"", "b"abc"
143143
ByteStr { terminated: bool },
144144
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
145-
RawStr(UnvalidatedRawStr),
145+
RawStr { n_hashes: u16, err: Option<RawStrError> },
146146
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
147-
RawByteStr(UnvalidatedRawStr),
148-
}
149-
150-
/// Represents something that looks like a raw string, but may have some
151-
/// problems. Use `.validate()` to convert it into something
152-
/// usable.
153-
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
154-
pub struct UnvalidatedRawStr {
155-
/// The prefix (`r###"`) is valid
156-
valid_start: bool,
157-
158-
/// The postfix (`"###`) is valid
159-
valid_end: bool,
160-
161-
/// The number of leading `#`
162-
n_start_hashes: usize,
163-
/// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
164-
n_end_hashes: usize,
165-
/// The offset starting at `r` or `br` where the user may have intended to end the string.
166-
/// Currently, it is the longest sequence of pattern `"#+"`.
167-
possible_terminator_offset: Option<usize>,
147+
RawByteStr { n_hashes: u16, err: Option<RawStrError> },
168148
}
169149

170150
/// Error produced validating a raw string. Represents cases like:
171-
/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
172-
/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
173-
/// - Too many `#`s (>65536): `TooManyDelimiters`
151+
/// - `r##~"abcde"##`: `InvalidStarter`
152+
/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
153+
/// - Too many `#`s (>65535): `TooManyDelimiters`
174154
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
175-
pub enum LexRawStrError {
155+
pub enum RawStrError {
176156
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
177-
InvalidStarter,
157+
InvalidStarter { bad_char: char },
178158
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
179159
/// may have intended to terminate it.
180160
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
181-
/// More than 65536 `#`s exist.
182-
TooManyDelimiters,
183-
}
184-
185-
/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
186-
/// there are a matching number of `#` characters in both. Note that this will
187-
/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
188-
/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
189-
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
190-
pub struct ValidatedRawStr {
191-
n_hashes: u16,
192-
}
193-
194-
impl ValidatedRawStr {
195-
pub fn num_hashes(&self) -> u16 {
196-
self.n_hashes
197-
}
198-
}
199-
200-
impl UnvalidatedRawStr {
201-
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
202-
if !self.valid_start {
203-
return Err(LexRawStrError::InvalidStarter);
204-
}
205-
206-
// Only up to 65535 `#`s are allowed in raw strings
207-
let n_start_safe: u16 =
208-
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
209-
210-
if self.n_start_hashes > self.n_end_hashes || !self.valid_end {
211-
Err(LexRawStrError::NoTerminator {
212-
expected: self.n_start_hashes,
213-
found: self.n_end_hashes,
214-
possible_terminator_offset: self.possible_terminator_offset,
215-
})
216-
} else {
217-
// Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
218-
// they must be equal.
219-
debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
220-
Ok(ValidatedRawStr { n_hashes: n_start_safe })
221-
}
222-
}
161+
/// More than 65535 `#`s exist.
162+
TooManyDelimiters { found: usize },
223163
}
224164

225165
/// Base of numeric literal encoding according to its prefix.
@@ -354,12 +294,12 @@ impl Cursor<'_> {
354294
'r' => match (self.first(), self.second()) {
355295
('#', c1) if is_id_start(c1) => self.raw_ident(),
356296
('#', _) | ('"', _) => {
357-
let raw_str_i = self.raw_double_quoted_string(1);
297+
let (n_hashes, err) = self.raw_double_quoted_string(1);
358298
let suffix_start = self.len_consumed();
359-
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
299+
if err.is_none() {
360300
self.eat_literal_suffix();
361301
}
362-
let kind = RawStr(raw_str_i);
302+
let kind = RawStr { n_hashes, err };
363303
Literal { kind, suffix_start }
364304
}
365305
_ => self.ident(),
@@ -389,14 +329,12 @@ impl Cursor<'_> {
389329
}
390330
('r', '"') | ('r', '#') => {
391331
self.bump();
392-
let raw_str_i = self.raw_double_quoted_string(2);
332+
let (n_hashes, err) = self.raw_double_quoted_string(2);
393333
let suffix_start = self.len_consumed();
394-
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
395-
if terminated {
334+
if err.is_none() {
396335
self.eat_literal_suffix();
397336
}
398-
399-
let kind = RawByteStr(raw_str_i);
337+
let kind = RawByteStr { n_hashes, err };
400338
Literal { kind, suffix_start }
401339
}
402340
_ => self.ident(),
@@ -692,27 +630,34 @@ impl Cursor<'_> {
692630
false
693631
}
694632

695-
/// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
696-
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
633+
/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
634+
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
635+
// Wrap the actual function to handle the error with too many hashes.
636+
// This way, it eats the whole raw string.
637+
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
638+
// Only up to 65535 `#`s are allowed in raw strings
639+
match u16::try_from(n_hashes) {
640+
Ok(num) => (num, err),
641+
// We lie about the number of hashes here :P
642+
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
643+
}
644+
}
645+
646+
fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
697647
debug_assert!(self.prev() == 'r');
698-
let mut valid_start: bool = false;
699648
let start_pos = self.len_consumed();
700-
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
649+
let mut possible_terminator_offset = None;
650+
let mut max_hashes = 0;
701651

702652
// Count opening '#' symbols.
703653
let n_start_hashes = self.eat_while(|c| c == '#');
704654

705655
// Check that string is started.
706656
match self.bump() {
707-
Some('"') => valid_start = true,
708-
_ => {
709-
return UnvalidatedRawStr {
710-
valid_start,
711-
valid_end: false,
712-
n_start_hashes,
713-
n_end_hashes: 0,
714-
possible_terminator_offset,
715-
};
657+
Some('"') => (),
658+
c => {
659+
let c = c.unwrap_or(EOF_CHAR);
660+
return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
716661
}
717662
}
718663

@@ -722,13 +667,14 @@ impl Cursor<'_> {
722667
self.eat_while(|c| c != '"');
723668

724669
if self.is_eof() {
725-
return UnvalidatedRawStr {
726-
valid_start,
727-
valid_end: false,
670+
return (
728671
n_start_hashes,
729-
n_end_hashes: max_hashes,
730-
possible_terminator_offset,
731-
};
672+
Some(RawStrError::NoTerminator {
673+
expected: n_start_hashes,
674+
found: max_hashes,
675+
possible_terminator_offset,
676+
}),
677+
);
732678
}
733679

734680
// Eat closing double quote.
@@ -737,7 +683,7 @@ impl Cursor<'_> {
737683
// Check that amount of closing '#' symbols
738684
// is equal to the amount of opening ones.
739685
// Note that this will not consume extra trailing `#` characters:
740-
// `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }`
686+
// `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
741687
// followed by a `#` token.
742688
let mut hashes_left = n_start_hashes;
743689
let is_closing_hash = |c| {
@@ -751,13 +697,7 @@ impl Cursor<'_> {
751697
let n_end_hashes = self.eat_while(is_closing_hash);
752698

753699
if n_end_hashes == n_start_hashes {
754-
return UnvalidatedRawStr {
755-
valid_start,
756-
valid_end: true,
757-
n_start_hashes,
758-
n_end_hashes,
759-
possible_terminator_offset: None,
760-
};
700+
return (n_start_hashes, None);
761701
} else if n_end_hashes > max_hashes {
762702
// Keep track of possible terminators to give a hint about
763703
// where there might be a missing terminator

src/librustc_lexer/src/tests.rs

+16-84
Original file line numberDiff line numberDiff line change
@@ -2,92 +2,46 @@
22
mod tests {
33
use crate::*;
44

5-
fn check_raw_str(
6-
s: &str,
7-
expected: UnvalidatedRawStr,
8-
validated: Result<ValidatedRawStr, LexRawStrError>,
9-
) {
5+
fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option<RawStrError>) {
106
let s = &format!("r{}", s);
117
let mut cursor = Cursor::new(s);
128
cursor.bump();
13-
let tok = cursor.raw_double_quoted_string(0);
14-
assert_eq!(tok, expected);
15-
assert_eq!(tok.validate(), validated);
9+
let (n_hashes, err) = cursor.raw_double_quoted_string(0);
10+
assert_eq!(n_hashes, expected_hashes);
11+
assert_eq!(err, expected_err);
1612
}
1713

1814
#[test]
1915
fn test_naked_raw_str() {
20-
check_raw_str(
21-
r#""abc""#,
22-
UnvalidatedRawStr {
23-
n_start_hashes: 0,
24-
n_end_hashes: 0,
25-
valid_start: true,
26-
valid_end: true,
27-
possible_terminator_offset: None,
28-
},
29-
Ok(ValidatedRawStr { n_hashes: 0 }),
30-
);
16+
check_raw_str(r#""abc""#, 0, None);
3117
}
3218

3319
#[test]
3420
fn test_raw_no_start() {
35-
check_raw_str(
36-
r##""abc"#"##,
37-
UnvalidatedRawStr {
38-
n_start_hashes: 0,
39-
n_end_hashes: 0,
40-
valid_start: true,
41-
valid_end: true,
42-
possible_terminator_offset: None,
43-
},
44-
Ok(ValidatedRawStr { n_hashes: 0 }),
45-
);
21+
check_raw_str(r##""abc"#"##, 0, None);
4622
}
4723

4824
#[test]
4925
fn test_too_many_terminators() {
5026
// this error is handled in the parser later
51-
check_raw_str(
52-
r###"#"abc"##"###,
53-
UnvalidatedRawStr {
54-
n_start_hashes: 1,
55-
n_end_hashes: 1,
56-
valid_end: true,
57-
valid_start: true,
58-
possible_terminator_offset: None,
59-
},
60-
Ok(ValidatedRawStr { n_hashes: 1 }),
61-
);
27+
check_raw_str(r###"#"abc"##"###, 1, None);
6228
}
6329

6430
#[test]
6531
fn test_unterminated() {
6632
check_raw_str(
6733
r#"#"abc"#,
68-
UnvalidatedRawStr {
69-
n_start_hashes: 1,
70-
n_end_hashes: 0,
71-
valid_end: false,
72-
valid_start: true,
73-
possible_terminator_offset: None,
74-
},
75-
Err(LexRawStrError::NoTerminator {
34+
1,
35+
Some(RawStrError::NoTerminator {
7636
expected: 1,
7737
found: 0,
7838
possible_terminator_offset: None,
7939
}),
8040
);
8141
check_raw_str(
8242
r###"##"abc"#"###,
83-
UnvalidatedRawStr {
84-
n_start_hashes: 2,
85-
n_end_hashes: 1,
86-
valid_start: true,
87-
valid_end: false,
88-
possible_terminator_offset: Some(7),
89-
},
90-
Err(LexRawStrError::NoTerminator {
43+
2,
44+
Some(RawStrError::NoTerminator {
9145
expected: 2,
9246
found: 1,
9347
possible_terminator_offset: Some(7),
@@ -96,14 +50,8 @@ mod tests {
9650
// We're looking for "# not just any #
9751
check_raw_str(
9852
r###"##"abc#"###,
99-
UnvalidatedRawStr {
100-
n_start_hashes: 2,
101-
n_end_hashes: 0,
102-
valid_start: true,
103-
valid_end: false,
104-
possible_terminator_offset: None,
105-
},
106-
Err(LexRawStrError::NoTerminator {
53+
2,
54+
Some(RawStrError::NoTerminator {
10755
expected: 2,
10856
found: 0,
10957
possible_terminator_offset: None,
@@ -113,32 +61,16 @@ mod tests {
11361

11462
#[test]
11563
fn test_invalid_start() {
116-
check_raw_str(
117-
r##"#~"abc"#"##,
118-
UnvalidatedRawStr {
119-
n_start_hashes: 1,
120-
n_end_hashes: 0,
121-
valid_start: false,
122-
valid_end: false,
123-
possible_terminator_offset: None,
124-
},
125-
Err(LexRawStrError::InvalidStarter),
126-
);
64+
check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
12765
}
12866

12967
#[test]
13068
fn test_unterminated_no_pound() {
13169
// https://github.com/rust-lang/rust/issues/70677
13270
check_raw_str(
13371
r#"""#,
134-
UnvalidatedRawStr {
135-
n_start_hashes: 0,
136-
n_end_hashes: 0,
137-
valid_start: true,
138-
valid_end: false,
139-
possible_terminator_offset: None,
140-
},
141-
Err(LexRawStrError::NoTerminator {
72+
0,
73+
Some(RawStrError::NoTerminator {
14274
expected: 0,
14375
found: 0,
14476
possible_terminator_offset: None,

0 commit comments

Comments
 (0)