Skip to content

Commit 1893ac6

Browse files
authored
Rollup merge of #62963 - estebank:homoglyph-recovery, r=petrochenkov
Allow lexer to recover from some homoglyphs
2 parents c6c8693 + 6844976 commit 1893ac6

6 files changed

+89
-36
lines changed

Diff for: src/libsyntax/parse/lexer/mod.rs

+12-2
Original file line numberDiff line numberDiff line change
@@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
389389
self.pos,
390390
"unknown start of token",
391391
c);
392-
unicode_chars::check_for_substitution(self, start, c, &mut err);
393-
return Err(err)
392+
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
393+
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
394+
// this should be inside `rustc_lexer`. However, we should first remove compound
395+
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
396+
// as there will be less overall work to do this way.
397+
return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
398+
Some(token) => {
399+
err.emit();
400+
Ok(token)
401+
}
402+
None => Err(err),
403+
}
394404
}
395405
};
396406
Ok(kind)

Diff for: src/libsyntax/parse/lexer/unicode_chars.rs

+40-33
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
use super::StringReader;
55
use errors::{Applicability, DiagnosticBuilder};
6-
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
6+
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
7+
use crate::parse::token;
78

89
#[rustfmt::skip] // for line breaks
910
const UNICODE_ARRAY: &[(char, &str, char)] = &[
@@ -297,53 +298,59 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
297298
('>', "Fullwidth Greater-Than Sign", '>'),
298299
];
299300

300-
const ASCII_ARRAY: &[(char, &str)] = &[
301-
(' ', "Space"),
302-
('_', "Underscore"),
303-
('-', "Minus/Hyphen"),
304-
(',', "Comma"),
305-
(';', "Semicolon"),
306-
(':', "Colon"),
307-
('!', "Exclamation Mark"),
308-
('?', "Question Mark"),
309-
('.', "Period"),
310-
('\'', "Single Quote"),
311-
('"', "Quotation Mark"),
312-
('(', "Left Parenthesis"),
313-
(')', "Right Parenthesis"),
314-
('[', "Left Square Bracket"),
315-
(']', "Right Square Bracket"),
316-
('{', "Left Curly Brace"),
317-
('}', "Right Curly Brace"),
318-
('*', "Asterisk"),
319-
('/', "Slash"),
320-
('\\', "Backslash"),
321-
('&', "Ampersand"),
322-
('+', "Plus Sign"),
323-
('<', "Less-Than Sign"),
324-
('=', "Equals Sign"),
325-
('>', "Greater-Than Sign"),
301+
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
302+
// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
303+
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
304+
// fancier error recovery to it, as there will be less overall work to do this way.
305+
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
306+
(' ', "Space", Some(token::Whitespace)),
307+
('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
308+
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
309+
(',', "Comma", Some(token::Comma)),
310+
(';', "Semicolon", Some(token::Semi)),
311+
(':', "Colon", Some(token::Colon)),
312+
('!', "Exclamation Mark", Some(token::Not)),
313+
('?', "Question Mark", Some(token::Question)),
314+
('.', "Period", Some(token::Dot)),
315+
('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
316+
(')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
317+
('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
318+
(']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
319+
('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
320+
('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
321+
('*', "Asterisk", Some(token::BinOp(token::Star))),
322+
('/', "Slash", Some(token::BinOp(token::Slash))),
323+
('\\', "Backslash", None),
324+
('&', "Ampersand", Some(token::BinOp(token::And))),
325+
('+', "Plus Sign", Some(token::BinOp(token::Plus))),
326+
('<', "Less-Than Sign", Some(token::Lt)),
327+
('=', "Equals Sign", Some(token::Eq)),
328+
('>', "Greater-Than Sign", Some(token::Gt)),
329+
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
330+
// spitting the correct token out.
331+
('\'', "Single Quote", None),
332+
('"', "Quotation Mark", None),
326333
];
327334

328335
crate fn check_for_substitution<'a>(
329336
reader: &StringReader<'a>,
330337
pos: BytePos,
331338
ch: char,
332339
err: &mut DiagnosticBuilder<'a>,
333-
) -> bool {
340+
) -> Option<token::TokenKind> {
334341
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
335342
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
336-
None => return false,
343+
None => return None,
337344
};
338345

339346
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
340347

341-
let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
342-
Some((_ascii_char, ascii_name)) => ascii_name,
348+
let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
349+
Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
343350
None => {
344351
let msg = format!("substitution character not found for '{}'", ch);
345352
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
346-
return false;
353+
return None;
347354
}
348355
};
349356

@@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
371378
);
372379
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
373380
}
374-
true
381+
token.clone()
375382
}
376383

377384
/// Extract string if found at current position with given delimiters
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
22
//~^ ERROR expected at least one digit in exponent
33
//~| ERROR unknown start of token: \u{2212}
4+
//~| ERROR cannot subtract `{integer}` from `{float}`
45

56
fn main() {}

Diff for: src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr

+10-1
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
1414
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
1515
| ^
1616

17-
error: aborting due to 2 previous errors
17+
error[E0277]: cannot subtract `{integer}` from `{float}`
18+
--> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
19+
|
20+
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
21+
| ^ no implementation for `{float} - {integer}`
22+
|
23+
= help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
24+
25+
error: aborting due to 3 previous errors
1826

27+
For more information about this error, try `rustc --explain E0277`.

Diff for: src/test/ui/parser/recover-from-homoglyph.rs

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
fn main() {
2+
println!(""); //~ ERROR unknown start of token: \u{37e}
3+
let x: usize = (); //~ ERROR mismatched types
4+
}

Diff for: src/test/ui/parser/recover-from-homoglyph.stderr

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
error: unknown start of token: \u{37e}
2+
--> $DIR/recover-from-homoglyph.rs:2:17
3+
|
4+
LL | println!("");
5+
| ^
6+
help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
7+
|
8+
LL | println!("");
9+
| ^
10+
11+
error[E0308]: mismatched types
12+
--> $DIR/recover-from-homoglyph.rs:3:20
13+
|
14+
LL | let x: usize = ();
15+
| ^^ expected usize, found ()
16+
|
17+
= note: expected type `usize`
18+
found type `()`
19+
20+
error: aborting due to 2 previous errors
21+
22+
For more information about this error, try `rustc --explain E0308`.

0 commit comments

Comments
 (0)