Skip to content

Commit b4dcb2c

Browse files
authored
Rollup merge of #88781 - estebank:emoji-idents, r=oli-obk
Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address #86102.
2 parents a8a4504 + 30f9807 commit b4dcb2c

File tree

12 files changed

+231
-15
lines changed

12 files changed

+231
-15
lines changed

Cargo.lock

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3980,6 +3980,7 @@ name = "rustc_lexer"
39803980
version = "0.1.0"
39813981
dependencies = [
39823982
"expect-test",
3983+
"unic-emoji-char",
39833984
"unicode-xid",
39843985
]
39853986

@@ -5443,6 +5444,47 @@ version = "0.1.3"
54435444
source = "registry+https://github.com/rust-lang/crates.io-index"
54445445
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
54455446

5447+
[[package]]
5448+
name = "unic-char-property"
5449+
version = "0.9.0"
5450+
source = "registry+https://github.com/rust-lang/crates.io-index"
5451+
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
5452+
dependencies = [
5453+
"unic-char-range",
5454+
]
5455+
5456+
[[package]]
5457+
name = "unic-char-range"
5458+
version = "0.9.0"
5459+
source = "registry+https://github.com/rust-lang/crates.io-index"
5460+
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
5461+
5462+
[[package]]
5463+
name = "unic-common"
5464+
version = "0.9.0"
5465+
source = "registry+https://github.com/rust-lang/crates.io-index"
5466+
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
5467+
5468+
[[package]]
5469+
name = "unic-emoji-char"
5470+
version = "0.9.0"
5471+
source = "registry+https://github.com/rust-lang/crates.io-index"
5472+
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
5473+
dependencies = [
5474+
"unic-char-property",
5475+
"unic-char-range",
5476+
"unic-ucd-version",
5477+
]
5478+
5479+
[[package]]
5480+
name = "unic-ucd-version"
5481+
version = "0.9.0"
5482+
source = "registry+https://github.com/rust-lang/crates.io-index"
5483+
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
5484+
dependencies = [
5485+
"unic-common",
5486+
]
5487+
54465488
[[package]]
54475489
name = "unicase"
54485490
version = "2.6.0"

compiler/rustc_errors/src/emitter.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ impl EmitterWriter {
721721
}
722722

723723
let source_string = match file.get_line(line.line_index - 1) {
724-
Some(s) => replace_tabs(&*s),
724+
Some(s) => normalize_whitespace(&*s),
725725
None => return Vec::new(),
726726
};
727727

@@ -1272,7 +1272,7 @@ impl EmitterWriter {
12721272
buffer.append(0, ": ", header_style);
12731273
}
12741274
for &(ref text, _) in msg.iter() {
1275-
buffer.append(0, &replace_tabs(text), header_style);
1275+
buffer.append(0, &normalize_whitespace(text), header_style);
12761276
}
12771277
}
12781278

@@ -1526,7 +1526,7 @@ impl EmitterWriter {
15261526

15271527
self.draw_line(
15281528
&mut buffer,
1529-
&replace_tabs(&unannotated_line),
1529+
&normalize_whitespace(&unannotated_line),
15301530
annotated_file.lines[line_idx + 1].line_index - 1,
15311531
last_buffer_line_num,
15321532
width_offset,
@@ -1648,7 +1648,7 @@ impl EmitterWriter {
16481648
buffer.puts(
16491649
row_num - 1,
16501650
max_line_num_len + 3,
1651-
&replace_tabs(
1651+
&normalize_whitespace(
16521652
&*file_lines
16531653
.file
16541654
.get_line(file_lines.lines[line_pos].line_index)
@@ -1674,7 +1674,7 @@ impl EmitterWriter {
16741674
}
16751675

16761676
// print the suggestion
1677-
buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
1677+
buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);
16781678

16791679
// Colorize addition/replacements with green.
16801680
for &SubstitutionHighlight { start, end } in highlight_parts {
@@ -2054,8 +2054,17 @@ fn num_decimal_digits(num: usize) -> usize {
20542054
MAX_DIGITS
20552055
}
20562056

2057-
fn replace_tabs(str: &str) -> String {
2058-
str.replace('\t', " ")
2057+
const REPLACEMENTS: &[(char, &str)] = &[
2058+
('\t', " "),
2059+
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2060+
];
2061+
2062+
fn normalize_whitespace(str: &str) -> String {
2063+
let mut output = str.to_string();
2064+
for (c, replacement) in REPLACEMENTS {
2065+
output = output.replace(*c, replacement);
2066+
}
2067+
output
20592068
}
20602069

20612070
fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {

compiler/rustc_interface/src/passes.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
3535
use rustc_session::search_paths::PathKind;
3636
use rustc_session::Session;
3737
use rustc_span::symbol::{Ident, Symbol};
38-
use rustc_span::FileName;
38+
use rustc_span::{FileName, MultiSpan};
3939
use rustc_trait_selection::traits;
4040
use rustc_typeck as typeck;
4141
use tempfile::Builder as TempFileBuilder;
@@ -445,6 +445,19 @@ pub fn configure_and_expand(
445445
}
446446
});
447447

448+
// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
449+
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
450+
let mut identifiers: Vec<_> = identifiers.drain().collect();
451+
identifiers.sort_by_key(|&(key, _)| key);
452+
for (ident, mut spans) in identifiers.into_iter() {
453+
spans.sort();
454+
sess.diagnostic().span_err(
455+
MultiSpan::from(spans),
456+
&format!("identifiers cannot contain emoji: `{}`", ident),
457+
);
458+
}
459+
});
460+
448461
Ok(krate)
449462
}
450463

compiler/rustc_lexer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ doctest = false
1717
# Note that this crate purposefully does not depend on other rustc crates
1818
[dependencies]
1919
unicode-xid = "0.2.0"
20+
unic-emoji-char = "0.9.0"
2021

2122
[dev-dependencies]
2223
expect-test = "1.0"

compiler/rustc_lexer/src/lib.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ pub enum TokenKind {
6464
/// "ident" or "continue"
6565
/// At this step keywords are also considered identifiers.
6666
Ident,
67+
/// Like the above, but containing invalid unicode codepoints.
68+
InvalidIdent,
6769
/// "r#ident"
6870
RawIdent,
6971
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -411,6 +413,10 @@ impl Cursor<'_> {
411413
let kind = Str { terminated };
412414
Literal { kind, suffix_start }
413415
}
416+
// Identifier starting with an emoji. Only lexed for graceful error recovery.
417+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
418+
self.fake_ident_or_unknown_prefix()
419+
}
414420
_ => Unknown,
415421
};
416422
Token::new(token_kind, self.len_consumed())
@@ -492,10 +498,28 @@ impl Cursor<'_> {
492498
// we see a prefix here, it is definitely an unknown prefix.
493499
match self.first() {
494500
'#' | '"' | '\'' => UnknownPrefix,
501+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
502+
self.fake_ident_or_unknown_prefix()
503+
}
495504
_ => Ident,
496505
}
497506
}
498507

508+
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
509+
// Start is already eaten, eat the rest of identifier.
510+
self.eat_while(|c| {
511+
unicode_xid::UnicodeXID::is_xid_continue(c)
512+
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
513+
|| c == '\u{200d}'
514+
});
515+
// Known prefixes must have been handled earlier. So if
516+
// we see a prefix here, it is definitely an unknown prefix.
517+
match self.first() {
518+
'#' | '"' | '\'' => UnknownPrefix,
519+
_ => InvalidIdent,
520+
}
521+
}
522+
499523
fn number(&mut self, first_digit: char) -> LiteralKind {
500524
debug_assert!('0' <= self.prev() && self.prev() <= '9');
501525
let mut base = Base::Decimal;

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::lexer::unicode_chars::UNICODE_ARRAY;
12
use rustc_ast::ast::{self, AttrStyle};
23
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
34
use rustc_ast::tokenstream::{Spacing, TokenStream};
@@ -191,6 +192,22 @@ impl<'a> StringReader<'a> {
191192
}
192193
token::Ident(sym, is_raw_ident)
193194
}
195+
rustc_lexer::TokenKind::InvalidIdent
196+
// Do not recover an identifier with emoji if the codepoint is a confusable
197+
// with a recoverable substitution token, like `➖`.
198+
if UNICODE_ARRAY
199+
.iter()
200+
.find(|&&(c, _, _)| {
201+
let sym = self.str_from(start);
202+
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
203+
})
204+
.is_none() =>
205+
{
206+
let sym = nfc_normalize(self.str_from(start));
207+
let span = self.mk_sp(start, self.pos);
208+
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
209+
token::Ident(sym, false)
210+
}
194211
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
195212
let suffix_start = start + BytePos(suffix_start as u32);
196213
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -262,7 +279,7 @@ impl<'a> StringReader<'a> {
262279
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
263280
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
264281

265-
rustc_lexer::TokenKind::Unknown => {
282+
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
266283
let c = self.str_from(start).chars().next().unwrap();
267284
let mut err =
268285
self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);

compiler/rustc_parse/src/lexer/unicode_chars.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
77
use rustc_span::{symbol::kw, BytePos, Pos, Span};
88

99
#[rustfmt::skip] // for line breaks
10-
const UNICODE_ARRAY: &[(char, &str, char)] = &[
10+
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
1111
('
', "Line Separator", ' '),
1212
('
', "Paragraph Separator", ' '),
1313
(' ', "Ogham Space mark", ' '),

compiler/rustc_session/src/parse.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,13 @@ pub struct ParseSess {
119119
pub config: CrateConfig,
120120
pub edition: Edition,
121121
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
122-
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
122+
/// Places where raw identifiers were used. This is used to avoid complaining about idents
123+
/// clashing with keywords in new editions.
123124
pub raw_identifier_spans: Lock<Vec<Span>>,
125+
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
126+
/// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
127+
/// provide a single error per unique incorrect identifier.
128+
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
124129
source_map: Lrc<SourceMap>,
125130
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
126131
/// Contains the spans of block expressions that could have been incomplete based on the
@@ -160,6 +165,7 @@ impl ParseSess {
160165
edition: ExpnId::root().expn_data().edition,
161166
missing_fragment_specifiers: Default::default(),
162167
raw_identifier_spans: Lock::new(Vec::new()),
168+
bad_unicode_identifiers: Lock::new(Default::default()),
163169
source_map,
164170
buffered_lints: Lock::new(vec![]),
165171
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),

src/librustdoc/html/highlight.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ impl<'a> Classifier<'a> {
489489
},
490490
Some(c) => c,
491491
},
492-
TokenKind::RawIdent | TokenKind::UnknownPrefix => {
492+
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
493493
Class::Ident(self.new_span(before, text))
494494
}
495495
TokenKind::Lifetime { .. } => Class::Lifetime,
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
struct ABig👩‍👩‍👧‍👧Family; //~ ERROR identifiers cannot contain emoji
2+
struct 👀; //~ ERROR identifiers cannot contain emoji
3+
impl 👀 {
4+
fn full_of_() -> 👀 { //~ ERROR identifiers cannot contain emoji
5+
👀
6+
}
7+
}
8+
fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emoji
9+
👀::full_of() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
10+
//~^ ERROR identifiers cannot contain emoji
11+
}
12+
fn main() {
13+
let _ = i_like_to_😄_a_lot()4; //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
14+
//~^ ERROR identifiers cannot contain emoji
15+
//~| ERROR unknown start of token: \u{2796}
16+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
error: unknown start of token: \u{2796}
2+
--> $DIR/emoji-identifiers.rs:13:33
3+
|
4+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
5+
| ^^
6+
|
7+
help: Unicode character '➖' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
8+
|
9+
LL | let _ = i_like_to_😄_a_lot() - 4;
10+
| ~
11+
12+
error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
13+
--> $DIR/emoji-identifiers.rs:13:13
14+
|
15+
LL | fn i_like_to_😅_a_lot() -> 👀 {
16+
| ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
17+
...
18+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
19+
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
20+
21+
error: identifiers cannot contain emoji: `ABig👩👩👧👧Family`
22+
--> $DIR/emoji-identifiers.rs:1:8
23+
|
24+
LL | struct ABig👩👩👧👧Family;
25+
| ^^^^^^^^^^^^^^^^^^
26+
27+
error: identifiers cannot contain emoji: `👀`
28+
--> $DIR/emoji-identifiers.rs:2:8
29+
|
30+
LL | struct 👀;
31+
| ^^
32+
LL | impl 👀 {
33+
| ^^
34+
LL | fn full_of_✨() -> 👀 {
35+
| ^^
36+
LL | 👀
37+
| ^^
38+
...
39+
LL | fn i_like_to_😅_a_lot() -> 👀 {
40+
| ^^
41+
LL | 👀::full_of✨()
42+
| ^^
43+
44+
error: identifiers cannot contain emoji: `full_of_✨`
45+
--> $DIR/emoji-identifiers.rs:4:8
46+
|
47+
LL | fn full_of_✨() -> 👀 {
48+
| ^^^^^^^^^^
49+
50+
error: identifiers cannot contain emoji: `i_like_to_😅_a_lot`
51+
--> $DIR/emoji-identifiers.rs:8:4
52+
|
53+
LL | fn i_like_to_😅_a_lot() -> 👀 {
54+
| ^^^^^^^^^^^^^^^^^^
55+
56+
error: identifiers cannot contain emoji: `full_of✨`
57+
--> $DIR/emoji-identifiers.rs:9:8
58+
|
59+
LL | 👀::full_of✨()
60+
| ^^^^^^^^^
61+
62+
error: identifiers cannot contain emoji: `i_like_to_😄_a_lot`
63+
--> $DIR/emoji-identifiers.rs:13:13
64+
|
65+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
66+
| ^^^^^^^^^^^^^^^^^^
67+
68+
error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
69+
--> $DIR/emoji-identifiers.rs:9:8
70+
|
71+
LL | struct 👀;
72+
| ---------- function or associated item `full_of✨` not found for this
73+
...
74+
LL | 👀::full_of✨()
75+
| ^^^^^^^^^
76+
| |
77+
| function or associated item not found in `👀`
78+
| help: there is an associated function with a similar name: `full_of_✨`
79+
80+
error: aborting due to 9 previous errors
81+
82+
Some errors have detailed explanations: E0425, E0599.
83+
For more information about an error, try `rustc --explain E0425`.

0 commit comments

Comments
 (0)