Skip to content

Commit 57c57a5

Browse files
authored
Rollup merge of rust-lang#114193 - crlf0710:lexer_unicode15, r=Manishearth
Update lexer emoji diagnostics to Unicode 15.0 This replaces the `unic-emoji-char` dep tree (which hasn't been updated for a while) with `unicode-properties` crate which contains Unicode 15.0 data. Improves diagnostics for added emoji characters in recent years. (See tests). cc rust-lang#101840 cc ``@Manishearth``
2 parents 7c6942a + bca79a2 commit 57c57a5

File tree

6 files changed

+36
-76
lines changed

6 files changed

+36
-76
lines changed

Diff for: Cargo.lock

+7-42
Original file line numberDiff line numberDiff line change
@@ -3786,7 +3786,7 @@ name = "rustc_lexer"
37863786
version = "0.1.0"
37873787
dependencies = [
37883788
"expect-test",
3789-
"unic-emoji-char",
3789+
"unicode-properties",
37903790
"unicode-xid",
37913791
]
37923792

@@ -5446,38 +5446,6 @@ dependencies = [
54465446
"tempfile",
54475447
]
54485448

5449-
[[package]]
5450-
name = "unic-char-property"
5451-
version = "0.9.0"
5452-
source = "registry+https://github.com/rust-lang/crates.io-index"
5453-
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
5454-
dependencies = [
5455-
"unic-char-range",
5456-
]
5457-
5458-
[[package]]
5459-
name = "unic-char-range"
5460-
version = "0.9.0"
5461-
source = "registry+https://github.com/rust-lang/crates.io-index"
5462-
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
5463-
5464-
[[package]]
5465-
name = "unic-common"
5466-
version = "0.9.0"
5467-
source = "registry+https://github.com/rust-lang/crates.io-index"
5468-
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
5469-
5470-
[[package]]
5471-
name = "unic-emoji-char"
5472-
version = "0.9.0"
5473-
source = "registry+https://github.com/rust-lang/crates.io-index"
5474-
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
5475-
dependencies = [
5476-
"unic-char-property",
5477-
"unic-char-range",
5478-
"unic-ucd-version",
5479-
]
5480-
54815449
[[package]]
54825450
name = "unic-langid"
54835451
version = "0.9.1"
@@ -5521,15 +5489,6 @@ dependencies = [
55215489
"unic-langid-impl",
55225490
]
55235491

5524-
[[package]]
5525-
name = "unic-ucd-version"
5526-
version = "0.9.0"
5527-
source = "registry+https://github.com/rust-lang/crates.io-index"
5528-
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
5529-
dependencies = [
5530-
"unic-common",
5531-
]
5532-
55335492
[[package]]
55345493
name = "unicase"
55355494
version = "2.6.0"
@@ -5567,6 +5526,12 @@ dependencies = [
55675526
"tinyvec",
55685527
]
55695528

5529+
[[package]]
5530+
name = "unicode-properties"
5531+
version = "0.1.0"
5532+
source = "registry+https://github.com/rust-lang/crates.io-index"
5533+
checksum = "c7f91c8b21fbbaa18853c3d0801c78f4fc94cdb976699bb03e832e75f7fd22f0"
5534+
55705535
[[package]]
55715536
name = "unicode-script"
55725537
version = "0.5.5"

Diff for: compiler/rustc_lexer/Cargo.toml

+5-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
1616
# Note that this crate purposefully does not depend on other rustc crates
1717
[dependencies]
1818
unicode-xid = "0.2.0"
19-
unic-emoji-char = "0.9.0"
19+
20+
[dependencies.unicode-properties]
21+
version = "0.1.0"
22+
default-features = false
23+
features = ["emoji"]
2024

2125
[dev-dependencies]
2226
expect-test = "1.4.0"

Diff for: compiler/rustc_lexer/src/lib.rs

+4-7
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
3434
use self::LiteralKind::*;
3535
use self::TokenKind::*;
3636
use crate::cursor::EOF_CHAR;
37+
use unicode_properties::UnicodeEmoji;
3738

3839
/// Parsed token.
3940
/// It doesn't contain information about data that has been parsed,
@@ -428,9 +429,7 @@ impl Cursor<'_> {
428429
Literal { kind, suffix_start }
429430
}
430431
// Identifier starting with an emoji. Only lexed for graceful error recovery.
431-
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
432-
self.fake_ident_or_unknown_prefix()
433-
}
432+
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
434433
_ => Unknown,
435434
};
436435
let res = Token::new(token_kind, self.pos_within_token());
@@ -514,9 +513,7 @@ impl Cursor<'_> {
514513
// we see a prefix here, it is definitely an unknown prefix.
515514
match self.first() {
516515
'#' | '"' | '\'' => UnknownPrefix,
517-
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
518-
self.fake_ident_or_unknown_prefix()
519-
}
516+
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
520517
_ => Ident,
521518
}
522519
}
@@ -525,7 +522,7 @@ impl Cursor<'_> {
525522
// Start is already eaten, eat the rest of identifier.
526523
self.eat_while(|c| {
527524
unicode_xid::UnicodeXID::is_xid_continue(c)
528-
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
525+
|| (!c.is_ascii() && c.is_emoji_char())
529526
|| c == '\u{200d}'
530527
});
531528
// Known prefixes must have been handled earlier. So if

Diff for: src/tools/tidy/src/deps.rs

+1-5
Original file line numberDiff line numberDiff line change
@@ -270,18 +270,14 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
270270
"twox-hash",
271271
"type-map",
272272
"typenum",
273-
"unic-char-property",
274-
"unic-char-range",
275-
"unic-common",
276-
"unic-emoji-char",
277273
"unic-langid",
278274
"unic-langid-impl",
279275
"unic-langid-macros",
280276
"unic-langid-macros-impl",
281-
"unic-ucd-version",
282277
"unicase",
283278
"unicode-ident",
284279
"unicode-normalization",
280+
"unicode-properties",
285281
"unicode-script",
286282
"unicode-security",
287283
"unicode-width",

Diff for: tests/ui/lexer/lex-emoji-identifiers.rs

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
fn invalid_emoji_usages() {
22
let arrow↔️ = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
3-
// FIXME
4-
let planet🪐 = "basic emoji"; //~ ERROR: unknown start of token
5-
// FIXME
6-
let wireless🛜 = "basic emoji"; //~ ERROR: unknown start of token
3+
let planet🪐 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
4+
let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
75
// FIXME
86
let key1️⃣ = "keycap sequence"; //~ ERROR: unknown start of token
97
//~^ WARN: identifier contains uncommon Unicode codepoints

Diff for: tests/ui/lexer/lex-emoji-identifiers.stderr

+17-17
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,5 @@
1-
error: unknown start of token: \u{1fa90}
2-
--> $DIR/lex-emoji-identifiers.rs:4:15
3-
|
4-
LL | let planet🪐 = "basic emoji";
5-
| ^^
6-
7-
error: unknown start of token: \u{1f6dc}
8-
--> $DIR/lex-emoji-identifiers.rs:6:17
9-
|
10-
LL | let wireless🛜 = "basic emoji";
11-
| ^^
12-
131
error: unknown start of token: \u{20e3}
14-
--> $DIR/lex-emoji-identifiers.rs:8:14
2+
--> $DIR/lex-emoji-identifiers.rs:6:14
153
|
164
LL | let key1️⃣ = "keycap sequence";
175
| ^
@@ -22,26 +10,38 @@ error: identifiers cannot contain emoji: `arrow↔️`
2210
LL | let arrow↔️ = "basic emoji";
2311
| ^^^^^^
2412

13+
error: identifiers cannot contain emoji: `planet🪐`
14+
--> $DIR/lex-emoji-identifiers.rs:3:9
15+
|
16+
LL | let planet🪐 = "basic emoji";
17+
| ^^^^^^^^
18+
19+
error: identifiers cannot contain emoji: `wireless🛜`
20+
--> $DIR/lex-emoji-identifiers.rs:4:9
21+
|
22+
LL | let wireless🛜 = "basic emoji";
23+
| ^^^^^^^^^^
24+
2525
error: identifiers cannot contain emoji: `flag🇺🇳`
26-
--> $DIR/lex-emoji-identifiers.rs:10:9
26+
--> $DIR/lex-emoji-identifiers.rs:8:9
2727
|
2828
LL | let flag🇺🇳 = "flag sequence";
2929
| ^^^^^^
3030

3131
error: identifiers cannot contain emoji: `wales🏴`
32-
--> $DIR/lex-emoji-identifiers.rs:11:9
32+
--> $DIR/lex-emoji-identifiers.rs:9:9
3333
|
3434
LL | let wales🏴 = "tag sequence";
3535
| ^^^^^^^
3636

3737
error: identifiers cannot contain emoji: `folded🙏🏿`
38-
--> $DIR/lex-emoji-identifiers.rs:12:9
38+
--> $DIR/lex-emoji-identifiers.rs:10:9
3939
|
4040
LL | let folded🙏🏿 = "modifier sequence";
4141
| ^^^^^^^^^^
4242

4343
warning: identifier contains uncommon Unicode codepoints
44-
--> $DIR/lex-emoji-identifiers.rs:8:9
44+
--> $DIR/lex-emoji-identifiers.rs:6:9
4545
|
4646
LL | let key1️⃣ = "keycap sequence";
4747
| ^^^^

0 commit comments

Comments
 (0)