Skip to content

Commit 99f5c79

Browse files
committed
Shrink Token.
From 72 bytes to 12 bytes (on x86-64). There are two parts to this: - Changing various source code offsets from 64-bit to 32-bit. This is not a problem because the rest of rustc also uses 32-bit source code offsets. This means `Token` is no longer `Copy` but this causes no problems. - Removing the `RawStrError` from `LiteralKind`. Raw string literal invalidity is now indicated by a `None` value within `RawStr`/`RawByteStr`, and the new `validate_raw_str` function can be used to re-lex an invalid raw string literal to get the `RawStrError`. There is one very small change in behaviour. Previously, if a raw string literal matched both the `InvalidStarter` and `TooManyHashes` cases, the latter would override the former. This has now changed, because `raw_double_quoted_string` now uses `?` and so returns immediately upon detecting the `InvalidStarter` case. I think this is a slight improvement to report the earlier-detected error, and it explains the change in the `test_too_many_hashes` test. The commit also removes a couple of comments that refer to #77629 and say that the size of these types don't affect performance. These comments are wrong, though the performance effect is small.
1 parent e6b9fcc commit 99f5c79

File tree

9 files changed

+111
-103
lines changed

9 files changed

+111
-103
lines changed

compiler/rustc_ast/src/util/comments.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
194194
}
195195

196196
for token in rustc_lexer::tokenize(&text[pos..]) {
197-
let token_text = &text[pos..pos + token.len];
197+
let token_text = &text[pos..pos + token.len as usize];
198198
match token.kind {
199199
rustc_lexer::TokenKind::Whitespace => {
200200
if let Some(mut idx) = token_text.find('\n') {
@@ -211,8 +211,10 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
211211
}
212212
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
213213
if doc_style.is_none() {
214-
let code_to_the_right =
215-
!matches!(text[pos + token.len..].chars().next(), Some('\r' | '\n'));
214+
let code_to_the_right = !matches!(
215+
text[pos + token.len as usize..].chars().next(),
216+
Some('\r' | '\n')
217+
);
216218
let style = match (code_to_the_left, code_to_the_right) {
217219
(_, true) => CommentStyle::Mixed,
218220
(false, false) => CommentStyle::Isolated,
@@ -246,7 +248,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
246248
code_to_the_left = true;
247249
}
248250
}
249-
pos += token.len;
251+
pos += token.len as usize;
250252
}
251253

252254
comments

compiler/rustc_lexer/src/cursor.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ impl<'a> Cursor<'a> {
6161
}
6262

6363
/// Returns amount of already consumed symbols.
64-
pub(crate) fn len_consumed(&self) -> usize {
65-
self.initial_len - self.chars.as_str().len()
64+
pub(crate) fn len_consumed(&self) -> u32 {
65+
(self.initial_len - self.chars.as_str().len()) as u32
6666
}
6767

6868
/// Resets the number of bytes consumed to 0.

compiler/rustc_lexer/src/lib.rs

+47-41
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,17 @@ use std::convert::TryFrom;
3838
#[derive(Debug)]
3939
pub struct Token {
4040
pub kind: TokenKind,
41-
pub len: usize,
41+
pub len: u32,
4242
}
4343

4444
impl Token {
45-
fn new(kind: TokenKind, len: usize) -> Token {
45+
fn new(kind: TokenKind, len: u32) -> Token {
4646
Token { kind, len }
4747
}
4848
}
4949

5050
/// Enum representing common lexeme types.
51-
// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
52-
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
51+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
5352
pub enum TokenKind {
5453
// Multi-char tokens:
5554
/// "// comment"
@@ -76,7 +75,7 @@ pub enum TokenKind {
7675
/// tokens.
7776
UnknownPrefix,
7877
/// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
79-
Literal { kind: LiteralKind, suffix_start: usize },
78+
Literal { kind: LiteralKind, suffix_start: u32 },
8079
/// "'a"
8180
Lifetime { starts_with_number: bool },
8281

@@ -160,26 +159,24 @@ pub enum LiteralKind {
160159
Str { terminated: bool },
161160
/// "b"abc"", "b"abc"
162161
ByteStr { terminated: bool },
163-
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
164-
RawStr { n_hashes: u8, err: Option<RawStrError> },
165-
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
166-
RawByteStr { n_hashes: u8, err: Option<RawStrError> },
162+
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
163+
/// an invalid literal.
164+
RawStr { n_hashes: Option<u8> },
165+
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
166+
/// indicates an invalid literal.
167+
RawByteStr { n_hashes: Option<u8> },
167168
}
168169

169-
/// Error produced validating a raw string. Represents cases like:
170-
/// - `r##~"abcde"##`: `InvalidStarter`
171-
/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
172-
/// - Too many `#`s (>255): `TooManyDelimiters`
173-
// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
174170
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
175171
pub enum RawStrError {
176-
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
172+
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
177173
InvalidStarter { bad_char: char },
178-
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
179-
/// may have intended to terminate it.
180-
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
174+
/// The string was not terminated, e.g. `r###"abcde"##`.
175+
/// `possible_terminator_offset` is the number of characters after `r` or
176+
/// `br` where they may have intended to terminate it.
177+
NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
181178
/// More than 255 `#`s exist.
182-
TooManyDelimiters { found: usize },
179+
TooManyDelimiters { found: u32 },
183180
}
184181

185182
/// Base of numeric literal encoding according to its prefix.
@@ -227,6 +224,19 @@ pub fn first_token(input: &str) -> Token {
227224
Cursor::new(input).advance_token()
228225
}
229226

227+
/// Validates a raw string literal. Used for getting more information about a
228+
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
229+
#[inline]
230+
pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
231+
debug_assert!(!input.is_empty());
232+
let mut cursor = Cursor::new(input);
233+
// Move past the leading `r` or `br`.
234+
for _ in 0..prefix_len {
235+
cursor.bump().unwrap();
236+
}
237+
cursor.raw_double_quoted_string(prefix_len).map(|_| ())
238+
}
239+
230240
/// Creates an iterator that produces tokens from the input string.
231241
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
232242
let mut cursor = Cursor::new(input);
@@ -316,12 +326,12 @@ impl Cursor<'_> {
316326
'r' => match (self.first(), self.second()) {
317327
('#', c1) if is_id_start(c1) => self.raw_ident(),
318328
('#', _) | ('"', _) => {
319-
let (n_hashes, err) = self.raw_double_quoted_string(1);
329+
let res = self.raw_double_quoted_string(1);
320330
let suffix_start = self.len_consumed();
321-
if err.is_none() {
331+
if res.is_ok() {
322332
self.eat_literal_suffix();
323333
}
324-
let kind = RawStr { n_hashes, err };
334+
let kind = RawStr { n_hashes: res.ok() };
325335
Literal { kind, suffix_start }
326336
}
327337
_ => self.ident_or_unknown_prefix(),
@@ -351,12 +361,12 @@ impl Cursor<'_> {
351361
}
352362
('r', '"') | ('r', '#') => {
353363
self.bump();
354-
let (n_hashes, err) = self.raw_double_quoted_string(2);
364+
let res = self.raw_double_quoted_string(2);
355365
let suffix_start = self.len_consumed();
356-
if err.is_none() {
366+
if res.is_ok() {
357367
self.eat_literal_suffix();
358368
}
359-
let kind = RawByteStr { n_hashes, err };
369+
let kind = RawByteStr { n_hashes: res.ok() };
360370
Literal { kind, suffix_start }
361371
}
362372
_ => self.ident_or_unknown_prefix(),
@@ -699,19 +709,18 @@ impl Cursor<'_> {
699709
}
700710

701711
/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
702-
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u8, Option<RawStrError>) {
712+
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
703713
// Wrap the actual function to handle the error with too many hashes.
704714
// This way, it eats the whole raw string.
705-
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
715+
let n_hashes = self.raw_string_unvalidated(prefix_len)?;
706716
// Only up to 255 `#`s are allowed in raw strings
707717
match u8::try_from(n_hashes) {
708-
Ok(num) => (num, err),
709-
// We lie about the number of hashes here :P
710-
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
718+
Ok(num) => Ok(num),
719+
Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
711720
}
712721
}
713722

714-
fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
723+
fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
715724
debug_assert!(self.prev() == 'r');
716725
let start_pos = self.len_consumed();
717726
let mut possible_terminator_offset = None;
@@ -730,7 +739,7 @@ impl Cursor<'_> {
730739
Some('"') => (),
731740
c => {
732741
let c = c.unwrap_or(EOF_CHAR);
733-
return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
742+
return Err(RawStrError::InvalidStarter { bad_char: c });
734743
}
735744
}
736745

@@ -740,14 +749,11 @@ impl Cursor<'_> {
740749
self.eat_while(|c| c != '"');
741750

742751
if self.is_eof() {
743-
return (
744-
n_start_hashes,
745-
Some(RawStrError::NoTerminator {
746-
expected: n_start_hashes,
747-
found: max_hashes,
748-
possible_terminator_offset,
749-
}),
750-
);
752+
return Err(RawStrError::NoTerminator {
753+
expected: n_start_hashes,
754+
found: max_hashes,
755+
possible_terminator_offset,
756+
});
751757
}
752758

753759
// Eat closing double quote.
@@ -765,7 +771,7 @@ impl Cursor<'_> {
765771
}
766772

767773
if n_end_hashes == n_start_hashes {
768-
return (n_start_hashes, None);
774+
return Ok(n_start_hashes);
769775
} else if n_end_hashes > max_hashes {
770776
// Keep track of possible terminators to give a hint about
771777
// where there might be a missing terminator

compiler/rustc_lexer/src/tests.rs

+22-28
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,39 @@ use super::*;
22

33
use expect_test::{expect, Expect};
44

5-
fn check_raw_str(s: &str, expected_hashes: u8, expected_err: Option<RawStrError>) {
5+
fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) {
66
let s = &format!("r{}", s);
77
let mut cursor = Cursor::new(s);
88
cursor.bump();
9-
let (n_hashes, err) = cursor.raw_double_quoted_string(0);
10-
assert_eq!(n_hashes, expected_hashes);
11-
assert_eq!(err, expected_err);
9+
let res = cursor.raw_double_quoted_string(0);
10+
assert_eq!(res, expected);
1211
}
1312

1413
#[test]
1514
fn test_naked_raw_str() {
16-
check_raw_str(r#""abc""#, 0, None);
15+
check_raw_str(r#""abc""#, Ok(0));
1716
}
1817

1918
#[test]
2019
fn test_raw_no_start() {
21-
check_raw_str(r##""abc"#"##, 0, None);
20+
check_raw_str(r##""abc"#"##, Ok(0));
2221
}
2322

2423
#[test]
2524
fn test_too_many_terminators() {
2625
// this error is handled in the parser later
27-
check_raw_str(r###"#"abc"##"###, 1, None);
26+
check_raw_str(r###"#"abc"##"###, Ok(1));
2827
}
2928

3029
#[test]
3130
fn test_unterminated() {
3231
check_raw_str(
3332
r#"#"abc"#,
34-
1,
35-
Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
33+
Err(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
3634
);
3735
check_raw_str(
3836
r###"##"abc"#"###,
39-
2,
40-
Some(RawStrError::NoTerminator {
37+
Err(RawStrError::NoTerminator {
4138
expected: 2,
4239
found: 1,
4340
possible_terminator_offset: Some(7),
@@ -46,41 +43,38 @@ fn test_unterminated() {
4643
// We're looking for "# not just any #
4744
check_raw_str(
4845
r###"##"abc#"###,
49-
2,
50-
Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
46+
Err(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
5147
)
5248
}
5349

5450
#[test]
5551
fn test_invalid_start() {
56-
check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
52+
check_raw_str(r##"#~"abc"#"##, Err(RawStrError::InvalidStarter { bad_char: '~' }));
5753
}
5854

5955
#[test]
6056
fn test_unterminated_no_pound() {
6157
// https://github.com/rust-lang/rust/issues/70677
6258
check_raw_str(
6359
r#"""#,
64-
0,
65-
Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
60+
Err(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
6661
);
6762
}
6863

6964
#[test]
7065
fn test_too_many_hashes() {
7166
let max_count = u8::MAX;
72-
let mut hashes: String = "#".repeat(max_count.into());
67+
let hashes1 = "#".repeat(max_count as usize);
68+
let hashes2 = "#".repeat(max_count as usize + 1);
69+
let middle = "\"abc\"";
70+
let s1 = [&hashes1, middle, &hashes1].join("");
71+
let s2 = [&hashes2, middle, &hashes2].join("");
7372

74-
// Valid number of hashes (255 = 2^8 - 1 = u8::MAX), but invalid string.
75-
check_raw_str(&hashes, max_count, Some(RawStrError::InvalidStarter { bad_char: '\u{0}' }));
73+
// Valid number of hashes (255 = 2^8 - 1 = u8::MAX).
74+
check_raw_str(&s1, Ok(255));
7675

7776
// One more hash sign (256 = 2^8) becomes too many.
78-
hashes.push('#');
79-
check_raw_str(
80-
&hashes,
81-
0,
82-
Some(RawStrError::TooManyDelimiters { found: usize::from(max_count) + 1 }),
83-
);
77+
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
8478
}
8579

8680
#[test]
@@ -251,7 +245,7 @@ fn raw_string() {
251245
check_lexing(
252246
"r###\"\"#a\\b\x00c\"\"###",
253247
expect![[r#"
254-
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 17 }, len: 17 }
248+
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 }
255249
"#]],
256250
)
257251
}
@@ -295,9 +289,9 @@ br###"raw"###suffix
295289
Token { kind: Whitespace, len: 1 }
296290
Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 }
297291
Token { kind: Whitespace, len: 1 }
298-
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 12 }, len: 18 }
292+
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 12 }, len: 18 }
299293
Token { kind: Whitespace, len: 1 }
300-
Token { kind: Literal { kind: RawByteStr { n_hashes: 3, err: None }, suffix_start: 13 }, len: 19 }
294+
Token { kind: Literal { kind: RawByteStr { n_hashes: Some(3) }, suffix_start: 13 }, len: 19 }
301295
Token { kind: Whitespace, len: 1 }
302296
"#]],
303297
)

0 commit comments

Comments
 (0)