Skip to content

Commit 28499d0

Browse files
committed
WIP: parse frontmatter
1 parent 65fa0ab commit 28499d0

File tree

9 files changed

+245
-11
lines changed

9 files changed

+245
-11
lines changed

Diff for: compiler/rustc_lexer/src/cursor.rs

+8-1
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,19 @@ pub struct Cursor<'a> {
88
len_remaining: usize,
99
/// Iterator over chars. Slightly faster than a &str.
1010
chars: Chars<'a>,
11+
pub(crate) frontmatter_allowed: bool,
1112
#[cfg(debug_assertions)]
1213
prev: char,
1314
}
1415

1516
pub(crate) const EOF_CHAR: char = '\0';
1617

1718
impl<'a> Cursor<'a> {
18-
pub fn new(input: &'a str) -> Cursor<'a> {
19+
pub fn new(input: &'a str, frontmatter_allowed: bool) -> Cursor<'a> {
1920
Cursor {
2021
len_remaining: input.len(),
2122
chars: input.chars(),
23+
frontmatter_allowed,
2224
#[cfg(debug_assertions)]
2325
prev: EOF_CHAR,
2426
}
@@ -95,6 +97,11 @@ impl<'a> Cursor<'a> {
9597
Some(c)
9698
}
9799

100+
/// Moves to a substring by a number of bytes.
101+
pub(crate) fn bump_bytes(&mut self, n: usize) {
102+
self.chars = self.as_str()[n..].chars();
103+
}
104+
98105
/// Eats symbols while predicate returns true or until the end of file is reached.
99106
pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
100107
// It was tried making optimized version of this for eg. line comments, but

Diff for: compiler/rustc_lexer/src/lib.rs

+79-2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ pub enum TokenKind {
6868
/// Any whitespace character sequence.
6969
Whitespace,
7070

71+
Frontmatter {
72+
has_invalid_preceding_whitespace: bool,
73+
invalid_infostring: bool,
74+
},
75+
7176
/// An identifier or keyword, e.g. `ident` or `continue`.
7277
Ident,
7378

@@ -280,7 +285,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
280285
#[inline]
281286
pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
282287
debug_assert!(!input.is_empty());
283-
let mut cursor = Cursor::new(input);
288+
let mut cursor = Cursor::new(input, false);
284289
// Move past the leading `r` or `br`.
285290
for _ in 0..prefix_len {
286291
cursor.bump().unwrap();
@@ -290,7 +295,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
290295

291296
/// Creates an iterator that produces tokens from the input string.
292297
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
293-
let mut cursor = Cursor::new(input);
298+
let mut cursor = Cursor::new(input, false);
294299
std::iter::from_fn(move || {
295300
let token = cursor.advance_token();
296301
if token.kind != TokenKind::Eof { Some(token) } else { None }
@@ -361,7 +366,27 @@ impl Cursor<'_> {
361366
Some(c) => c,
362367
None => return Token::new(TokenKind::Eof, 0),
363368
};
369+
364370
let token_kind = match first_char {
371+
c if self.frontmatter_allowed && is_whitespace(c) => {
372+
let mut last = first_char;
373+
while is_whitespace(self.first()) {
374+
let Some(c) = self.bump() else { break; };
375+
last = c;
376+
}
377+
// invalid frontmatter opening as whitespace preceding it isn't newline.
378+
// combine the whitespace and the frontmatter to a single token as we shall
379+
// error later.
380+
if last != '\n' && self.as_str().starts_with("---") {
381+
self.frontmatter(true)
382+
} else {
383+
Whitespace
384+
}
385+
}
386+
_ if self.frontmatter_allowed && self.as_str().starts_with("---") => {
387+
// happy path
388+
self.frontmatter(false)
389+
}
365390
// Slash, comment or block comment.
366391
'/' => match self.first() {
367392
'/' => self.line_comment(),
@@ -464,11 +489,63 @@ impl Cursor<'_> {
464489
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
465490
_ => Unknown,
466491
};
492+
if self.frontmatter_allowed {
493+
self.frontmatter_allowed = matches!(token_kind, Whitespace);
494+
}
467495
let res = Token::new(token_kind, self.pos_within_token());
468496
self.reset_pos_within_token();
469497
res
470498
}
471499

500+
fn frontmatter(&mut self, has_invalid_preceding_whitespace: bool) -> TokenKind {
501+
let pos = self.pos_within_token();
502+
self.eat_while(|c| c == '-');
503+
let length_opening = self.pos_within_token() - pos;
504+
505+
// must be ensured by the caller
506+
debug_assert!(length_opening >= 3);
507+
508+
let s = self.as_str();
509+
self.eat_identifier();
510+
self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
511+
let invalid_infostring = self.first() != '\n';
512+
513+
if let Some(closing) = s.find(&"-".repeat(length_opening as usize)) {
514+
self.bump_bytes(closing);
515+
// in case like
516+
// ---cargo
517+
// --- blahblah
518+
// or
519+
// ---cargo
520+
// ----
521+
// combine those stuff into this frontmatter token such that it gets detected later.
522+
self.eat_until(b'\n');
523+
} else {
524+
// recovery strategy:
525+
// (1) a closing statement with precending whitespace/newline but not having enough characters
526+
// in this case we eat until there and report a mismatch
527+
let mut potential_closing = None;
528+
let mut rest = s;
529+
while let Some(closing) = rest.find("---") {
530+
let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1);
531+
if rest[preceding_chars_start..closing].chars().all(is_whitespace) {
532+
// candidate found
533+
potential_closing = Some(closing);
534+
} else {
535+
rest = &rest[closing + 3..];
536+
}
537+
}
538+
if let Some(potential_closing) = potential_closing {
539+
self.bump_bytes(potential_closing);
540+
} else {
541+
// eat everything.
542+
self.eat_while(|_| true);
543+
}
544+
};
545+
546+
Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
547+
}
548+
472549
fn line_comment(&mut self) -> TokenKind {
473550
debug_assert!(self.prev() == '/' && self.first() == '/');
474551
self.bump();

Diff for: compiler/rustc_lexer/src/tests.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use super::*;
44

55
fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) {
66
let s = &format!("r{}", s);
7-
let mut cursor = Cursor::new(s);
7+
let mut cursor = Cursor::new(s, false);
88
cursor.bump();
99
let res = cursor.raw_double_quoted_string(0);
1010
assert_eq!(res, expected);

Diff for: compiler/rustc_parse/messages.ftl

+13
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,19 @@ parse_forgot_paren = perhaps you forgot parentheses?
297297
parse_found_expr_would_be_stmt = expected expression, found `{$token}`
298298
.label = expected expression
299299
300+
parse_frontmatter_extra_characters_after_close = extra characters after frontmatter close are not allowed
301+
parse_frontmatter_invalid_close_preceding_whitespace = invalid preceding whitespace for frontmatter close
302+
.label = frontmatter close should not be preceded by whitespace
303+
parse_frontmatter_invalid_infostring = invalid infostring for frontmatter
304+
.note = frontmatter infostrings must be an identifier immediately following the opening
305+
parse_frontmatter_invalid_opening_preceding_whitespace = invalid preceding whitespace for frontmatter opening
306+
.label = frontmatter opening should not be preceded by whitespace
307+
parse_frontmatter_unclosed = unclosed frontmatter
308+
.label = frontmatter opening here was not closed
309+
parse_frontmatter_length_mismatch = frontmatter close does not match the opening
310+
.label_opening = the opening here has {$len_opening} dashes..
311+
.label_close = ..while the close has {$len_close} dashes
312+
300313
parse_function_body_equals_expr = function body cannot be `= expression;`
301314
.suggestion = surround the expression with `{"{"}` and `{"}"}` instead of `=` and `;`
302315

Diff for: compiler/rustc_parse/src/errors.rs

+56
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,62 @@ pub(crate) struct FoundExprWouldBeStmt {
730730
pub suggestion: ExprParenthesesNeeded,
731731
}
732732

733+
#[derive(Diagnostic)]
734+
#[diag(parse_frontmatter_extra_characters_after_close)]
735+
pub(crate) struct FrontmatterExtraCharactersAfterClose {
736+
#[primary_span]
737+
pub span: Span,
738+
}
739+
740+
#[derive(Diagnostic)]
741+
#[diag(parse_frontmatter_invalid_infostring)]
742+
pub(crate) struct FrontmatterInvalidInfostring {
743+
#[primary_span]
744+
pub span: Span,
745+
}
746+
747+
#[derive(Diagnostic)]
748+
#[diag(parse_frontmatter_invalid_opening_preceding_whitespace)]
749+
pub(crate) struct FrontmatterInvalidOpeningPrecedingWhitespace {
750+
#[primary_span]
751+
pub span: Span,
752+
#[label]
753+
pub label_span: Span,
754+
}
755+
756+
757+
#[derive(Diagnostic)]
758+
#[diag(parse_frontmatter_unclosed)]
759+
pub(crate) struct FrontmatterUnclosed {
760+
#[primary_span]
761+
pub span: Span,
762+
#[label]
763+
pub label_span: Span,
764+
}
765+
766+
#[derive(Diagnostic)]
767+
#[diag(parse_frontmatter_invalid_close_preceding_whitespace)]
768+
pub(crate) struct FrontmatterInvalidClosingPrecedingWhitespace {
769+
#[primary_span]
770+
pub span: Span,
771+
#[label]
772+
pub label_span: Span,
773+
}
774+
775+
#[derive(Diagnostic)]
776+
#[diag(parse_frontmatter_length_mismatch)]
777+
pub(crate) struct FrontmatterLengthMismatch {
778+
#[primary_span]
779+
pub span: Span,
780+
#[label(parse_label_opening)]
781+
pub opening: Span,
782+
#[label(parse_label_close)]
783+
pub close: Span,
784+
pub len_opening: usize,
785+
pub len_close: usize,
786+
}
787+
788+
733789
#[derive(Diagnostic)]
734790
#[diag(parse_leading_plus_not_supported)]
735791
pub(crate) struct LeadingPlusNotSupported {

Diff for: compiler/rustc_parse/src/lexer/mod.rs

+69-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use rustc_ast::tokenstream::TokenStream;
66
use rustc_ast::util::unicode::contains_text_flow_control_chars;
77
use rustc_errors::codes::*;
88
use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
9-
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
9+
use rustc_lexer::{is_whitespace, Base, Cursor, DocStyle, LiteralKind, RawStrError};
1010
use rustc_literal_escaper::{EscapeError, Mode, unescape_mixed, unescape_unicode};
1111
use rustc_session::lint::BuiltinLintDiag;
1212
use rustc_session::lint::builtin::{
@@ -55,7 +55,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
5555
start_pos = start_pos + BytePos::from_usize(shebang_len);
5656
}
5757

58-
let cursor = Cursor::new(src);
58+
let cursor = Cursor::new(src, true);
5959
let mut lexer = Lexer {
6060
psess,
6161
start_pos,
@@ -192,6 +192,11 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
192192
let content = self.str_from_to(content_start, content_end);
193193
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
194194
}
195+
rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
196+
self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
197+
preceded_by_whitespace = true;
198+
continue;
199+
}
195200
rustc_lexer::TokenKind::Whitespace => {
196201
preceded_by_whitespace = true;
197202
continue;
@@ -255,7 +260,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
255260
// was consumed.
256261
let lit_start = start + BytePos(prefix_len);
257262
self.pos = lit_start;
258-
self.cursor = Cursor::new(&str_before[prefix_len as usize..]);
263+
self.cursor = Cursor::new(&str_before[prefix_len as usize..], false);
259264

260265
self.report_unknown_prefix(start);
261266
let prefix_span = self.mk_sp(start, lit_start);
@@ -361,7 +366,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
361366
// Reset the state so we just lex the `'r`.
362367
let lt_start = start + BytePos(2);
363368
self.pos = lt_start;
364-
self.cursor = Cursor::new(&str_before[2 as usize..]);
369+
self.cursor = Cursor::new(&str_before[2 as usize..], false);
365370

366371
let lifetime_name = self.str_from(start);
367372
let ident = Symbol::intern(lifetime_name);
@@ -474,6 +479,64 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
474479
}
475480
}
476481

482+
fn validate_frontmatter(&self, start: BytePos, has_invalid_preceding_whitespace: bool, invalid_infostring: bool) {
483+
let s = self.str_from(start);
484+
let real_start = s.find("---").unwrap();
485+
let frontmatter_opening_pos = BytePos(real_start as u32) + start;
486+
let s_new = &s[real_start..];
487+
let within = s_new.trim_start_matches('-');
488+
let len_opening = s_new.len() - within.len();
489+
490+
let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
491+
if has_invalid_preceding_whitespace {
492+
let line_start = BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
493+
let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
494+
let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
495+
self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace { span, label_span });
496+
}
497+
498+
if invalid_infostring {
499+
let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
500+
let span = self.mk_sp(frontmatter_opening_end_pos, frontmatter_opening_pos + BytePos(line_end as u32));
501+
self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
502+
}
503+
504+
let last_line_start = within.rfind('\n').map_or(0, |i| i+1);
505+
let last_line = &within[last_line_start..];
506+
let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
507+
let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
508+
509+
if !last_line_trimmed.starts_with("---") {
510+
let span = self.mk_sp(frontmatter_opening_pos, self.pos);
511+
let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
512+
self.dcx().emit_err(errors::FrontmatterUnclosed { span, label_span });
513+
return;
514+
}
515+
516+
if last_line_trimmed.len() != last_line.len() {
517+
let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
518+
let span = self.mk_sp(last_line_start_pos, line_end);
519+
let whitespace_end = last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
520+
let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
521+
self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace { span, label_span });
522+
}
523+
524+
let rest = last_line.trim_start_matches('-');
525+
let len_close = last_line.len() - rest.len();
526+
if len_close != len_opening {
527+
let span = self.mk_sp(frontmatter_opening_pos, self.pos);
528+
let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
529+
let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
530+
let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
531+
self.dcx().emit_err(errors::FrontmatterLengthMismatch { span, opening, close, len_opening, len_close });
532+
}
533+
534+
if !rest.trim_matches(is_whitespace).is_empty() {
535+
let span = self.mk_sp(last_line_start_pos, self.pos);
536+
self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
537+
}
538+
}
539+
477540
fn cook_doc_comment(
478541
&self,
479542
content_start: BytePos,
@@ -838,7 +901,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
838901
let space_pos = start + BytePos(1);
839902
let space_span = self.mk_sp(space_pos, space_pos);
840903

841-
let mut cursor = Cursor::new(str_before);
904+
let mut cursor = Cursor::new(str_before, false);
842905

843906
let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
844907
Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
@@ -904,7 +967,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
904967
// For backwards compatibility, roll back to after just the first `#`
905968
// and return the `Pound` token.
906969
self.pos = start + BytePos(1);
907-
self.cursor = Cursor::new(&str_before[1..]);
970+
self.cursor = Cursor::new(&str_before[1..], false);
908971
token::Pound
909972
}
910973
}

Diff for: src/librustdoc/html/highlight.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,7 @@ impl<'src> Classifier<'src> {
638638
/// Takes as argument the source code to HTML-ify, the rust edition to use and the source code
639639
/// file span which will be used later on by the `span_correspondence_map`.
640640
fn new(src: &'src str, file_span: Span, decoration_info: Option<&DecorationInfo>) -> Self {
641-
let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) });
641+
let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src, true) });
642642
let decorations = decoration_info.map(Decorations::new);
643643
Classifier {
644644
tokens,
@@ -884,6 +884,7 @@ impl<'src> Classifier<'src> {
884884
| TokenKind::At
885885
| TokenKind::Tilde
886886
| TokenKind::Colon
887+
| TokenKind::Frontmatter { .. }
887888
| TokenKind::Unknown => return no_highlight(sink),
888889

889890
TokenKind::Question => Class::QuestionMark,

0 commit comments

Comments
 (0)