Skip to content

Commit 05973f4

Browse files
committed
feat(lexer): Add frontmatter stripping
1 parent b5d2a6c commit 05973f4

File tree

2 files changed

+165
-0
lines changed

2 files changed

+165
-0
lines changed

Diff for: compiler/rustc_lexer/src/lib.rs

+52
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,58 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
276276
None
277277
}
278278

279+
/// Frontmatter is a special attribute type reserved for use by external tools
280+
///
281+
/// This must be called after [`strip_shebang`]
282+
pub fn strip_frontmatter(input: &str) -> Option<usize> {
283+
// Whitespace may precede a frontmatter but must end with a newline
284+
let rest = input.trim_start_matches(is_whitespace);
285+
if rest.len() != input.len() {
286+
let trimmed_len = input.len() - rest.len();
287+
let last_trimmed_index = trimmed_len - 1;
288+
if input.as_bytes()[last_trimmed_index] != b'\n' {
289+
// either not a frontmatter or invalid opening
290+
return None;
291+
}
292+
}
293+
294+
// Opens with a line that starts with 3 or more `-` followed by an optional identifier
295+
const FENCE_CHAR: char = '-';
296+
let fence_length =
297+
rest.char_indices().find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)).unwrap_or(rest.len());
298+
if fence_length < 3 {
299+
// either not a frontmatter or invalid frontmatter opening
300+
return None;
301+
}
302+
let (fence_pattern, rest) = rest.split_at(fence_length);
303+
let Some(info_end_index) = rest.find('\n') else {
304+
// frontmatter close is required
305+
return None;
306+
};
307+
let (info, rest) = rest.split_at(info_end_index);
308+
let info = info.trim_matches(is_whitespace);
309+
if !info.is_empty() && !is_ident(info) {
310+
// optional infostring is not an identifier
311+
return None;
312+
}
313+
314+
// Ends with a line that starts with a matching number of `-` only followed by whitespace
315+
let nl_fence_pattern = format!("\n{fence_pattern}");
316+
let Some(frontmatter_nl) = rest.find(&nl_fence_pattern) else {
317+
// frontmatter close is required
318+
return None;
319+
};
320+
let rest = &rest[frontmatter_nl + nl_fence_pattern.len()..];
321+
322+
let (after_closing_fence, rest) = rest.split_once("\n").unwrap_or((rest, ""));
323+
let after_closing_fence = after_closing_fence.trim_matches(is_whitespace);
324+
if !after_closing_fence.is_empty() {
325+
// extra characters beyond the original fence pattern, even if they are extra `-`
326+
return None;
327+
}
328+
Some(input.len() - rest.len())
329+
}
330+
279331
/// Validates a raw string literal. Used for getting more information about a
280332
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
281333
#[inline]

Diff for: compiler/rustc_lexer/src/tests.rs

+113
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,119 @@ fn test_valid_shebang() {
124124
assert_eq!(strip_shebang(input), None);
125125
}
126126

127+
#[test]
128+
fn test_frontmatter() {
129+
let input = "---
130+
---
131+
";
132+
assert_eq!(strip_frontmatter(input), Some(input.len()));
133+
134+
let input = "---
135+
package.edition = '2024'
136+
137+
[dependencies]
138+
regex = '1'
139+
---
140+
";
141+
assert_eq!(strip_frontmatter(input), Some(input.len()));
142+
143+
// allow ident infostring
144+
let input = "---cargo
145+
146+
---
147+
";
148+
assert_eq!(strip_frontmatter(input), Some(input.len()));
149+
150+
// disallow non-ident infostring
151+
let input = "---cargo hello
152+
153+
---
154+
";
155+
assert_eq!(strip_frontmatter(input), None);
156+
157+
// ignore extra whitespace
158+
let input = "
159+
160+
161+
---\u{0020}
162+
163+
---\u{0020}
164+
";
165+
assert_eq!(strip_frontmatter(input), Some(input.len()));
166+
167+
// disallow indented opening/close
168+
let input = " ---
169+
---
170+
";
171+
assert_eq!(strip_frontmatter(input), None);
172+
173+
// ignore inner dashes not at line start
174+
let input = "---
175+
176+
---
177+
---
178+
179+
---
180+
";
181+
assert_eq!(strip_frontmatter(input), Some(input.len()));
182+
183+
// ignore fewer dashes inside
184+
let input = "-----
185+
186+
---
187+
---
188+
189+
-----
190+
";
191+
assert_eq!(strip_frontmatter(input), Some(input.len()));
192+
193+
// disallow more dashes inside
194+
let input = "---
195+
196+
-----
197+
-----
198+
199+
---
200+
";
201+
assert_eq!(strip_frontmatter(input), None);
202+
203+
// disallow mismatch close
204+
let input = "----
205+
206+
---
207+
";
208+
assert_eq!(strip_frontmatter(input), None);
209+
210+
// disallow unclosed
211+
let input = "---
212+
213+
";
214+
assert_eq!(strip_frontmatter(input), None);
215+
216+
// disallow short open/close
217+
let input = "--
218+
219+
--
220+
";
221+
assert_eq!(strip_frontmatter(input), None);
222+
223+
// disallow content before
224+
let input = "#![feature(frontmatter)]
225+
226+
---
227+
---
228+
";
229+
assert_eq!(strip_frontmatter(input), None);
230+
231+
// disallow trailing text
232+
let input = "#![feature(frontmatter)]
233+
234+
---
235+
---cargo
236+
";
237+
assert_eq!(strip_frontmatter(input), None);
238+
}
239+
127240
fn check_lexing(src: &str, expect: Expect) {
128241
let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect();
129242
expect.assert_eq(&actual)

0 commit comments

Comments
 (0)