Skip to content

Commit 00a693f

Browse files
committed
feat(lexer): Add frontmatter stripping
1 parent 197154b commit 00a693f

File tree

2 files changed

+169
-0
lines changed

2 files changed

+169
-0
lines changed

Diff for: compiler/rustc_lexer/src/lib.rs

+56
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,62 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
277277
None
278278
}
279279

280+
/// Frontmatter is a special attribute type reserved for use by external tools
281+
///
282+
/// This must be called after [`strip_shebang`]
283+
pub fn strip_frontmatter(input: &str) -> Option<usize> {
284+
// Whitespace may precede a frontmatter but must end with a newline
285+
let mut rest = input;
286+
while !rest.is_empty() {
287+
let without_spaces = rest.trim_start_matches(|c| is_whitespace(c) && c != '\n');
288+
let without_nl = without_spaces.trim_start_matches('\n');
289+
if without_nl.len() == rest.len() {
290+
// nothing trimmed
291+
break;
292+
} else if without_nl.len() == without_spaces.len() {
293+
// either not a frontmatter or invalid opening
294+
return None;
295+
}
296+
rest = without_nl;
297+
}
298+
299+
// Opens with a line that starts with 3+ `-` followed by an optional identifier
300+
const FENCE_CHAR: char = '-';
301+
let fence_end =
302+
rest.char_indices().find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)).unwrap_or(rest.len());
303+
if fence_end < 3 {
304+
// either not a frontmatter or invalid frontmatter opening
305+
return None;
306+
}
307+
let (fence_pattern, rest) = rest.split_at(fence_end);
308+
let (info, rest) = rest.split_once("\n").unwrap_or((rest, ""));
309+
let info = info.trim_matches(is_whitespace);
310+
if !info.is_empty() && !is_ident(info) {
311+
// invalid infostring
312+
return None;
313+
}
314+
315+
// Ends with a line that starts with a matching number of `-` only followed by whitespace
316+
let rest = if let Some(rest) = rest.strip_prefix(fence_pattern) {
317+
rest
318+
} else {
319+
let nl_fence_pattern = format!("\n{fence_pattern}");
320+
let Some(frontmatter_nl) = rest.find(&nl_fence_pattern) else {
321+
// frontmatter close is required
322+
return None;
323+
};
324+
&rest[frontmatter_nl + nl_fence_pattern.len()..]
325+
};
326+
327+
let (line, rest) = rest.split_once("\n").unwrap_or((rest, ""));
328+
let line = line.trim_matches(is_whitespace);
329+
if !line.is_empty() {
330+
// invalid close, even if there are extra `-`s
331+
return None;
332+
}
333+
Some(input.len() - rest.len())
334+
}
335+
280336
/// Validates a raw string literal. Used for getting more information about a
281337
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
282338
#[inline]

Diff for: compiler/rustc_lexer/src/tests.rs

+113
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,119 @@ fn test_valid_shebang() {
124124
assert_eq!(strip_shebang(input), None);
125125
}
126126

127+
#[test]
128+
fn test_frontmatter() {
129+
let input = "---
130+
---
131+
";
132+
assert_eq!(strip_frontmatter(input), Some(input.len()));
133+
134+
let input = "---
135+
package.edition = '2024'
136+
137+
[dependencies]
138+
regex = '1'
139+
---
140+
";
141+
assert_eq!(strip_frontmatter(input), Some(input.len()));
142+
143+
// allow ident infostring
144+
let input = "---cargo
145+
146+
---
147+
";
148+
assert_eq!(strip_frontmatter(input), Some(input.len()));
149+
150+
// disallow non-ident infostring
151+
let input = "---cargo hello
152+
153+
---
154+
";
155+
assert_eq!(strip_frontmatter(input), None);
156+
157+
// ignore extra whitespace
158+
let input = "
159+
160+
161+
---\u{0020}
162+
163+
---\u{0020}
164+
";
165+
assert_eq!(strip_frontmatter(input), Some(input.len()));
166+
167+
// disallow indented opening/close
168+
let input = " ---
169+
---
170+
";
171+
assert_eq!(strip_frontmatter(input), None);
172+
173+
// ignore inner dashes not at line start
174+
let input = "---
175+
176+
---
177+
---
178+
179+
---
180+
";
181+
assert_eq!(strip_frontmatter(input), Some(input.len()));
182+
183+
// ignore fewer dashes inside
184+
let input = "-----
185+
186+
---
187+
---
188+
189+
-----
190+
";
191+
assert_eq!(strip_frontmatter(input), Some(input.len()));
192+
193+
// disallow more dashes inside
194+
let input = "---
195+
196+
-----
197+
-----
198+
199+
---
200+
";
201+
assert_eq!(strip_frontmatter(input), None);
202+
203+
// disallow mismatch close
204+
let input = "----
205+
206+
---
207+
";
208+
assert_eq!(strip_frontmatter(input), None);
209+
210+
// disallow unclosed
211+
let input = "---
212+
213+
";
214+
assert_eq!(strip_frontmatter(input), None);
215+
216+
// disallow short open/close
217+
let input = "--
218+
219+
--
220+
";
221+
assert_eq!(strip_frontmatter(input), None);
222+
223+
// disallow content before
224+
let input = "#![feature(frontmatter)]
225+
226+
---
227+
---
228+
";
229+
assert_eq!(strip_frontmatter(input), None);
230+
231+
// disallow trailing text
232+
let input = "#![feature(frontmatter)]
233+
234+
---
235+
---cargo
236+
";
237+
assert_eq!(strip_frontmatter(input), None);
238+
}
239+
127240
fn check_lexing(src: &str, expect: Expect) {
128241
let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect();
129242
expect.assert_eq(&actual)

0 commit comments

Comments
 (0)