Skip to content

Commit 2567e14

Browse files
authored
Lexer should consider BOM for the start offset (#11732)
## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot.
1 parent 3b19df0 commit 2567e14

4 files changed

+117
-15
lines changed

crates/ruff_python_parser/src/lexer.rs

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ mod cursor;
3030
mod fstring;
3131
mod indentation;
3232

33+
const BOM: char = '\u{feff}';
34+
3335
/// A lexer for Python source code.
3436
#[derive(Debug)]
3537
pub struct Lexer<'src> {
@@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
100102
errors: Vec::new(),
101103
};
102104

103-
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
104-
// spell-checker:ignore feff
105-
lexer.cursor.eat_char('\u{feff}');
106-
107-
if start_offset > TextSize::new(0) {
105+
if start_offset == TextSize::new(0) {
106+
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
107+
lexer.cursor.eat_char(BOM);
108+
} else {
108109
lexer.cursor.skip_bytes(start_offset.to_usize());
109110
}
110111

@@ -1922,8 +1923,8 @@ mod tests {
19221923
}
19231924
}
19241925

1925-
fn lex(source: &str, mode: Mode) -> LexerOutput {
1926-
let mut lexer = Lexer::new(source, mode, TextSize::default());
1926+
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
1927+
let mut lexer = Lexer::new(source, mode, start_offset);
19271928
let mut tokens = Vec::new();
19281929
loop {
19291930
let kind = lexer.next_token();
@@ -1943,8 +1944,8 @@ mod tests {
19431944
}
19441945
}
19451946

1946-
fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
1947-
let output = lex(source, mode);
1947+
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
1948+
let output = lex(source, mode, start_offset);
19481949

19491950
if !output.errors.is_empty() {
19501951
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
@@ -1959,7 +1960,7 @@ mod tests {
19591960
}
19601961

19611962
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
1962-
let output = lex(source, mode);
1963+
let output = lex(source, mode, TextSize::default());
19631964

19641965
assert!(
19651966
!output.errors.is_empty(),
@@ -1970,11 +1971,35 @@ mod tests {
19701971
}
19711972

19721973
fn lex_source(source: &str) -> LexerOutput {
1973-
lex_valid(source, Mode::Module)
1974+
lex_valid(source, Mode::Module, TextSize::default())
1975+
}
1976+
1977+
fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
1978+
lex_valid(source, Mode::Module, start_offset)
19741979
}
19751980

19761981
fn lex_jupyter_source(source: &str) -> LexerOutput {
1977-
lex_valid(source, Mode::Ipython)
1982+
lex_valid(source, Mode::Ipython, TextSize::default())
1983+
}
1984+
1985+
#[test]
1986+
fn bom() {
1987+
let source = "\u{feff}x = 1";
1988+
assert_snapshot!(lex_source(source));
1989+
}
1990+
1991+
#[test]
1992+
fn bom_with_offset() {
1993+
let source = "\u{feff}x + y + z";
1994+
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
1995+
}
1996+
1997+
#[test]
1998+
fn bom_with_offset_edge() {
1999+
// BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
2000+
// doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
2001+
let source = "\u{feff}x + y + z";
2002+
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
19782003
}
19792004

19802005
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
@@ -2118,7 +2143,7 @@ foo = ,func
21182143
def f(arg=%timeit a = b):
21192144
pass"
21202145
.trim();
2121-
let output = lex(source, Mode::Ipython);
2146+
let output = lex(source, Mode::Ipython, TextSize::default());
21222147
assert!(output.errors.is_empty());
21232148
assert_no_ipython_escape_command(&output.tokens);
21242149
}
@@ -2351,7 +2376,7 @@ if first:
23512376
}
23522377

23532378
fn get_tokens_only(source: &str) -> Vec<TokenKind> {
2354-
let output = lex(source, Mode::Module);
2379+
let output = lex(source, Mode::Module, TextSize::default());
23552380
assert!(output.errors.is_empty());
23562381
output.tokens.into_iter().map(|token| token.kind).collect()
23572382
}
@@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
25932618
}
25942619

25952620
fn lex_fstring_error(source: &str) -> FStringErrorType {
2596-
let output = lex(source, Mode::Module);
2621+
let output = lex(source, Mode::Module, TextSize::default());
25972622
match output
25982623
.errors
25992624
.into_iter()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
---
2+
source: crates/ruff_python_parser/src/lexer.rs
3+
expression: lex_source(source)
4+
---
5+
## Tokens
6+
```
7+
[
8+
(
9+
Name(
10+
"x",
11+
),
12+
3..4,
13+
),
14+
(
15+
Equal,
16+
5..6,
17+
),
18+
(
19+
Int(
20+
1,
21+
),
22+
7..8,
23+
),
24+
(
25+
Newline,
26+
8..8,
27+
),
28+
]
29+
```
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
---
2+
source: crates/ruff_python_parser/src/lexer.rs
3+
expression: "lex_source_with_offset(source, TextSize::new(7))"
4+
---
5+
## Tokens
6+
```
7+
[
8+
(
9+
Name(
10+
"y",
11+
),
12+
7..8,
13+
),
14+
(
15+
Plus,
16+
9..10,
17+
),
18+
(
19+
Name(
20+
"z",
21+
),
22+
11..12,
23+
),
24+
(
25+
Newline,
26+
12..12,
27+
),
28+
]
29+
```
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
---
2+
source: crates/ruff_python_parser/src/lexer.rs
3+
expression: "lex_source_with_offset(source, TextSize::new(11))"
4+
---
5+
## Tokens
6+
```
7+
[
8+
(
9+
Name(
10+
"z",
11+
),
12+
11..12,
13+
),
14+
(
15+
Newline,
16+
12..12,
17+
),
18+
]
19+
```

0 commit comments

Comments
 (0)