Lexer should consider BOM for the start offset (#11732)

dhruvmanila · web-flow · commit 2567e14b7a55 · 2024-06-04T08:45:46.000Z
## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -30,6 +30,8 @@ mod cursor;
 mod fstring;
 mod indentation;
 
+const BOM: char = '\u{feff}';
+
 /// A lexer for Python source code.
 #[derive(Debug)]
 pub struct Lexer<'src> {
@@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
             errors: Vec::new(),
         };
 
-        // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
-        // spell-checker:ignore feff
-        lexer.cursor.eat_char('\u{feff}');
-
-        if start_offset > TextSize::new(0) {
+        if start_offset == TextSize::new(0) {
+            // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
+            lexer.cursor.eat_char(BOM);
+        } else {
             lexer.cursor.skip_bytes(start_offset.to_usize());
         }
 
@@ -1922,8 +1923,8 @@ mod tests {
         }
     }
 
-    fn lex(source: &str, mode: Mode) -> LexerOutput {
-        let mut lexer = Lexer::new(source, mode, TextSize::default());
+    fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
+        let mut lexer = Lexer::new(source, mode, start_offset);
         let mut tokens = Vec::new();
         loop {
             let kind = lexer.next_token();
@@ -1943,8 +1944,8 @@ mod tests {
         }
     }
 
-    fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+    fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
+        let output = lex(source, mode, start_offset);
 
         if !output.errors.is_empty() {
             let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
@@ -1959,7 +1960,7 @@ mod tests {
     }
 
     fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+        let output = lex(source, mode, TextSize::default());
 
         assert!(
             !output.errors.is_empty(),
@@ -1970,11 +1971,35 @@ mod tests {
     }
 
     fn lex_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Module)
+        lex_valid(source, Mode::Module, TextSize::default())
+    }
+
+    fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
+        lex_valid(source, Mode::Module, start_offset)
     }
 
     fn lex_jupyter_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Ipython)
+        lex_valid(source, Mode::Ipython, TextSize::default())
+    }
+
+    #[test]
+    fn bom() {
+        let source = "\u{feff}x = 1";
+        assert_snapshot!(lex_source(source));
+    }
+
+    #[test]
+    fn bom_with_offset() {
+        let source = "\u{feff}x + y + z";
+        assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
+    }
+
+    #[test]
+    fn bom_with_offset_edge() {
+        // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
+        // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
+        let source = "\u{feff}x + y + z";
+        assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
     }
 
     fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
@@ -2118,7 +2143,7 @@ foo = ,func
 def f(arg=%timeit a = b):
     pass"
             .trim();
-        let output = lex(source, Mode::Ipython);
+        let output = lex(source, Mode::Ipython, TextSize::default());
         assert!(output.errors.is_empty());
         assert_no_ipython_escape_command(&output.tokens);
     }
@@ -2351,7 +2376,7 @@ if first:
     }
 
     fn get_tokens_only(source: &str) -> Vec<TokenKind> {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
         assert!(output.errors.is_empty());
         output.tokens.into_iter().map(|token| token.kind).collect()
     }
@@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
     }
 
     fn lex_fstring_error(source: &str) -> FStringErrorType {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
         match output
             .errors
             .into_iter()
diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap
@@ -0,0 +1,29 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: lex_source(source)
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "x",
+        ),
+        3..4,
+    ),
+    (
+        Equal,
+        5..6,
+    ),
+    (
+        Int(
+            1,
+        ),
+        7..8,
+    ),
+    (
+        Newline,
+        8..8,
+    ),
+]
+```
diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap
@@ -0,0 +1,29 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: "lex_source_with_offset(source, TextSize::new(7))"
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "y",
+        ),
+        7..8,
+    ),
+    (
+        Plus,
+        9..10,
+    ),
+    (
+        Name(
+            "z",
+        ),
+        11..12,
+    ),
+    (
+        Newline,
+        12..12,
+    ),
+]
+```
diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap
@@ -0,0 +1,19 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: "lex_source_with_offset(source, TextSize::new(11))"
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "z",
+        ),
+        11..12,
+    ),
+    (
+        Newline,
+        12..12,
+    ),
+]
+```