@@ -30,6 +30,8 @@ mod cursor;
30
30
mod fstring;
31
31
mod indentation;
32
32
33
+ const BOM : char = '\u{feff}' ;
34
+
33
35
/// A lexer for Python source code.
34
36
#[ derive( Debug ) ]
35
37
pub struct Lexer < ' src > {
@@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
100
102
errors : Vec :: new ( ) ,
101
103
} ;
102
104
103
- // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
104
- // spell-checker:ignore feff
105
- lexer. cursor . eat_char ( '\u{feff}' ) ;
106
-
107
- if start_offset > TextSize :: new ( 0 ) {
105
+ if start_offset == TextSize :: new ( 0 ) {
106
+ // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
107
+ lexer. cursor . eat_char ( BOM ) ;
108
+ } else {
108
109
lexer. cursor . skip_bytes ( start_offset. to_usize ( ) ) ;
109
110
}
110
111
@@ -1922,8 +1923,8 @@ mod tests {
1922
1923
}
1923
1924
}
1924
1925
1925
- fn lex ( source : & str , mode : Mode ) -> LexerOutput {
1926
- let mut lexer = Lexer :: new ( source, mode, TextSize :: default ( ) ) ;
1926
+ fn lex ( source : & str , mode : Mode , start_offset : TextSize ) -> LexerOutput {
1927
+ let mut lexer = Lexer :: new ( source, mode, start_offset ) ;
1927
1928
let mut tokens = Vec :: new ( ) ;
1928
1929
loop {
1929
1930
let kind = lexer. next_token ( ) ;
@@ -1943,8 +1944,8 @@ mod tests {
1943
1944
}
1944
1945
}
1945
1946
1946
- fn lex_valid ( source : & str , mode : Mode ) -> LexerOutput {
1947
- let output = lex ( source, mode) ;
1947
+ fn lex_valid ( source : & str , mode : Mode , start_offset : TextSize ) -> LexerOutput {
1948
+ let output = lex ( source, mode, start_offset ) ;
1948
1949
1949
1950
if !output. errors . is_empty ( ) {
1950
1951
let mut message = "Unexpected lexical errors for a valid source:\n " . to_string ( ) ;
@@ -1959,7 +1960,7 @@ mod tests {
1959
1960
}
1960
1961
1961
1962
fn lex_invalid ( source : & str , mode : Mode ) -> LexerOutput {
1962
- let output = lex ( source, mode) ;
1963
+ let output = lex ( source, mode, TextSize :: default ( ) ) ;
1963
1964
1964
1965
assert ! (
1965
1966
!output. errors. is_empty( ) ,
@@ -1970,11 +1971,35 @@ mod tests {
1970
1971
}
1971
1972
1972
1973
fn lex_source ( source : & str ) -> LexerOutput {
1973
- lex_valid ( source, Mode :: Module )
1974
+ lex_valid ( source, Mode :: Module , TextSize :: default ( ) )
1975
+ }
1976
+
1977
+ fn lex_source_with_offset ( source : & str , start_offset : TextSize ) -> LexerOutput {
1978
+ lex_valid ( source, Mode :: Module , start_offset)
1974
1979
}
1975
1980
1976
1981
fn lex_jupyter_source ( source : & str ) -> LexerOutput {
1977
- lex_valid ( source, Mode :: Ipython )
1982
+ lex_valid ( source, Mode :: Ipython , TextSize :: default ( ) )
1983
+ }
1984
+
1985
+ #[ test]
1986
+ fn bom ( ) {
1987
+ let source = "\u{feff} x = 1" ;
1988
+ assert_snapshot ! ( lex_source( source) ) ;
1989
+ }
1990
+
1991
+ #[ test]
1992
+ fn bom_with_offset ( ) {
1993
+ let source = "\u{feff} x + y + z" ;
1994
+ assert_snapshot ! ( lex_source_with_offset( source, TextSize :: new( 7 ) ) ) ;
1995
+ }
1996
+
1997
+ #[ test]
1998
+ fn bom_with_offset_edge ( ) {
1999
+ // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
2000
+ // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
2001
+ let source = "\u{feff} x + y + z" ;
2002
+ assert_snapshot ! ( lex_source_with_offset( source, TextSize :: new( 11 ) ) ) ;
1978
2003
}
1979
2004
1980
2005
fn ipython_escape_command_line_continuation_eol ( eol : & str ) -> LexerOutput {
@@ -2118,7 +2143,7 @@ foo = ,func
2118
2143
def f(arg=%timeit a = b):
2119
2144
pass"
2120
2145
. trim ( ) ;
2121
- let output = lex ( source, Mode :: Ipython ) ;
2146
+ let output = lex ( source, Mode :: Ipython , TextSize :: default ( ) ) ;
2122
2147
assert ! ( output. errors. is_empty( ) ) ;
2123
2148
assert_no_ipython_escape_command ( & output. tokens ) ;
2124
2149
}
@@ -2351,7 +2376,7 @@ if first:
2351
2376
}
2352
2377
2353
2378
fn get_tokens_only ( source : & str ) -> Vec < TokenKind > {
2354
- let output = lex ( source, Mode :: Module ) ;
2379
+ let output = lex ( source, Mode :: Module , TextSize :: default ( ) ) ;
2355
2380
assert ! ( output. errors. is_empty( ) ) ;
2356
2381
output. tokens . into_iter ( ) . map ( |token| token. kind ) . collect ( )
2357
2382
}
@@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
2593
2618
}
2594
2619
2595
2620
fn lex_fstring_error ( source : & str ) -> FStringErrorType {
2596
- let output = lex ( source, Mode :: Module ) ;
2621
+ let output = lex ( source, Mode :: Module , TextSize :: default ( ) ) ;
2597
2622
match output
2598
2623
. errors
2599
2624
. into_iter ( )
0 commit comments