@@ -1136,12 +1136,24 @@ impl<'a> Tokenizer<'a> {
1136
1136
}
1137
1137
// numbers and period
1138
1138
'0' ..='9' | '.' => {
1139
- let mut s = peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
1139
+ // Some dialects support underscore as number separator
1140
+ // There can only be one at a time and it must be followed by another digit
1141
+ let is_number_separator = |ch : char , next_char : Option < char > | {
1142
+ self . dialect . supports_numeric_literal_underscores ( )
1143
+ && ch == '_'
1144
+ && next_char. is_some_and ( |next_ch| next_ch. is_ascii_hexdigit ( ) )
1145
+ } ;
1146
+
1147
+ let mut s = peeking_next_take_while ( chars, |ch, next_ch| {
1148
+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1149
+ } ) ;
1140
1150
1141
1151
// match binary literal that starts with 0x
1142
1152
if s == "0" && chars. peek ( ) == Some ( & 'x' ) {
1143
1153
chars. next ( ) ;
1144
- let s2 = peeking_take_while ( chars, |ch| ch. is_ascii_hexdigit ( ) ) ;
1154
+ let s2 = peeking_next_take_while ( chars, |ch, next_ch| {
1155
+ ch. is_ascii_hexdigit ( ) || is_number_separator ( ch, next_ch)
1156
+ } ) ;
1145
1157
return Ok ( Some ( Token :: HexStringLiteral ( s2) ) ) ;
1146
1158
}
1147
1159
@@ -1150,7 +1162,10 @@ impl<'a> Tokenizer<'a> {
1150
1162
s. push ( '.' ) ;
1151
1163
chars. next ( ) ;
1152
1164
}
1153
- s += & peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
1165
+
1166
+ s += & peeking_next_take_while ( chars, |ch, next_ch| {
1167
+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1168
+ } ) ;
1154
1169
1155
1170
// No number -> Token::Period
1156
1171
if s == "." {
@@ -1946,6 +1961,24 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
1946
1961
s
1947
1962
}
1948
1963
1964
+ /// Same as peeking_take_while, but also passes the next character to the predicate.
1965
+ fn peeking_next_take_while (
1966
+ chars : & mut State ,
1967
+ mut predicate : impl FnMut ( char , Option < char > ) -> bool ,
1968
+ ) -> String {
1969
+ let mut s = String :: new ( ) ;
1970
+ while let Some ( & ch) = chars. peek ( ) {
1971
+ let next_char = chars. peekable . clone ( ) . nth ( 1 ) ;
1972
+ if predicate ( ch, next_char) {
1973
+ chars. next ( ) ; // consume
1974
+ s. push ( ch) ;
1975
+ } else {
1976
+ break ;
1977
+ }
1978
+ }
1979
+ s
1980
+ }
1981
+
1949
1982
fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
1950
1983
Unescape :: new ( chars) . unescape ( )
1951
1984
}
@@ -2227,6 +2260,41 @@ mod tests {
2227
2260
compare ( expected, tokens) ;
2228
2261
}
2229
2262
2263
+ #[ test]
2264
+ fn tokenize_numeric_literal_underscore ( ) {
2265
+ let dialect = GenericDialect { } ;
2266
+ let sql = String :: from ( "SELECT 10_000" ) ;
2267
+ let mut tokenizer = Tokenizer :: new ( & dialect, & sql) ;
2268
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
2269
+ let expected = vec ! [
2270
+ Token :: make_keyword( "SELECT" ) ,
2271
+ Token :: Whitespace ( Whitespace :: Space ) ,
2272
+ Token :: Number ( "10" . to_string( ) , false ) ,
2273
+ Token :: make_word( "_000" , None ) ,
2274
+ ] ;
2275
+ compare ( expected, tokens) ;
2276
+
2277
+ all_dialects_where ( |dialect| dialect. supports_numeric_literal_underscores ( ) ) . tokenizes_to (
2278
+ "SELECT 10_000, _10_000, 10_00_, 10___0" ,
2279
+ vec ! [
2280
+ Token :: make_keyword( "SELECT" ) ,
2281
+ Token :: Whitespace ( Whitespace :: Space ) ,
2282
+ Token :: Number ( "10_000" . to_string( ) , false ) ,
2283
+ Token :: Comma ,
2284
+ Token :: Whitespace ( Whitespace :: Space ) ,
2285
+ Token :: make_word( "_10_000" , None ) , // leading underscore tokenizes as a word (parsed as column identifier)
2286
+ Token :: Comma ,
2287
+ Token :: Whitespace ( Whitespace :: Space ) ,
2288
+ Token :: Number ( "10_00" . to_string( ) , false ) ,
2289
+ Token :: make_word( "_" , None ) , // trailing underscores tokenizes as a word (syntax error in some dialects)
2290
+ Token :: Comma ,
2291
+ Token :: Whitespace ( Whitespace :: Space ) ,
2292
+ Token :: Number ( "10" . to_string( ) , false ) ,
2293
+ Token :: make_word( "___0" , None ) , // multiple underscores tokenizes as a word (syntax error in some dialects)
2294
+ ] ,
2295
+ ) ;
2296
+ }
2297
+
2230
2298
#[ test]
2231
2299
fn tokenize_select_exponent ( ) {
2232
2300
let sql = String :: from ( "SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10" ) ;
0 commit comments