diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs index 884dfcbcb..830b3da90 100644 --- a/src/dialect/clickhouse.rs +++ b/src/dialect/clickhouse.rs @@ -59,6 +59,10 @@ impl Dialect for ClickHouseDialect { true } + fn supports_numeric_literal_underscores(&self) -> bool { + true + } + // ClickHouse uses this for some FORMAT expressions in `INSERT` context, e.g. when inserting // with FORMAT JSONEachRow a raw JSON key-value expression is valid and expected. // diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 9fc16cd56..5f2b4e26a 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -304,6 +304,11 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports numbers containing underscores, e.g. `10_000_000` + fn supports_numeric_literal_underscores(&self) -> bool { + false + } + /// Returns true if the dialects supports specifying null treatment /// as part of a window function's parameter list as opposed /// to after the parameter list. diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index d4f2a032e..5ce4250fb 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -249,6 +249,10 @@ impl Dialect for PostgreSqlDialect { fn supports_string_escape_constant(&self) -> bool { true } + + fn supports_numeric_literal_underscores(&self) -> bool { + true + } } pub fn parse_create(parser: &mut Parser) -> Option> { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 309f09d81..7742e8fae 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1136,12 +1136,24 @@ impl<'a> Tokenizer<'a> { } // numbers and period '0'..='9' | '.' => { - let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit()); + // Some dialects support underscore as number separator + // There can only be one at a time and it must be followed by another digit + let is_number_separator = |ch: char, next_char: Option| { + self.dialect.supports_numeric_literal_underscores() + && ch == '_' + && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) + }; + + let mut s = peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); // match binary literal that starts with 0x if s == "0" && chars.peek() == Some(&'x') { chars.next(); - let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit()); + let s2 = peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch) + }); return Ok(Some(Token::HexStringLiteral(s2))); } @@ -1150,7 +1162,10 @@ impl<'a> Tokenizer<'a> { s.push('.'); chars.next(); } - s += &peeking_take_while(chars, |ch| ch.is_ascii_digit()); + + s += &peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); // No number -> Token::Period if s == "." { @@ -1946,6 +1961,24 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool s } +/// Same as peeking_take_while, but also passes the next character to the predicate. +fn peeking_next_take_while( + chars: &mut State, + mut predicate: impl FnMut(char, Option) -> bool, +) -> String { + let mut s = String::new(); + while let Some(&ch) = chars.peek() { + let next_char = chars.peekable.clone().nth(1); + if predicate(ch, next_char) { + chars.next(); // consume + s.push(ch); + } else { + break; + } + } + s +} + fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option { Unescape::new(chars).unescape() } @@ -2227,6 +2260,41 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_numeric_literal_underscore() { + let dialect = GenericDialect {}; + let sql = String::from("SELECT 10_000"); + let mut tokenizer = Tokenizer::new(&dialect, &sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("10".to_string(), false), + Token::make_word("_000", None), + ]; + compare(expected, tokens); + + all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to( + "SELECT 10_000, _10_000, 10_00_, 10___0", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("10_000".to_string(), false), + Token::Comma, + Token::Whitespace(Whitespace::Space), + Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier) + Token::Comma, + Token::Whitespace(Whitespace::Space), + Token::Number("10_00".to_string(), false), + Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects) + Token::Comma, + Token::Whitespace(Whitespace::Space), + Token::Number("10".to_string(), false), + Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects) + ], + ); + } + #[test] fn tokenize_select_exponent() { let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10"); diff --git a/tests/sqlparser_clickhouse.rs b/tests/sqlparser_clickhouse.rs index 0f22db389..5b0638a4b 100644 --- a/tests/sqlparser_clickhouse.rs +++ b/tests/sqlparser_clickhouse.rs @@ -1649,6 +1649,21 @@ fn parse_table_sample() { clickhouse().verified_stmt("SELECT * FROM tbl SAMPLE 1 / 10 OFFSET 1 / 2"); } +#[test] +fn parse_numbers_with_underscore() { + let canonical = if cfg!(feature = "bigdecimal") { + "SELECT 10000" + } else { + "SELECT 10_000" + }; + let select = clickhouse().verified_only_select_with_canonical("SELECT 10_000", canonical); + + assert_eq!( + select.projection, + vec![SelectItem::UnnamedExpr(Expr::Value(number("10_000")))] + ) +} + fn clickhouse() -> TestedDialects { TestedDialects::new(vec![Box::new(ClickHouseDialect {})]) }