Skip to content

Support underscore separators in numbers for Clickhouse. Fixes #1659 #1677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/dialect/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ impl Dialect for ClickHouseDialect {
true
}

fn supports_numeric_literal_underscores(&self) -> bool {
true
}

// ClickHouse uses this for some FORMAT expressions in `INSERT` context, e.g. when inserting
// with FORMAT JSONEachRow a raw JSON key-value expression is valid and expected.
//
Expand Down
5 changes: 5 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,11 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports numbers containing underscores, e.g. `10_000_000`
fn supports_numeric_literal_underscores(&self) -> bool {
false
}

/// Returns true if the dialects supports specifying null treatment
/// as part of a window function's parameter list as opposed
/// to after the parameter list.
Expand Down
4 changes: 4 additions & 0 deletions src/dialect/postgresql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ impl Dialect for PostgreSqlDialect {
fn supports_string_escape_constant(&self) -> bool {
true
}

fn supports_numeric_literal_underscores(&self) -> bool {
true
}
}

pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
Expand Down
74 changes: 71 additions & 3 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1136,12 +1136,24 @@ impl<'a> Tokenizer<'a> {
}
// numbers and period
'0'..='9' | '.' => {
let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
// Some dialects support underscore as number separator
// There can only be one at a time and it must be followed by another digit
let is_number_separator = |ch: char, next_char: Option<char>| {
self.dialect.supports_numeric_literal_underscores()
&& ch == '_'
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
};

let mut s = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// match binary literal that starts with 0x
if s == "0" && chars.peek() == Some(&'x') {
chars.next();
let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
});
return Ok(Some(Token::HexStringLiteral(s2)));
}

Expand All @@ -1150,7 +1162,10 @@ impl<'a> Tokenizer<'a> {
s.push('.');
chars.next();
}
s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());

s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// No number -> Token::Period
if s == "." {
Expand Down Expand Up @@ -1946,6 +1961,24 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
s
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
chars: &mut State,
mut predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
let mut s = String::new();
while let Some(&ch) = chars.peek() {
let next_char = chars.peekable.clone().nth(1);
if predicate(ch, next_char) {
chars.next(); // consume
s.push(ch);
} else {
break;
}
}
s
}

fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
Unescape::new(chars).unescape()
}
Expand Down Expand Up @@ -2227,6 +2260,41 @@ mod tests {
compare(expected, tokens);
}

#[test]
fn tokenize_numeric_literal_underscore() {
let dialect = GenericDialect {};
let sql = String::from("SELECT 10_000");
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("10".to_string(), false),
Token::make_word("_000", None),
];
compare(expected, tokens);

all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
"SELECT 10_000, _10_000, 10_00_, 10___0",
vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("10_000".to_string(), false),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number("10_00".to_string(), false),
Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number("10".to_string(), false),
Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
],
);
}

#[test]
fn tokenize_select_exponent() {
let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
Expand Down
15 changes: 15 additions & 0 deletions tests/sqlparser_clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1649,6 +1649,21 @@ fn parse_table_sample() {
clickhouse().verified_stmt("SELECT * FROM tbl SAMPLE 1 / 10 OFFSET 1 / 2");
}

#[test]
fn parse_numbers_with_underscore() {
let canonical = if cfg!(feature = "bigdecimal") {
"SELECT 10000"
} else {
"SELECT 10_000"
};
let select = clickhouse().verified_only_select_with_canonical("SELECT 10_000", canonical);

assert_eq!(
select.projection,
vec![SelectItem::UnnamedExpr(Expr::Value(number("10_000")))]
)
}

fn clickhouse() -> TestedDialects {
TestedDialects::new(vec![Box::new(ClickHouseDialect {})])
}
Expand Down
Loading