From b56b9bed7661f744109bdc080093e519ff273248 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Mon, 30 Dec 2024 16:43:29 +0100 Subject: [PATCH 1/6] Correctly handle nested comments The tokenizer currently throws EOF error for `select 'foo' /*/**/*/` `last_ch` causes problems when tokenizing nested comments, we have to consume the combination of /* or */ The existing `tokenize_nested_multiline_comment` test fails after fixing this logic: /*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/ ^^ Start ^^ End nested comment Relevant: https://github.com/apache/datafusion-sqlparser-rs/pull/726 --- src/tokenizer.rs | 72 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index da61303b4..28626dddd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1855,22 +1855,30 @@ impl<'a> Tokenizer<'a> { ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; - let mut last_ch = ' '; loop { match chars.next() { Some(ch) => { - if last_ch == '/' && ch == '*' { + if ch == '/' && matches!(chars.peek(), Some('*')) { + s.push(ch); + s.push(chars.next().unwrap()); // consume the '*' nested += 1; - } else if last_ch == '*' && ch == '/' { + continue; + } + + if ch == '*' && matches!(chars.peek(), Some('/')) { + s.push(ch); + let slash = chars.next(); nested -= 1; if nested == 0 { - s.pop(); + s.pop(); // remove the last '/' break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } + s.push(slash.unwrap()); + continue; } + s.push(ch); - last_ch = ch; } None => { break self.tokenizer_error( @@ -2718,17 +2726,65 @@ mod tests { #[test] fn tokenize_nested_multiline_comment() { - let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1"); - let dialect = GenericDialect {}; + + let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1"); let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(), + "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), + )), + Token::Whitespace(Whitespace::Space), + Token::Div, + Token::Word(Word { + value: "comment".to_string(), + quote_style: None, + keyword: Keyword::COMMENT, + }), + Token::Mul, + Token::Div, + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + + let sql2 = String::from("0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1"); + let tokens2 = Tokenizer::new(&dialect, &sql2).tokenize().unwrap(); + let expected2 = vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), )), Token::Number("1".to_string(), false), ]; + compare(expected2, tokens2); + + let sql3 = String::from("SELECT 1 /* a /* b */ c */"); + let tokens3 = Tokenizer::new(&dialect, &sql3).tokenize().unwrap(); + let expected3 = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), + ]; + compare(expected3, tokens3); + } + + #[test] + fn tokenize_nested_multiline_comment_empty() { + let sql = "select 'foo' /*/**/*/"; + + let dialect = GenericDialect {}; + let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); + let expected = vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::SingleQuotedString("foo".to_string()), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), + ]; + compare(expected, tokens); } From 5c6d3ed2a04e11c9a92bbccc205fec4d0f5bf6ff Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Mon, 30 Dec 2024 16:58:54 +0100 Subject: [PATCH 2/6] Fix comment --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 28626dddd..8e31918f5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1871,7 +1871,7 @@ impl<'a> Tokenizer<'a> { let slash = chars.next(); nested -= 1; if nested == 0 { - s.pop(); // remove the last '/' + s.pop(); // remove the last '*' break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } s.push(slash.unwrap()); From f068f5eb45920810f0fbdc09b9e76194d9725ba8 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Mon, 30 Dec 2024 17:10:24 +0100 Subject: [PATCH 3/6] Use loop with test cases --- src/tokenizer.rs | 84 +++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8e31918f5..13ffdf4c4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2727,48 +2727,52 @@ mod tests { #[test] fn tokenize_nested_multiline_comment() { let dialect = GenericDialect {}; - - let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1"); - let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - let expected = vec![ - Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), - )), - Token::Whitespace(Whitespace::Space), - Token::Div, - Token::Word(Word { - value: "comment".to_string(), - quote_style: None, - keyword: Keyword::COMMENT, - }), - Token::Mul, - Token::Div, - Token::Number("1".to_string(), false), - ]; - compare(expected, tokens); - - let sql2 = String::from("0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1"); - let tokens2 = Tokenizer::new(&dialect, &sql2).tokenize().unwrap(); - let expected2 = vec![ - Token::Number("0".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), - )), - Token::Number("1".to_string(), false), + let test_cases = vec![ + ( + "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), + )), + Token::Whitespace(Whitespace::Space), + Token::Div, + Token::Word(Word { + value: "comment".to_string(), + quote_style: None, + keyword: Keyword::COMMENT, + }), + Token::Mul, + Token::Div, + Token::Number("1".to_string(), false), + ], + ), + ( + "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), + )), + Token::Number("1".to_string(), false), + ], + ), + ( + "SELECT 1/* a /* b */ c */0", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), + Token::Number("0".to_string(), false), + ], + ), ]; - compare(expected2, tokens2); - let sql3 = String::from("SELECT 1 /* a /* b */ c */"); - let tokens3 = Tokenizer::new(&dialect, &sql3).tokenize().unwrap(); - let expected3 = vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::Space), - Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), - ]; - compare(expected3, tokens3); + for (sql, expected) in test_cases { + let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); + compare(expected, tokens); + } } #[test] From d7c8b3b7ff50cf62e6d295daa4519177aa8afd22 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Mon, 30 Dec 2024 17:11:37 +0100 Subject: [PATCH 4/6] Add character after nested comment --- src/tokenizer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 13ffdf4c4..33faadaf9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2777,16 +2777,16 @@ mod tests { #[test] fn tokenize_nested_multiline_comment_empty() { - let sql = "select 'foo' /*/**/*/"; + let sql = "select 1/*/**/*/0"; let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); let expected = vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), - Token::SingleQuotedString("foo".to_string()), - Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), + Token::Number("0".to_string(), false), ]; compare(expected, tokens); From e1bd5c336acf115e02cd9881d0b437cbc0671530 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Mon, 30 Dec 2024 18:27:54 +0100 Subject: [PATCH 5/6] Add option for nested comments to dialects --- src/dialect/generic.rs | 4 ++++ src/dialect/mod.rs | 6 ++++++ src/dialect/postgresql.rs | 4 ++++ src/tokenizer.rs | 23 ++++++++++++++++++++++- 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index f852152a0..e2a73de8a 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -131,4 +131,8 @@ impl Dialect for GenericDialect { fn supports_empty_projections(&self) -> bool { true } + + fn supports_nested_comments(&self) -> bool { + true + } } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 1343efca6..9ffbd8ed8 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports nested comments + /// e.g. `/* /* nested */ */` + fn supports_nested_comments(&self) -> bool { + false + } + /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem` /// as an alias assignment operator, rather than a boolean expression. /// For example: the following statements are equivalent for such a dialect: diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 6a13a386a..170b0a7c9 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect { fn supports_empty_projections(&self) -> bool { true } + + fn supports_nested_comments(&self) -> bool { + true + } } pub fn parse_create(parser: &mut Parser) -> Option> { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 33faadaf9..85aaa970f 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1855,11 +1855,12 @@ impl<'a> Tokenizer<'a> { ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; + let supports_nested_comments = self.dialect.supports_nested_comments(); loop { match chars.next() { Some(ch) => { - if ch == '/' && matches!(chars.peek(), Some('*')) { + if ch == '/' && matches!(chars.peek(), Some('*')) && supports_nested_comments { s.push(ch); s.push(chars.next().unwrap()); // consume the '*' nested += 1; @@ -2792,6 +2793,26 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_nested_comments_if_not_supported() { + let dialect = SQLiteDialect {}; + let sql = "SELECT 1/*/* nested comment */*/0"; + let tokens = Tokenizer::new(&dialect, sql).tokenize(); + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + Token::Whitespace(Whitespace::MultiLineComment( + "/* nested comment ".to_string(), + )), + Token::Mul, + Token::Div, + Token::Number("0".to_string(), false), + ]; + + compare(expected, tokens.unwrap()); + } + #[test] fn tokenize_multiline_comment_with_even_asterisks() { let sql = String::from("\n/** Comment **/\n"); From 4a6ab2b91e22fb3a3d9567292f17bd3aeef46574 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Sun, 5 Jan 2025 13:20:16 +0100 Subject: [PATCH 6/6] Move if statements to match inside loop and avoid unwrap --- src/tokenizer.rs | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 85aaa970f..38bd33d60 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1859,33 +1859,29 @@ impl<'a> Tokenizer<'a> { loop { match chars.next() { - Some(ch) => { - if ch == '/' && matches!(chars.peek(), Some('*')) && supports_nested_comments { - s.push(ch); - s.push(chars.next().unwrap()); // consume the '*' - nested += 1; - continue; - } - - if ch == '*' && matches!(chars.peek(), Some('/')) { - s.push(ch); - let slash = chars.next(); - nested -= 1; - if nested == 0 { - s.pop(); // remove the last '*' - break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); - } - s.push(slash.unwrap()); - continue; + Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { + chars.next(); // consume the '*' + s.push('/'); + s.push('*'); + nested += 1; + } + Some('*') if matches!(chars.peek(), Some('/')) => { + chars.next(); // consume the '/' + nested -= 1; + if nested == 0 { + break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } - + s.push('*'); + s.push('/'); + } + Some(ch) => { s.push(ch); } None => { break self.tokenizer_error( chars.location(), "Unexpected EOF while in a multi-line comment", - ) + ); } } }