Skip to content

Commit b56b9be

Browse files
committed
Correctly handle nested comments
The tokenizer currently throws EOF error for `select 'foo' /*/**/*/` `last_ch` causes problems when tokenizing nested comments, we have to consume the combination of /* or */ The existing `tokenize_nested_multiline_comment` test fails after fixing this logic: /*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/ ^^ Start ^^ End nested comment Relevant: apache#726
1 parent fe36020 commit b56b9be

File tree

1 file changed

+64
-8
lines changed

1 file changed

+64
-8
lines changed

src/tokenizer.rs

+64-8
Original file line numberDiff line numberDiff line change
@@ -1855,22 +1855,30 @@ impl<'a> Tokenizer<'a> {
18551855
) -> Result<Option<Token>, TokenizerError> {
18561856
let mut s = String::new();
18571857
let mut nested = 1;
1858-
let mut last_ch = ' ';
18591858

18601859
loop {
18611860
match chars.next() {
18621861
Some(ch) => {
1863-
if last_ch == '/' && ch == '*' {
1862+
if ch == '/' && matches!(chars.peek(), Some('*')) {
1863+
s.push(ch);
1864+
s.push(chars.next().unwrap()); // consume the '*'
18641865
nested += 1;
1865-
} else if last_ch == '*' && ch == '/' {
1866+
continue;
1867+
}
1868+
1869+
if ch == '*' && matches!(chars.peek(), Some('/')) {
1870+
s.push(ch);
1871+
let slash = chars.next();
18661872
nested -= 1;
18671873
if nested == 0 {
1868-
s.pop();
1874+
s.pop(); // remove the last '/'
18691875
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
18701876
}
1877+
s.push(slash.unwrap());
1878+
continue;
18711879
}
1880+
18721881
s.push(ch);
1873-
last_ch = ch;
18741882
}
18751883
None => {
18761884
break self.tokenizer_error(
@@ -2718,17 +2726,65 @@ mod tests {
27182726

27192727
#[test]
27202728
fn tokenize_nested_multiline_comment() {
2721-
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2722-
27232729
let dialect = GenericDialect {};
2730+
2731+
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
27242732
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
27252733
let expected = vec![
27262734
Token::Number("0".to_string(), false),
27272735
Token::Whitespace(Whitespace::MultiLineComment(
2728-
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2736+
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
2737+
)),
2738+
Token::Whitespace(Whitespace::Space),
2739+
Token::Div,
2740+
Token::Word(Word {
2741+
value: "comment".to_string(),
2742+
quote_style: None,
2743+
keyword: Keyword::COMMENT,
2744+
}),
2745+
Token::Mul,
2746+
Token::Div,
2747+
Token::Number("1".to_string(), false),
2748+
];
2749+
compare(expected, tokens);
2750+
2751+
let sql2 = String::from("0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1");
2752+
let tokens2 = Tokenizer::new(&dialect, &sql2).tokenize().unwrap();
2753+
let expected2 = vec![
2754+
Token::Number("0".to_string(), false),
2755+
Token::Whitespace(Whitespace::MultiLineComment(
2756+
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
27292757
)),
27302758
Token::Number("1".to_string(), false),
27312759
];
2760+
compare(expected2, tokens2);
2761+
2762+
let sql3 = String::from("SELECT 1 /* a /* b */ c */");
2763+
let tokens3 = Tokenizer::new(&dialect, &sql3).tokenize().unwrap();
2764+
let expected3 = vec![
2765+
Token::make_keyword("SELECT"),
2766+
Token::Whitespace(Whitespace::Space),
2767+
Token::Number("1".to_string(), false),
2768+
Token::Whitespace(Whitespace::Space),
2769+
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
2770+
];
2771+
compare(expected3, tokens3);
2772+
}
2773+
2774+
#[test]
2775+
fn tokenize_nested_multiline_comment_empty() {
2776+
let sql = "select 'foo' /*/**/*/";
2777+
2778+
let dialect = GenericDialect {};
2779+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2780+
let expected = vec![
2781+
Token::make_keyword("select"),
2782+
Token::Whitespace(Whitespace::Space),
2783+
Token::SingleQuotedString("foo".to_string()),
2784+
Token::Whitespace(Whitespace::Space),
2785+
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
2786+
];
2787+
27322788
compare(expected, tokens);
27332789
}
27342790

0 commit comments

Comments
 (0)