Skip to content

Commit b0184b4

Browse files
committed
Correctly handle nested comments
The tokenizer currently throws EOF error for `select 'foo' /*/**/*/` `last_ch` causes problems when tokenizing nested comments, we have to consume the combination of /* or */ The existing `tokenize_nested_multiline_comment` test fails after fixing this logic: /*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/ ^^ Start ^^ End nested comment Relevant: apache#726
1 parent 3f5fdeb commit b0184b4

File tree

4 files changed

+108
-17
lines changed

4 files changed

+108
-17
lines changed

src/dialect/generic.rs

+4
Original file line numberDiff line numberDiff line change
@@ -107,4 +107,8 @@ impl Dialect for GenericDialect {
107107
fn supports_asc_desc_in_column_definition(&self) -> bool {
108108
true
109109
}
110+
111+
fn supports_nested_comments(&self) -> bool {
112+
true
113+
}
110114
}

src/dialect/mod.rs

+6
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,12 @@ pub trait Dialect: Debug + Any {
567567
fn supports_asc_desc_in_column_definition(&self) -> bool {
568568
false
569569
}
570+
571+
/// Returns true if the dialect supports nested comments
572+
/// e.g. `/* /* nested */ */`
573+
fn supports_nested_comments(&self) -> bool {
574+
false
575+
}
570576
}
571577

572578
/// This represents the operators for which precedence must be defined

src/dialect/postgresql.rs

+4
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,10 @@ impl Dialect for PostgreSqlDialect {
188188
fn supports_explain_with_utility_options(&self) -> bool {
189189
true
190190
}
191+
192+
fn supports_nested_comments(&self) -> bool {
193+
true
194+
}
191195
}
192196

193197
pub fn parse_comment(parser: &mut Parser) -> Result<Statement, ParserError> {

src/tokenizer.rs

+94-17
Original file line numberDiff line numberDiff line change
@@ -1603,28 +1603,33 @@ impl<'a> Tokenizer<'a> {
16031603
) -> Result<Option<Token>, TokenizerError> {
16041604
let mut s = String::new();
16051605
let mut nested = 1;
1606-
let mut last_ch = ' ';
1606+
let supports_nested_comments = self.dialect.supports_nested_comments();
16071607

16081608
loop {
16091609
match chars.next() {
1610-
Some(ch) => {
1611-
if last_ch == '/' && ch == '*' {
1612-
nested += 1;
1613-
} else if last_ch == '*' && ch == '/' {
1614-
nested -= 1;
1615-
if nested == 0 {
1616-
s.pop();
1617-
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1618-
}
1610+
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
1611+
chars.next(); // consume the '*'
1612+
s.push('/');
1613+
s.push('*');
1614+
nested += 1;
1615+
}
1616+
Some('*') if matches!(chars.peek(), Some('/')) => {
1617+
chars.next(); // consume the '/'
1618+
nested -= 1;
1619+
if nested == 0 {
1620+
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
16191621
}
1622+
s.push('*');
1623+
s.push('/');
1624+
}
1625+
Some(ch) => {
16201626
s.push(ch);
1621-
last_ch = ch;
16221627
}
16231628
None => {
16241629
break self.tokenizer_error(
16251630
chars.location(),
16261631
"Unexpected EOF while in a multi-line comment",
1627-
)
1632+
);
16281633
}
16291634
}
16301635
}
@@ -2466,18 +2471,90 @@ mod tests {
24662471

24672472
#[test]
24682473
fn tokenize_nested_multiline_comment() {
2469-
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2474+
let dialect = GenericDialect {};
2475+
let test_cases = vec![
2476+
(
2477+
"0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
2478+
vec![
2479+
Token::Number("0".to_string(), false),
2480+
Token::Whitespace(Whitespace::MultiLineComment(
2481+
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
2482+
)),
2483+
Token::Whitespace(Whitespace::Space),
2484+
Token::Div,
2485+
Token::Word(Word {
2486+
value: "comment".to_string(),
2487+
quote_style: None,
2488+
keyword: Keyword::COMMENT,
2489+
}),
2490+
Token::Mul,
2491+
Token::Div,
2492+
Token::Number("1".to_string(), false),
2493+
],
2494+
),
2495+
(
2496+
"0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
2497+
vec![
2498+
Token::Number("0".to_string(), false),
2499+
Token::Whitespace(Whitespace::MultiLineComment(
2500+
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
2501+
)),
2502+
Token::Number("1".to_string(), false),
2503+
],
2504+
),
2505+
(
2506+
"SELECT 1/* a /* b */ c */0",
2507+
vec![
2508+
Token::make_keyword("SELECT"),
2509+
Token::Whitespace(Whitespace::Space),
2510+
Token::Number("1".to_string(), false),
2511+
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
2512+
Token::Number("0".to_string(), false),
2513+
],
2514+
),
2515+
];
2516+
2517+
for (sql, expected) in test_cases {
2518+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2519+
compare(expected, tokens);
2520+
}
2521+
}
2522+
2523+
#[test]
2524+
fn tokenize_nested_multiline_comment_empty() {
2525+
let sql = "select 1/*/**/*/0";
24702526

24712527
let dialect = GenericDialect {};
2472-
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2528+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
24732529
let expected = vec![
2530+
Token::make_keyword("select"),
2531+
Token::Whitespace(Whitespace::Space),
2532+
Token::Number("1".to_string(), false),
2533+
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
24742534
Token::Number("0".to_string(), false),
2535+
];
2536+
2537+
compare(expected, tokens);
2538+
}
2539+
2540+
#[test]
2541+
fn tokenize_nested_comments_if_not_supported() {
2542+
let dialect = SQLiteDialect {};
2543+
let sql = "SELECT 1/*/* nested comment */*/0";
2544+
let tokens = Tokenizer::new(&dialect, sql).tokenize();
2545+
let expected = vec![
2546+
Token::make_keyword("SELECT"),
2547+
Token::Whitespace(Whitespace::Space),
2548+
Token::Number("1".to_string(), false),
24752549
Token::Whitespace(Whitespace::MultiLineComment(
2476-
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2550+
"/* nested comment ".to_string(),
24772551
)),
2478-
Token::Number("1".to_string(), false),
2552+
Token::Mul,
2553+
Token::Div,
2554+
Token::Number("0".to_string(), false),
24792555
];
2480-
compare(expected, tokens);
2556+
2557+
compare(expected, tokens.unwrap());
24812558
}
24822559

24832560
#[test]

0 commit comments

Comments
 (0)