From b56b9bed7661f744109bdc080093e519ff273248 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Mon, 30 Dec 2024 16:43:29 +0100
Subject: [PATCH 1/6] Correctly handle nested comments

The tokenizer currently throws EOF error for

`select 'foo' /*/**/*/`

`last_ch` causes problems when tokenizing nested comments, we have to
consume the combination of /* or */

The existing `tokenize_nested_multiline_comment` test fails after fixing
this logic:

/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/
^^ Start                                      ^^ End nested comment

Relevant: https://github.com/apache/datafusion-sqlparser-rs/pull/726
---
 src/tokenizer.rs | 72 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 64 insertions(+), 8 deletions(-)
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index da61303b4..28626dddd 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1855,22 +1855,30 @@ impl<'a> Tokenizer<'a> {
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
-        let mut last_ch = ' ';
 
         loop {
             match chars.next() {
                 Some(ch) => {
-                    if last_ch == '/' && ch == '*' {
+                    if ch == '/' && matches!(chars.peek(), Some('*')) {
+                        s.push(ch);
+                        s.push(chars.next().unwrap()); // consume the '*'
                         nested += 1;
-                    } else if last_ch == '*' && ch == '/' {
+                        continue;
+                    }
+
+                    if ch == '*' && matches!(chars.peek(), Some('/')) {
+                        s.push(ch);
+                        let slash = chars.next();
                         nested -= 1;
                         if nested == 0 {
-                            s.pop();
+                            s.pop(); // remove the last '/'
                             break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                         }
+                        s.push(slash.unwrap());
+                        continue;
                     }
+
                     s.push(ch);
-                    last_ch = ch;
                 }
                 None => {
                     break self.tokenizer_error(
@@ -2718,17 +2726,65 @@ mod tests {
 
     #[test]
     fn tokenize_nested_multiline_comment() {
-        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
-
         let dialect = GenericDialect {};
+
+        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
         let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
         let expected = vec![
             Token::Number("0".to_string(), false),
             Token::Whitespace(Whitespace::MultiLineComment(
-                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
+                "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
+            )),
+            Token::Whitespace(Whitespace::Space),
+            Token::Div,
+            Token::Word(Word {
+                value: "comment".to_string(),
+                quote_style: None,
+                keyword: Keyword::COMMENT,
+            }),
+            Token::Mul,
+            Token::Div,
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+
+        let sql2 = String::from("0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1");
+        let tokens2 = Tokenizer::new(&dialect, &sql2).tokenize().unwrap();
+        let expected2 = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::MultiLineComment(
+                "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
             )),
             Token::Number("1".to_string(), false),
         ];
+        compare(expected2, tokens2);
+
+        let sql3 = String::from("SELECT 1 /* a /* b */ c */");
+        let tokens3 = Tokenizer::new(&dialect, &sql3).tokenize().unwrap();
+        let expected3 = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
+        ];
+        compare(expected3, tokens3);
+    }
+
+    #[test]
+    fn tokenize_nested_multiline_comment_empty() {
+        let sql = "select 'foo' /*/**/*/";
+
+        let dialect = GenericDialect {};
+        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
+        let expected = vec![
+            Token::make_keyword("select"),
+            Token::Whitespace(Whitespace::Space),
+            Token::SingleQuotedString("foo".to_string()),
+            Token::Whitespace(Whitespace::Space),
+            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
+        ];
+
         compare(expected, tokens);
     }
 

From 5c6d3ed2a04e11c9a92bbccc205fec4d0f5bf6ff Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Mon, 30 Dec 2024 16:58:54 +0100
Subject: [PATCH 2/6] Fix comment

---
 src/tokenizer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 28626dddd..8e31918f5 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1871,7 +1871,7 @@ impl<'a> Tokenizer<'a> {
                         let slash = chars.next();
                         nested -= 1;
                         if nested == 0 {
-                            s.pop(); // remove the last '/'
+                            s.pop(); // remove the last '*'
                             break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                         }
                         s.push(slash.unwrap());

From f068f5eb45920810f0fbdc09b9e76194d9725ba8 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Mon, 30 Dec 2024 17:10:24 +0100
Subject: [PATCH 3/6] Use loop with test cases

---
 src/tokenizer.rs | 84 +++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 8e31918f5..13ffdf4c4 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2727,48 +2727,52 @@ mod tests {
     #[test]
     fn tokenize_nested_multiline_comment() {
         let dialect = GenericDialect {};
-
-        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
-        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
-        let expected = vec![
-            Token::Number("0".to_string(), false),
-            Token::Whitespace(Whitespace::MultiLineComment(
-                "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
-            )),
-            Token::Whitespace(Whitespace::Space),
-            Token::Div,
-            Token::Word(Word {
-                value: "comment".to_string(),
-                quote_style: None,
-                keyword: Keyword::COMMENT,
-            }),
-            Token::Mul,
-            Token::Div,
-            Token::Number("1".to_string(), false),
-        ];
-        compare(expected, tokens);
-
-        let sql2 = String::from("0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1");
-        let tokens2 = Tokenizer::new(&dialect, &sql2).tokenize().unwrap();
-        let expected2 = vec![
-            Token::Number("0".to_string(), false),
-            Token::Whitespace(Whitespace::MultiLineComment(
-                "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
-            )),
-            Token::Number("1".to_string(), false),
+        let test_cases = vec![
+            (
+                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
+                vec![
+                    Token::Number("0".to_string(), false),
+                    Token::Whitespace(Whitespace::MultiLineComment(
+                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
+                    )),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::Div,
+                    Token::Word(Word {
+                        value: "comment".to_string(),
+                        quote_style: None,
+                        keyword: Keyword::COMMENT,
+                    }),
+                    Token::Mul,
+                    Token::Div,
+                    Token::Number("1".to_string(), false),
+                ],
+            ),
+            (
+                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
+                vec![
+                    Token::Number("0".to_string(), false),
+                    Token::Whitespace(Whitespace::MultiLineComment(
+                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
+                    )),
+                    Token::Number("1".to_string(), false),
+                ],
+            ),
+            (
+                "SELECT 1/* a /* b */ c */0",
+                vec![
+                    Token::make_keyword("SELECT"),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::Number("1".to_string(), false),
+                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
+                    Token::Number("0".to_string(), false),
+                ],
+            ),
         ];
-        compare(expected2, tokens2);
 
-        let sql3 = String::from("SELECT 1 /* a /* b */ c */");
-        let tokens3 = Tokenizer::new(&dialect, &sql3).tokenize().unwrap();
-        let expected3 = vec![
-            Token::make_keyword("SELECT"),
-            Token::Whitespace(Whitespace::Space),
-            Token::Number("1".to_string(), false),
-            Token::Whitespace(Whitespace::Space),
-            Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
-        ];
-        compare(expected3, tokens3);
+        for (sql, expected) in test_cases {
+            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
+            compare(expected, tokens);
+        }
     }
 
     #[test]

From d7c8b3b7ff50cf62e6d295daa4519177aa8afd22 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Mon, 30 Dec 2024 17:11:37 +0100
Subject: [PATCH 4/6] Add character after nested comment

---
 src/tokenizer.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 13ffdf4c4..33faadaf9 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2777,16 +2777,16 @@ mod tests {
 
     #[test]
     fn tokenize_nested_multiline_comment_empty() {
-        let sql = "select 'foo' /*/**/*/";
+        let sql = "select 1/*/**/*/0";
 
         let dialect = GenericDialect {};
         let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
         let expected = vec![
             Token::make_keyword("select"),
             Token::Whitespace(Whitespace::Space),
-            Token::SingleQuotedString("foo".to_string()),
-            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
             Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
+            Token::Number("0".to_string(), false),
         ];
 
         compare(expected, tokens);

From e1bd5c336acf115e02cd9881d0b437cbc0671530 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Mon, 30 Dec 2024 18:27:54 +0100
Subject: [PATCH 5/6] Add option for nested comments to dialects

---
 src/dialect/generic.rs    |  4 ++++
 src/dialect/mod.rs        |  6 ++++++
 src/dialect/postgresql.rs |  4 ++++
 src/tokenizer.rs          | 23 ++++++++++++++++++++++-
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
index f852152a0..e2a73de8a 100644
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@@ -131,4 +131,8 @@ impl Dialect for GenericDialect {
     fn supports_empty_projections(&self) -> bool {
         true
     }
+
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 1343efca6..9ffbd8ed8 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports nested comments
+    /// e.g. `/* /* nested */ */`
+    fn supports_nested_comments(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a dialect:
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
index 6a13a386a..170b0a7c9 100644
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect {
     fn supports_empty_projections(&self) -> bool {
         true
     }
+
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
 
 pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 33faadaf9..85aaa970f 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1855,11 +1855,12 @@ impl<'a> Tokenizer<'a> {
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
+        let supports_nested_comments = self.dialect.supports_nested_comments();
 
         loop {
             match chars.next() {
                 Some(ch) => {
-                    if ch == '/' && matches!(chars.peek(), Some('*')) {
+                    if ch == '/' && matches!(chars.peek(), Some('*')) && supports_nested_comments {
                         s.push(ch);
                         s.push(chars.next().unwrap()); // consume the '*'
                         nested += 1;
@@ -2792,6 +2793,26 @@ mod tests {
         compare(expected, tokens);
     }
 
+    #[test]
+    fn tokenize_nested_comments_if_not_supported() {
+        let dialect = SQLiteDialect {};
+        let sql = "SELECT 1/*/* nested comment */*/0";
+        let tokens = Tokenizer::new(&dialect, sql).tokenize();
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+            Token::Whitespace(Whitespace::MultiLineComment(
+                "/* nested comment ".to_string(),
+            )),
+            Token::Mul,
+            Token::Div,
+            Token::Number("0".to_string(), false),
+        ];
+
+        compare(expected, tokens.unwrap());
+    }
+
     #[test]
     fn tokenize_multiline_comment_with_even_asterisks() {
         let sql = String::from("\n/** Comment **/\n");

From 4a6ab2b91e22fb3a3d9567292f17bd3aeef46574 Mon Sep 17 00:00:00 2001
From: Hans Ott <hansott@hotmail.be>
Date: Sun, 5 Jan 2025 13:20:16 +0100
Subject: [PATCH 6/6] Move if statements to match inside loop and avoid unwrap

---
 src/tokenizer.rs | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 85aaa970f..38bd33d60 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1859,33 +1859,29 @@ impl<'a> Tokenizer<'a> {
 
         loop {
             match chars.next() {
-                Some(ch) => {
-                    if ch == '/' && matches!(chars.peek(), Some('*')) && supports_nested_comments {
-                        s.push(ch);
-                        s.push(chars.next().unwrap()); // consume the '*'
-                        nested += 1;
-                        continue;
-                    }
-
-                    if ch == '*' && matches!(chars.peek(), Some('/')) {
-                        s.push(ch);
-                        let slash = chars.next();
-                        nested -= 1;
-                        if nested == 0 {
-                            s.pop(); // remove the last '*'
-                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
-                        }
-                        s.push(slash.unwrap());
-                        continue;
+                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
+                    chars.next(); // consume the '*'
+                    s.push('/');
+                    s.push('*');
+                    nested += 1;
+                }
+                Some('*') if matches!(chars.peek(), Some('/')) => {
+                    chars.next(); // consume the '/'
+                    nested -= 1;
+                    if nested == 0 {
+                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                     }
-
+                    s.push('*');
+                    s.push('/');
+                }
+                Some(ch) => {
                     s.push(ch);
                 }
                 None => {
                     break self.tokenizer_error(
                         chars.location(),
                         "Unexpected EOF while in a multi-line comment",
-                    )
+                    );
                 }
             }
         }