From 113722efb230072d31b14eb991b947761006f43e Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Fri, 4 Feb 2022 17:28:22 +0100 Subject: [PATCH 1/5] fix: Handle double quotes inside quoted identifiers correctly This fixes #410 for standard SQL, however I don't know enough about other dialects to know if they handle this differently. May need more extensive testing as well. --- src/ast/mod.rs | 15 +++++++++++++-- src/tokenizer.rs | 22 ++++++++++++++++++++-- tests/sqlparser_postgres.rs | 5 +++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 477a22890..81d98a820 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -23,7 +23,7 @@ use alloc::{ string::{String, ToString}, vec::Vec, }; -use core::fmt; +use core::fmt::{self, Write}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -127,7 +127,18 @@ impl From<&str> for Ident { impl fmt::Display for Ident { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.quote_style { - Some(q) if q == '"' || q == '\'' || q == '`' => write!(f, "{}{}{}", q, self.value, q), + Some(q) if q == '"' || q == '\'' || q == '`' => { + f.write_char(q)?; + let mut first = true; + for s in self.value.split_inclusive(q) { + if !first { + f.write_char(q)?; + } + first = false; + f.write_str(s)?; + } + f.write_char(q) + } Some(q) if q == '[' => write!(f, "[{}]", self.value), None => f.write_str(&self.value), _ => panic!("unexpected quote style"), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 296bcc64b..24c91ef95 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -418,8 +418,26 @@ impl<'a> Tokenizer<'a> { quote_start if self.dialect.is_delimited_identifier_start(quote_start) => { chars.next(); // consume the opening quote let quote_end = Word::matching_end_quote(quote_start); - let s = peeking_take_while(chars, |ch| ch != quote_end); - if chars.next() == Some(quote_end) { + let mut last_char = None; + let s = { + let mut s = String::new(); + while let Some(ch) = chars.next() { + if ch == quote_end { + if chars.peek() == Some("e_end) { + chars.next(); + s.push(ch); + } else { + last_char = Some(quote_end); + break; + } + } else { + s.push(ch); + } + } + s + }; + + if last_char == Some(quote_end) { Ok(Some(Token::make_word(&s, Some(quote_start)))) } else { self.tokenizer_error(format!( diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 60d9c1cb4..0c1bb627c 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -834,6 +834,11 @@ fn parse_comments() { } } +#[test] +fn parse_quoted_identifier() { + pg_and_generic().verified_stmt(r#"SELECT "quoted "" ident""#); +} + fn pg() -> TestedDialects { TestedDialects { dialects: vec![Box::new(PostgreSqlDialect {})], From 96e695fd2430d0e732d03126295ef3475a483f52 Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Mon, 7 Feb 2022 12:11:09 +0100 Subject: [PATCH 2/5] refactor: Make quoted identifier parsing a seperate function --- src/tokenizer.rs | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 24c91ef95..0f13f53b4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -418,24 +418,7 @@ impl<'a> Tokenizer<'a> { quote_start if self.dialect.is_delimited_identifier_start(quote_start) => { chars.next(); // consume the opening quote let quote_end = Word::matching_end_quote(quote_start); - let mut last_char = None; - let s = { - let mut s = String::new(); - while let Some(ch) = chars.next() { - if ch == quote_end { - if chars.peek() == Some("e_end) { - chars.next(); - s.push(ch); - } else { - last_char = Some(quote_end); - break; - } - } else { - s.push(ch); - } - } - s - }; + let (s, last_char) = parse_quoted_ident(chars, quote_end); if last_char == Some(quote_end) { Ok(Some(Token::make_word(&s, Some(quote_start)))) @@ -746,6 +729,25 @@ fn peeking_take_while( s } +fn parse_quoted_ident(chars: &mut Peekable>, quote_end: char) -> (String, Option) { + let mut last_char = None; + let mut s = String::new(); + while let Some(ch) = chars.next() { + if ch == quote_end { + if chars.peek() == Some("e_end) { + chars.next(); + s.push(ch); + } else { + last_char = Some(quote_end); + break; + } + } else { + s.push(ch); + } + } + (s, last_char) +} + #[cfg(test)] mod tests { use super::*; From e344aac30f8be12073a5f605f6429f1feec6a51e Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Mon, 7 Feb 2022 12:21:12 +0100 Subject: [PATCH 3/5] test: Check that quoted identifier tokenization works Added `pretty_assertions` so that the `assert_eq!` in the tokenization is readable --- Cargo.toml | 1 + src/lib.rs | 4 ++++ src/tokenizer.rs | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index bf65c052b..d53e3aa9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ serde_json = { version = "1.0", optional = true } [dev-dependencies] simple_logger = "1.9" matches = "0.1" +pretty_assertions = "1" [package.metadata.release] # Instruct `cargo release` to not run `cargo publish` locally: diff --git a/src/lib.rs b/src/lib.rs index f04ae07a9..880c6d6e1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -39,6 +39,10 @@ #[cfg(not(feature = "std"))] extern crate alloc; +#[macro_use] +#[cfg(test)] +extern crate pretty_assertions; + pub mod ast; #[macro_use] pub mod dialect; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 0f13f53b4..faf99cd4e 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1296,6 +1296,24 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_quoted_identifier() { + let sql = r#" "a "" b" "a """ "c """"" "#; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![ + Token::Whitespace(Whitespace::Space), + Token::make_word(r#"a " b"#.into(), Some('"')), + Token::Whitespace(Whitespace::Space), + Token::make_word(r#"a ""#.into(), Some('"')), + Token::Whitespace(Whitespace::Space), + Token::make_word(r#"c """#.into(), Some('"')), + Token::Whitespace(Whitespace::Space), + ]; + compare(expected, tokens); + } + fn compare(expected: Vec, actual: Vec) { //println!("------------------------------"); //println!("tokens = {:?}", actual); From f85c3991cd80cf62d7deae1e976143ca6207b3bc Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Mon, 7 Feb 2022 12:37:13 +0100 Subject: [PATCH 4/5] test: Check that quoted identifiers work in mysql --- tests/sqlparser_mysql.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index f67d05c34..c1f558596 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -178,6 +178,37 @@ fn parse_quote_identifiers() { } } +#[test] +fn parse_quote_identifiers_2() { + let sql = "SELECT `quoted `` identifier`"; + assert_eq!( + mysql().verified_stmt(sql), + Statement::Query(Box::new(Query { + with: None, + body: SetExpr::Select(Box::new(Select { + distinct: false, + top: None, + projection: vec![SelectItem::UnnamedExpr(Expr::Identifier(Ident { + value: "quoted ` identifier".into(), + quote_style: Some('`'), + }))], + from: vec![], + lateral_views: vec![], + selection: None, + group_by: vec![], + cluster_by: vec![], + distribute_by: vec![], + sort_by: vec![], + having: None, + })), + order_by: vec![], + limit: None, + offset: None, + fetch: None, + })) + ); +} + #[test] fn parse_unterminated_escape() { let sql = r#"SELECT 'I\'m not fine\'"#; From 2c9cd207e972c0271cbf7ff77b91d7e50eefaebe Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Mon, 7 Feb 2022 15:56:27 +0100 Subject: [PATCH 5/5] chore: cargo clippy --- src/tokenizer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index faf99cd4e..05b530216 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1304,11 +1304,11 @@ mod tests { let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Whitespace(Whitespace::Space), - Token::make_word(r#"a " b"#.into(), Some('"')), + Token::make_word(r#"a " b"#, Some('"')), Token::Whitespace(Whitespace::Space), - Token::make_word(r#"a ""#.into(), Some('"')), + Token::make_word(r#"a ""#, Some('"')), Token::Whitespace(Whitespace::Space), - Token::make_word(r#"c """#.into(), Some('"')), + Token::make_word(r#"c """#, Some('"')), Token::Whitespace(Whitespace::Space), ]; compare(expected, tokens);