diff --git a/src/ast/value.rs b/src/ast/value.rs index 95ea978d0..aad795f81 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -54,6 +54,7 @@ pub enum Value { HexStringLiteral(String), DoubleQuotedString(String), + OriginalString(String), /// Boolean value true or false Boolean(bool), /// `NULL` value @@ -70,6 +71,7 @@ impl fmt::Display for Value { Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }), Value::DoubleQuotedString(v) => write!(f, "\"{v}\""), Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)), + Value::OriginalString(v) => write!(f, "{}", v), Value::DollarQuotedString(v) => write!(f, "{v}"), Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)), Value::NationalStringLiteral(v) => write!(f, "N'{v}'"), diff --git a/src/ast/visitor.rs b/src/ast/visitor.rs index 81343220a..e857087e7 100644 --- a/src/ast/visitor.rs +++ b/src/ast/visitor.rs @@ -632,7 +632,7 @@ mod tests { fn do_visit(sql: &str) -> Vec { let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let s = Parser::new(&dialect) .with_tokens(tokens) diff --git a/src/parser.rs b/src/parser.rs index b89077fb5..0be594738 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -198,6 +198,7 @@ const DEFAULT_REMAINING_DEPTH: usize = 50; #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct ParserOptions { pub trailing_commas: bool, + pub no_escape: bool, } pub struct Parser<'a> { @@ -207,7 +208,7 @@ pub struct Parser<'a> { /// The current dialect to use dialect: &'a dyn Dialect, /// Additional options that allow you to mix & match behavior otherwise - /// constrained to certain dialects (e.g. trailing commas) + /// constrained to certain dialects (e.g. trailing commas) and/or format of parse (e.g. no escape) options: ParserOptions, /// ensure the stack does not overflow by limiting recursion depth recursion_counter: RecursionCounter, @@ -317,7 +318,10 @@ impl<'a> Parser<'a> { /// See example on [`Parser::new()`] for an example pub fn try_with_sql(self, sql: &str) -> Result { debug!("Parsing sql '{}'...", sql); - let mut tokenizer = Tokenizer::new(self.dialect, sql); + let tokenizer_options = TokenizerOptions { + no_escape: self.options.no_escape, + }; + let mut tokenizer = Tokenizer::new(self.dialect, sql, &tokenizer_options); let tokens = tokenizer.tokenize()?; Ok(self.with_tokens(tokens)) } @@ -4277,6 +4281,7 @@ impl<'a> Parser<'a> { }, Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())), Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())), + Token::OriginalString(ref s) => Ok(Value::OriginalString(s.to_string())), Token::DollarQuotedString(ref s) => Ok(Value::DollarQuotedString(s.clone())), Token::SingleQuotedByteStringLiteral(ref s) => { Ok(Value::SingleQuotedByteStringLiteral(s.clone())) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ffa1a96f2..a9a0e7e68 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -56,6 +56,8 @@ pub enum Token { SingleQuotedString(String), /// Double quoted string: i.e: "string" DoubleQuotedString(String), + /// String for no-escape mode + OriginalString(String), /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$ DollarQuotedString(DollarQuotedString), /// Byte string literal: i.e: b'string' or B'string' @@ -193,6 +195,7 @@ impl fmt::Display for Token { Token::Char(ref c) => write!(f, "{c}"), Token::SingleQuotedString(ref s) => write!(f, "'{s}'"), Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""), + Token::OriginalString(ref s) => write!(f, "{s}"), Token::DollarQuotedString(ref s) => write!(f, "{s}"), Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"), Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"), @@ -447,12 +450,21 @@ impl<'a> State<'a> { pub struct Tokenizer<'a> { dialect: &'a dyn Dialect, query: &'a str, + options: &'a TokenizerOptions, +} + +pub struct TokenizerOptions { + pub no_escape: bool, } impl<'a> Tokenizer<'a> { /// Create a new SQL tokenizer for the specified SQL statement - pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { - Self { dialect, query } + pub fn new(dialect: &'a dyn Dialect, query: &'a str, options: &'a TokenizerOptions) -> Self { + Self { + dialect, + query, + options, + } } /// Tokenize the statement and produce a vector of tokens @@ -508,19 +520,39 @@ impl<'a> Tokenizer<'a> { // BigQuery uses b or B for byte string literal b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => { chars.next(); // consume - match chars.peek() { - Some('\'') => { - let s = self.tokenize_quoted_string(chars, '\'')?; - Ok(Some(Token::SingleQuotedByteStringLiteral(s))) - } - Some('\"') => { - let s = self.tokenize_quoted_string(chars, '\"')?; - Ok(Some(Token::DoubleQuotedByteStringLiteral(s))) + if self.options.no_escape { + match chars.peek() { + Some('\'') => { + let _s = self.tokenize_quoted_string_with_no_escape(chars, '\'')?; + let s = "\'".to_string() + &_s + "\'"; + Ok(Some(Token::OriginalString(s))) + } + Some('\"') => { + let _s = self.tokenize_quoted_string_with_no_escape(chars, '\"')?; + let s = "\"".to_string() + &_s + "\""; + Ok(Some(Token::OriginalString(s))) + } + _ => { + // regular identifier starting with an "b" or "B" + let s = self.tokenize_word(b, chars); + Ok(Some(Token::make_word(&s, None))) + } } - _ => { - // regular identifier starting with an "b" or "B" - let s = self.tokenize_word(b, chars); - Ok(Some(Token::make_word(&s, None))) + } else { + match chars.peek() { + Some('\'') => { + let s = self.tokenize_quoted_string(chars, '\'')?; + Ok(Some(Token::SingleQuotedByteStringLiteral(s))) + } + Some('\"') => { + let s = self.tokenize_quoted_string(chars, '\"')?; + Ok(Some(Token::DoubleQuotedByteStringLiteral(s))) + } + _ => { + // regular identifier starting with an "b" or "B" + let s = self.tokenize_word(b, chars); + Ok(Some(Token::make_word(&s, None))) + } } } } @@ -1156,6 +1188,37 @@ impl<'a> Tokenizer<'a> { self.tokenizer_error(error_loc, "Unterminated string literal") } + fn tokenize_quoted_string_with_no_escape( + &self, + chars: &mut State, + quote_style: char, + ) -> Result { + let mut s = String::new(); + let error_loc = chars.location(); + + chars.next(); // consume the opening quote + + while let Some(&ch) = chars.peek() { + match ch { + char if char == quote_style => { + chars.next(); // consume + if chars.peek().map(|c| *c == quote_style).unwrap_or(false) { + s.push(ch); + s.push(ch); + chars.next(); + } else { + return Ok(s); + } + } + _ => { + chars.next(); // consume + s.push(ch); + } + } + } + self.tokenizer_error(error_loc, "Unterminated string literal") + } + fn tokenize_multiline_comment( &self, chars: &mut State, @@ -1259,7 +1322,7 @@ mod tests { fn tokenize_select_1() { let sql = String::from("SELECT 1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1275,7 +1338,7 @@ mod tests { fn tokenize_select_float() { let sql = String::from("SELECT .1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1291,7 +1354,7 @@ mod tests { fn tokenize_select_exponent() { let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1326,7 +1389,7 @@ mod tests { fn tokenize_scalar_function() { let sql = String::from("SELECT sqrt(1)"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1345,7 +1408,7 @@ mod tests { fn tokenize_string_string_concat() { let sql = String::from("SELECT 'a' || 'b'"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1364,7 +1427,7 @@ mod tests { fn tokenize_bitwise_op() { let sql = String::from("SELECT one | two ^ three"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1388,7 +1451,7 @@ mod tests { let sql = String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1428,7 +1491,7 @@ mod tests { fn tokenize_simple_select() { let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1460,7 +1523,7 @@ mod tests { fn tokenize_explain_select() { let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1490,7 +1553,7 @@ mod tests { fn tokenize_explain_analyze_select() { let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1522,7 +1585,7 @@ mod tests { fn tokenize_string_predicate() { let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1551,7 +1614,7 @@ mod tests { let sql = String::from("\n💝مصطفىh"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); // println!("tokens: {:#?}", tokens); let expected = vec![ @@ -1567,7 +1630,7 @@ mod tests { let sql = String::from("'foo\r\nbar\nbaz'"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())]; compare(expected, tokens); @@ -1578,7 +1641,7 @@ mod tests { let sql = String::from("select 'foo"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); assert_eq!( tokenizer.tokenize(), Err(TokenizerError { @@ -1594,7 +1657,7 @@ mod tests { let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); assert_eq!( tokenizer.tokenize(), Err(TokenizerError { @@ -1610,7 +1673,7 @@ mod tests { let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); // println!("tokens: {:#?}", tokens); let expected = vec![ @@ -1634,7 +1697,7 @@ mod tests { fn tokenize_right_arrow() { let sql = String::from("FUNCTION(key=>value)"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_word("FUNCTION", None), @@ -1651,7 +1714,7 @@ mod tests { fn tokenize_is_null() { let sql = String::from("a IS NULL"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -1670,7 +1733,7 @@ mod tests { let sql = String::from("0--this is a comment\n1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), @@ -1688,7 +1751,7 @@ mod tests { let sql = String::from("--this is a comment"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![Token::Whitespace(Whitespace::SingleLineComment { prefix: "--".to_string(), @@ -1702,7 +1765,7 @@ mod tests { let sql = String::from("0/*multi-line\n* /comment*/1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), @@ -1719,7 +1782,7 @@ mod tests { let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), @@ -1736,7 +1799,7 @@ mod tests { let sql = String::from("\n/** Comment **/\n"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Whitespace(Whitespace::Newline), @@ -1751,7 +1814,7 @@ mod tests { let sql = String::from(" \u{2003}\n"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Whitespace(Whitespace::Space), @@ -1766,7 +1829,7 @@ mod tests { let sql = String::from("\"foo"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); assert_eq!( tokenizer.tokenize(), Err(TokenizerError { @@ -1782,7 +1845,7 @@ mod tests { let sql = String::from("line1\nline2\rline3\r\nline4\r"); let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, &sql); + let mut tokenizer = Tokenizer::new(&dialect, &sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_word("line1", None), @@ -1801,7 +1864,7 @@ mod tests { fn tokenize_mssql_top() { let sql = "SELECT TOP 5 [bar] FROM foo"; let dialect = MsSqlDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), @@ -1823,7 +1886,7 @@ mod tests { fn tokenize_pg_regex_match() { let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'"; let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), @@ -1862,7 +1925,7 @@ mod tests { fn tokenize_quoted_identifier() { let sql = r#" "a "" b" "a """ "c """"" "#; let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Whitespace(Whitespace::Space), @@ -1880,7 +1943,7 @@ mod tests { fn tokenize_with_location() { let sql = "SELECT a,\n b"; let dialect = GenericDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize_with_location().unwrap(); let expected = vec![ TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1), diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index ad3507058..3ffe2dd55 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -7068,6 +7068,7 @@ fn parse_trailing_comma() { dialects: vec![Box::new(GenericDialect {})], options: Some(ParserOptions { trailing_commas: true, + no_escape: false, }), }; @@ -7093,6 +7094,32 @@ fn parse_trailing_comma() { trailing_commas.verified_stmt("SELECT DISTINCT ON (album_id) name FROM track"); } +#[test] +fn parse_with_no_escape() { + let no_escape = TestedDialects { + dialects: vec![Box::new(MySqlDialect {})], // MySQL uses backslash as escape character + options: Some(ParserOptions { + trailing_commas: false, + no_escape: true, + }), + }; + + no_escape.one_statement_parses_to( + r#"INSERT INTO `table` VALUES ("I\'m a value")"#, + r#"INSERT INTO `table` VALUES ("I\'m a value")"#, + ); + + no_escape.one_statement_parses_to( + r#"INSERT INTO `table` VALUES ('I''m a value')"#, + r#"INSERT INTO `table` VALUES ('I''m a value')"#, + ); + + no_escape.one_statement_parses_to( + r#"INSERT INTO `table` VALUES ('I\\'m a value')"#, + r#"INSERT INTO `table` VALUES ('I\\'m a value')"#, + ); +} + #[test] fn parse_create_type() { let create_type = diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index 9a54c89cf..4649a559e 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -55,7 +55,7 @@ fn test_snowflake_create_transient_table() { fn test_snowflake_single_line_tokenize() { let sql = "CREATE TABLE# this is a comment \ntable_1"; let dialect = SnowflakeDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -72,7 +72,7 @@ fn test_snowflake_single_line_tokenize() { assert_eq!(expected, tokens); let sql = "CREATE TABLE // this is a comment \ntable_1"; - let mut tokenizer = Tokenizer::new(&dialect, sql); + let mut tokenizer = Tokenizer::new(&dialect, sql, &TokenizerOptions { no_escape: false }); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![