From 37892f04c5ffcdf492865c302f9720760be7ad04 Mon Sep 17 00:00:00 2001 From: jasonnnli Date: Mon, 26 Feb 2024 17:13:23 +0800 Subject: [PATCH 1/3] refactor: tokenize_escaped_single_quoted_string --- src/tokenizer.rs | 264 ++++++++++++++++++++++++++++-------- tests/sqlparser_postgres.rs | 53 ++++++++ 2 files changed, 263 insertions(+), 54 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 23f963bc8..a447ac2cd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1199,61 +1199,11 @@ impl<'a> Tokenizer<'a> { starting_loc: Location, chars: &mut State, ) -> Result { - let mut s = String::new(); - - // This case is a bit tricky - - chars.next(); // consume the opening quote - - // slash escaping - let mut is_escaped = false; - while let Some(&ch) = chars.peek() { - macro_rules! escape_control_character { - ($ESCAPED:expr) => {{ - if is_escaped { - s.push($ESCAPED); - is_escaped = false; - } else { - s.push(ch); - } - - chars.next(); - }}; - } - - match ch { - '\'' => { - chars.next(); // consume - if is_escaped { - s.push(ch); - is_escaped = false; - } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) { - s.push(ch); - chars.next(); - } else { - return Ok(s); - } - } - '\\' => { - if is_escaped { - s.push('\\'); - is_escaped = false; - } else { - is_escaped = true; - } - - chars.next(); - } - 'r' => escape_control_character!('\r'), - 'n' => escape_control_character!('\n'), - 't' => escape_control_character!('\t'), - _ => { - is_escaped = false; - chars.next(); // consume - s.push(ch); - } - } + let mut unescape = Unescape { chars }; + if let Some(s) = unescape.unescape_single_quoted_string() { + return Ok(s); } + self.tokenizer_error(starting_loc, "Unterminated encoded string literal") } @@ -1406,6 +1356,147 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool s } +struct Unescape<'a: 'b, 'b> { + chars: &'b mut State<'a>, +} + +impl<'a: 'b, 'b> Unescape<'a, 'b> { + pub(crate) fn unescape_single_quoted_string(&mut self) -> Option { + let mut unescaped = String::new(); + + self.chars.next(); + + while let Some(c) = self.chars.next() { + if c == '\'' { + // case: '''' + if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) { + self.chars.next(); + unescaped.push('\''); + continue; + } + return Some(unescaped); + } + + if c != '\\' { + unescaped.push(c); + continue; + } + + let c = match self.chars.next()? { + 'b' => '\u{0008}', + 'f' => '\u{000C}', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'u' => self.unescape_unicode_16()?, + 'U' => self.unescape_unicode_32()?, + 'x' => self.unescape_hex()?, + c if c.is_digit(8) => self.unescape_octal(c)?, + c => c, + }; + + unescaped.push(Self::check_null(c)?); + } + + None + } + + #[inline] + fn check_null(c: char) -> Option { + if c == '\0' { + None + } else { + Some(c) + } + } + + #[inline] + fn byte_to_char(s: &str) -> Option { + // u32 is used here because Pg has an overflow operation rather than throwing an exception directly. + match u32::from_str_radix(s, RADIX) { + Err(_) => None, + Ok(n) => { + let n = n & 0xFF; + if n <= 127 { + char::from_u32(n) + } else { + None + } + } + } + } + + // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F) + fn unescape_hex(&mut self) -> Option { + let mut s = String::new(); + + for _ in 0..2 { + match self.next_hex_digit() { + Some(c) => s.push(c), + None => break, + } + } + + if s.is_empty() { + return Some('x'); + } + + Self::byte_to_char::<16>(&s) + } + + #[inline] + fn next_hex_digit(&mut self) -> Option { + match self.chars.peek() { + Some(c) if c.is_ascii_hexdigit() => self.chars.next(), + _ => None, + } + } + + // Octal byte value. \o, \oo, \ooo (o = 0–7) + fn unescape_octal(&mut self, c: char) -> Option { + let mut s = String::new(); + + s.push(c); + for _ in 0..2 { + match self.next_octal_digest() { + Some(c) => s.push(c), + None => break, + } + } + + Self::byte_to_char::<8>(&s) + } + + #[inline] + fn next_octal_digest(&mut self) -> Option { + match self.chars.peek() { + Some(c) if c.is_digit(8) => self.chars.next(), + _ => None, + } + } + + // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F) + fn unescape_unicode_16(&mut self) -> Option { + self.unescape_unicode::<4>() + } + + // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F) + fn unescape_unicode_32(&mut self) -> Option { + self.unescape_unicode::<8>() + } + + fn unescape_unicode(&mut self) -> Option { + let mut s = String::new(); + for _ in 0..NUM { + s.push(self.chars.next()?); + } + match u32::from_str_radix(&s, 16) { + Err(_) => None, + Ok(n) => char::from_u32(n), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -2139,4 +2230,69 @@ mod tests { //println!("------------------------------"); assert_eq!(expected, actual); } + + fn check_unescape(s: &str, expected: Option<&str>) { + let s = format!("'{}'", s); + let mut unescape = Unescape { + chars: &mut State { + peekable: s.chars().peekable(), + line: 0, + col: 0, + }, + }; + assert_eq!( + unescape.unescape_single_quoted_string(), + expected.map(|s| s.to_string()) + ); + } + + #[test] + fn test_unescape() { + check_unescape(r"\b", Some("\u{0008}")); + check_unescape(r"\f", Some("\u{000C}")); + check_unescape(r"\t", Some("\t")); + check_unescape(r"\r\n", Some("\r\n")); + check_unescape(r"\/", Some("/")); + check_unescape(r"/", Some("/")); + check_unescape(r"\\", Some("\\")); + + // 16 and 32-bit hexadecimal Unicode character value + check_unescape(r"\u0001", Some("\u{0001}")); + check_unescape(r"\u4c91", Some("\u{4c91}")); + check_unescape(r"\u4c916", Some("\u{4c91}6")); + check_unescape(r"\u4c", None); + check_unescape(r"\u0000", None); + check_unescape(r"\U0010FFFF", Some("\u{10FFFF}")); + check_unescape(r"\U00110000", None); + check_unescape(r"\U00000000", None); + check_unescape(r"\u", None); + check_unescape(r"\U", None); + check_unescape(r"\U1010FFFF", None); + + // hexadecimal byte value + check_unescape(r"\x4B", Some("\u{004b}")); + check_unescape(r"\x4", Some("\u{0004}")); + check_unescape(r"\x4L", Some("\u{0004}L")); + check_unescape(r"\x", Some("x")); + check_unescape(r"\xP", Some("xP")); + check_unescape(r"\x0", None); + check_unescape(r"\xCAD", None); + check_unescape(r"\xA9", None); + + // octal byte value + check_unescape(r"\1", Some("\u{0001}")); + check_unescape(r"\12", Some("\u{000a}")); + check_unescape(r"\123", Some("\u{0053}")); + check_unescape(r"\1232", Some("\u{0053}2")); + check_unescape(r"\4", Some("\u{0004}")); + check_unescape(r"\45", Some("\u{0025}")); + check_unescape(r"\450", Some("\u{0028}")); + check_unescape(r"\603", None); + check_unescape(r"\0", None); + check_unescape(r"\080", None); + + // others + check_unescape(r"\9", Some("9")); + check_unescape(r"''", Some("'")); + } } diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index c987822b8..36fbfa2af 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -2528,6 +2528,59 @@ fn parse_escaped_literal_string() { .to_string(), "sql parser error: Unterminated encoded string literal at Line: 1, Column 8" ); + + let sql = r"SELECT E'\u0001', E'\U0010FFFF', E'\xC', E'\x25', E'\2', E'\45', E'\445'"; + let canonical = ""; + let select = pg_and_generic().verified_only_select_with_canonical(sql, canonical); + assert_eq!(7, select.projection.len()); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{0001}".to_string())), + expr_from_projection(&select.projection[0]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{10ffff}".to_string())), + expr_from_projection(&select.projection[1]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{000c}".to_string())), + expr_from_projection(&select.projection[2]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("%".to_string())), + expr_from_projection(&select.projection[3]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{0002}".to_string())), + expr_from_projection(&select.projection[4]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("%".to_string())), + expr_from_projection(&select.projection[5]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("%".to_string())), + expr_from_projection(&select.projection[6]) + ); + + fn negative_cast(sqls: &[&str]) { + for sql in sqls { + assert_eq!( + pg_and_generic() + .parse_sql_statements(sql) + .unwrap_err() + .to_string(), + "sql parser error: Unterminated encoded string literal at Line: 1, Column 8" + ); + } + } + + negative_cast(&[ + r"SELECT E'\u0000'", + r"SELECT E'\U00110000'", + r"SELECT E'\u{0001}'", + r"SELECT E'\xCAD'", + r"SELECT E'\080'", + ]); } #[test] From 670ca62a8dd33115907d5e6ba1709e794640f229 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 29 Feb 2024 07:52:06 -0500 Subject: [PATCH 2/3] Cleanup interface --- src/tokenizer.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a447ac2cd..8fac233af 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1199,8 +1199,7 @@ impl<'a> Tokenizer<'a> { starting_loc: Location, chars: &mut State, ) -> Result { - let mut unescape = Unescape { chars }; - if let Some(s) = unescape.unescape_single_quoted_string() { + if let Some(s) = unescape_single_quoted_string(chars) { return Ok(s); } @@ -1356,12 +1355,19 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool s } +fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option { + Unescape::new(chars).unescape() +} + struct Unescape<'a: 'b, 'b> { chars: &'b mut State<'a>, } impl<'a: 'b, 'b> Unescape<'a, 'b> { - pub(crate) fn unescape_single_quoted_string(&mut self) -> Option { + fn new(chars: &'b mut State<'a>) -> Self { + Self { chars } + } + fn unescape(mut self) -> Option { let mut unescaped = String::new(); self.chars.next(); @@ -2233,15 +2239,14 @@ mod tests { fn check_unescape(s: &str, expected: Option<&str>) { let s = format!("'{}'", s); - let mut unescape = Unescape { - chars: &mut State { - peekable: s.chars().peekable(), - line: 0, - col: 0, - }, + let mut state = State { + peekable: s.chars().peekable(), + line: 0, + col: 0, }; + assert_eq!( - unescape.unescape_single_quoted_string(), + unescape_single_quoted_string(&mut state), expected.map(|s| s.to_string()) ); } From 1a0a346db3b763181fd9a6baf1be1bffeab2dffc Mon Sep 17 00:00:00 2001 From: jasonnnli Date: Thu, 29 Feb 2024 23:26:45 +0800 Subject: [PATCH 3/3] chore: add some test case --- src/tokenizer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8fac233af..ae0b022a7 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2299,5 +2299,11 @@ mod tests { // others check_unescape(r"\9", Some("9")); check_unescape(r"''", Some("'")); + check_unescape( + r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232", + Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"), + ); + check_unescape(r"Hello\0", None); + check_unescape(r"Hello\xCADRust", None); } }