diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 23f963bc8..ae0b022a7 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1199,61 +1199,10 @@ impl<'a> Tokenizer<'a> { starting_loc: Location, chars: &mut State, ) -> Result { - let mut s = String::new(); - - // This case is a bit tricky - - chars.next(); // consume the opening quote - - // slash escaping - let mut is_escaped = false; - while let Some(&ch) = chars.peek() { - macro_rules! escape_control_character { - ($ESCAPED:expr) => {{ - if is_escaped { - s.push($ESCAPED); - is_escaped = false; - } else { - s.push(ch); - } - - chars.next(); - }}; - } - - match ch { - '\'' => { - chars.next(); // consume - if is_escaped { - s.push(ch); - is_escaped = false; - } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) { - s.push(ch); - chars.next(); - } else { - return Ok(s); - } - } - '\\' => { - if is_escaped { - s.push('\\'); - is_escaped = false; - } else { - is_escaped = true; - } - - chars.next(); - } - 'r' => escape_control_character!('\r'), - 'n' => escape_control_character!('\n'), - 't' => escape_control_character!('\t'), - _ => { - is_escaped = false; - chars.next(); // consume - s.push(ch); - } - } + if let Some(s) = unescape_single_quoted_string(chars) { + return Ok(s); } + self.tokenizer_error(starting_loc, "Unterminated encoded string literal") } @@ -1406,6 +1355,154 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool s } +fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option { + Unescape::new(chars).unescape() +} + +struct Unescape<'a: 'b, 'b> { + chars: &'b mut State<'a>, +} + +impl<'a: 'b, 'b> Unescape<'a, 'b> { + fn new(chars: &'b mut State<'a>) -> Self { + Self { chars } + } + fn unescape(mut self) -> Option { + let mut unescaped = String::new(); + + self.chars.next(); + + while let Some(c) = self.chars.next() { + if c == '\'' { + // case: '''' + if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) { + self.chars.next(); + unescaped.push('\''); + continue; + } + return Some(unescaped); + } + + if c != '\\' { + unescaped.push(c); + continue; + } + + let c = match self.chars.next()? { + 'b' => '\u{0008}', + 'f' => '\u{000C}', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'u' => self.unescape_unicode_16()?, + 'U' => self.unescape_unicode_32()?, + 'x' => self.unescape_hex()?, + c if c.is_digit(8) => self.unescape_octal(c)?, + c => c, + }; + + unescaped.push(Self::check_null(c)?); + } + + None + } + + #[inline] + fn check_null(c: char) -> Option { + if c == '\0' { + None + } else { + Some(c) + } + } + + #[inline] + fn byte_to_char(s: &str) -> Option { + // u32 is used here because Pg has an overflow operation rather than throwing an exception directly. + match u32::from_str_radix(s, RADIX) { + Err(_) => None, + Ok(n) => { + let n = n & 0xFF; + if n <= 127 { + char::from_u32(n) + } else { + None + } + } + } + } + + // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F) + fn unescape_hex(&mut self) -> Option { + let mut s = String::new(); + + for _ in 0..2 { + match self.next_hex_digit() { + Some(c) => s.push(c), + None => break, + } + } + + if s.is_empty() { + return Some('x'); + } + + Self::byte_to_char::<16>(&s) + } + + #[inline] + fn next_hex_digit(&mut self) -> Option { + match self.chars.peek() { + Some(c) if c.is_ascii_hexdigit() => self.chars.next(), + _ => None, + } + } + + // Octal byte value. \o, \oo, \ooo (o = 0–7) + fn unescape_octal(&mut self, c: char) -> Option { + let mut s = String::new(); + + s.push(c); + for _ in 0..2 { + match self.next_octal_digest() { + Some(c) => s.push(c), + None => break, + } + } + + Self::byte_to_char::<8>(&s) + } + + #[inline] + fn next_octal_digest(&mut self) -> Option { + match self.chars.peek() { + Some(c) if c.is_digit(8) => self.chars.next(), + _ => None, + } + } + + // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F) + fn unescape_unicode_16(&mut self) -> Option { + self.unescape_unicode::<4>() + } + + // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F) + fn unescape_unicode_32(&mut self) -> Option { + self.unescape_unicode::<8>() + } + + fn unescape_unicode(&mut self) -> Option { + let mut s = String::new(); + for _ in 0..NUM { + s.push(self.chars.next()?); + } + match u32::from_str_radix(&s, 16) { + Err(_) => None, + Ok(n) => char::from_u32(n), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -2139,4 +2236,74 @@ mod tests { //println!("------------------------------"); assert_eq!(expected, actual); } + + fn check_unescape(s: &str, expected: Option<&str>) { + let s = format!("'{}'", s); + let mut state = State { + peekable: s.chars().peekable(), + line: 0, + col: 0, + }; + + assert_eq!( + unescape_single_quoted_string(&mut state), + expected.map(|s| s.to_string()) + ); + } + + #[test] + fn test_unescape() { + check_unescape(r"\b", Some("\u{0008}")); + check_unescape(r"\f", Some("\u{000C}")); + check_unescape(r"\t", Some("\t")); + check_unescape(r"\r\n", Some("\r\n")); + check_unescape(r"\/", Some("/")); + check_unescape(r"/", Some("/")); + check_unescape(r"\\", Some("\\")); + + // 16 and 32-bit hexadecimal Unicode character value + check_unescape(r"\u0001", Some("\u{0001}")); + check_unescape(r"\u4c91", Some("\u{4c91}")); + check_unescape(r"\u4c916", Some("\u{4c91}6")); + check_unescape(r"\u4c", None); + check_unescape(r"\u0000", None); + check_unescape(r"\U0010FFFF", Some("\u{10FFFF}")); + check_unescape(r"\U00110000", None); + check_unescape(r"\U00000000", None); + check_unescape(r"\u", None); + check_unescape(r"\U", None); + check_unescape(r"\U1010FFFF", None); + + // hexadecimal byte value + check_unescape(r"\x4B", Some("\u{004b}")); + check_unescape(r"\x4", Some("\u{0004}")); + check_unescape(r"\x4L", Some("\u{0004}L")); + check_unescape(r"\x", Some("x")); + check_unescape(r"\xP", Some("xP")); + check_unescape(r"\x0", None); + check_unescape(r"\xCAD", None); + check_unescape(r"\xA9", None); + + // octal byte value + check_unescape(r"\1", Some("\u{0001}")); + check_unescape(r"\12", Some("\u{000a}")); + check_unescape(r"\123", Some("\u{0053}")); + check_unescape(r"\1232", Some("\u{0053}2")); + check_unescape(r"\4", Some("\u{0004}")); + check_unescape(r"\45", Some("\u{0025}")); + check_unescape(r"\450", Some("\u{0028}")); + check_unescape(r"\603", None); + check_unescape(r"\0", None); + check_unescape(r"\080", None); + + // others + check_unescape(r"\9", Some("9")); + check_unescape(r"''", Some("'")); + check_unescape( + r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232", + Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"), + ); + check_unescape(r"Hello\0", None); + check_unescape(r"Hello\xCADRust", None); + } } diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 1131a79b0..ce5209e23 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -2531,6 +2531,59 @@ fn parse_escaped_literal_string() { .to_string(), "sql parser error: Unterminated encoded string literal at Line: 1, Column 8" ); + + let sql = r"SELECT E'\u0001', E'\U0010FFFF', E'\xC', E'\x25', E'\2', E'\45', E'\445'"; + let canonical = ""; + let select = pg_and_generic().verified_only_select_with_canonical(sql, canonical); + assert_eq!(7, select.projection.len()); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{0001}".to_string())), + expr_from_projection(&select.projection[0]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{10ffff}".to_string())), + expr_from_projection(&select.projection[1]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{000c}".to_string())), + expr_from_projection(&select.projection[2]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("%".to_string())), + expr_from_projection(&select.projection[3]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("\u{0002}".to_string())), + expr_from_projection(&select.projection[4]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("%".to_string())), + expr_from_projection(&select.projection[5]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("%".to_string())), + expr_from_projection(&select.projection[6]) + ); + + fn negative_cast(sqls: &[&str]) { + for sql in sqls { + assert_eq!( + pg_and_generic() + .parse_sql_statements(sql) + .unwrap_err() + .to_string(), + "sql parser error: Unterminated encoded string literal at Line: 1, Column 8" + ); + } + } + + negative_cast(&[ + r"SELECT E'\u0000'", + r"SELECT E'\U00110000'", + r"SELECT E'\u{0001}'", + r"SELECT E'\xCAD'", + r"SELECT E'\080'", + ]); } #[test]