From 37892f04c5ffcdf492865c302f9720760be7ad04 Mon Sep 17 00:00:00 2001
From: jasonnnli <jasonnnli@tencent.com>
Date: Mon, 26 Feb 2024 17:13:23 +0800
Subject: [PATCH 1/3] refactor: tokenize_escaped_single_quoted_string

---
 src/tokenizer.rs            | 264 ++++++++++++++++++++++++++++--------
 tests/sqlparser_postgres.rs |  53 ++++++++
 2 files changed, 263 insertions(+), 54 deletions(-)
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 23f963bc8..a447ac2cd 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1199,61 +1199,11 @@ impl<'a> Tokenizer<'a> {
         starting_loc: Location,
         chars: &mut State,
     ) -> Result<String, TokenizerError> {
-        let mut s = String::new();
-
-        // This case is a bit tricky
-
-        chars.next(); // consume the opening quote
-
-        // slash escaping
-        let mut is_escaped = false;
-        while let Some(&ch) = chars.peek() {
-            macro_rules! escape_control_character {
-                ($ESCAPED:expr) => {{
-                    if is_escaped {
-                        s.push($ESCAPED);
-                        is_escaped = false;
-                    } else {
-                        s.push(ch);
-                    }
-
-                    chars.next();
-                }};
-            }
-
-            match ch {
-                '\'' => {
-                    chars.next(); // consume
-                    if is_escaped {
-                        s.push(ch);
-                        is_escaped = false;
-                    } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
-                        s.push(ch);
-                        chars.next();
-                    } else {
-                        return Ok(s);
-                    }
-                }
-                '\\' => {
-                    if is_escaped {
-                        s.push('\\');
-                        is_escaped = false;
-                    } else {
-                        is_escaped = true;
-                    }
-
-                    chars.next();
-                }
-                'r' => escape_control_character!('\r'),
-                'n' => escape_control_character!('\n'),
-                't' => escape_control_character!('\t'),
-                _ => {
-                    is_escaped = false;
-                    chars.next(); // consume
-                    s.push(ch);
-                }
-            }
+        let mut unescape = Unescape { chars };
+        if let Some(s) = unescape.unescape_single_quoted_string() {
+            return Ok(s);
         }
+
         self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
     }
 
@@ -1406,6 +1356,147 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
     s
 }
 
+struct Unescape<'a: 'b, 'b> {
+    chars: &'b mut State<'a>,
+}
+
+impl<'a: 'b, 'b> Unescape<'a, 'b> {
+    pub(crate) fn unescape_single_quoted_string(&mut self) -> Option<String> {
+        let mut unescaped = String::new();
+
+        self.chars.next();
+
+        while let Some(c) = self.chars.next() {
+            if c == '\'' {
+                // case: ''''
+                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
+                    self.chars.next();
+                    unescaped.push('\'');
+                    continue;
+                }
+                return Some(unescaped);
+            }
+
+            if c != '\\' {
+                unescaped.push(c);
+                continue;
+            }
+
+            let c = match self.chars.next()? {
+                'b' => '\u{0008}',
+                'f' => '\u{000C}',
+                'n' => '\n',
+                'r' => '\r',
+                't' => '\t',
+                'u' => self.unescape_unicode_16()?,
+                'U' => self.unescape_unicode_32()?,
+                'x' => self.unescape_hex()?,
+                c if c.is_digit(8) => self.unescape_octal(c)?,
+                c => c,
+            };
+
+            unescaped.push(Self::check_null(c)?);
+        }
+
+        None
+    }
+
+    #[inline]
+    fn check_null(c: char) -> Option<char> {
+        if c == '\0' {
+            None
+        } else {
+            Some(c)
+        }
+    }
+
+    #[inline]
+    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
+        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
+        match u32::from_str_radix(s, RADIX) {
+            Err(_) => None,
+            Ok(n) => {
+                let n = n & 0xFF;
+                if n <= 127 {
+                    char::from_u32(n)
+                } else {
+                    None
+                }
+            }
+        }
+    }
+
+    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
+    fn unescape_hex(&mut self) -> Option<char> {
+        let mut s = String::new();
+
+        for _ in 0..2 {
+            match self.next_hex_digit() {
+                Some(c) => s.push(c),
+                None => break,
+            }
+        }
+
+        if s.is_empty() {
+            return Some('x');
+        }
+
+        Self::byte_to_char::<16>(&s)
+    }
+
+    #[inline]
+    fn next_hex_digit(&mut self) -> Option<char> {
+        match self.chars.peek() {
+            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
+            _ => None,
+        }
+    }
+
+    // Octal byte value. \o, \oo, \ooo (o = 0–7)
+    fn unescape_octal(&mut self, c: char) -> Option<char> {
+        let mut s = String::new();
+
+        s.push(c);
+        for _ in 0..2 {
+            match self.next_octal_digest() {
+                Some(c) => s.push(c),
+                None => break,
+            }
+        }
+
+        Self::byte_to_char::<8>(&s)
+    }
+
+    #[inline]
+    fn next_octal_digest(&mut self) -> Option<char> {
+        match self.chars.peek() {
+            Some(c) if c.is_digit(8) => self.chars.next(),
+            _ => None,
+        }
+    }
+
+    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
+    fn unescape_unicode_16(&mut self) -> Option<char> {
+        self.unescape_unicode::<4>()
+    }
+
+    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
+    fn unescape_unicode_32(&mut self) -> Option<char> {
+        self.unescape_unicode::<8>()
+    }
+
+    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
+        let mut s = String::new();
+        for _ in 0..NUM {
+            s.push(self.chars.next()?);
+        }
+        match u32::from_str_radix(&s, 16) {
+            Err(_) => None,
+            Ok(n) => char::from_u32(n),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -2139,4 +2230,69 @@ mod tests {
         //println!("------------------------------");
         assert_eq!(expected, actual);
     }
+
+    fn check_unescape(s: &str, expected: Option<&str>) {
+        let s = format!("'{}'", s);
+        let mut unescape = Unescape {
+            chars: &mut State {
+                peekable: s.chars().peekable(),
+                line: 0,
+                col: 0,
+            },
+        };
+        assert_eq!(
+            unescape.unescape_single_quoted_string(),
+            expected.map(|s| s.to_string())
+        );
+    }
+
+    #[test]
+    fn test_unescape() {
+        check_unescape(r"\b", Some("\u{0008}"));
+        check_unescape(r"\f", Some("\u{000C}"));
+        check_unescape(r"\t", Some("\t"));
+        check_unescape(r"\r\n", Some("\r\n"));
+        check_unescape(r"\/", Some("/"));
+        check_unescape(r"/", Some("/"));
+        check_unescape(r"\\", Some("\\"));
+
+        // 16 and 32-bit hexadecimal Unicode character value
+        check_unescape(r"\u0001", Some("\u{0001}"));
+        check_unescape(r"\u4c91", Some("\u{4c91}"));
+        check_unescape(r"\u4c916", Some("\u{4c91}6"));
+        check_unescape(r"\u4c", None);
+        check_unescape(r"\u0000", None);
+        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
+        check_unescape(r"\U00110000", None);
+        check_unescape(r"\U00000000", None);
+        check_unescape(r"\u", None);
+        check_unescape(r"\U", None);
+        check_unescape(r"\U1010FFFF", None);
+
+        // hexadecimal byte value
+        check_unescape(r"\x4B", Some("\u{004b}"));
+        check_unescape(r"\x4", Some("\u{0004}"));
+        check_unescape(r"\x4L", Some("\u{0004}L"));
+        check_unescape(r"\x", Some("x"));
+        check_unescape(r"\xP", Some("xP"));
+        check_unescape(r"\x0", None);
+        check_unescape(r"\xCAD", None);
+        check_unescape(r"\xA9", None);
+
+        // octal byte value
+        check_unescape(r"\1", Some("\u{0001}"));
+        check_unescape(r"\12", Some("\u{000a}"));
+        check_unescape(r"\123", Some("\u{0053}"));
+        check_unescape(r"\1232", Some("\u{0053}2"));
+        check_unescape(r"\4", Some("\u{0004}"));
+        check_unescape(r"\45", Some("\u{0025}"));
+        check_unescape(r"\450", Some("\u{0028}"));
+        check_unescape(r"\603", None);
+        check_unescape(r"\0", None);
+        check_unescape(r"\080", None);
+
+        // others
+        check_unescape(r"\9", Some("9"));
+        check_unescape(r"''", Some("'"));
+    }
 }
diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs
index c987822b8..36fbfa2af 100644
--- a/tests/sqlparser_postgres.rs
+++ b/tests/sqlparser_postgres.rs
@@ -2528,6 +2528,59 @@ fn parse_escaped_literal_string() {
             .to_string(),
         "sql parser error: Unterminated encoded string literal at Line: 1, Column 8"
     );
+
+    let sql = r"SELECT E'\u0001', E'\U0010FFFF', E'\xC', E'\x25', E'\2', E'\45', E'\445'";
+    let canonical = "";
+    let select = pg_and_generic().verified_only_select_with_canonical(sql, canonical);
+    assert_eq!(7, select.projection.len());
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("\u{0001}".to_string())),
+        expr_from_projection(&select.projection[0])
+    );
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("\u{10ffff}".to_string())),
+        expr_from_projection(&select.projection[1])
+    );
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("\u{000c}".to_string())),
+        expr_from_projection(&select.projection[2])
+    );
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("%".to_string())),
+        expr_from_projection(&select.projection[3])
+    );
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("\u{0002}".to_string())),
+        expr_from_projection(&select.projection[4])
+    );
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("%".to_string())),
+        expr_from_projection(&select.projection[5])
+    );
+    assert_eq!(
+        &Expr::Value(Value::EscapedStringLiteral("%".to_string())),
+        expr_from_projection(&select.projection[6])
+    );
+
+    fn negative_cast(sqls: &[&str]) {
+        for sql in sqls {
+            assert_eq!(
+                pg_and_generic()
+                    .parse_sql_statements(sql)
+                    .unwrap_err()
+                    .to_string(),
+                "sql parser error: Unterminated encoded string literal at Line: 1, Column 8"
+            );
+        }
+    }
+
+    negative_cast(&[
+        r"SELECT E'\u0000'",
+        r"SELECT E'\U00110000'",
+        r"SELECT E'\u{0001}'",
+        r"SELECT E'\xCAD'",
+        r"SELECT E'\080'",
+    ]);
 }
 
 #[test]

From 670ca62a8dd33115907d5e6ba1709e794640f229 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 29 Feb 2024 07:52:06 -0500
Subject: [PATCH 2/3] Cleanup interface

---
 src/tokenizer.rs | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index a447ac2cd..8fac233af 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1199,8 +1199,7 @@ impl<'a> Tokenizer<'a> {
         starting_loc: Location,
         chars: &mut State,
     ) -> Result<String, TokenizerError> {
-        let mut unescape = Unescape { chars };
-        if let Some(s) = unescape.unescape_single_quoted_string() {
+        if let Some(s) = unescape_single_quoted_string(chars) {
             return Ok(s);
         }
 
@@ -1356,12 +1355,19 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
     s
 }
 
+fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
+    Unescape::new(chars).unescape()
+}
+
 struct Unescape<'a: 'b, 'b> {
     chars: &'b mut State<'a>,
 }
 
 impl<'a: 'b, 'b> Unescape<'a, 'b> {
-    pub(crate) fn unescape_single_quoted_string(&mut self) -> Option<String> {
+    fn new(chars: &'b mut State<'a>) -> Self {
+        Self { chars }
+    }
+    fn unescape(mut self) -> Option<String> {
         let mut unescaped = String::new();
 
         self.chars.next();
@@ -2233,15 +2239,14 @@ mod tests {
 
     fn check_unescape(s: &str, expected: Option<&str>) {
         let s = format!("'{}'", s);
-        let mut unescape = Unescape {
-            chars: &mut State {
-                peekable: s.chars().peekable(),
-                line: 0,
-                col: 0,
-            },
+        let mut state = State {
+            peekable: s.chars().peekable(),
+            line: 0,
+            col: 0,
         };
+
         assert_eq!(
-            unescape.unescape_single_quoted_string(),
+            unescape_single_quoted_string(&mut state),
             expected.map(|s| s.to_string())
         );
     }

From 1a0a346db3b763181fd9a6baf1be1bffeab2dffc Mon Sep 17 00:00:00 2001
From: jasonnnli <jasonnnli@tencent.com>
Date: Thu, 29 Feb 2024 23:26:45 +0800
Subject: [PATCH 3/3] chore: add some test case

---
 src/tokenizer.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 8fac233af..ae0b022a7 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2299,5 +2299,11 @@ mod tests {
         // others
         check_unescape(r"\9", Some("9"));
         check_unescape(r"''", Some("'"));
+        check_unescape(
+            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
+            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
+        );
+        check_unescape(r"Hello\0", None);
+        check_unescape(r"Hello\xCADRust", None);
     }
 }