Skip to content

Commit 4d1eecd

Browse files
JasonLi-cnjasonnnlialamb
authored
Handle escape, unicode, and hex in tokenize_escaped_single_quoted_string (#1146)
Co-authored-by: jasonnnli <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent 0c5f6fb commit 4d1eecd

File tree

2 files changed

+274
-54
lines changed

2 files changed

+274
-54
lines changed

src/tokenizer.rs

Lines changed: 221 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,61 +1199,10 @@ impl<'a> Tokenizer<'a> {
11991199
starting_loc: Location,
12001200
chars: &mut State,
12011201
) -> Result<String, TokenizerError> {
1202-
let mut s = String::new();
1203-
1204-
// This case is a bit tricky
1205-
1206-
chars.next(); // consume the opening quote
1207-
1208-
// slash escaping
1209-
let mut is_escaped = false;
1210-
while let Some(&ch) = chars.peek() {
1211-
macro_rules! escape_control_character {
1212-
($ESCAPED:expr) => {{
1213-
if is_escaped {
1214-
s.push($ESCAPED);
1215-
is_escaped = false;
1216-
} else {
1217-
s.push(ch);
1218-
}
1219-
1220-
chars.next();
1221-
}};
1222-
}
1223-
1224-
match ch {
1225-
'\'' => {
1226-
chars.next(); // consume
1227-
if is_escaped {
1228-
s.push(ch);
1229-
is_escaped = false;
1230-
} else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1231-
s.push(ch);
1232-
chars.next();
1233-
} else {
1234-
return Ok(s);
1235-
}
1236-
}
1237-
'\\' => {
1238-
if is_escaped {
1239-
s.push('\\');
1240-
is_escaped = false;
1241-
} else {
1242-
is_escaped = true;
1243-
}
1244-
1245-
chars.next();
1246-
}
1247-
'r' => escape_control_character!('\r'),
1248-
'n' => escape_control_character!('\n'),
1249-
't' => escape_control_character!('\t'),
1250-
_ => {
1251-
is_escaped = false;
1252-
chars.next(); // consume
1253-
s.push(ch);
1254-
}
1255-
}
1202+
if let Some(s) = unescape_single_quoted_string(chars) {
1203+
return Ok(s);
12561204
}
1205+
12571206
self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
12581207
}
12591208

@@ -1406,6 +1355,154 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
14061355
s
14071356
}
14081357

1358+
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
1359+
Unescape::new(chars).unescape()
1360+
}
1361+
1362+
struct Unescape<'a: 'b, 'b> {
1363+
chars: &'b mut State<'a>,
1364+
}
1365+
1366+
impl<'a: 'b, 'b> Unescape<'a, 'b> {
1367+
fn new(chars: &'b mut State<'a>) -> Self {
1368+
Self { chars }
1369+
}
1370+
fn unescape(mut self) -> Option<String> {
1371+
let mut unescaped = String::new();
1372+
1373+
self.chars.next();
1374+
1375+
while let Some(c) = self.chars.next() {
1376+
if c == '\'' {
1377+
// case: ''''
1378+
if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1379+
self.chars.next();
1380+
unescaped.push('\'');
1381+
continue;
1382+
}
1383+
return Some(unescaped);
1384+
}
1385+
1386+
if c != '\\' {
1387+
unescaped.push(c);
1388+
continue;
1389+
}
1390+
1391+
let c = match self.chars.next()? {
1392+
'b' => '\u{0008}',
1393+
'f' => '\u{000C}',
1394+
'n' => '\n',
1395+
'r' => '\r',
1396+
't' => '\t',
1397+
'u' => self.unescape_unicode_16()?,
1398+
'U' => self.unescape_unicode_32()?,
1399+
'x' => self.unescape_hex()?,
1400+
c if c.is_digit(8) => self.unescape_octal(c)?,
1401+
c => c,
1402+
};
1403+
1404+
unescaped.push(Self::check_null(c)?);
1405+
}
1406+
1407+
None
1408+
}
1409+
1410+
#[inline]
1411+
fn check_null(c: char) -> Option<char> {
1412+
if c == '\0' {
1413+
None
1414+
} else {
1415+
Some(c)
1416+
}
1417+
}
1418+
1419+
#[inline]
1420+
fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
1421+
// u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
1422+
match u32::from_str_radix(s, RADIX) {
1423+
Err(_) => None,
1424+
Ok(n) => {
1425+
let n = n & 0xFF;
1426+
if n <= 127 {
1427+
char::from_u32(n)
1428+
} else {
1429+
None
1430+
}
1431+
}
1432+
}
1433+
}
1434+
1435+
// Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
1436+
fn unescape_hex(&mut self) -> Option<char> {
1437+
let mut s = String::new();
1438+
1439+
for _ in 0..2 {
1440+
match self.next_hex_digit() {
1441+
Some(c) => s.push(c),
1442+
None => break,
1443+
}
1444+
}
1445+
1446+
if s.is_empty() {
1447+
return Some('x');
1448+
}
1449+
1450+
Self::byte_to_char::<16>(&s)
1451+
}
1452+
1453+
#[inline]
1454+
fn next_hex_digit(&mut self) -> Option<char> {
1455+
match self.chars.peek() {
1456+
Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
1457+
_ => None,
1458+
}
1459+
}
1460+
1461+
// Octal byte value. \o, \oo, \ooo (o = 0–7)
1462+
fn unescape_octal(&mut self, c: char) -> Option<char> {
1463+
let mut s = String::new();
1464+
1465+
s.push(c);
1466+
for _ in 0..2 {
1467+
match self.next_octal_digest() {
1468+
Some(c) => s.push(c),
1469+
None => break,
1470+
}
1471+
}
1472+
1473+
Self::byte_to_char::<8>(&s)
1474+
}
1475+
1476+
#[inline]
1477+
fn next_octal_digest(&mut self) -> Option<char> {
1478+
match self.chars.peek() {
1479+
Some(c) if c.is_digit(8) => self.chars.next(),
1480+
_ => None,
1481+
}
1482+
}
1483+
1484+
// 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
1485+
fn unescape_unicode_16(&mut self) -> Option<char> {
1486+
self.unescape_unicode::<4>()
1487+
}
1488+
1489+
// 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
1490+
fn unescape_unicode_32(&mut self) -> Option<char> {
1491+
self.unescape_unicode::<8>()
1492+
}
1493+
1494+
fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
1495+
let mut s = String::new();
1496+
for _ in 0..NUM {
1497+
s.push(self.chars.next()?);
1498+
}
1499+
match u32::from_str_radix(&s, 16) {
1500+
Err(_) => None,
1501+
Ok(n) => char::from_u32(n),
1502+
}
1503+
}
1504+
}
1505+
14091506
#[cfg(test)]
14101507
mod tests {
14111508
use super::*;
@@ -2139,4 +2236,74 @@ mod tests {
21392236
//println!("------------------------------");
21402237
assert_eq!(expected, actual);
21412238
}
2239+
2240+
fn check_unescape(s: &str, expected: Option<&str>) {
2241+
let s = format!("'{}'", s);
2242+
let mut state = State {
2243+
peekable: s.chars().peekable(),
2244+
line: 0,
2245+
col: 0,
2246+
};
2247+
2248+
assert_eq!(
2249+
unescape_single_quoted_string(&mut state),
2250+
expected.map(|s| s.to_string())
2251+
);
2252+
}
2253+
2254+
#[test]
2255+
fn test_unescape() {
2256+
check_unescape(r"\b", Some("\u{0008}"));
2257+
check_unescape(r"\f", Some("\u{000C}"));
2258+
check_unescape(r"\t", Some("\t"));
2259+
check_unescape(r"\r\n", Some("\r\n"));
2260+
check_unescape(r"\/", Some("/"));
2261+
check_unescape(r"/", Some("/"));
2262+
check_unescape(r"\\", Some("\\"));
2263+
2264+
// 16 and 32-bit hexadecimal Unicode character value
2265+
check_unescape(r"\u0001", Some("\u{0001}"));
2266+
check_unescape(r"\u4c91", Some("\u{4c91}"));
2267+
check_unescape(r"\u4c916", Some("\u{4c91}6"));
2268+
check_unescape(r"\u4c", None);
2269+
check_unescape(r"\u0000", None);
2270+
check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
2271+
check_unescape(r"\U00110000", None);
2272+
check_unescape(r"\U00000000", None);
2273+
check_unescape(r"\u", None);
2274+
check_unescape(r"\U", None);
2275+
check_unescape(r"\U1010FFFF", None);
2276+
2277+
// hexadecimal byte value
2278+
check_unescape(r"\x4B", Some("\u{004b}"));
2279+
check_unescape(r"\x4", Some("\u{0004}"));
2280+
check_unescape(r"\x4L", Some("\u{0004}L"));
2281+
check_unescape(r"\x", Some("x"));
2282+
check_unescape(r"\xP", Some("xP"));
2283+
check_unescape(r"\x0", None);
2284+
check_unescape(r"\xCAD", None);
2285+
check_unescape(r"\xA9", None);
2286+
2287+
// octal byte value
2288+
check_unescape(r"\1", Some("\u{0001}"));
2289+
check_unescape(r"\12", Some("\u{000a}"));
2290+
check_unescape(r"\123", Some("\u{0053}"));
2291+
check_unescape(r"\1232", Some("\u{0053}2"));
2292+
check_unescape(r"\4", Some("\u{0004}"));
2293+
check_unescape(r"\45", Some("\u{0025}"));
2294+
check_unescape(r"\450", Some("\u{0028}"));
2295+
check_unescape(r"\603", None);
2296+
check_unescape(r"\0", None);
2297+
check_unescape(r"\080", None);
2298+
2299+
// others
2300+
check_unescape(r"\9", Some("9"));
2301+
check_unescape(r"''", Some("'"));
2302+
check_unescape(
2303+
r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
2304+
Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
2305+
);
2306+
check_unescape(r"Hello\0", None);
2307+
check_unescape(r"Hello\xCADRust", None);
2308+
}
21422309
}

tests/sqlparser_postgres.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2531,6 +2531,59 @@ fn parse_escaped_literal_string() {
25312531
.to_string(),
25322532
"sql parser error: Unterminated encoded string literal at Line: 1, Column 8"
25332533
);
2534+
2535+
let sql = r"SELECT E'\u0001', E'\U0010FFFF', E'\xC', E'\x25', E'\2', E'\45', E'\445'";
2536+
let canonical = "";
2537+
let select = pg_and_generic().verified_only_select_with_canonical(sql, canonical);
2538+
assert_eq!(7, select.projection.len());
2539+
assert_eq!(
2540+
&Expr::Value(Value::EscapedStringLiteral("\u{0001}".to_string())),
2541+
expr_from_projection(&select.projection[0])
2542+
);
2543+
assert_eq!(
2544+
&Expr::Value(Value::EscapedStringLiteral("\u{10ffff}".to_string())),
2545+
expr_from_projection(&select.projection[1])
2546+
);
2547+
assert_eq!(
2548+
&Expr::Value(Value::EscapedStringLiteral("\u{000c}".to_string())),
2549+
expr_from_projection(&select.projection[2])
2550+
);
2551+
assert_eq!(
2552+
&Expr::Value(Value::EscapedStringLiteral("%".to_string())),
2553+
expr_from_projection(&select.projection[3])
2554+
);
2555+
assert_eq!(
2556+
&Expr::Value(Value::EscapedStringLiteral("\u{0002}".to_string())),
2557+
expr_from_projection(&select.projection[4])
2558+
);
2559+
assert_eq!(
2560+
&Expr::Value(Value::EscapedStringLiteral("%".to_string())),
2561+
expr_from_projection(&select.projection[5])
2562+
);
2563+
assert_eq!(
2564+
&Expr::Value(Value::EscapedStringLiteral("%".to_string())),
2565+
expr_from_projection(&select.projection[6])
2566+
);
2567+
2568+
fn negative_cast(sqls: &[&str]) {
2569+
for sql in sqls {
2570+
assert_eq!(
2571+
pg_and_generic()
2572+
.parse_sql_statements(sql)
2573+
.unwrap_err()
2574+
.to_string(),
2575+
"sql parser error: Unterminated encoded string literal at Line: 1, Column 8"
2576+
);
2577+
}
2578+
}
2579+
2580+
negative_cast(&[
2581+
r"SELECT E'\u0000'",
2582+
r"SELECT E'\U00110000'",
2583+
r"SELECT E'\u{0001}'",
2584+
r"SELECT E'\xCAD'",
2585+
r"SELECT E'\080'",
2586+
]);
25342587
}
25352588

25362589
#[test]

0 commit comments

Comments
 (0)