Skip to content

Commit b7f265a

Browse files
authored
feat: Strings with Unicode Escapes (#43)
1 parent dca8d90 commit b7f265a

File tree

3 files changed

+65
-1
lines changed

3 files changed

+65
-1
lines changed

src/ast/value.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ pub enum Value {
4242
NationalStringLiteral(String),
4343
/// X'hex value'
4444
HexStringLiteral(String),
45+
/// U&'hex value'
46+
UnicodeEscapedStringLiteral(String),
4547

4648
DoubleQuotedString(String),
4749
/// Boolean value true or false
@@ -78,6 +80,7 @@ impl fmt::Display for Value {
7880
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
7981
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
8082
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
83+
Value::UnicodeEscapedStringLiteral(v) => write!(f, "U&'{}'", escape_escaped_string(v)),
8184
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
8285
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
8386
Value::Boolean(v) => write!(f, "{}", v),

src/parser.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ impl<'a> Parser<'a> {
539539
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
540540
})
541541
}
542-
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
542+
Token::EscapedStringLiteral(_) | Token::UnicodeEscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
543543
{
544544
self.prev_token();
545545
Ok(Expr::Value(self.parse_value()?))
@@ -971,6 +971,7 @@ impl<'a> Parser<'a> {
971971
Token::SingleQuotedString(_)
972972
| Token::EscapedStringLiteral(_)
973973
| Token::NationalStringLiteral(_)
974+
| Token::UnicodeEscapedStringLiteral(_)
974975
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
975976
unexpected => {
976977
self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)?
@@ -2893,6 +2894,9 @@ impl<'a> Parser<'a> {
28932894
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
28942895
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
28952896
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
2897+
Token::UnicodeEscapedStringLiteral(ref s) => {
2898+
Ok(Value::UnicodeEscapedStringLiteral(s.to_string()))
2899+
}
28962900
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
28972901
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
28982902
unexpected => self.expected("a value", unexpected),

src/tokenizer.rs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ pub enum Token {
5555
EscapedStringLiteral(String),
5656
/// Hexadecimal string literal: i.e.: X'deadbeef'
5757
HexStringLiteral(String),
58+
/// Unicode escaped string: U&'d\0061t\+000061' (data)
59+
UnicodeEscapedStringLiteral(String),
5860
/// Comma
5961
Comma,
6062
/// Whitespace (space, tab, etc)
@@ -156,6 +158,7 @@ impl fmt::Display for Token {
156158
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
157159
Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s),
158160
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
161+
Token::UnicodeEscapedStringLiteral(ref s) => write!(f, "U&'{}'", s),
159162
Token::Comma => f.write_str(","),
160163
Token::Whitespace(ws) => write!(f, "{}", ws),
161164
Token::DoubleEq => f.write_str("=="),
@@ -415,6 +418,28 @@ impl<'a> Tokenizer<'a> {
415418
}
416419
}
417420
}
421+
x @ 'u' | x @ 'U' => {
422+
chars.next(); // consume, to check the next char
423+
let mut look_ahead_chars = chars.clone();
424+
if look_ahead_chars.next_if_eq(&'&').is_some() {
425+
match look_ahead_chars.peek() {
426+
Some('\'') => {
427+
//Move chars to the position of look_ahead_chars
428+
chars.next();
429+
// U&'...' - a <binary string literal>
430+
let s = self.tokenize_single_quoted_string(chars)?;
431+
Ok(Some(Token::UnicodeEscapedStringLiteral(s)))
432+
}
433+
_ => {
434+
let s = self.tokenize_word(x, chars);
435+
Ok(Some(Token::make_word(&s, None)))
436+
}
437+
}
438+
} else {
439+
let s = self.tokenize_word(x, chars);
440+
Ok(Some(Token::make_word(&s, None)))
441+
}
442+
}
418443
// identifier or keyword
419444
ch if self.dialect.is_identifier_start(ch) => {
420445
chars.next(); // consume the first char
@@ -1417,4 +1442,36 @@ mod tests {
14171442
//println!("------------------------------");
14181443
assert_eq!(expected, actual);
14191444
}
1445+
#[test]
1446+
fn tokenize_unicode_escaped_literal() {
1447+
let sql = r#"U&'aaa'"#;
1448+
let dialect = GenericDialect {};
1449+
let mut tokenizer = Tokenizer::new(&dialect, sql);
1450+
let tokens = tokenizer.tokenize().unwrap();
1451+
let expected = vec![Token::UnicodeEscapedStringLiteral("aaa".to_string())];
1452+
compare(expected, tokens);
1453+
1454+
let sql = r#"U&a"#;
1455+
let dialect = GenericDialect {};
1456+
let mut tokenizer = Tokenizer::new(&dialect, sql);
1457+
let tokens = tokenizer.tokenize().unwrap();
1458+
let expected = vec![
1459+
Token::make_word("U", None),
1460+
Token::Ampersand,
1461+
Token::make_word("a", None),
1462+
];
1463+
compare(expected, tokens);
1464+
let sql = r#"U & 'aaa'"#;
1465+
let dialect = GenericDialect {};
1466+
let mut tokenizer = Tokenizer::new(&dialect, sql);
1467+
let tokens = tokenizer.tokenize().unwrap();
1468+
let expected = vec![
1469+
Token::make_word("U", None),
1470+
Token::Whitespace(Whitespace::Space),
1471+
Token::Ampersand,
1472+
Token::Whitespace(Whitespace::Space),
1473+
Token::SingleQuotedString("aaa".to_string()),
1474+
];
1475+
compare(expected, tokens);
1476+
}
14201477
}

0 commit comments

Comments
 (0)