Skip to content

Commit 5420201

Browse files
waralexrommcheshkov
authored andcommitted
feat: Strings with Unicode Escapes (#43)
Can drop this after rebase on commit bc15f7b "Support for postgres String Constants with Unicode Escapes (apache#1355)", first released in 0.50.0
1 parent 96d13ea commit 5420201

File tree

3 files changed

+65
-1
lines changed

3 files changed

+65
-1
lines changed

src/ast/value.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ pub enum Value {
3939
NationalStringLiteral(String),
4040
/// X'hex value'
4141
HexStringLiteral(String),
42+
/// U&'hex value'
43+
UnicodeEscapedStringLiteral(String),
4244

4345
DoubleQuotedString(String),
4446
/// Boolean value true or false
@@ -75,6 +77,7 @@ impl fmt::Display for Value {
7577
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
7678
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
7779
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
80+
Value::UnicodeEscapedStringLiteral(v) => write!(f, "U&'{}'", escape_escaped_string(v)),
7881
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
7982
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
8083
Value::Boolean(v) => write!(f, "{}", v),

src/parser.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ impl<'a> Parser<'a> {
527527
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
528528
})
529529
}
530-
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
530+
Token::EscapedStringLiteral(_) | Token::UnicodeEscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
531531
{
532532
self.prev_token();
533533
Ok(Expr::Value(self.parse_value()?))
@@ -956,6 +956,7 @@ impl<'a> Parser<'a> {
956956
Token::SingleQuotedString(_)
957957
| Token::EscapedStringLiteral(_)
958958
| Token::NationalStringLiteral(_)
959+
| Token::UnicodeEscapedStringLiteral(_)
959960
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
960961
unexpected => {
961962
self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)?
@@ -2888,6 +2889,9 @@ impl<'a> Parser<'a> {
28882889
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
28892890
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
28902891
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
2892+
Token::UnicodeEscapedStringLiteral(ref s) => {
2893+
Ok(Value::UnicodeEscapedStringLiteral(s.to_string()))
2894+
}
28912895
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
28922896
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
28932897
unexpected => self.expected("a value", unexpected),

src/tokenizer.rs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ pub enum Token {
5555
EscapedStringLiteral(String),
5656
/// Hexadecimal string literal: i.e.: X'deadbeef'
5757
HexStringLiteral(String),
58+
/// Unicode escaped string: U&'d\0061t\+000061' (data)
59+
UnicodeEscapedStringLiteral(String),
5860
/// Comma
5961
Comma,
6062
/// Whitespace (space, tab, etc)
@@ -164,6 +166,7 @@ impl fmt::Display for Token {
164166
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
165167
Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s),
166168
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
169+
Token::UnicodeEscapedStringLiteral(ref s) => write!(f, "U&'{}'", s),
167170
Token::Comma => f.write_str(","),
168171
Token::Whitespace(ws) => write!(f, "{}", ws),
169172
Token::DoubleEq => f.write_str("=="),
@@ -427,6 +430,28 @@ impl<'a> Tokenizer<'a> {
427430
}
428431
}
429432
}
433+
x @ 'u' | x @ 'U' => {
434+
chars.next(); // consume, to check the next char
435+
let mut look_ahead_chars = chars.clone();
436+
if look_ahead_chars.next_if_eq(&'&').is_some() {
437+
match look_ahead_chars.peek() {
438+
Some('\'') => {
439+
//Move chars to the position of look_ahead_chars
440+
chars.next();
441+
// U&'...' - a <binary string literal>
442+
let s = self.tokenize_single_quoted_string(chars)?;
443+
Ok(Some(Token::UnicodeEscapedStringLiteral(s)))
444+
}
445+
_ => {
446+
let s = self.tokenize_word(x, chars);
447+
Ok(Some(Token::make_word(&s, None)))
448+
}
449+
}
450+
} else {
451+
let s = self.tokenize_word(x, chars);
452+
Ok(Some(Token::make_word(&s, None)))
453+
}
454+
}
430455
// identifier or keyword
431456
ch if self.dialect.is_identifier_start(ch) => {
432457
chars.next(); // consume the first char
@@ -1454,4 +1479,36 @@ mod tests {
14541479
//println!("------------------------------");
14551480
assert_eq!(expected, actual);
14561481
}
1482+
#[test]
1483+
fn tokenize_unicode_escaped_literal() {
1484+
let sql = r#"U&'aaa'"#;
1485+
let dialect = GenericDialect {};
1486+
let mut tokenizer = Tokenizer::new(&dialect, sql);
1487+
let tokens = tokenizer.tokenize().unwrap();
1488+
let expected = vec![Token::UnicodeEscapedStringLiteral("aaa".to_string())];
1489+
compare(expected, tokens);
1490+
1491+
let sql = r#"U&a"#;
1492+
let dialect = GenericDialect {};
1493+
let mut tokenizer = Tokenizer::new(&dialect, sql);
1494+
let tokens = tokenizer.tokenize().unwrap();
1495+
let expected = vec![
1496+
Token::make_word("U", None),
1497+
Token::Ampersand,
1498+
Token::make_word("a", None),
1499+
];
1500+
compare(expected, tokens);
1501+
let sql = r#"U & 'aaa'"#;
1502+
let dialect = GenericDialect {};
1503+
let mut tokenizer = Tokenizer::new(&dialect, sql);
1504+
let tokens = tokenizer.tokenize().unwrap();
1505+
let expected = vec![
1506+
Token::make_word("U", None),
1507+
Token::Whitespace(Whitespace::Space),
1508+
Token::Ampersand,
1509+
Token::Whitespace(Whitespace::Space),
1510+
Token::SingleQuotedString("aaa".to_string()),
1511+
];
1512+
compare(expected, tokens);
1513+
}
14571514
}

0 commit comments

Comments
 (0)