Skip to content

Support for postgres String Constants with Unicode Escapes #1355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/ast/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ pub enum Value {
/// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
/// for more details.
EscapedStringLiteral(String),
/// u&'string value' (postgres extension)
/// See [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
/// for more details.
UnicodeStringLiteral(String),
/// B'string value'
SingleQuotedByteStringLiteral(String),
/// B"string value"
Expand Down Expand Up @@ -102,6 +106,7 @@ impl fmt::Display for Value {
}
Value::DollarQuotedString(v) => write!(f, "{v}"),
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
Value::UnicodeStringLiteral(v) => write!(f, "U&'{}'", escape_unicode_string(v)),
Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
Value::HexStringLiteral(v) => write!(f, "X'{v}'"),
Value::Boolean(v) => write!(f, "{v}"),
Expand Down Expand Up @@ -347,6 +352,41 @@ pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
EscapeEscapedStringLiteral(s)
}

pub struct EscapeUnicodeStringLiteral<'a>(&'a str);

impl<'a> fmt::Display for EscapeUnicodeStringLiteral<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.0.chars() {
match c {
'\'' => {
write!(f, "''")?;
}
'\\' => {
write!(f, r#"\\"#)?;
}
x if x.is_ascii() => {
write!(f, "{}", c)?;
}
_ => {
let codepoint = c as u32;
// if the character fits in 32 bits, we can use the \XXXX format
// otherwise, we need to use the \+XXXXXX format
if codepoint <= 0xFFFF {
write!(f, "\\{:04X}", codepoint)?;
} else {
write!(f, "\\+{:06X}", codepoint)?;
}
}
}
}
Ok(())
}
}

pub fn escape_unicode_string(s: &str) -> EscapeUnicodeStringLiteral<'_> {
EscapeUnicodeStringLiteral(s)
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
Expand Down
4 changes: 4 additions & 0 deletions src/dialect/generic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ impl Dialect for GenericDialect {
|| ch == '_'
}

fn supports_unicode_string_literal(&self) -> bool {
true
}

fn supports_group_by_expr(&self) -> bool {
true
}
Expand Down
15 changes: 15 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,21 @@ pub trait Dialect: Debug + Any {
fn supports_string_literal_backslash_escape(&self) -> bool {
false
}

/// Determine if the dialect supports string literals with `U&` prefix.
/// This is used to specify Unicode code points in string literals.
/// For example, in PostgreSQL, the following is a valid string literal:
/// ```sql
/// SELECT U&'\0061\0062\0063';
/// ```
/// This is equivalent to the string literal `'abc'`.
/// See
/// - [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
/// - [H2 docs](http://www.h2database.com/html/grammar.html#string)
fn supports_unicode_string_literal(&self) -> bool {
false
}

/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
fn supports_filter_during_aggregation(&self) -> bool {
false
Expand Down
4 changes: 4 additions & 0 deletions src/dialect/postgresql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ impl Dialect for PostgreSqlDialect {
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
}

fn supports_unicode_string_literal(&self) -> bool {
true
}

/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
fn is_custom_operator_part(&self, ch: char) -> bool {
matches!(
Expand Down
7 changes: 7 additions & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,10 @@ impl<'a> Parser<'a> {
self.prev_token();
Ok(Expr::Value(self.parse_value()?))
}
Token::UnicodeStringLiteral(_) => {
self.prev_token();
Ok(Expr::Value(self.parse_value()?))
}
Token::Number(_, _)
| Token::SingleQuotedString(_)
| Token::DoubleQuotedString(_)
Expand Down Expand Up @@ -1866,6 +1870,7 @@ impl<'a> Parser<'a> {
}
Token::SingleQuotedString(_)
| Token::EscapedStringLiteral(_)
| Token::UnicodeStringLiteral(_)
| Token::NationalStringLiteral(_)
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
_ => self.expected(
Expand Down Expand Up @@ -6963,6 +6968,7 @@ impl<'a> Parser<'a> {
}
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())),
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
tok @ Token::Colon | tok @ Token::AtSign => {
Expand Down Expand Up @@ -7054,6 +7060,7 @@ impl<'a> Parser<'a> {
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
Ok(s)
}
Token::UnicodeStringLiteral(s) => Ok(s),
_ => self.expected("literal string", next_token),
}
}
Expand Down
78 changes: 78 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ pub enum Token {
NationalStringLiteral(String),
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
EscapedStringLiteral(String),
/// Unicode string literal: i.e: U&'first \000A second'
UnicodeStringLiteral(String),
/// Hexadecimal string literal: i.e.: X'deadbeef'
HexStringLiteral(String),
/// Comma
Expand Down Expand Up @@ -251,6 +253,7 @@ impl fmt::Display for Token {
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
Expand Down Expand Up @@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> {
}
}
}
// Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
chars.next(); // consume, to check the next char
if chars.peek() == Some(&'&') {
// we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
let mut chars_clone = chars.peekable.clone();
chars_clone.next(); // consume the '&' in the clone
if chars_clone.peek() == Some(&'\'') {
chars.next(); // consume the '&' in the original iterator
let s = unescape_unicode_single_quoted_string(chars)?;
return Ok(Some(Token::UnicodeStringLiteral(s)));
}
}
// regular identifier starting with an "U" or "u"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
// The spec only allows an uppercase 'X' to introduce a hex
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
x @ 'x' | x @ 'X' => {
Expand Down Expand Up @@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
}
}

fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
let mut unescaped = String::new();
chars.next(); // consume the opening quote
while let Some(c) = chars.next() {
match c {
'\'' => {
if chars.peek() == Some(&'\'') {
chars.next();
unescaped.push('\'');
} else {
return Ok(unescaped);
}
}
'\\' => match chars.peek() {
Some('\\') => {
chars.next();
unescaped.push('\\');
}
Some('+') => {
chars.next();
unescaped.push(take_char_from_hex_digits(chars, 6)?);
}
_ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
},
_ => {
unescaped.push(c);
}
}
}
Err(TokenizerError {
message: "Unterminated unicode encoded string literal".to_string(),
location: chars.location(),
})
}

fn take_char_from_hex_digits(
chars: &mut State<'_>,
max_digits: usize,
) -> Result<char, TokenizerError> {
let mut result = 0u32;
for _ in 0..max_digits {
let next_char = chars.next().ok_or_else(|| TokenizerError {
message: "Unexpected EOF while parsing hex digit in escaped unicode string."
.to_string(),
location: chars.location(),
})?;
let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
location: chars.location(),
})?;
result = result * 16 + digit;
}
char::from_u32(result).ok_or_else(|| TokenizerError {
message: format!("Invalid unicode character: {:x}", result),
location: chars.location(),
})
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
32 changes: 32 additions & 0 deletions tests/sqlparser_postgres.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4441,3 +4441,35 @@ fn test_table_unnest_with_ordinality() {
_ => panic!("Expecting TableFactor::UNNEST with ordinality"),
}
}

#[test]
fn test_escaped_string_literal() {
match pg().verified_expr(r#"E'\n'"#) {
Expr::Value(Value::EscapedStringLiteral(s)) => {
assert_eq!("\n", s);
}
_ => unreachable!(),
}
}

#[test]
fn test_unicode_string_literal() {
let pairs = [
// Example from the postgres docs
(r#"U&'\0441\043B\043E\043D'"#, "слон"),
// High unicode code point (> 0xFFFF)
(r#"U&'\+01F418'"#, "🐘"),
// Escaped backslash
(r#"U&'\\'"#, r#"\"#),
// Escaped single quote
(r#"U&''''"#, "'"),
];
for (input, expected) in pairs {
match pg_and_generic().verified_expr(input) {
Expr::Value(Value::UnicodeStringLiteral(s)) => {
assert_eq!(expected, s);
}
_ => unreachable!(),
}
}
}
Loading