Skip to content

Commit 2c0886d

Browse files
ovralamb
andauthored
feat: Support escaped string literals (PostgreSQL) (#502)
* feat: Support escaped string literals (PostgreSQL) Signed-off-by: Dmitry Patsura <[email protected]> * lint * escape ', \r, \t * Update src/ast/value.rs Co-authored-by: Andrew Lamb <[email protected]> * Update src/tokenizer.rs Co-authored-by: Andrew Lamb <[email protected]> * test: two slashes * remove dead code * test: parsing error * support generic dialect too (for DF) Co-authored-by: Andrew Lamb <[email protected]>
1 parent 4070f3e commit 2c0886d

File tree

4 files changed

+167
-0
lines changed

4 files changed

+167
-0
lines changed

src/ast/value.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ pub enum Value {
3030
Number(BigDecimal, bool),
3131
/// 'string value'
3232
SingleQuotedString(String),
33+
/// e'string value' (postgres extension)
34+
/// <https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS
35+
EscapedStringLiteral(String),
3336
/// N'string value'
3437
NationalStringLiteral(String),
3538
/// X'hex value'
@@ -69,6 +72,7 @@ impl fmt::Display for Value {
6972
Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }),
7073
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
7174
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
75+
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
7276
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
7377
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
7478
Value::Boolean(v) => write!(f, "{}", v),
@@ -193,6 +197,40 @@ pub fn escape_single_quote_string(s: &str) -> EscapeSingleQuoteString<'_> {
193197
EscapeSingleQuoteString(s)
194198
}
195199

200+
pub struct EscapeEscapedStringLiteral<'a>(&'a str);
201+
202+
impl<'a> fmt::Display for EscapeEscapedStringLiteral<'a> {
203+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
204+
for c in self.0.chars() {
205+
match c {
206+
'\'' => {
207+
write!(f, r#"\'"#)?;
208+
}
209+
'\\' => {
210+
write!(f, r#"\\"#)?;
211+
}
212+
'\n' => {
213+
write!(f, r#"\n"#)?;
214+
}
215+
'\t' => {
216+
write!(f, r#"\t"#)?;
217+
}
218+
'\r' => {
219+
write!(f, r#"\r"#)?;
220+
}
221+
_ => {
222+
write!(f, "{}", c)?;
223+
}
224+
}
225+
}
226+
Ok(())
227+
}
228+
}
229+
230+
pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
231+
EscapeEscapedStringLiteral(s)
232+
}
233+
196234
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
197235
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
198236
pub enum TrimWhereField {

src/parser.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,11 @@ impl<'a> Parser<'a> {
497497
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
498498
})
499499
}
500+
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
501+
{
502+
self.prev_token();
503+
Ok(Expr::Value(self.parse_value()?))
504+
}
500505
Token::Number(_, _)
501506
| Token::SingleQuotedString(_)
502507
| Token::NationalStringLiteral(_)
@@ -902,6 +907,7 @@ impl<'a> Parser<'a> {
902907
None
903908
}
904909
Token::SingleQuotedString(_)
910+
| Token::EscapedStringLiteral(_)
905911
| Token::NationalStringLiteral(_)
906912
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
907913
unexpected => {
@@ -2576,6 +2582,7 @@ impl<'a> Parser<'a> {
25762582
},
25772583
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
25782584
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
2585+
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
25792586
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
25802587
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
25812588
unexpected => self.expected("a value", unexpected),
@@ -2607,6 +2614,9 @@ impl<'a> Parser<'a> {
26072614
match self.next_token() {
26082615
Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value),
26092616
Token::SingleQuotedString(s) => Ok(s),
2617+
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
2618+
Ok(s)
2619+
}
26102620
unexpected => self.expected("literal string", unexpected),
26112621
}
26122622
}

src/tokenizer.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ pub enum Token {
5151
SingleQuotedString(String),
5252
/// "National" string literal: i.e: N'string'
5353
NationalStringLiteral(String),
54+
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
55+
EscapedStringLiteral(String),
5456
/// Hexadecimal string literal: i.e.: X'deadbeef'
5557
HexStringLiteral(String),
5658
/// Comma
@@ -160,6 +162,7 @@ impl fmt::Display for Token {
160162
Token::Char(ref c) => write!(f, "{}", c),
161163
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
162164
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
165+
Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s),
163166
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
164167
Token::Comma => f.write_str(","),
165168
Token::Whitespace(ws) => write!(f, "{}", ws),
@@ -392,6 +395,21 @@ impl<'a> Tokenizer<'a> {
392395
}
393396
}
394397
}
398+
// PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
399+
x @ 'e' | x @ 'E' => {
400+
chars.next(); // consume, to check the next char
401+
match chars.peek() {
402+
Some('\'') => {
403+
let s = self.tokenize_escaped_single_quoted_string(chars)?;
404+
Ok(Some(Token::EscapedStringLiteral(s)))
405+
}
406+
_ => {
407+
// regular identifier starting with an "E" or "e"
408+
let s = self.tokenize_word(x, chars);
409+
Ok(Some(Token::make_word(&s, None)))
410+
}
411+
}
412+
}
395413
// The spec only allows an uppercase 'X' to introduce a hex
396414
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
397415
x @ 'x' | x @ 'X' => {
@@ -690,6 +708,66 @@ impl<'a> Tokenizer<'a> {
690708
s
691709
}
692710

711+
/// Read a single quoted string, starting with the opening quote.
712+
fn tokenize_escaped_single_quoted_string(
713+
&self,
714+
chars: &mut Peekable<Chars<'_>>,
715+
) -> Result<String, TokenizerError> {
716+
let mut s = String::new();
717+
chars.next(); // consume the opening quote
718+
719+
// slash escaping
720+
let mut is_escaped = false;
721+
while let Some(&ch) = chars.peek() {
722+
macro_rules! escape_control_character {
723+
($ESCAPED:expr) => {{
724+
if is_escaped {
725+
s.push($ESCAPED);
726+
is_escaped = false;
727+
} else {
728+
s.push(ch);
729+
}
730+
731+
chars.next();
732+
}};
733+
}
734+
735+
match ch {
736+
'\'' => {
737+
chars.next(); // consume
738+
if is_escaped {
739+
s.push(ch);
740+
is_escaped = false;
741+
} else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
742+
s.push(ch);
743+
chars.next();
744+
} else {
745+
return Ok(s);
746+
}
747+
}
748+
'\\' => {
749+
if is_escaped {
750+
s.push('\\');
751+
is_escaped = false;
752+
} else {
753+
is_escaped = true;
754+
}
755+
756+
chars.next();
757+
}
758+
'r' => escape_control_character!('\r'),
759+
'n' => escape_control_character!('\n'),
760+
't' => escape_control_character!('\t'),
761+
_ => {
762+
is_escaped = false;
763+
chars.next(); // consume
764+
s.push(ch);
765+
}
766+
}
767+
}
768+
self.tokenizer_error("Unterminated encoded string literal")
769+
}
770+
693771
/// Read a single quoted string, starting with the opening quote.
694772
fn tokenize_single_quoted_string(
695773
&self,

tests/sqlparser_postgres.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,3 +1467,44 @@ fn pg_and_generic() -> TestedDialects {
14671467
dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
14681468
}
14691469
}
1470+
1471+
#[test]
1472+
fn parse_escaped_literal_string() {
1473+
let sql =
1474+
r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4', E'\'', E'foo \\'"#;
1475+
let select = pg_and_generic().verified_only_select(sql);
1476+
assert_eq!(6, select.projection.len());
1477+
assert_eq!(
1478+
&Expr::Value(Value::EscapedStringLiteral("s1 \n s1".to_string())),
1479+
expr_from_projection(&select.projection[0])
1480+
);
1481+
assert_eq!(
1482+
&Expr::Value(Value::EscapedStringLiteral("s2 \\n s2".to_string())),
1483+
expr_from_projection(&select.projection[1])
1484+
);
1485+
assert_eq!(
1486+
&Expr::Value(Value::EscapedStringLiteral("s3 \\\n s3".to_string())),
1487+
expr_from_projection(&select.projection[2])
1488+
);
1489+
assert_eq!(
1490+
&Expr::Value(Value::EscapedStringLiteral("s4 \\\\n s4".to_string())),
1491+
expr_from_projection(&select.projection[3])
1492+
);
1493+
assert_eq!(
1494+
&Expr::Value(Value::EscapedStringLiteral("'".to_string())),
1495+
expr_from_projection(&select.projection[4])
1496+
);
1497+
assert_eq!(
1498+
&Expr::Value(Value::EscapedStringLiteral("foo \\".to_string())),
1499+
expr_from_projection(&select.projection[5])
1500+
);
1501+
1502+
let sql = r#"SELECT E'\'"#;
1503+
assert_eq!(
1504+
pg_and_generic()
1505+
.parse_sql_statements(sql)
1506+
.unwrap_err()
1507+
.to_string(),
1508+
"sql parser error: Unterminated encoded string literal at Line: 1, Column 8"
1509+
);
1510+
}

0 commit comments

Comments
 (0)