Skip to content

Commit d2c2b15

Browse files
authored
Add support for quoted string backslash escaping (#1177)
1 parent 7b49c69 commit d2c2b15

18 files changed

+352
-996
lines changed

src/ast/mod.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -512,21 +512,21 @@ pub enum Expr {
512512
negated: bool,
513513
expr: Box<Expr>,
514514
pattern: Box<Expr>,
515-
escape_char: Option<char>,
515+
escape_char: Option<String>,
516516
},
517517
/// `ILIKE` (case-insensitive `LIKE`)
518518
ILike {
519519
negated: bool,
520520
expr: Box<Expr>,
521521
pattern: Box<Expr>,
522-
escape_char: Option<char>,
522+
escape_char: Option<String>,
523523
},
524524
/// SIMILAR TO regex
525525
SimilarTo {
526526
negated: bool,
527527
expr: Box<Expr>,
528528
pattern: Box<Expr>,
529-
escape_char: Option<char>,
529+
escape_char: Option<String>,
530530
},
531531
/// MySQL: RLIKE regex or REGEXP regex
532532
RLike {

src/dialect/bigquery.rs

+5
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,9 @@ impl Dialect for BigQueryDialect {
2929
fn is_identifier_part(&self, ch: char) -> bool {
3030
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
3131
}
32+
33+
// See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
34+
fn supports_string_literal_backslash_escape(&self) -> bool {
35+
true
36+
}
3237
}

src/dialect/clickhouse.rs

+4
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ impl Dialect for ClickHouseDialect {
2525
fn is_identifier_part(&self, ch: char) -> bool {
2626
self.is_identifier_start(ch) || ch.is_ascii_digit()
2727
}
28+
29+
fn supports_string_literal_backslash_escape(&self) -> bool {
30+
true
31+
}
2832
}

src/dialect/mod.rs

+21
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,23 @@ pub trait Dialect: Debug + Any {
120120
fn is_identifier_start(&self, ch: char) -> bool;
121121
/// Determine if a character is a valid unquoted identifier character
122122
fn is_identifier_part(&self, ch: char) -> bool;
123+
/// Determine if the dialect supports escaping characters via '\' in string literals.
124+
///
125+
/// Some dialects like BigQuery and Snowflake support this while others like
126+
/// Postgres do not. Such that the following is accepted by the former but
127+
/// rejected by the latter.
128+
/// ```sql
129+
/// SELECT 'ab\'cd';
130+
/// ```
131+
///
132+
/// Conversely, such dialects reject the following statement which
133+
/// otherwise would be valid in the other dialects.
134+
/// ```sql
135+
/// SELECT '\';
136+
/// ```
137+
fn supports_string_literal_backslash_escape(&self) -> bool {
138+
false
139+
}
123140
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
124141
fn supports_filter_during_aggregation(&self) -> bool {
125142
false
@@ -306,6 +323,10 @@ mod tests {
306323
self.0.identifier_quote_style(identifier)
307324
}
308325

326+
fn supports_string_literal_backslash_escape(&self) -> bool {
327+
self.0.supports_string_literal_backslash_escape()
328+
}
329+
309330
fn is_proper_identifier_inside_quotes(
310331
&self,
311332
chars: std::iter::Peekable<std::str::Chars<'_>>,

src/dialect/mysql.rs

+5
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ impl Dialect for MySqlDialect {
4848
Some('`')
4949
}
5050

51+
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
52+
fn supports_string_literal_backslash_escape(&self) -> bool {
53+
true
54+
}
55+
5156
fn parse_infix(
5257
&self,
5358
parser: &mut crate::parser::Parser,

src/dialect/snowflake.rs

+5
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ impl Dialect for SnowflakeDialect {
4646
|| ch == '_'
4747
}
4848

49+
// See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
50+
fn supports_string_literal_backslash_escape(&self) -> bool {
51+
true
52+
}
53+
4954
fn supports_within_after_array_aggregation(&self) -> bool {
5055
true
5156
}

src/parser/mod.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -2560,9 +2560,9 @@ impl<'a> Parser<'a> {
25602560
}
25612561

25622562
/// parse the ESCAPE CHAR portion of LIKE, ILIKE, and SIMILAR TO
2563-
pub fn parse_escape_char(&mut self) -> Result<Option<char>, ParserError> {
2563+
pub fn parse_escape_char(&mut self) -> Result<Option<String>, ParserError> {
25642564
if self.parse_keyword(Keyword::ESCAPE) {
2565-
Ok(Some(self.parse_literal_char()?))
2565+
Ok(Some(self.parse_literal_string()?))
25662566
} else {
25672567
Ok(None)
25682568
}

src/tokenizer.rs

+94-36
Original file line numberDiff line numberDiff line change
@@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
627627
chars.next(); // consume
628628
match chars.peek() {
629629
Some('\'') => {
630-
let s = self.tokenize_quoted_string(chars, '\'')?;
630+
let s = self.tokenize_quoted_string(chars, '\'', false)?;
631631
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
632632
}
633633
Some('\"') => {
634-
let s = self.tokenize_quoted_string(chars, '\"')?;
634+
let s = self.tokenize_quoted_string(chars, '\"', false)?;
635635
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
636636
}
637637
_ => {
@@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
646646
chars.next(); // consume
647647
match chars.peek() {
648648
Some('\'') => {
649-
let s = self.tokenize_quoted_string(chars, '\'')?;
649+
let s = self.tokenize_quoted_string(chars, '\'', false)?;
650650
Ok(Some(Token::RawStringLiteral(s)))
651651
}
652652
Some('\"') => {
653-
let s = self.tokenize_quoted_string(chars, '\"')?;
653+
let s = self.tokenize_quoted_string(chars, '\"', false)?;
654654
Ok(Some(Token::RawStringLiteral(s)))
655655
}
656656
_ => {
@@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
666666
match chars.peek() {
667667
Some('\'') => {
668668
// N'...' - a <national character string literal>
669-
let s = self.tokenize_quoted_string(chars, '\'')?;
669+
let s = self.tokenize_quoted_string(chars, '\'', true)?;
670670
Ok(Some(Token::NationalStringLiteral(s)))
671671
}
672672
_ => {
@@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
700700
match chars.peek() {
701701
Some('\'') => {
702702
// X'...' - a <binary string literal>
703-
let s = self.tokenize_quoted_string(chars, '\'')?;
703+
let s = self.tokenize_quoted_string(chars, '\'', true)?;
704704
Ok(Some(Token::HexStringLiteral(s)))
705705
}
706706
_ => {
@@ -712,15 +712,23 @@ impl<'a> Tokenizer<'a> {
712712
}
713713
// single quoted string
714714
'\'' => {
715-
let s = self.tokenize_quoted_string(chars, '\'')?;
715+
let s = self.tokenize_quoted_string(
716+
chars,
717+
'\'',
718+
self.dialect.supports_string_literal_backslash_escape(),
719+
)?;
716720

717721
Ok(Some(Token::SingleQuotedString(s)))
718722
}
719723
// double quoted string
720724
'\"' if !self.dialect.is_delimited_identifier_start(ch)
721725
&& !self.dialect.is_identifier_start(ch) =>
722726
{
723-
let s = self.tokenize_quoted_string(chars, '"')?;
727+
let s = self.tokenize_quoted_string(
728+
chars,
729+
'"',
730+
self.dialect.supports_string_literal_backslash_escape(),
731+
)?;
724732

725733
Ok(Some(Token::DoubleQuotedString(s)))
726734
}
@@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
12221230
&self,
12231231
chars: &mut State,
12241232
quote_style: char,
1233+
allow_escape: bool,
12251234
) -> Result<String, TokenizerError> {
12261235
let mut s = String::new();
12271236
let error_loc = chars.location();
@@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
12431252
return Ok(s);
12441253
}
12451254
}
1246-
'\\' => {
1247-
// consume
1255+
'\\' if allow_escape => {
1256+
// consume backslash
12481257
chars.next();
1249-
// slash escaping is specific to MySQL dialect.
1250-
if dialect_of!(self is MySqlDialect) {
1251-
if let Some(next) = chars.peek() {
1252-
if !self.unescape {
1253-
// In no-escape mode, the given query has to be saved completely including backslashes.
1254-
s.push(ch);
1255-
s.push(*next);
1256-
chars.next(); // consume next
1257-
} else {
1258-
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
1259-
let n = match next {
1260-
'\'' | '\"' | '\\' | '%' | '_' => *next,
1261-
'0' => '\0',
1262-
'b' => '\u{8}',
1263-
'n' => '\n',
1264-
'r' => '\r',
1265-
't' => '\t',
1266-
'Z' => '\u{1a}',
1267-
_ => *next,
1268-
};
1269-
s.push(n);
1270-
chars.next(); // consume next
1271-
}
1258+
1259+
if let Some(next) = chars.peek() {
1260+
if !self.unescape {
1261+
// In no-escape mode, the given query has to be saved completely including backslashes.
1262+
s.push(ch);
1263+
s.push(*next);
1264+
chars.next(); // consume next
1265+
} else {
1266+
let n = match next {
1267+
'0' => '\0',
1268+
'a' => '\u{7}',
1269+
'b' => '\u{8}',
1270+
'f' => '\u{c}',
1271+
'n' => '\n',
1272+
'r' => '\r',
1273+
't' => '\t',
1274+
'Z' => '\u{1a}',
1275+
_ => *next,
1276+
};
1277+
s.push(n);
1278+
chars.next(); // consume next
12721279
}
1273-
} else {
1274-
s.push(ch);
12751280
}
12761281
}
12771282
_ => {
@@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
15171522
#[cfg(test)]
15181523
mod tests {
15191524
use super::*;
1520-
use crate::dialect::{ClickHouseDialect, MsSqlDialect};
1525+
use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};
15211526

15221527
#[test]
15231528
fn tokenizer_error_impl() {
@@ -2386,4 +2391,57 @@ mod tests {
23862391
check_unescape(r"Hello\0", None);
23872392
check_unescape(r"Hello\xCADRust", None);
23882393
}
2394+
2395+
#[test]
2396+
fn tokenize_quoted_string_escape() {
2397+
for (sql, expected, expected_unescaped) in [
2398+
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
2399+
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
2400+
(r#"'\\'"#, r#"\\"#, r#"\"#),
2401+
(
2402+
r#"'\0\a\b\f\n\r\t\Z'"#,
2403+
r#"\0\a\b\f\n\r\t\Z"#,
2404+
"\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
2405+
),
2406+
(r#"'\"'"#, r#"\""#, "\""),
2407+
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
2408+
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
2409+
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
2410+
] {
2411+
let dialect = BigQueryDialect {};
2412+
2413+
let tokens = Tokenizer::new(&dialect, sql)
2414+
.with_unescape(false)
2415+
.tokenize()
2416+
.unwrap();
2417+
let expected = vec![Token::SingleQuotedString(expected.to_string())];
2418+
compare(expected, tokens);
2419+
2420+
let tokens = Tokenizer::new(&dialect, sql)
2421+
.with_unescape(true)
2422+
.tokenize()
2423+
.unwrap();
2424+
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
2425+
compare(expected, tokens);
2426+
}
2427+
2428+
for sql in [r#"'\'"#, r#"'ab\'"#] {
2429+
let dialect = BigQueryDialect {};
2430+
let mut tokenizer = Tokenizer::new(&dialect, sql);
2431+
assert_eq!(
2432+
"Unterminated string literal",
2433+
tokenizer.tokenize().unwrap_err().message.as_str(),
2434+
);
2435+
}
2436+
2437+
// Non-escape dialect
2438+
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
2439+
let dialect = GenericDialect {};
2440+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2441+
2442+
let expected = vec![Token::SingleQuotedString(expected.to_string())];
2443+
2444+
compare(expected, tokens);
2445+
}
2446+
}
23892447
}

0 commit comments

Comments
 (0)