Skip to content

Commit 0458e4b

Browse files
committed
Add support backslash escape
This adds support for parsing string literals on dialects that treat backslash character as an escape character. As an example, the following previously failed to parse by dialects like BigQuery where the syntax is valid. ```sql SELECT 'a\'b'; ``` Moves the SQL `like` and `similar_to` tests from individual dialects to common since the tests were identical.
1 parent 2f03fad commit 0458e4b

18 files changed

+352
-996
lines changed

src/ast/mod.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -492,21 +492,21 @@ pub enum Expr {
492492
negated: bool,
493493
expr: Box<Expr>,
494494
pattern: Box<Expr>,
495-
escape_char: Option<char>,
495+
escape_char: Option<String>,
496496
},
497497
/// `ILIKE` (case-insensitive `LIKE`)
498498
ILike {
499499
negated: bool,
500500
expr: Box<Expr>,
501501
pattern: Box<Expr>,
502-
escape_char: Option<char>,
502+
escape_char: Option<String>,
503503
},
504504
/// SIMILAR TO regex
505505
SimilarTo {
506506
negated: bool,
507507
expr: Box<Expr>,
508508
pattern: Box<Expr>,
509-
escape_char: Option<char>,
509+
escape_char: Option<String>,
510510
},
511511
/// MySQL: RLIKE regex or REGEXP regex
512512
RLike {

src/dialect/bigquery.rs

+5
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,9 @@ impl Dialect for BigQueryDialect {
2929
fn is_identifier_part(&self, ch: char) -> bool {
3030
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
3131
}
32+
33+
// See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
34+
fn supports_string_literal_backslash_escape(&self) -> bool {
35+
true
36+
}
3237
}

src/dialect/clickhouse.rs

+4
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ impl Dialect for ClickHouseDialect {
2525
fn is_identifier_part(&self, ch: char) -> bool {
2626
self.is_identifier_start(ch) || ch.is_ascii_digit()
2727
}
28+
29+
fn supports_string_literal_backslash_escape(&self) -> bool {
30+
true
31+
}
2832
}

src/dialect/mod.rs

+21
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,23 @@ pub trait Dialect: Debug + Any {
120120
fn is_identifier_start(&self, ch: char) -> bool;
121121
/// Determine if a character is a valid unquoted identifier character
122122
fn is_identifier_part(&self, ch: char) -> bool;
123+
/// Determine if the dialect supports escaping characters via '\' in string literals.
124+
///
125+
/// Some dialects like BigQuery and Snowflake support this while others like
126+
/// Postgres do not. Such that the following is accepted by the former but
127+
/// rejected by the latter.
128+
/// ```sql
129+
/// SELECT 'ab\'cd';
130+
/// ```
131+
///
132+
/// Conversely, such dialects reject the following statement which
133+
/// otherwise would be valid in the other dialects.
134+
/// ```sql
135+
/// SELECT '\';
136+
/// ```
137+
fn supports_string_literal_backslash_escape(&self) -> bool {
138+
false
139+
}
123140
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
124141
fn supports_filter_during_aggregation(&self) -> bool {
125142
false
@@ -306,6 +323,10 @@ mod tests {
306323
self.0.identifier_quote_style(identifier)
307324
}
308325

326+
fn supports_string_literal_backslash_escape(&self) -> bool {
327+
self.0.supports_string_literal_backslash_escape()
328+
}
329+
309330
fn is_proper_identifier_inside_quotes(
310331
&self,
311332
chars: std::iter::Peekable<std::str::Chars<'_>>,

src/dialect/mysql.rs

+5
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ impl Dialect for MySqlDialect {
4848
Some('`')
4949
}
5050

51+
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
52+
fn supports_string_literal_backslash_escape(&self) -> bool {
53+
true
54+
}
55+
5156
fn parse_infix(
5257
&self,
5358
parser: &mut crate::parser::Parser,

src/dialect/snowflake.rs

+5
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ impl Dialect for SnowflakeDialect {
4646
|| ch == '_'
4747
}
4848

49+
// See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
50+
fn supports_string_literal_backslash_escape(&self) -> bool {
51+
true
52+
}
53+
4954
fn supports_within_after_array_aggregation(&self) -> bool {
5055
true
5156
}

src/parser/mod.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -2584,9 +2584,9 @@ impl<'a> Parser<'a> {
25842584
}
25852585

25862586
/// parse the ESCAPE CHAR portion of LIKE, ILIKE, and SIMILAR TO
2587-
pub fn parse_escape_char(&mut self) -> Result<Option<char>, ParserError> {
2587+
pub fn parse_escape_char(&mut self) -> Result<Option<String>, ParserError> {
25882588
if self.parse_keyword(Keyword::ESCAPE) {
2589-
Ok(Some(self.parse_literal_char()?))
2589+
Ok(Some(self.parse_literal_string()?))
25902590
} else {
25912591
Ok(None)
25922592
}

src/tokenizer.rs

+94-36
Original file line numberDiff line numberDiff line change
@@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
627627
chars.next(); // consume
628628
match chars.peek() {
629629
Some('\'') => {
630-
let s = self.tokenize_quoted_string(chars, '\'')?;
630+
let s = self.tokenize_quoted_string(chars, '\'', false)?;
631631
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
632632
}
633633
Some('\"') => {
634-
let s = self.tokenize_quoted_string(chars, '\"')?;
634+
let s = self.tokenize_quoted_string(chars, '\"', false)?;
635635
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
636636
}
637637
_ => {
@@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
646646
chars.next(); // consume
647647
match chars.peek() {
648648
Some('\'') => {
649-
let s = self.tokenize_quoted_string(chars, '\'')?;
649+
let s = self.tokenize_quoted_string(chars, '\'', false)?;
650650
Ok(Some(Token::RawStringLiteral(s)))
651651
}
652652
Some('\"') => {
653-
let s = self.tokenize_quoted_string(chars, '\"')?;
653+
let s = self.tokenize_quoted_string(chars, '\"', false)?;
654654
Ok(Some(Token::RawStringLiteral(s)))
655655
}
656656
_ => {
@@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
666666
match chars.peek() {
667667
Some('\'') => {
668668
// N'...' - a <national character string literal>
669-
let s = self.tokenize_quoted_string(chars, '\'')?;
669+
let s = self.tokenize_quoted_string(chars, '\'', true)?;
670670
Ok(Some(Token::NationalStringLiteral(s)))
671671
}
672672
_ => {
@@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
700700
match chars.peek() {
701701
Some('\'') => {
702702
// X'...' - a <binary string literal>
703-
let s = self.tokenize_quoted_string(chars, '\'')?;
703+
let s = self.tokenize_quoted_string(chars, '\'', true)?;
704704
Ok(Some(Token::HexStringLiteral(s)))
705705
}
706706
_ => {
@@ -712,15 +712,23 @@ impl<'a> Tokenizer<'a> {
712712
}
713713
// single quoted string
714714
'\'' => {
715-
let s = self.tokenize_quoted_string(chars, '\'')?;
715+
let s = self.tokenize_quoted_string(
716+
chars,
717+
'\'',
718+
self.dialect.supports_string_literal_backslash_escape(),
719+
)?;
716720

717721
Ok(Some(Token::SingleQuotedString(s)))
718722
}
719723
// double quoted string
720724
'\"' if !self.dialect.is_delimited_identifier_start(ch)
721725
&& !self.dialect.is_identifier_start(ch) =>
722726
{
723-
let s = self.tokenize_quoted_string(chars, '"')?;
727+
let s = self.tokenize_quoted_string(
728+
chars,
729+
'"',
730+
self.dialect.supports_string_literal_backslash_escape(),
731+
)?;
724732

725733
Ok(Some(Token::DoubleQuotedString(s)))
726734
}
@@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
12221230
&self,
12231231
chars: &mut State,
12241232
quote_style: char,
1233+
allow_escape: bool,
12251234
) -> Result<String, TokenizerError> {
12261235
let mut s = String::new();
12271236
let error_loc = chars.location();
@@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
12431252
return Ok(s);
12441253
}
12451254
}
1246-
'\\' => {
1247-
// consume
1255+
'\\' if allow_escape => {
1256+
// consume backslash
12481257
chars.next();
1249-
// slash escaping is specific to MySQL dialect.
1250-
if dialect_of!(self is MySqlDialect) {
1251-
if let Some(next) = chars.peek() {
1252-
if !self.unescape {
1253-
// In no-escape mode, the given query has to be saved completely including backslashes.
1254-
s.push(ch);
1255-
s.push(*next);
1256-
chars.next(); // consume next
1257-
} else {
1258-
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
1259-
let n = match next {
1260-
'\'' | '\"' | '\\' | '%' | '_' => *next,
1261-
'0' => '\0',
1262-
'b' => '\u{8}',
1263-
'n' => '\n',
1264-
'r' => '\r',
1265-
't' => '\t',
1266-
'Z' => '\u{1a}',
1267-
_ => *next,
1268-
};
1269-
s.push(n);
1270-
chars.next(); // consume next
1271-
}
1258+
1259+
if let Some(next) = chars.peek() {
1260+
if !self.unescape {
1261+
// In no-escape mode, the given query has to be saved completely including backslashes.
1262+
s.push(ch);
1263+
s.push(*next);
1264+
chars.next(); // consume next
1265+
} else {
1266+
let n = match next {
1267+
'0' => '\0',
1268+
'a' => '\u{7}',
1269+
'b' => '\u{8}',
1270+
'f' => '\u{c}',
1271+
'n' => '\n',
1272+
'r' => '\r',
1273+
't' => '\t',
1274+
'Z' => '\u{1a}',
1275+
_ => *next,
1276+
};
1277+
s.push(n);
1278+
chars.next(); // consume next
12721279
}
1273-
} else {
1274-
s.push(ch);
12751280
}
12761281
}
12771282
_ => {
@@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
15171522
#[cfg(test)]
15181523
mod tests {
15191524
use super::*;
1520-
use crate::dialect::{ClickHouseDialect, MsSqlDialect};
1525+
use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};
15211526

15221527
#[test]
15231528
fn tokenizer_error_impl() {
@@ -2386,4 +2391,57 @@ mod tests {
23862391
check_unescape(r"Hello\0", None);
23872392
check_unescape(r"Hello\xCADRust", None);
23882393
}
2394+
2395+
#[test]
2396+
fn tokenize_quoted_string_escape() {
2397+
for (sql, expected, expected_unescaped) in [
2398+
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
2399+
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
2400+
(r#"'\\'"#, r#"\\"#, r#"\"#),
2401+
(
2402+
r#"'\0\a\b\f\n\r\t\Z'"#,
2403+
r#"\0\a\b\f\n\r\t\Z"#,
2404+
"\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
2405+
),
2406+
(r#"'\"'"#, r#"\""#, "\""),
2407+
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
2408+
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
2409+
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
2410+
] {
2411+
let dialect = BigQueryDialect {};
2412+
2413+
let tokens = Tokenizer::new(&dialect, sql)
2414+
.with_unescape(false)
2415+
.tokenize()
2416+
.unwrap();
2417+
let expected = vec![Token::SingleQuotedString(expected.to_string())];
2418+
compare(expected, tokens);
2419+
2420+
let tokens = Tokenizer::new(&dialect, sql)
2421+
.with_unescape(true)
2422+
.tokenize()
2423+
.unwrap();
2424+
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
2425+
compare(expected, tokens);
2426+
}
2427+
2428+
for sql in [r#"'\'"#, r#"'ab\'"#] {
2429+
let dialect = BigQueryDialect {};
2430+
let mut tokenizer = Tokenizer::new(&dialect, sql);
2431+
assert_eq!(
2432+
"Unterminated string literal",
2433+
tokenizer.tokenize().unwrap_err().message.as_str(),
2434+
);
2435+
}
2436+
2437+
// Non-escape dialect
2438+
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
2439+
let dialect = GenericDialect {};
2440+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2441+
2442+
let expected = vec![Token::SingleQuotedString(expected.to_string())];
2443+
2444+
compare(expected, tokens);
2445+
}
2446+
}
23892447
}

0 commit comments

Comments
 (0)