@@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
627
627
chars. next ( ) ; // consume
628
628
match chars. peek ( ) {
629
629
Some ( '\'' ) => {
630
- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
630
+ let s = self . tokenize_quoted_string ( chars, '\'' , false ) ?;
631
631
Ok ( Some ( Token :: SingleQuotedByteStringLiteral ( s) ) )
632
632
}
633
633
Some ( '\"' ) => {
634
- let s = self . tokenize_quoted_string ( chars, '\"' ) ?;
634
+ let s = self . tokenize_quoted_string ( chars, '\"' , false ) ?;
635
635
Ok ( Some ( Token :: DoubleQuotedByteStringLiteral ( s) ) )
636
636
}
637
637
_ => {
@@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
646
646
chars. next ( ) ; // consume
647
647
match chars. peek ( ) {
648
648
Some ( '\'' ) => {
649
- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
649
+ let s = self . tokenize_quoted_string ( chars, '\'' , false ) ?;
650
650
Ok ( Some ( Token :: RawStringLiteral ( s) ) )
651
651
}
652
652
Some ( '\"' ) => {
653
- let s = self . tokenize_quoted_string ( chars, '\"' ) ?;
653
+ let s = self . tokenize_quoted_string ( chars, '\"' , false ) ?;
654
654
Ok ( Some ( Token :: RawStringLiteral ( s) ) )
655
655
}
656
656
_ => {
@@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
666
666
match chars. peek ( ) {
667
667
Some ( '\'' ) => {
668
668
// N'...' - a <national character string literal>
669
- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
669
+ let s = self . tokenize_quoted_string ( chars, '\'' , true ) ?;
670
670
Ok ( Some ( Token :: NationalStringLiteral ( s) ) )
671
671
}
672
672
_ => {
@@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
700
700
match chars. peek ( ) {
701
701
Some ( '\'' ) => {
702
702
// X'...' - a <binary string literal>
703
- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
703
+ let s = self . tokenize_quoted_string ( chars, '\'' , true ) ?;
704
704
Ok ( Some ( Token :: HexStringLiteral ( s) ) )
705
705
}
706
706
_ => {
@@ -712,15 +712,23 @@ impl<'a> Tokenizer<'a> {
712
712
}
713
713
// single quoted string
714
714
'\'' => {
715
- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
715
+ let s = self . tokenize_quoted_string (
716
+ chars,
717
+ '\'' ,
718
+ self . dialect . supports_string_literal_backslash_escape ( ) ,
719
+ ) ?;
716
720
717
721
Ok ( Some ( Token :: SingleQuotedString ( s) ) )
718
722
}
719
723
// double quoted string
720
724
'\"' if !self . dialect . is_delimited_identifier_start ( ch)
721
725
&& !self . dialect . is_identifier_start ( ch) =>
722
726
{
723
- let s = self . tokenize_quoted_string ( chars, '"' ) ?;
727
+ let s = self . tokenize_quoted_string (
728
+ chars,
729
+ '"' ,
730
+ self . dialect . supports_string_literal_backslash_escape ( ) ,
731
+ ) ?;
724
732
725
733
Ok ( Some ( Token :: DoubleQuotedString ( s) ) )
726
734
}
@@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
1222
1230
& self ,
1223
1231
chars : & mut State ,
1224
1232
quote_style : char ,
1233
+ allow_escape : bool ,
1225
1234
) -> Result < String , TokenizerError > {
1226
1235
let mut s = String :: new ( ) ;
1227
1236
let error_loc = chars. location ( ) ;
@@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
1243
1252
return Ok ( s) ;
1244
1253
}
1245
1254
}
1246
- '\\' => {
1247
- // consume
1255
+ '\\' if allow_escape => {
1256
+ // consume backslash
1248
1257
chars. next ( ) ;
1249
- // slash escaping is specific to MySQL dialect.
1250
- if dialect_of ! ( self is MySqlDialect ) {
1251
- if let Some ( next) = chars. peek ( ) {
1252
- if !self . unescape {
1253
- // In no-escape mode, the given query has to be saved completely including backslashes.
1254
- s. push ( ch) ;
1255
- s. push ( * next) ;
1256
- chars. next ( ) ; // consume next
1257
- } else {
1258
- // See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
1259
- let n = match next {
1260
- '\'' | '\"' | '\\' | '%' | '_' => * next,
1261
- '0' => '\0' ,
1262
- 'b' => '\u{8}' ,
1263
- 'n' => '\n' ,
1264
- 'r' => '\r' ,
1265
- 't' => '\t' ,
1266
- 'Z' => '\u{1a}' ,
1267
- _ => * next,
1268
- } ;
1269
- s. push ( n) ;
1270
- chars. next ( ) ; // consume next
1271
- }
1258
+
1259
+ if let Some ( next) = chars. peek ( ) {
1260
+ if !self . unescape {
1261
+ // In no-escape mode, the given query has to be saved completely including backslashes.
1262
+ s. push ( ch) ;
1263
+ s. push ( * next) ;
1264
+ chars. next ( ) ; // consume next
1265
+ } else {
1266
+ let n = match next {
1267
+ '0' => '\0' ,
1268
+ 'a' => '\u{7}' ,
1269
+ 'b' => '\u{8}' ,
1270
+ 'f' => '\u{c}' ,
1271
+ 'n' => '\n' ,
1272
+ 'r' => '\r' ,
1273
+ 't' => '\t' ,
1274
+ 'Z' => '\u{1a}' ,
1275
+ _ => * next,
1276
+ } ;
1277
+ s. push ( n) ;
1278
+ chars. next ( ) ; // consume next
1272
1279
}
1273
- } else {
1274
- s. push ( ch) ;
1275
1280
}
1276
1281
}
1277
1282
_ => {
@@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
1517
1522
#[ cfg( test) ]
1518
1523
mod tests {
1519
1524
use super :: * ;
1520
- use crate :: dialect:: { ClickHouseDialect , MsSqlDialect } ;
1525
+ use crate :: dialect:: { BigQueryDialect , ClickHouseDialect , MsSqlDialect } ;
1521
1526
1522
1527
#[ test]
1523
1528
fn tokenizer_error_impl ( ) {
@@ -2386,4 +2391,57 @@ mod tests {
2386
2391
check_unescape ( r"Hello\0" , None ) ;
2387
2392
check_unescape ( r"Hello\xCADRust" , None ) ;
2388
2393
}
2394
+
2395
+ #[ test]
2396
+ fn tokenize_quoted_string_escape ( ) {
2397
+ for ( sql, expected, expected_unescaped) in [
2398
+ ( r#"'%a\'%b'"# , r#"%a\'%b"# , r#"%a'%b"# ) ,
2399
+ ( r#"'a\'\'b\'c\'d'"# , r#"a\'\'b\'c\'d"# , r#"a''b'c'd"# ) ,
2400
+ ( r#"'\\'"# , r#"\\"# , r#"\"# ) ,
2401
+ (
2402
+ r#"'\0\a\b\f\n\r\t\Z'"# ,
2403
+ r#"\0\a\b\f\n\r\t\Z"# ,
2404
+ "\0 \u{7} \u{8} \u{c} \n \r \t \u{1a} " ,
2405
+ ) ,
2406
+ ( r#"'\"'"# , r#"\""# , "\" " ) ,
2407
+ ( r#"'\\a\\b\'c'"# , r#"\\a\\b\'c"# , r#"\a\b'c"# ) ,
2408
+ ( r#"'\'abcd'"# , r#"\'abcd"# , r#"'abcd"# ) ,
2409
+ ( r#"'''a''b'"# , r#"''a''b"# , r#"'a'b"# ) ,
2410
+ ] {
2411
+ let dialect = BigQueryDialect { } ;
2412
+
2413
+ let tokens = Tokenizer :: new ( & dialect, sql)
2414
+ . with_unescape ( false )
2415
+ . tokenize ( )
2416
+ . unwrap ( ) ;
2417
+ let expected = vec ! [ Token :: SingleQuotedString ( expected. to_string( ) ) ] ;
2418
+ compare ( expected, tokens) ;
2419
+
2420
+ let tokens = Tokenizer :: new ( & dialect, sql)
2421
+ . with_unescape ( true )
2422
+ . tokenize ( )
2423
+ . unwrap ( ) ;
2424
+ let expected = vec ! [ Token :: SingleQuotedString ( expected_unescaped. to_string( ) ) ] ;
2425
+ compare ( expected, tokens) ;
2426
+ }
2427
+
2428
+ for sql in [ r#"'\'"# , r#"'ab\'"# ] {
2429
+ let dialect = BigQueryDialect { } ;
2430
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
2431
+ assert_eq ! (
2432
+ "Unterminated string literal" ,
2433
+ tokenizer. tokenize( ) . unwrap_err( ) . message. as_str( ) ,
2434
+ ) ;
2435
+ }
2436
+
2437
+ // Non-escape dialect
2438
+ for ( sql, expected) in [ ( r#"'\'"# , r#"\"# ) , ( r#"'ab\'"# , r#"ab\"# ) ] {
2439
+ let dialect = GenericDialect { } ;
2440
+ let tokens = Tokenizer :: new ( & dialect, sql) . tokenize ( ) . unwrap ( ) ;
2441
+
2442
+ let expected = vec ! [ Token :: SingleQuotedString ( expected. to_string( ) ) ] ;
2443
+
2444
+ compare ( expected, tokens) ;
2445
+ }
2446
+ }
2389
2447
}
0 commit comments