@@ -1199,61 +1199,10 @@ impl<'a> Tokenizer<'a> {
1199
1199
starting_loc : Location ,
1200
1200
chars : & mut State ,
1201
1201
) -> Result < String , TokenizerError > {
1202
- let mut s = String :: new ( ) ;
1203
-
1204
- // This case is a bit tricky
1205
-
1206
- chars. next ( ) ; // consume the opening quote
1207
-
1208
- // slash escaping
1209
- let mut is_escaped = false ;
1210
- while let Some ( & ch) = chars. peek ( ) {
1211
- macro_rules! escape_control_character {
1212
- ( $ESCAPED: expr) => { {
1213
- if is_escaped {
1214
- s. push( $ESCAPED) ;
1215
- is_escaped = false ;
1216
- } else {
1217
- s. push( ch) ;
1218
- }
1219
-
1220
- chars. next( ) ;
1221
- } } ;
1222
- }
1223
-
1224
- match ch {
1225
- '\'' => {
1226
- chars. next ( ) ; // consume
1227
- if is_escaped {
1228
- s. push ( ch) ;
1229
- is_escaped = false ;
1230
- } else if chars. peek ( ) . map ( |c| * c == '\'' ) . unwrap_or ( false ) {
1231
- s. push ( ch) ;
1232
- chars. next ( ) ;
1233
- } else {
1234
- return Ok ( s) ;
1235
- }
1236
- }
1237
- '\\' => {
1238
- if is_escaped {
1239
- s. push ( '\\' ) ;
1240
- is_escaped = false ;
1241
- } else {
1242
- is_escaped = true ;
1243
- }
1244
-
1245
- chars. next ( ) ;
1246
- }
1247
- 'r' => escape_control_character ! ( '\r' ) ,
1248
- 'n' => escape_control_character ! ( '\n' ) ,
1249
- 't' => escape_control_character ! ( '\t' ) ,
1250
- _ => {
1251
- is_escaped = false ;
1252
- chars. next ( ) ; // consume
1253
- s. push ( ch) ;
1254
- }
1255
- }
1202
+ if let Some ( s) = unescape_single_quoted_string ( chars) {
1203
+ return Ok ( s) ;
1256
1204
}
1205
+
1257
1206
self . tokenizer_error ( starting_loc, "Unterminated encoded string literal" )
1258
1207
}
1259
1208
@@ -1406,6 +1355,154 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
1406
1355
s
1407
1356
}
1408
1357
1358
+ fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
1359
+ Unescape :: new ( chars) . unescape ( )
1360
+ }
1361
+
1362
+ struct Unescape < ' a : ' b , ' b > {
1363
+ chars : & ' b mut State < ' a > ,
1364
+ }
1365
+
1366
+ impl < ' a : ' b , ' b > Unescape < ' a , ' b > {
1367
+ fn new ( chars : & ' b mut State < ' a > ) -> Self {
1368
+ Self { chars }
1369
+ }
1370
+ fn unescape ( mut self ) -> Option < String > {
1371
+ let mut unescaped = String :: new ( ) ;
1372
+
1373
+ self . chars . next ( ) ;
1374
+
1375
+ while let Some ( c) = self . chars . next ( ) {
1376
+ if c == '\'' {
1377
+ // case: ''''
1378
+ if self . chars . peek ( ) . map ( |c| * c == '\'' ) . unwrap_or ( false ) {
1379
+ self . chars . next ( ) ;
1380
+ unescaped. push ( '\'' ) ;
1381
+ continue ;
1382
+ }
1383
+ return Some ( unescaped) ;
1384
+ }
1385
+
1386
+ if c != '\\' {
1387
+ unescaped. push ( c) ;
1388
+ continue ;
1389
+ }
1390
+
1391
+ let c = match self . chars . next ( ) ? {
1392
+ 'b' => '\u{0008}' ,
1393
+ 'f' => '\u{000C}' ,
1394
+ 'n' => '\n' ,
1395
+ 'r' => '\r' ,
1396
+ 't' => '\t' ,
1397
+ 'u' => self . unescape_unicode_16 ( ) ?,
1398
+ 'U' => self . unescape_unicode_32 ( ) ?,
1399
+ 'x' => self . unescape_hex ( ) ?,
1400
+ c if c. is_digit ( 8 ) => self . unescape_octal ( c) ?,
1401
+ c => c,
1402
+ } ;
1403
+
1404
+ unescaped. push ( Self :: check_null ( c) ?) ;
1405
+ }
1406
+
1407
+ None
1408
+ }
1409
+
1410
+ #[ inline]
1411
+ fn check_null ( c : char ) -> Option < char > {
1412
+ if c == '\0' {
1413
+ None
1414
+ } else {
1415
+ Some ( c)
1416
+ }
1417
+ }
1418
+
1419
+ #[ inline]
1420
+ fn byte_to_char < const RADIX : u32 > ( s : & str ) -> Option < char > {
1421
+ // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
1422
+ match u32:: from_str_radix ( s, RADIX ) {
1423
+ Err ( _) => None ,
1424
+ Ok ( n) => {
1425
+ let n = n & 0xFF ;
1426
+ if n <= 127 {
1427
+ char:: from_u32 ( n)
1428
+ } else {
1429
+ None
1430
+ }
1431
+ }
1432
+ }
1433
+ }
1434
+
1435
+ // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
1436
+ fn unescape_hex ( & mut self ) -> Option < char > {
1437
+ let mut s = String :: new ( ) ;
1438
+
1439
+ for _ in 0 ..2 {
1440
+ match self . next_hex_digit ( ) {
1441
+ Some ( c) => s. push ( c) ,
1442
+ None => break ,
1443
+ }
1444
+ }
1445
+
1446
+ if s. is_empty ( ) {
1447
+ return Some ( 'x' ) ;
1448
+ }
1449
+
1450
+ Self :: byte_to_char :: < 16 > ( & s)
1451
+ }
1452
+
1453
+ #[ inline]
1454
+ fn next_hex_digit ( & mut self ) -> Option < char > {
1455
+ match self . chars . peek ( ) {
1456
+ Some ( c) if c. is_ascii_hexdigit ( ) => self . chars . next ( ) ,
1457
+ _ => None ,
1458
+ }
1459
+ }
1460
+
1461
+ // Octal byte value. \o, \oo, \ooo (o = 0–7)
1462
+ fn unescape_octal ( & mut self , c : char ) -> Option < char > {
1463
+ let mut s = String :: new ( ) ;
1464
+
1465
+ s. push ( c) ;
1466
+ for _ in 0 ..2 {
1467
+ match self . next_octal_digest ( ) {
1468
+ Some ( c) => s. push ( c) ,
1469
+ None => break ,
1470
+ }
1471
+ }
1472
+
1473
+ Self :: byte_to_char :: < 8 > ( & s)
1474
+ }
1475
+
1476
+ #[ inline]
1477
+ fn next_octal_digest ( & mut self ) -> Option < char > {
1478
+ match self . chars . peek ( ) {
1479
+ Some ( c) if c. is_digit ( 8 ) => self . chars . next ( ) ,
1480
+ _ => None ,
1481
+ }
1482
+ }
1483
+
1484
+ // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
1485
+ fn unescape_unicode_16 ( & mut self ) -> Option < char > {
1486
+ self . unescape_unicode :: < 4 > ( )
1487
+ }
1488
+
1489
+ // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
1490
+ fn unescape_unicode_32 ( & mut self ) -> Option < char > {
1491
+ self . unescape_unicode :: < 8 > ( )
1492
+ }
1493
+
1494
+ fn unescape_unicode < const NUM : usize > ( & mut self ) -> Option < char > {
1495
+ let mut s = String :: new ( ) ;
1496
+ for _ in 0 ..NUM {
1497
+ s. push ( self . chars . next ( ) ?) ;
1498
+ }
1499
+ match u32:: from_str_radix ( & s, 16 ) {
1500
+ Err ( _) => None ,
1501
+ Ok ( n) => char:: from_u32 ( n) ,
1502
+ }
1503
+ }
1504
+ }
1505
+
1409
1506
#[ cfg( test) ]
1410
1507
mod tests {
1411
1508
use super :: * ;
@@ -2139,4 +2236,74 @@ mod tests {
2139
2236
//println!("------------------------------");
2140
2237
assert_eq ! ( expected, actual) ;
2141
2238
}
2239
+
2240
+ fn check_unescape ( s : & str , expected : Option < & str > ) {
2241
+ let s = format ! ( "'{}'" , s) ;
2242
+ let mut state = State {
2243
+ peekable : s. chars ( ) . peekable ( ) ,
2244
+ line : 0 ,
2245
+ col : 0 ,
2246
+ } ;
2247
+
2248
+ assert_eq ! (
2249
+ unescape_single_quoted_string( & mut state) ,
2250
+ expected. map( |s| s. to_string( ) )
2251
+ ) ;
2252
+ }
2253
+
2254
+ #[ test]
2255
+ fn test_unescape ( ) {
2256
+ check_unescape ( r"\b" , Some ( "\u{0008} " ) ) ;
2257
+ check_unescape ( r"\f" , Some ( "\u{000C} " ) ) ;
2258
+ check_unescape ( r"\t" , Some ( "\t " ) ) ;
2259
+ check_unescape ( r"\r\n" , Some ( "\r \n " ) ) ;
2260
+ check_unescape ( r"\/" , Some ( "/" ) ) ;
2261
+ check_unescape ( r"/" , Some ( "/" ) ) ;
2262
+ check_unescape ( r"\\" , Some ( "\\ " ) ) ;
2263
+
2264
+ // 16 and 32-bit hexadecimal Unicode character value
2265
+ check_unescape ( r"\u0001" , Some ( "\u{0001} " ) ) ;
2266
+ check_unescape ( r"\u4c91" , Some ( "\u{4c91} " ) ) ;
2267
+ check_unescape ( r"\u4c916" , Some ( "\u{4c91} 6" ) ) ;
2268
+ check_unescape ( r"\u4c" , None ) ;
2269
+ check_unescape ( r"\u0000" , None ) ;
2270
+ check_unescape ( r"\U0010FFFF" , Some ( "\u{10FFFF} " ) ) ;
2271
+ check_unescape ( r"\U00110000" , None ) ;
2272
+ check_unescape ( r"\U00000000" , None ) ;
2273
+ check_unescape ( r"\u" , None ) ;
2274
+ check_unescape ( r"\U" , None ) ;
2275
+ check_unescape ( r"\U1010FFFF" , None ) ;
2276
+
2277
+ // hexadecimal byte value
2278
+ check_unescape ( r"\x4B" , Some ( "\u{004b} " ) ) ;
2279
+ check_unescape ( r"\x4" , Some ( "\u{0004} " ) ) ;
2280
+ check_unescape ( r"\x4L" , Some ( "\u{0004} L" ) ) ;
2281
+ check_unescape ( r"\x" , Some ( "x" ) ) ;
2282
+ check_unescape ( r"\xP" , Some ( "xP" ) ) ;
2283
+ check_unescape ( r"\x0" , None ) ;
2284
+ check_unescape ( r"\xCAD" , None ) ;
2285
+ check_unescape ( r"\xA9" , None ) ;
2286
+
2287
+ // octal byte value
2288
+ check_unescape ( r"\1" , Some ( "\u{0001} " ) ) ;
2289
+ check_unescape ( r"\12" , Some ( "\u{000a} " ) ) ;
2290
+ check_unescape ( r"\123" , Some ( "\u{0053} " ) ) ;
2291
+ check_unescape ( r"\1232" , Some ( "\u{0053} 2" ) ) ;
2292
+ check_unescape ( r"\4" , Some ( "\u{0004} " ) ) ;
2293
+ check_unescape ( r"\45" , Some ( "\u{0025} " ) ) ;
2294
+ check_unescape ( r"\450" , Some ( "\u{0028} " ) ) ;
2295
+ check_unescape ( r"\603" , None ) ;
2296
+ check_unescape ( r"\0" , None ) ;
2297
+ check_unescape ( r"\080" , None ) ;
2298
+
2299
+ // others
2300
+ check_unescape ( r"\9" , Some ( "9" ) ) ;
2301
+ check_unescape ( r"''" , Some ( "'" ) ) ;
2302
+ check_unescape (
2303
+ r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232" ,
2304
+ Some ( "Hello\r \n Rust/\u{4c91} SQL Parser\u{10abcd} \u{0053} 2" ) ,
2305
+ ) ;
2306
+ check_unescape ( r"Hello\0" , None ) ;
2307
+ check_unescape ( r"Hello\xCADRust" , None ) ;
2308
+ }
2142
2309
}
0 commit comments