@@ -55,6 +55,8 @@ pub enum Token {
55
55
EscapedStringLiteral ( String ) ,
56
56
/// Hexadecimal string literal: i.e.: X'deadbeef'
57
57
HexStringLiteral ( String ) ,
58
+ /// Unicode escaped string: U&'d\0061t\+000061' (data)
59
+ UnicodeEscapedStringLiteral ( String ) ,
58
60
/// Comma
59
61
Comma ,
60
62
/// Whitespace (space, tab, etc)
@@ -156,6 +158,7 @@ impl fmt::Display for Token {
156
158
Token :: NationalStringLiteral ( ref s) => write ! ( f, "N'{}'" , s) ,
157
159
Token :: EscapedStringLiteral ( ref s) => write ! ( f, "E'{}'" , s) ,
158
160
Token :: HexStringLiteral ( ref s) => write ! ( f, "X'{}'" , s) ,
161
+ Token :: UnicodeEscapedStringLiteral ( ref s) => write ! ( f, "U&'{}'" , s) ,
159
162
Token :: Comma => f. write_str ( "," ) ,
160
163
Token :: Whitespace ( ws) => write ! ( f, "{}" , ws) ,
161
164
Token :: DoubleEq => f. write_str ( "==" ) ,
@@ -415,6 +418,28 @@ impl<'a> Tokenizer<'a> {
415
418
}
416
419
}
417
420
}
421
+ x @ 'u' | x @ 'U' => {
422
+ chars. next ( ) ; // consume, to check the next char
423
+ let mut look_ahead_chars = chars. clone ( ) ;
424
+ if look_ahead_chars. next_if_eq ( & '&' ) . is_some ( ) {
425
+ match look_ahead_chars. peek ( ) {
426
+ Some ( '\'' ) => {
427
+ //Move chars to the position of look_ahead_chars
428
+ chars. next ( ) ;
429
+ // U&'...' - a <binary string literal>
430
+ let s = self . tokenize_single_quoted_string ( chars) ?;
431
+ Ok ( Some ( Token :: UnicodeEscapedStringLiteral ( s) ) )
432
+ }
433
+ _ => {
434
+ let s = self . tokenize_word ( x, chars) ;
435
+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
436
+ }
437
+ }
438
+ } else {
439
+ let s = self . tokenize_word ( x, chars) ;
440
+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
441
+ }
442
+ }
418
443
// identifier or keyword
419
444
ch if self . dialect . is_identifier_start ( ch) => {
420
445
chars. next ( ) ; // consume the first char
@@ -1417,4 +1442,36 @@ mod tests {
1417
1442
//println!("------------------------------");
1418
1443
assert_eq ! ( expected, actual) ;
1419
1444
}
1445
+ #[ test]
1446
+ fn tokenize_unicode_escaped_literal ( ) {
1447
+ let sql = r#"U&'aaa'"# ;
1448
+ let dialect = GenericDialect { } ;
1449
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1450
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1451
+ let expected = vec ! [ Token :: UnicodeEscapedStringLiteral ( "aaa" . to_string( ) ) ] ;
1452
+ compare ( expected, tokens) ;
1453
+
1454
+ let sql = r#"U&a"# ;
1455
+ let dialect = GenericDialect { } ;
1456
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1457
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1458
+ let expected = vec ! [
1459
+ Token :: make_word( "U" , None ) ,
1460
+ Token :: Ampersand ,
1461
+ Token :: make_word( "a" , None ) ,
1462
+ ] ;
1463
+ compare ( expected, tokens) ;
1464
+ let sql = r#"U & 'aaa'"# ;
1465
+ let dialect = GenericDialect { } ;
1466
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1467
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1468
+ let expected = vec ! [
1469
+ Token :: make_word( "U" , None ) ,
1470
+ Token :: Whitespace ( Whitespace :: Space ) ,
1471
+ Token :: Ampersand ,
1472
+ Token :: Whitespace ( Whitespace :: Space ) ,
1473
+ Token :: SingleQuotedString ( "aaa" . to_string( ) ) ,
1474
+ ] ;
1475
+ compare ( expected, tokens) ;
1476
+ }
1420
1477
}
0 commit comments