@@ -35,7 +35,9 @@ use serde::{Deserialize, Serialize};
35
35
use sqlparser_derive:: { Visit , VisitMut } ;
36
36
37
37
use crate :: ast:: DollarQuotedString ;
38
- use crate :: dialect:: { BigQueryDialect , DuckDbDialect , GenericDialect , SnowflakeDialect } ;
38
+ use crate :: dialect:: {
39
+ BigQueryDialect , DuckDbDialect , GenericDialect , HiveDialect , SnowflakeDialect ,
40
+ } ;
39
41
use crate :: dialect:: { Dialect , MySqlDialect } ;
40
42
use crate :: keywords:: { Keyword , ALL_KEYWORDS , ALL_KEYWORDS_INDEX } ;
41
43
@@ -495,9 +497,32 @@ impl<'a> Tokenizer<'a> {
495
497
Ok ( tokens)
496
498
}
497
499
500
+ fn tokenize_identifier_or_keyword (
501
+ & self ,
502
+ ch : String ,
503
+ chars : & mut State ,
504
+ ) -> Result < Option < Token > , TokenizerError > {
505
+ chars. next ( ) ; // consume the first char
506
+ let word = self . tokenize_word ( ch, chars) ;
507
+
508
+ // TODO: implement parsing of exponent here
509
+ if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
510
+ let mut inner_state = State {
511
+ peekable : word. chars ( ) . peekable ( ) ,
512
+ line : 0 ,
513
+ col : 0 ,
514
+ } ;
515
+ let mut s = peeking_take_while ( & mut inner_state, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
516
+ let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
517
+ s += s2. as_str ( ) ;
518
+ return Ok ( Some ( Token :: Number ( s, false ) ) ) ;
519
+ }
520
+
521
+ Ok ( Some ( Token :: make_word ( & word, None ) ) )
522
+ }
523
+
498
524
/// Get the next token or return None
499
525
fn next_token ( & self , chars : & mut State ) -> Result < Option < Token > , TokenizerError > {
500
- //println!("next_token: {:?}", chars.peek());
501
526
match chars. peek ( ) {
502
527
Some ( & ch) => match ch {
503
528
' ' => self . consume_and_return ( chars, Token :: Whitespace ( Whitespace :: Space ) ) ,
@@ -525,7 +550,7 @@ impl<'a> Tokenizer<'a> {
525
550
}
526
551
_ => {
527
552
// regular identifier starting with an "b" or "B"
528
- let s = self . tokenize_word ( b, chars) ;
553
+ let s = self . tokenize_word ( b. to_string ( ) , chars) ;
529
554
Ok ( Some ( Token :: make_word ( & s, None ) ) )
530
555
}
531
556
}
@@ -544,7 +569,7 @@ impl<'a> Tokenizer<'a> {
544
569
}
545
570
_ => {
546
571
// regular identifier starting with an "r" or "R"
547
- let s = self . tokenize_word ( b, chars) ;
572
+ let s = self . tokenize_word ( b. to_string ( ) , chars) ;
548
573
Ok ( Some ( Token :: make_word ( & s, None ) ) )
549
574
}
550
575
}
@@ -560,7 +585,7 @@ impl<'a> Tokenizer<'a> {
560
585
}
561
586
_ => {
562
587
// regular identifier starting with an "N"
563
- let s = self . tokenize_word ( n, chars) ;
588
+ let s = self . tokenize_word ( n. to_string ( ) , chars) ;
564
589
Ok ( Some ( Token :: make_word ( & s, None ) ) )
565
590
}
566
591
}
@@ -577,7 +602,7 @@ impl<'a> Tokenizer<'a> {
577
602
}
578
603
_ => {
579
604
// regular identifier starting with an "E" or "e"
580
- let s = self . tokenize_word ( x, chars) ;
605
+ let s = self . tokenize_word ( x. to_string ( ) , chars) ;
581
606
Ok ( Some ( Token :: make_word ( & s, None ) ) )
582
607
}
583
608
}
@@ -594,33 +619,11 @@ impl<'a> Tokenizer<'a> {
594
619
}
595
620
_ => {
596
621
// regular identifier starting with an "X"
597
- let s = self . tokenize_word ( x, chars) ;
622
+ let s = self . tokenize_word ( x. to_string ( ) , chars) ;
598
623
Ok ( Some ( Token :: make_word ( & s, None ) ) )
599
624
}
600
625
}
601
626
}
602
- // identifier or keyword
603
- ch if self . dialect . is_identifier_start ( ch) => {
604
- chars. next ( ) ; // consume the first char
605
- let word = self . tokenize_word ( ch, chars) ;
606
-
607
- // TODO: implement parsing of exponent here
608
- if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
609
- let mut inner_state = State {
610
- peekable : word. chars ( ) . peekable ( ) ,
611
- line : 0 ,
612
- col : 0 ,
613
- } ;
614
- let mut s = peeking_take_while ( & mut inner_state, |ch| {
615
- matches ! ( ch, '0' ..='9' | '.' )
616
- } ) ;
617
- let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
618
- s += s2. as_str ( ) ;
619
- return Ok ( Some ( Token :: Number ( s, false ) ) ) ;
620
- }
621
-
622
- Ok ( Some ( Token :: make_word ( & word, None ) ) )
623
- }
624
627
// single quoted string
625
628
'\'' => {
626
629
let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
@@ -714,7 +717,7 @@ impl<'a> Tokenizer<'a> {
714
717
715
718
// mysql dialect supports identifiers that start with a numeric prefix,
716
719
// as long as they aren't an exponent number.
717
- if dialect_of ! ( self is MySqlDialect ) && exponent_part. is_empty ( ) {
720
+ if dialect_of ! ( self is MySqlDialect | HiveDialect ) && exponent_part. is_empty ( ) {
718
721
let word =
719
722
peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
720
723
@@ -786,7 +789,18 @@ impl<'a> Tokenizer<'a> {
786
789
}
787
790
'+' => self . consume_and_return ( chars, Token :: Plus ) ,
788
791
'*' => self . consume_and_return ( chars, Token :: Mul ) ,
789
- '%' => self . consume_and_return ( chars, Token :: Mod ) ,
792
+ '%' => {
793
+ chars. next ( ) ;
794
+ match chars. peek ( ) {
795
+ Some ( ' ' ) => self . consume_and_return ( chars, Token :: Mod ) ,
796
+ Some ( sch) if self . dialect . is_identifier_start ( '%' ) => {
797
+ let mut s = ch. to_string ( ) ;
798
+ s. push_str ( & sch. to_string ( ) ) ;
799
+ self . tokenize_identifier_or_keyword ( s, chars)
800
+ }
801
+ _ => self . consume_and_return ( chars, Token :: Mod ) ,
802
+ }
803
+ }
790
804
'|' => {
791
805
chars. next ( ) ; // consume the '|'
792
806
match chars. peek ( ) {
@@ -901,6 +915,12 @@ impl<'a> Tokenizer<'a> {
901
915
_ => Ok ( Some ( Token :: HashArrow ) ) ,
902
916
}
903
917
}
918
+ Some ( ' ' ) => Ok ( Some ( Token :: Sharp ) ) ,
919
+ Some ( sch) if self . dialect . is_identifier_start ( '#' ) => {
920
+ let mut s = ch. to_string ( ) ;
921
+ s. push_str ( & sch. to_string ( ) ) ;
922
+ self . tokenize_identifier_or_keyword ( s, chars)
923
+ }
904
924
_ => Ok ( Some ( Token :: Sharp ) ) ,
905
925
}
906
926
}
@@ -909,7 +929,25 @@ impl<'a> Tokenizer<'a> {
909
929
match chars. peek ( ) {
910
930
Some ( '>' ) => self . consume_and_return ( chars, Token :: AtArrow ) ,
911
931
Some ( '?' ) => self . consume_and_return ( chars, Token :: AtQuestion ) ,
912
- Some ( '@' ) => self . consume_and_return ( chars, Token :: AtAt ) ,
932
+ Some ( '@' ) => {
933
+ chars. next ( ) ;
934
+ match chars. peek ( ) {
935
+ Some ( ' ' ) => Ok ( Some ( Token :: AtAt ) ) ,
936
+ Some ( tch) if self . dialect . is_identifier_start ( '@' ) => {
937
+ let mut s = ch. to_string ( ) ;
938
+ s. push ( '@' ) ;
939
+ s. push_str ( & tch. to_string ( ) ) ;
940
+ self . tokenize_identifier_or_keyword ( s, chars)
941
+ }
942
+ _ => Ok ( Some ( Token :: AtAt ) ) ,
943
+ }
944
+ }
945
+ Some ( ' ' ) => Ok ( Some ( Token :: AtSign ) ) ,
946
+ Some ( sch) if self . dialect . is_identifier_start ( '@' ) => {
947
+ let mut s = ch. to_string ( ) ;
948
+ s. push_str ( & sch. to_string ( ) ) ;
949
+ self . tokenize_identifier_or_keyword ( s, chars)
950
+ }
913
951
_ => Ok ( Some ( Token :: AtSign ) ) ,
914
952
}
915
953
}
@@ -918,6 +956,11 @@ impl<'a> Tokenizer<'a> {
918
956
let s = peeking_take_while ( chars, |ch| ch. is_numeric ( ) ) ;
919
957
Ok ( Some ( Token :: Placeholder ( String :: from ( "?" ) + & s) ) )
920
958
}
959
+
960
+ // identifier or keyword
961
+ ch if self . dialect . is_identifier_start ( ch) => {
962
+ self . tokenize_identifier_or_keyword ( ch. to_string ( ) , chars)
963
+ }
921
964
'$' => Ok ( Some ( self . tokenize_dollar_preceded_value ( chars) ?) ) ,
922
965
923
966
//whitespace check (including unicode chars) should be last as it covers some of the chars above
@@ -1043,8 +1086,8 @@ impl<'a> Tokenizer<'a> {
1043
1086
}
1044
1087
1045
1088
/// Tokenize an identifier or keyword, after the first char is already consumed.
1046
- fn tokenize_word ( & self , first_char : char , chars : & mut State ) -> String {
1047
- let mut s = first_char . to_string ( ) ;
1089
+ fn tokenize_word ( & self , first_chars : String , chars : & mut State ) -> String {
1090
+ let mut s = first_chars ;
1048
1091
s. push_str ( & peeking_take_while ( chars, |ch| {
1049
1092
self . dialect . is_identifier_part ( ch)
1050
1093
} ) ) ;
0 commit comments