Skip to content

Commit df45db1

Browse files
authored
fix: parsing JsonOperator (#913)
1 parent c8b6e7f commit df45db1

File tree

2 files changed

+112
-34
lines changed

2 files changed

+112
-34
lines changed

src/tokenizer.rs

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ use serde::{Deserialize, Serialize};
3535
use sqlparser_derive::{Visit, VisitMut};
3636

3737
use crate::ast::DollarQuotedString;
38-
use crate::dialect::{BigQueryDialect, DuckDbDialect, GenericDialect, SnowflakeDialect};
38+
use crate::dialect::{
39+
BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
40+
};
3941
use crate::dialect::{Dialect, MySqlDialect};
4042
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
4143

@@ -495,9 +497,32 @@ impl<'a> Tokenizer<'a> {
495497
Ok(tokens)
496498
}
497499

500+
fn tokenize_identifier_or_keyword(
501+
&self,
502+
ch: String,
503+
chars: &mut State,
504+
) -> Result<Option<Token>, TokenizerError> {
505+
chars.next(); // consume the first char
506+
let word = self.tokenize_word(ch, chars);
507+
508+
// TODO: implement parsing of exponent here
509+
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
510+
let mut inner_state = State {
511+
peekable: word.chars().peekable(),
512+
line: 0,
513+
col: 0,
514+
};
515+
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
516+
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
517+
s += s2.as_str();
518+
return Ok(Some(Token::Number(s, false)));
519+
}
520+
521+
Ok(Some(Token::make_word(&word, None)))
522+
}
523+
498524
/// Get the next token or return None
499525
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
500-
//println!("next_token: {:?}", chars.peek());
501526
match chars.peek() {
502527
Some(&ch) => match ch {
503528
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -525,7 +550,7 @@ impl<'a> Tokenizer<'a> {
525550
}
526551
_ => {
527552
// regular identifier starting with an "b" or "B"
528-
let s = self.tokenize_word(b, chars);
553+
let s = self.tokenize_word(b.to_string(), chars);
529554
Ok(Some(Token::make_word(&s, None)))
530555
}
531556
}
@@ -544,7 +569,7 @@ impl<'a> Tokenizer<'a> {
544569
}
545570
_ => {
546571
// regular identifier starting with an "r" or "R"
547-
let s = self.tokenize_word(b, chars);
572+
let s = self.tokenize_word(b.to_string(), chars);
548573
Ok(Some(Token::make_word(&s, None)))
549574
}
550575
}
@@ -560,7 +585,7 @@ impl<'a> Tokenizer<'a> {
560585
}
561586
_ => {
562587
// regular identifier starting with an "N"
563-
let s = self.tokenize_word(n, chars);
588+
let s = self.tokenize_word(n.to_string(), chars);
564589
Ok(Some(Token::make_word(&s, None)))
565590
}
566591
}
@@ -577,7 +602,7 @@ impl<'a> Tokenizer<'a> {
577602
}
578603
_ => {
579604
// regular identifier starting with an "E" or "e"
580-
let s = self.tokenize_word(x, chars);
605+
let s = self.tokenize_word(x.to_string(), chars);
581606
Ok(Some(Token::make_word(&s, None)))
582607
}
583608
}
@@ -594,33 +619,11 @@ impl<'a> Tokenizer<'a> {
594619
}
595620
_ => {
596621
// regular identifier starting with an "X"
597-
let s = self.tokenize_word(x, chars);
622+
let s = self.tokenize_word(x.to_string(), chars);
598623
Ok(Some(Token::make_word(&s, None)))
599624
}
600625
}
601626
}
602-
// identifier or keyword
603-
ch if self.dialect.is_identifier_start(ch) => {
604-
chars.next(); // consume the first char
605-
let word = self.tokenize_word(ch, chars);
606-
607-
// TODO: implement parsing of exponent here
608-
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
609-
let mut inner_state = State {
610-
peekable: word.chars().peekable(),
611-
line: 0,
612-
col: 0,
613-
};
614-
let mut s = peeking_take_while(&mut inner_state, |ch| {
615-
matches!(ch, '0'..='9' | '.')
616-
});
617-
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
618-
s += s2.as_str();
619-
return Ok(Some(Token::Number(s, false)));
620-
}
621-
622-
Ok(Some(Token::make_word(&word, None)))
623-
}
624627
// single quoted string
625628
'\'' => {
626629
let s = self.tokenize_quoted_string(chars, '\'')?;
@@ -714,7 +717,7 @@ impl<'a> Tokenizer<'a> {
714717

715718
// mysql dialect supports identifiers that start with a numeric prefix,
716719
// as long as they aren't an exponent number.
717-
if dialect_of!(self is MySqlDialect) && exponent_part.is_empty() {
720+
if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
718721
let word =
719722
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
720723

@@ -786,7 +789,18 @@ impl<'a> Tokenizer<'a> {
786789
}
787790
'+' => self.consume_and_return(chars, Token::Plus),
788791
'*' => self.consume_and_return(chars, Token::Mul),
789-
'%' => self.consume_and_return(chars, Token::Mod),
792+
'%' => {
793+
chars.next();
794+
match chars.peek() {
795+
Some(' ') => self.consume_and_return(chars, Token::Mod),
796+
Some(sch) if self.dialect.is_identifier_start('%') => {
797+
let mut s = ch.to_string();
798+
s.push_str(&sch.to_string());
799+
self.tokenize_identifier_or_keyword(s, chars)
800+
}
801+
_ => self.consume_and_return(chars, Token::Mod),
802+
}
803+
}
790804
'|' => {
791805
chars.next(); // consume the '|'
792806
match chars.peek() {
@@ -901,6 +915,12 @@ impl<'a> Tokenizer<'a> {
901915
_ => Ok(Some(Token::HashArrow)),
902916
}
903917
}
918+
Some(' ') => Ok(Some(Token::Sharp)),
919+
Some(sch) if self.dialect.is_identifier_start('#') => {
920+
let mut s = ch.to_string();
921+
s.push_str(&sch.to_string());
922+
self.tokenize_identifier_or_keyword(s, chars)
923+
}
904924
_ => Ok(Some(Token::Sharp)),
905925
}
906926
}
@@ -909,7 +929,25 @@ impl<'a> Tokenizer<'a> {
909929
match chars.peek() {
910930
Some('>') => self.consume_and_return(chars, Token::AtArrow),
911931
Some('?') => self.consume_and_return(chars, Token::AtQuestion),
912-
Some('@') => self.consume_and_return(chars, Token::AtAt),
932+
Some('@') => {
933+
chars.next();
934+
match chars.peek() {
935+
Some(' ') => Ok(Some(Token::AtAt)),
936+
Some(tch) if self.dialect.is_identifier_start('@') => {
937+
let mut s = ch.to_string();
938+
s.push('@');
939+
s.push_str(&tch.to_string());
940+
self.tokenize_identifier_or_keyword(s, chars)
941+
}
942+
_ => Ok(Some(Token::AtAt)),
943+
}
944+
}
945+
Some(' ') => Ok(Some(Token::AtSign)),
946+
Some(sch) if self.dialect.is_identifier_start('@') => {
947+
let mut s = ch.to_string();
948+
s.push_str(&sch.to_string());
949+
self.tokenize_identifier_or_keyword(s, chars)
950+
}
913951
_ => Ok(Some(Token::AtSign)),
914952
}
915953
}
@@ -918,6 +956,11 @@ impl<'a> Tokenizer<'a> {
918956
let s = peeking_take_while(chars, |ch| ch.is_numeric());
919957
Ok(Some(Token::Placeholder(String::from("?") + &s)))
920958
}
959+
960+
// identifier or keyword
961+
ch if self.dialect.is_identifier_start(ch) => {
962+
self.tokenize_identifier_or_keyword(ch.to_string(), chars)
963+
}
921964
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
922965

923966
//whitespace check (including unicode chars) should be last as it covers some of the chars above
@@ -1043,8 +1086,8 @@ impl<'a> Tokenizer<'a> {
10431086
}
10441087

10451088
/// Tokenize an identifier or keyword, after the first char is already consumed.
1046-
fn tokenize_word(&self, first_char: char, chars: &mut State) -> String {
1047-
let mut s = first_char.to_string();
1089+
fn tokenize_word(&self, first_chars: String, chars: &mut State) -> String {
1090+
let mut s = first_chars;
10481091
s.push_str(&peeking_take_while(chars, |ch| {
10491092
self.dialect.is_identifier_part(ch)
10501093
}));

tests/sqlparser_common.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,41 @@ fn parse_unary_math_with_multiply() {
11131113
);
11141114
}
11151115

1116+
fn pg_and_generic() -> TestedDialects {
1117+
TestedDialects {
1118+
dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
1119+
options: None,
1120+
}
1121+
}
1122+
1123+
#[test]
1124+
fn parse_json_ops_without_colon() {
1125+
use self::JsonOperator;
1126+
let binary_ops = &[
1127+
("->", JsonOperator::Arrow, all_dialects()),
1128+
("->>", JsonOperator::LongArrow, all_dialects()),
1129+
("#>", JsonOperator::HashArrow, pg_and_generic()),
1130+
("#>>", JsonOperator::HashLongArrow, pg_and_generic()),
1131+
("@>", JsonOperator::AtArrow, all_dialects()),
1132+
("<@", JsonOperator::ArrowAt, all_dialects()),
1133+
("#-", JsonOperator::HashMinus, pg_and_generic()),
1134+
("@?", JsonOperator::AtQuestion, all_dialects()),
1135+
("@@", JsonOperator::AtAt, all_dialects()),
1136+
];
1137+
1138+
for (str_op, op, dialects) in binary_ops {
1139+
let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op));
1140+
assert_eq!(
1141+
SelectItem::UnnamedExpr(Expr::JsonAccess {
1142+
left: Box::new(Expr::Identifier(Ident::new("a"))),
1143+
operator: *op,
1144+
right: Box::new(Expr::Identifier(Ident::new("b"))),
1145+
}),
1146+
select.projection[0]
1147+
);
1148+
}
1149+
}
1150+
11161151
#[test]
11171152
fn parse_is_null() {
11181153
use self::Expr::*;

0 commit comments

Comments
 (0)