fix parsing of identifiers after % symbol (apache#927)

alamb · serprex · commit 7676257df888 · 2023-11-06T01:21:26.000Z
diff --git a/src/test_utils.rs b/src/test_utils.rs
@@ -116,6 +116,16 @@ impl TestedDialects {
         only_statement
     }
 
+    /// Ensures that `sql` parses as an [`Expr`], and that
+    /// re-serializing the parse result produces canonical
+    pub fn expr_parses_to(&self, sql: &str, canonical: &str) -> Expr {
+        let ast = self
+            .run_parser_method(sql, |parser| parser.parse_expr())
+            .unwrap();
+        assert_eq!(canonical, &ast.to_string());
+        ast
+    }
+
     /// Ensures that `sql` parses as a single [Statement], and that
     /// re-serializing the parse result produces the same `sql`
     /// string (is not modified after a serialization round-trip).
@@ -147,11 +157,7 @@ impl TestedDialects {
     /// re-serializing the parse result produces the same `sql`
     /// string (is not modified after a serialization round-trip).
     pub fn verified_expr(&self, sql: &str) -> Expr {
-        let ast = self
-            .run_parser_method(sql, |parser| parser.parse_expr())
-            .unwrap();
-        assert_eq!(sql, &ast.to_string(), "round-tripping without changes");
-        ast
+        self.expr_parses_to(sql, sql)
     }
 }
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -424,6 +424,7 @@ struct State<'a> {
 }
 
 impl<'a> State<'a> {
+    /// return the next character and advance the stream
     pub fn next(&mut self) -> Option<char> {
         match self.peekable.next() {
             None => None,
@@ -439,6 +440,7 @@ impl<'a> State<'a> {
         }
     }
 
+    /// return the next character but do not advance the stream
     pub fn peek(&mut self) -> Option<&char> {
         self.peekable.peek()
     }
@@ -849,13 +851,13 @@ impl<'a> Tokenizer<'a> {
                 '+' => self.consume_and_return(chars, Token::Plus),
                 '*' => self.consume_and_return(chars, Token::Mul),
                 '%' => {
-                    chars.next();
+                    chars.next(); // advance past '%'
                     match chars.peek() {
-                        Some(' ') => self.consume_and_return(chars, Token::Mod),
+                        Some(' ') => Ok(Some(Token::Mod)),
                         Some(sch) if self.dialect.is_identifier_start('%') => {
                             self.tokenize_identifier_or_keyword([ch, *sch], chars)
                         }
-                        _ => self.consume_and_return(chars, Token::Mod),
+                        _ => Ok(Some(Token::Mod)),
                     }
                 }
                 '|' => {
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
@@ -1143,6 +1143,20 @@ fn parse_unary_math_with_multiply() {
     );
 }
 
+#[test]
+fn parse_mod() {
+    use self::Expr::*;
+    let sql = "a % b";
+    assert_eq!(
+        BinaryOp {
+            left: Box::new(Identifier(Ident::new("a"))),
+            op: BinaryOperator::Modulo,
+            right: Box::new(Identifier(Ident::new("b"))),
+        },
+        verified_expr(sql)
+    );
+}
+
 fn pg_and_generic() -> TestedDialects {
     TestedDialects {
         dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
@@ -1178,6 +1192,24 @@ fn parse_json_ops_without_colon() {
     }
 }
 
+#[test]
+fn parse_mod_no_spaces() {
+    use self::Expr::*;
+    let canonical = "a1 % b1";
+    let sqls = ["a1 % b1", "a1% b1", "a1 %b1", "a1%b1"];
+    for sql in sqls {
+        println!("Parsing {sql}");
+        assert_eq!(
+            BinaryOp {
+                left: Box::new(Identifier(Ident::new("a1"))),
+                op: BinaryOperator::Modulo,
+                right: Box::new(Identifier(Ident::new("b1"))),
+            },
+            pg_and_generic().expr_parses_to(sql, canonical)
+        );
+    }
+}
+
 #[test]
 fn parse_is_null() {
     use self::Expr::*;

Original file line number	Diff line number	Diff line change
`@@ -424,6 +424,7 @@ struct State<'a> {`
`424`	`424`	`}`
`425`	`425`
`426`	`426`	`impl<'a> State<'a> {`
	`427`	`+ /// return the next character and advance the stream`
`427`	`428`	`pub fn next(&mut self) -> Option<char> {`
`428`	`429`	`match self.peekable.next() {`
`429`	`430`	`None => None,`
`@@ -439,6 +440,7 @@ impl<'a> State<'a> {`
`439`	`440`	`}`
`440`	`441`	`}`
`441`	`442`
	`443`	`+ /// return the next character but do not advance the stream`
`442`	`444`	`pub fn peek(&mut self) -> Option<&char> {`
`443`	`445`	`self.peekable.peek()`
`444`	`446`	`}`
`@@ -849,13 +851,13 @@ impl<'a> Tokenizer<'a> {`
`849`	`851`	`'+' => self.consume_and_return(chars, Token::Plus),`
`850`	`852`	`'*' => self.consume_and_return(chars, Token::Mul),`
`851`	`853`	`'%' => {`
`852`		`- chars.next();`
	`854`	`+ chars.next(); // advance past '%'`
`853`	`855`	`match chars.peek() {`
`854`		`- Some(' ') => self.consume_and_return(chars, Token::Mod),`
	`856`	`+ Some(' ') => Ok(Some(Token::Mod)),`
`855`	`857`	`Some(sch) if self.dialect.is_identifier_start('%') => {`
`856`	`858`	`self.tokenize_identifier_or_keyword([ch, *sch], chars)`
`857`	`859`	`}`
`858`		`- _ => self.consume_and_return(chars, Token::Mod),`
	`860`	`+ _ => Ok(Some(Token::Mod)),`
`859`	`861`	`}`
`860`	`862`	`}`
`861`	`863`	`'\|' => {`