Skip to content

added parsing for PostgreSQL operations #267

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 30, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,11 @@ pub enum Expr {
right: Box<Expr>,
},
/// Unary operation e.g. `NOT foo`
UnaryOp { op: UnaryOperator, expr: Box<Expr> },
UnaryOp {
op: UnaryOperator,
expr: Box<Expr>,
infix: bool,
},
/// CAST an expression to a different data type e.g. `CAST(foo AS VARCHAR(123))`
Cast {
expr: Box<Expr>,
Expand Down Expand Up @@ -282,7 +286,13 @@ impl fmt::Display for Expr {
high
),
Expr::BinaryOp { left, op, right } => write!(f, "{} {} {}", left, op, right),
Expr::UnaryOp { op, expr } => write!(f, "{} {}", op, expr),
Expr::UnaryOp { op, expr, infix } => {
if *infix {
write!(f, "{}{}", expr, op)
} else {
write!(f, "{} {}", op, expr)
}
}
Expr::Cast { expr, data_type } => write!(f, "CAST({} AS {})", expr, data_type),
Expr::Extract { field, expr } => write!(f, "EXTRACT({} FROM {})", field, expr),
Expr::Collate { expr, collation } => write!(f, "{} COLLATE {}", expr, collation),
Expand Down
18 changes: 18 additions & 0 deletions src/ast/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ pub enum UnaryOperator {
Plus,
Minus,
Not,
PGBitwiseNot,
PGSqrt,
PGCbrt,
PGFactorial,
PGInfixFactorial,
PGAbs,
}

impl fmt::Display for UnaryOperator {
Expand All @@ -29,6 +35,12 @@ impl fmt::Display for UnaryOperator {
UnaryOperator::Plus => "+",
UnaryOperator::Minus => "-",
UnaryOperator::Not => "NOT",
UnaryOperator::PGBitwiseNot => "~",
UnaryOperator::PGSqrt => "|/",
UnaryOperator::PGCbrt => "||/",
UnaryOperator::PGFactorial => "!",
UnaryOperator::PGInfixFactorial => "!!",
UnaryOperator::PGAbs => "@",
})
}
}
Expand Down Expand Up @@ -56,6 +68,9 @@ pub enum BinaryOperator {
BitwiseOr,
BitwiseAnd,
BitwiseXor,
PGBitwiseXor,
PGBitwiseShiftLeft,
PGBitwiseShiftRight,
}

impl fmt::Display for BinaryOperator {
Expand All @@ -80,6 +95,9 @@ impl fmt::Display for BinaryOperator {
BinaryOperator::BitwiseOr => "|",
BinaryOperator::BitwiseAnd => "&",
BinaryOperator::BitwiseXor => "^",
BinaryOperator::PGBitwiseXor => "#",
BinaryOperator::PGBitwiseShiftLeft => "<<",
BinaryOperator::PGBitwiseShiftRight => ">>",
})
}
}
40 changes: 39 additions & 1 deletion src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ impl<'a> Parser<'a> {
Keyword::NOT => Ok(Expr::UnaryOp {
op: UnaryOperator::Not,
expr: Box::new(self.parse_subexpr(Self::UNARY_NOT_PREC)?),
infix: false,
}),
// Here `w` is a word, check if it's a part of a multi-part
// identifier, a function call, or a simple identifier:
Expand Down Expand Up @@ -283,6 +284,31 @@ impl<'a> Parser<'a> {
},
}, // End of Token::Word
Token::Mult => Ok(Expr::Wildcard),
Token::Tilde => Ok(Expr::UnaryOp {
op: UnaryOperator::PGBitwiseNot,
expr: Box::new(self.parse_subexpr(0)?),
infix: false,
}),
Token::DoubleExclamationMark => Ok(Expr::UnaryOp {
op: UnaryOperator::PGInfixFactorial,
expr: Box::new(self.parse_subexpr(0)?),
infix: false,
}),
Token::SquareRoot => Ok(Expr::UnaryOp {
op: UnaryOperator::PGSqrt,
expr: Box::new(self.parse_subexpr(0)?),
infix: false,
}),
Token::CubeRoot => Ok(Expr::UnaryOp {
op: UnaryOperator::PGCbrt,
expr: Box::new(self.parse_subexpr(0)?),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably not right, as this parses WHERE ||/27 = 3 as ||/ ( 27=3 ), which is not a valid expression. I don't see precedence mentioned in the documentation you linked to, and I think a better guess would be to default all the unary operators to the PLUS_MINUS_PREC (moving the new ops to the tok @ case below).

infix: false,
}),
Token::Ampersat => Ok(Expr::UnaryOp {
op: UnaryOperator::PGAbs,
expr: Box::new(self.parse_subexpr(0)?),
infix: false,
}),
tok @ Token::Minus | tok @ Token::Plus => {
let op = if tok == Token::Plus {
UnaryOperator::Plus
Expand All @@ -292,6 +318,7 @@ impl<'a> Parser<'a> {
Ok(Expr::UnaryOp {
op,
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
infix: false,
})
}
Token::Number(_)
Expand Down Expand Up @@ -658,6 +685,9 @@ impl<'a> Parser<'a> {
Token::Caret => Some(BinaryOperator::BitwiseXor),
Token::Ampersand => Some(BinaryOperator::BitwiseAnd),
Token::Div => Some(BinaryOperator::Divide),
Token::ShiftLeft => Some(BinaryOperator::PGBitwiseShiftLeft),
Token::ShiftRight => Some(BinaryOperator::PGBitwiseShiftRight),
Token::Sharp => Some(BinaryOperator::PGBitwiseXor),
Token::Word(w) => match w.keyword {
Keyword::AND => Some(BinaryOperator::And),
Keyword::OR => Some(BinaryOperator::Or),
Expand Down Expand Up @@ -707,6 +737,13 @@ impl<'a> Parser<'a> {
}
} else if Token::DoubleColon == tok {
self.parse_pg_cast(expr)
} else if Token::ExclamationMark == tok {
// PostgreSQL factorial operation
Ok(Expr::UnaryOp {
Comment on lines 737 to +741
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we'd consider how postfix factorial interacted with other operators (the postfix ::type, another factorial, and prefix operators), but I don't think this has to block merging this PR.

op: UnaryOperator::PGFactorial,
expr: Box::new(expr),
infix: true,
})
} else {
// Can only happen if `get_next_precedence` got out of sync with this function
panic!("No infix parser for token {:?}", tok)
Expand Down Expand Up @@ -785,11 +822,12 @@ impl<'a> Parser<'a> {
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(Self::BETWEEN_PREC),
Token::Eq | Token::Lt | Token::LtEq | Token::Neq | Token::Gt | Token::GtEq => Ok(20),
Token::Pipe => Ok(21),
Token::Caret => Ok(22),
Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(22),
Token::Ampersand => Ok(23),
Token::Plus | Token::Minus => Ok(Self::PLUS_MINUS_PREC),
Token::Mult | Token::Div | Token::Mod | Token::StringConcat => Ok(40),
Token::DoubleColon => Ok(50),
Token::ExclamationMark => Ok(50),
_ => Ok(0),
}
}
Expand Down
121 changes: 118 additions & 3 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use std::str::Chars;

use super::dialect::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
use super::dialect::Dialect;
use super::dialect::PostgreSqlDialect;
use super::dialect::SnowflakeDialect;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -54,7 +55,7 @@ pub enum Token {
Neq,
/// Less Than operator `<`
Lt,
/// Greater han operator `>`
/// Greater Than operator `>`
Gt,
/// Less Than Or Equals operator `<=`
LtEq,
Expand Down Expand Up @@ -102,6 +103,24 @@ pub enum Token {
RBrace,
/// Right Arrow `=>`
RArrow,
/// Sharp `#` use for PostgreSQL Bitwise XOR operator
Sharp,
/// Tilde `~` use for PostgreSQL Bitwise NOT operator
Tilde,
/// Bitwise left operator `<<` use for PostgreSQL
ShiftLeft,
/// Bitwise right operator `>>` use for PostgreSQL
ShiftRight,
/// Exclamation Mark `!` use for PostgreSQL factorial operator
ExclamationMark,
/// Exclamation Mark `!!` use for PostgreSQL prefix factorial operator
DoubleExclamationMark,
/// Ampersat `@` use for PostgreSQL abs operator
Ampersat,
/// PostgreSQL square root math operator
SquareRoot,
/// PostgreSQL cube root math operator
CubeRoot,
}

impl fmt::Display for Token {
Expand Down Expand Up @@ -143,6 +162,15 @@ impl fmt::Display for Token {
Token::LBrace => f.write_str("{"),
Token::RBrace => f.write_str("}"),
Token::RArrow => f.write_str("=>"),
Token::Sharp => f.write_str("#"),
Token::ExclamationMark => f.write_str("!"),
Token::DoubleExclamationMark => f.write_str("!!"),
Token::Tilde => f.write_str("~"),
Token::Ampersat => f.write_str("@"),
Token::ShiftLeft => f.write_str("<<"),
Token::ShiftRight => f.write_str(">>"),
Token::SquareRoot => f.write_str("|/"),
Token::CubeRoot => f.write_str("||/"),
}
}
}
Expand Down Expand Up @@ -406,7 +434,18 @@ impl<'a> Tokenizer<'a> {
'|' => {
chars.next(); // consume the '|'
match chars.peek() {
Some('|') => self.consume_and_return(chars, Token::StringConcat),
Some('/') if dialect_of!(self is PostgreSqlDialect) => {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nickolay, am I right that this is not going to work with custom dialect as you suggest here #243 (comment)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry, but I don't follow. In the comment you mention I said I didn't think that simply allowing $ to start an identifier matched what the PosgreSQL documentation said about this.

In this PR you make |/ conditionally parse as SquareRoot, which makes sense to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have custom Dialect https://github.com/alex-dukhno/database/blob/master/src/parser/src/lib.rs#L221-L231, is it true that it wouldn't parse |/ into SquareRoot because of if dialect_of!(self is PostgreSqlDialect)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see now. Yes, I assumed |/ is something very Postgres-specific, so limiting it to PostgreSqlDialect (and GenericDialect, which I didn't mention explicitly) seemed to make sense.

In the older comment you're referring to I described a stop-gap measure that would be used until someone figured out the proper way to deal with $... in the parser.

We could make the new operators parse unconditionally for now if that makes your life easier, but without a clear use-case (and accompanying testcase) this will likely break in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to have dialect_of checks and have a solution for dealing with $.... Having my knowledge of crate I'd suggest to add Variable(Ident) or something similar to ast::Expr enum. It has to be done in a separate PR of course. WDYT?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have @-mentioned you in #265, where this discussion can continue.

The hard part is describing what specifically we're trying to implement and why. Your suggestion, for example, solves the problem of representing $ (but not another leader) followed by an identifier (quoted or not; consisting of characters from an unspecified set) anywhere an arbitrary expression is allowed (but not in more restricted contexts, like OFFSET __ ROWS). I'm not sure if this matches Postgres or the "common denominator" of the usual dialects; answering that requires some research. From the examples I've seen so far, a new Value variant with {leader, ident} fields might be more appropriate.

self.consume_and_return(chars, Token::SquareRoot)
}
Some('|') => {
chars.next(); // consume the second '|'
match chars.peek() {
Some('/') if dialect_of!(self is PostgreSqlDialect) => {
self.consume_and_return(chars, Token::CubeRoot)
}
_ => Ok(Some(Token::StringConcat)),
}
}
// Bitshift '|' operator
_ => Ok(Some(Token::Pipe)),
}
Expand All @@ -423,21 +462,28 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::Neq),
_ => self.tokenizer_error("Expected to see '=' after '!' character"),
Some('!') if dialect_of!(self is PostgreSqlDialect) => {
self.consume_and_return(chars, Token::DoubleExclamationMark)
}
_ => Ok(Some(Token::ExclamationMark)),
}
}
'<' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::LtEq),
Some('>') => self.consume_and_return(chars, Token::Neq),
Some('<') if dialect_of!(self is PostgreSqlDialect) => {
self.consume_and_return(chars, Token::ShiftLeft)
}
_ => Ok(Some(Token::Lt)),
}
}
'>' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::GtEq),
Some('>') => self.consume_and_return(chars, Token::ShiftRight),
_ => Ok(Some(Token::Gt)),
}
}
Expand All @@ -464,6 +510,15 @@ impl<'a> Tokenizer<'a> {
comment,
})))
}
'~' if dialect_of!(self is PostgreSqlDialect) => {
self.consume_and_return(chars, Token::Tilde)
}
'#' if dialect_of!(self is PostgreSqlDialect) => {
self.consume_and_return(chars, Token::Sharp)
}
'@' if dialect_of!(self is PostgreSqlDialect) => {
self.consume_and_return(chars, Token::Ampersat)
}
other => self.consume_and_return(chars, Token::Char(other)),
},
None => Ok(None),
Expand Down Expand Up @@ -586,6 +641,7 @@ mod tests {
use super::super::dialect::GenericDialect;
use super::super::dialect::MsSqlDialect;
use super::*;
use crate::dialect::PostgreSqlDialect;

#[test]
fn tokenize_select_1() {
Expand Down Expand Up @@ -958,6 +1014,65 @@ mod tests {
compare(expected, tokens);
}

#[test]
fn tokenize_postgresql_bitwise_operations() {
let sql = String::from("SELECT ~one << two # three >> four");
let dialect = PostgreSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();

let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Tilde,
Token::make_word("one", None),
Token::Whitespace(Whitespace::Space),
Token::ShiftLeft,
Token::Whitespace(Whitespace::Space),
Token::make_word("two", None),
Token::Whitespace(Whitespace::Space),
Token::Sharp,
Token::Whitespace(Whitespace::Space),
Token::make_word("three", None),
Token::Whitespace(Whitespace::Space),
Token::ShiftRight,
Token::Whitespace(Whitespace::Space),
Token::make_word("four", None),
];

compare(expected, tokens);
}

#[test]
fn tokenize_postgresql_math_operations() {
let sql = String::from("SELECT !!5 5! @-6 |/4 ||/8");
let dialect = PostgreSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();

let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::DoubleExclamationMark,
Token::Number("5".to_string()),
Token::Whitespace(Whitespace::Space),
Token::Number("5".to_string()),
Token::ExclamationMark,
Token::Whitespace(Whitespace::Space),
Token::Ampersat,
Token::Minus,
Token::Number("6".to_string()),
Token::Whitespace(Whitespace::Space),
Token::SquareRoot,
Token::Number("4".to_string()),
Token::Whitespace(Whitespace::Space),
Token::CubeRoot,
Token::Number("8".to_string()),
];

compare(expected, tokens);
}

fn compare(expected: Vec<Token>, actual: Vec<Token>) {
//println!("------------------------------");
//println!("tokens = {:?}", actual);
Expand Down
Loading