-
Notifications
You must be signed in to change notification settings - Fork 605
Support Dialect
level precedence, update Postgres Dialect
to match Postgres
#1360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,12 +24,13 @@ mod redshift; | |
mod snowflake; | ||
mod sqlite; | ||
|
||
use crate::ast::{Expr, Statement}; | ||
use core::any::{Any, TypeId}; | ||
use core::fmt::Debug; | ||
use core::iter::Peekable; | ||
use core::str::Chars; | ||
|
||
use log::debug; | ||
|
||
pub use self::ansi::AnsiDialect; | ||
pub use self::bigquery::BigQueryDialect; | ||
pub use self::clickhouse::ClickHouseDialect; | ||
|
@@ -44,7 +45,10 @@ pub use self::redshift::RedshiftSqlDialect; | |
pub use self::snowflake::SnowflakeDialect; | ||
pub use self::sqlite::SQLiteDialect; | ||
pub use crate::keywords; | ||
use crate::ast::{Expr, Statement}; | ||
use crate::parser::{Parser, ParserError}; | ||
use crate::keywords::Keyword; | ||
use crate::tokenizer::Token; | ||
|
||
#[cfg(not(feature = "std"))] | ||
use alloc::boxed::Box; | ||
|
@@ -300,13 +304,172 @@ pub trait Dialect: Debug + Any { | |
// return None to fall back to the default behavior | ||
None | ||
} | ||
|
||
/// Get the precedence of the next token. This "full" method means all precedence logic and remain | ||
/// in the dialect. while still allowing overriding the `get_next_precedence` method with the option to | ||
/// fallback to the default behavior. | ||
/// | ||
/// Higher number => higher precedence | ||
fn get_next_precedence_full(&self, parser: &Parser) -> Result<u8, ParserError> { | ||
if let Some(precedence) = self.get_next_precedence(parser) { | ||
return precedence; | ||
} | ||
|
||
let token = parser.peek_token(); | ||
debug!("get_next_precedence() {:?}", token); | ||
match token.token { | ||
Token::Word(w) if w.keyword == Keyword::OR => Ok(OR_PREC), | ||
Token::Word(w) if w.keyword == Keyword::AND => Ok(AND_PREC), | ||
Token::Word(w) if w.keyword == Keyword::XOR => Ok(XOR_PREC), | ||
|
||
Token::Word(w) if w.keyword == Keyword::AT => { | ||
match ( | ||
parser.peek_nth_token(1).token, | ||
parser.peek_nth_token(2).token, | ||
) { | ||
(Token::Word(w), Token::Word(w2)) | ||
if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => | ||
{ | ||
Ok(AT_TZ_PREC) | ||
} | ||
_ => Ok(UNKNOWN_PREC), | ||
} | ||
} | ||
|
||
Token::Word(w) if w.keyword == Keyword::NOT => match parser.peek_nth_token(1).token { | ||
// The precedence of NOT varies depending on keyword that | ||
// follows it. If it is followed by IN, BETWEEN, or LIKE, | ||
// it takes on the precedence of those tokens. Otherwise, it | ||
// is not an infix operator, and therefore has zero | ||
// precedence. | ||
Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC), | ||
Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC), | ||
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC), | ||
_ => Ok(UNKNOWN_PREC), | ||
}, | ||
Token::Word(w) if w.keyword == Keyword::IS => Ok(IS_PREC), | ||
Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC), | ||
Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC), | ||
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC), | ||
Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(BETWEEN_PREC), | ||
Token::Word(w) if w.keyword == Keyword::DIV => Ok(MUL_DIV_MOD_OP_PREC), | ||
Token::Eq | ||
| Token::Lt | ||
| Token::LtEq | ||
| Token::Neq | ||
| Token::Gt | ||
| Token::GtEq | ||
| Token::DoubleEq | ||
| Token::Tilde | ||
| Token::TildeAsterisk | ||
| Token::ExclamationMarkTilde | ||
| Token::ExclamationMarkTildeAsterisk | ||
| Token::DoubleTilde | ||
| Token::DoubleTildeAsterisk | ||
| Token::ExclamationMarkDoubleTilde | ||
| Token::ExclamationMarkDoubleTildeAsterisk | ||
| Token::Spaceship => Ok(EQ_PREC), | ||
Token::Pipe => Ok(PIPE_PREC), | ||
Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(CARET_PREC), | ||
Token::Ampersand => Ok(AMPERSAND_PREC), | ||
Token::Plus | Token::Minus => Ok(PLUS_MINUS_PREC), | ||
Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => { | ||
Ok(MUL_DIV_MOD_OP_PREC) | ||
} | ||
Token::DoubleColon | ||
| Token::ExclamationMark | ||
| Token::LBracket | ||
| Token::Overlap | ||
| Token::CaretAt => Ok(DOUBLE_COLON_PREC), | ||
// Token::Colon if (self as dyn Dialect).is::<SnowflakeDialect>() => Ok(DOUBLE_COLON_PREC), | ||
Token::Arrow | ||
| Token::LongArrow | ||
| Token::HashArrow | ||
| Token::HashLongArrow | ||
| Token::AtArrow | ||
| Token::ArrowAt | ||
| Token::HashMinus | ||
| Token::AtQuestion | ||
| Token::AtAt | ||
| Token::Question | ||
| Token::QuestionAnd | ||
| Token::QuestionPipe | ||
| Token::CustomBinaryOperator(_) => Ok(PG_OTHER_PREC), | ||
_ => Ok(UNKNOWN_PREC), | ||
} | ||
} | ||
|
||
/// Dialect-specific statement parser override | ||
fn parse_statement(&self, _parser: &mut Parser) -> Option<Result<Statement, ParserError>> { | ||
// return None to fall back to the default behavior | ||
None | ||
} | ||
|
||
/// The following precedence values are used directly by `Parse` or in dialects, | ||
/// so have to be made public by the dialect. | ||
fn prec_double_colon(&self) -> u8 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these are the precedence values that either |
||
DOUBLE_COLON_PREC | ||
} | ||
|
||
fn prec_mul_div_mod_op(&self) -> u8 { | ||
MUL_DIV_MOD_OP_PREC | ||
} | ||
|
||
fn prec_plus_minus(&self) -> u8 { | ||
PLUS_MINUS_PREC | ||
} | ||
|
||
fn prec_between(&self) -> u8 { | ||
BETWEEN_PREC | ||
} | ||
|
||
fn prec_like(&self) -> u8 { | ||
LIKE_PREC | ||
} | ||
|
||
fn prec_unary_not(&self) -> u8 { | ||
UNARY_NOT_PREC | ||
} | ||
|
||
fn prec_unknown(&self) -> u8 { | ||
UNKNOWN_PREC | ||
} | ||
} | ||
|
||
// Define the lexical Precedence of operators. | ||
// | ||
// Uses (APPROXIMATELY) <https://www.postgresql.org/docs/7.0/operators.htm#AEN2026> as a reference | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This statement really isn't true, hence I added We could rewrite to "was originally inspired by" or something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is fine |
||
// higher number = higher precedence | ||
// | ||
// NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator | ||
// actually has higher precedence than addition. | ||
// See <https://postgrespro.com/list/thread-id/2673331>. | ||
const DOUBLE_COLON_PREC: u8 = 50; | ||
const AT_TZ_PREC: u8 = 41; | ||
const MUL_DIV_MOD_OP_PREC: u8 = 40; | ||
const PLUS_MINUS_PREC: u8 = 30; | ||
const XOR_PREC: u8 = 24; | ||
const AMPERSAND_PREC: u8 = 23; | ||
const CARET_PREC: u8 = 22; | ||
const PIPE_PREC: u8 = 21; | ||
const BETWEEN_PREC: u8 = 20; | ||
const EQ_PREC: u8 = 20; | ||
const LIKE_PREC: u8 = 19; | ||
const IS_PREC: u8 = 17; | ||
const PG_OTHER_PREC: u8 = 16; | ||
const UNARY_NOT_PREC: u8 = 15; | ||
const AND_PREC: u8 = 10; | ||
const OR_PREC: u8 = 5; | ||
const UNKNOWN_PREC: u8 = 0; | ||
|
||
impl dyn Dialect { | ||
#[inline] | ||
pub fn is<T: Dialect>(&self) -> bool { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
use log::debug; | ||
|
||
use crate::ast::{CommentObject, Statement}; | ||
use crate::dialect::Dialect; | ||
|
@@ -20,6 +21,23 @@ use crate::tokenizer::Token; | |
#[derive(Debug)] | ||
pub struct PostgreSqlDialect {} | ||
|
||
const DOUBLE_COLON_PREC: u8 = 140; | ||
const BRACKET_PREC: u8 = 130; | ||
const COLLATE_PREC: u8 = 120; | ||
const AT_TZ_PREC: u8 = 110; | ||
const CARET_PREC: u8 = 100; | ||
const MUL_DIV_MOD_OP_PREC: u8 = 90; | ||
const PLUS_MINUS_PREC: u8 = 80; | ||
// there's no XOR operator in PostgreSQL, but support it here to avoid breaking tests | ||
const XOR_PREC: u8 = 75; | ||
Comment on lines
+31
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what to do about this, if we remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I think we should just leave it in unless there is a compelling reason to take it out |
||
const PG_OTHER_PREC: u8 = 70; | ||
const BETWEEN_LIKE_PREC: u8 = 60; | ||
const EQ_PREC: u8 = 50; | ||
const IS_PREC: u8 = 40; | ||
const NOT_PREC: u8 = 30; | ||
const AND_PREC: u8 = 20; | ||
const OR_PREC: u8 = 10; | ||
|
||
impl Dialect for PostgreSqlDialect { | ||
fn identifier_quote_style(&self, _identifier: &str) -> Option<char> { | ||
Some('"') | ||
|
@@ -67,6 +85,102 @@ impl Dialect for PostgreSqlDialect { | |
) | ||
} | ||
|
||
fn get_next_precedence(&self, parser: &Parser) -> Option<Result<u8, ParserError>> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alamb @samuelcolvin : wouldn't it be better to have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that would:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also think the https://doc.rust-lang.org/std/option/enum.Option.html#method.transpose I don't have any particular preference for one over the other |
||
let token = parser.peek_token(); | ||
debug!("get_next_precedence() {:?}", token); | ||
|
||
let precedence = match token.token { | ||
Token::Word(w) if w.keyword == Keyword::OR => OR_PREC, | ||
Token::Word(w) if w.keyword == Keyword::XOR => XOR_PREC, | ||
Token::Word(w) if w.keyword == Keyword::AND => AND_PREC, | ||
Token::Word(w) if w.keyword == Keyword::AT => { | ||
match ( | ||
parser.peek_nth_token(1).token, | ||
parser.peek_nth_token(2).token, | ||
) { | ||
(Token::Word(w), Token::Word(w2)) | ||
if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => | ||
{ | ||
AT_TZ_PREC | ||
} | ||
_ => self.prec_unknown(), | ||
} | ||
} | ||
|
||
Token::Word(w) if w.keyword == Keyword::NOT => match parser.peek_nth_token(1).token { | ||
// The precedence of NOT varies depending on keyword that | ||
// follows it. If it is followed by IN, BETWEEN, or LIKE, | ||
// it takes on the precedence of those tokens. Otherwise, it | ||
// is not an infix operator, and therefore has zero | ||
// precedence. | ||
Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, | ||
_ => self.prec_unknown(), | ||
}, | ||
Token::Word(w) if w.keyword == Keyword::IS => IS_PREC, | ||
Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::OPERATOR => BETWEEN_LIKE_PREC, | ||
Token::Word(w) if w.keyword == Keyword::DIV => MUL_DIV_MOD_OP_PREC, | ||
Token::Word(w) if w.keyword == Keyword::COLLATE => COLLATE_PREC, | ||
Token::Eq | ||
| Token::Lt | ||
| Token::LtEq | ||
| Token::Neq | ||
| Token::Gt | ||
| Token::GtEq | ||
| Token::DoubleEq | ||
| Token::Tilde | ||
| Token::TildeAsterisk | ||
| Token::ExclamationMarkTilde | ||
| Token::ExclamationMarkTildeAsterisk | ||
| Token::DoubleTilde | ||
| Token::DoubleTildeAsterisk | ||
| Token::ExclamationMarkDoubleTilde | ||
| Token::ExclamationMarkDoubleTildeAsterisk | ||
| Token::Spaceship => EQ_PREC, | ||
Token::Caret => CARET_PREC, | ||
Token::Plus | Token::Minus => PLUS_MINUS_PREC, | ||
Token::Mul | Token::Div | Token::Mod => MUL_DIV_MOD_OP_PREC, | ||
Token::DoubleColon => DOUBLE_COLON_PREC, | ||
Token::LBracket => BRACKET_PREC, | ||
Token::Arrow | ||
| Token::LongArrow | ||
| Token::HashArrow | ||
| Token::HashLongArrow | ||
| Token::AtArrow | ||
| Token::ArrowAt | ||
| Token::HashMinus | ||
| Token::AtQuestion | ||
| Token::AtAt | ||
| Token::Question | ||
| Token::QuestionAnd | ||
| Token::QuestionPipe | ||
| Token::ExclamationMark | ||
| Token::Overlap | ||
| Token::CaretAt | ||
| Token::StringConcat | ||
| Token::Sharp | ||
| Token::ShiftRight | ||
| Token::ShiftLeft | ||
| Token::Pipe | ||
| Token::Ampersand | ||
| Token::CustomBinaryOperator(_) => PG_OTHER_PREC, | ||
_ => self.prec_unknown(), | ||
}; | ||
Some(Ok(precedence)) | ||
Comment on lines
+88
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I fear a little bit for the added maintenance burden here. What do you think @alamb ? There are still many SQL tokens to add to sqlparser, and there is a chance people will add them only in the generic dialect, leaving postgres silently broken... Isn't there ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do think dialect divergence is a possibly and likely made more so by this PR In fact, in some ways the point of this PR is to start to diverge the dialects with more accurate precedence rules 🤔 The only way I know to guard against this divergence is with test coverage for all new features (which we are pretty good about in general). Do you have any other thoughts about how to improve the situation @lovasoa ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could change postgres' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something like fn get_next_precedence(&self, parser: &Parser) -> Option<Result<u8, ParserError>> {
if some_particular_case { return some_particular_precedence }
GenericDialect.get_next_precedence(parser)
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lovasoa I agree it's unfortunate, there's so much duplication in the the postgres dialect. However your The reason the Postgres dialect is completely reimplementing the logic is that it has to change the precedence values for virtually every token to match Postgres. We can either:
These would avoid the need to duplicate the logic in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implemented in #1379. |
||
} | ||
|
||
fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> { | ||
if parser.parse_keyword(Keyword::COMMENT) { | ||
Some(parse_comment(parser)) | ||
|
@@ -82,6 +196,26 @@ impl Dialect for PostgreSqlDialect { | |
fn supports_group_by_expr(&self) -> bool { | ||
true | ||
} | ||
|
||
fn prec_mul_div_mod_op(&self) -> u8 { | ||
MUL_DIV_MOD_OP_PREC | ||
} | ||
|
||
fn prec_plus_minus(&self) -> u8 { | ||
PLUS_MINUS_PREC | ||
} | ||
|
||
fn prec_between(&self) -> u8 { | ||
BETWEEN_LIKE_PREC | ||
} | ||
|
||
fn prec_like(&self) -> u8 { | ||
BETWEEN_LIKE_PREC | ||
} | ||
|
||
fn prec_unary_not(&self) -> u8 { | ||
NOT_PREC | ||
} | ||
} | ||
|
||
pub fn parse_comment(parser: &mut Parser) -> Result<Statement, ParserError> { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is moved to
Dialect
so dialects can change the precedence without all the numeric constants being shared between modules.it's implemented as a separate function from
get_next_precedence
:Snowflake
.