Skip to content

Simple custom lexical precedence in PostgreSQL dialect #1379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 81 additions & 108 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,13 +354,18 @@ pub trait Dialect: Debug + Any {
if let Some(precedence) = self.get_next_precedence(parser) {
return precedence;
}
macro_rules! p {
($precedence:ident) => {
self.prec_value(Precedence::$precedence)
};
}

let token = parser.peek_token();
debug!("get_next_precedence_full() {:?}", token);
match token.token {
Token::Word(w) if w.keyword == Keyword::OR => Ok(OR_PREC),
Token::Word(w) if w.keyword == Keyword::AND => Ok(AND_PREC),
Token::Word(w) if w.keyword == Keyword::XOR => Ok(XOR_PREC),
Token::Word(w) if w.keyword == Keyword::OR => Ok(p!(Or)),
Token::Word(w) if w.keyword == Keyword::AND => Ok(p!(And)),
Token::Word(w) if w.keyword == Keyword::XOR => Ok(p!(Xor)),

Token::Word(w) if w.keyword == Keyword::AT => {
match (
Expand All @@ -370,9 +375,9 @@ pub trait Dialect: Debug + Any {
(Token::Word(w), Token::Word(w2))
if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE =>
{
Ok(AT_TZ_PREC)
Ok(p!(AtTz))
}
_ => Ok(UNKNOWN_PREC),
_ => Ok(self.prec_unknown()),
}
}

Expand All @@ -382,25 +387,25 @@ pub trait Dialect: Debug + Any {
// it takes on the precedence of those tokens. Otherwise, it
// is not an infix operator, and therefore has zero
// precedence.
Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC),
Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC),
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC),
_ => Ok(UNKNOWN_PREC),
Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)),
Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)),
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)),
_ => Ok(self.prec_unknown()),
},
Token::Word(w) if w.keyword == Keyword::IS => Ok(IS_PREC),
Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC),
Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC),
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC),
Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(BETWEEN_PREC),
Token::Word(w) if w.keyword == Keyword::DIV => Ok(MUL_DIV_MOD_OP_PREC),
Token::Word(w) if w.keyword == Keyword::IS => Ok(p!(Is)),
Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)),
Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)),
Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)),
Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)),
Token::Eq
| Token::Lt
| Token::LtEq
Expand All @@ -416,20 +421,19 @@ pub trait Dialect: Debug + Any {
| Token::DoubleTildeAsterisk
| Token::ExclamationMarkDoubleTilde
| Token::ExclamationMarkDoubleTildeAsterisk
| Token::Spaceship => Ok(EQ_PREC),
Token::Pipe => Ok(PIPE_PREC),
Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(CARET_PREC),
Token::Ampersand => Ok(AMPERSAND_PREC),
Token::Plus | Token::Minus => Ok(PLUS_MINUS_PREC),
| Token::Spaceship => Ok(p!(Eq)),
Token::Pipe => Ok(p!(Pipe)),
Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(p!(Caret)),
Token::Ampersand => Ok(p!(Ampersand)),
Token::Plus | Token::Minus => Ok(p!(PlusMinus)),
Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => {
Ok(MUL_DIV_MOD_OP_PREC)
Ok(p!(MulDivModOp))
}
Token::DoubleColon
| Token::ExclamationMark
| Token::LBracket
| Token::Overlap
| Token::CaretAt => Ok(DOUBLE_COLON_PREC),
// Token::Colon if (self as dyn Dialect).is::<SnowflakeDialect>() => Ok(DOUBLE_COLON_PREC),
| Token::CaretAt => Ok(p!(DoubleColon)),
Token::Arrow
| Token::LongArrow
| Token::HashArrow
Expand All @@ -442,8 +446,8 @@ pub trait Dialect: Debug + Any {
| Token::Question
| Token::QuestionAnd
| Token::QuestionPipe
| Token::CustomBinaryOperator(_) => Ok(PG_OTHER_PREC),
_ => Ok(UNKNOWN_PREC),
| Token::CustomBinaryOperator(_) => Ok(p!(PgOther)),
_ => Ok(self.prec_unknown()),
}
}

Expand All @@ -457,88 +461,57 @@ pub trait Dialect: Debug + Any {
None
}

// The following precedence values are used directly by `Parse` or in dialects,
// so have to be made public by the dialect.

/// Return the precedence of the `::` operator.
/// Decide the lexical Precedence of operators.
///
/// Default is 50.
fn prec_double_colon(&self) -> u8 {
DOUBLE_COLON_PREC
}

/// Return the precedence of `*`, `/`, and `%` operators.
///
/// Default is 40.
fn prec_mul_div_mod_op(&self) -> u8 {
MUL_DIV_MOD_OP_PREC
}

/// Return the precedence of the `+` and `-` operators.
///
/// Default is 30.
fn prec_plus_minus(&self) -> u8 {
PLUS_MINUS_PREC
}

/// Return the precedence of the `BETWEEN` operator.
///
/// For example `BETWEEN <low> AND <high>`
///
/// Default is 22.
fn prec_between(&self) -> u8 {
BETWEEN_PREC
}

/// Return the precedence of the `LIKE` operator.
///
/// Default is 19.
fn prec_like(&self) -> u8 {
LIKE_PREC
}

/// Return the precedence of the unary `NOT` operator.
///
/// For example `NOT (a OR b)`
///
/// Default is 15.
fn prec_unary_not(&self) -> u8 {
UNARY_NOT_PREC
/// Uses (APPROXIMATELY) <https://www.postgresql.org/docs/7.0/operators.htm#AEN2026> as a reference
fn prec_value(&self, prec: Precedence) -> u8 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a good idea.

My only potential concern with this approach is that it could be potentially slower, as it now has many more function calls, however I don't actually think we have a sqlparser benchmark so at this point so trying to make such a change would likely be a pre-mature optimization.

If anyone needs additional performance, we can do that as follow on PRs

match prec {
Precedence::DoubleColon => 50,
Precedence::AtTz => 41,
Precedence::MulDivModOp => 40,
Precedence::PlusMinus => 30,
Precedence::Xor => 24,
Precedence::Ampersand => 23,
Precedence::Caret => 22,
Precedence::Pipe => 21,
Precedence::Between => 20,
Precedence::Eq => 20,
Precedence::Like => 19,
Precedence::Is => 17,
Precedence::PgOther => 16,
Precedence::UnaryNot => 15,
Precedence::And => 10,
Precedence::Or => 5,
}
}

/// Return the default (unknown) precedence.
///
/// Default is 0.
fn prec_unknown(&self) -> u8 {
UNKNOWN_PREC
0
}
}

// Define the lexical Precedence of operators.
//
// Uses (APPROXIMATELY) <https://www.postgresql.org/docs/7.0/operators.htm#AEN2026> as a reference
// higher number = higher precedence
//
// NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator
// actually has higher precedence than addition.
// See <https://postgrespro.com/list/thread-id/2673331>.
const DOUBLE_COLON_PREC: u8 = 50;
const AT_TZ_PREC: u8 = 41;
const MUL_DIV_MOD_OP_PREC: u8 = 40;
const PLUS_MINUS_PREC: u8 = 30;
const XOR_PREC: u8 = 24;
const AMPERSAND_PREC: u8 = 23;
const CARET_PREC: u8 = 22;
const PIPE_PREC: u8 = 21;
const BETWEEN_PREC: u8 = 20;
const EQ_PREC: u8 = 20;
const LIKE_PREC: u8 = 19;
const IS_PREC: u8 = 17;
const PG_OTHER_PREC: u8 = 16;
const UNARY_NOT_PREC: u8 = 15;
const AND_PREC: u8 = 10;
const OR_PREC: u8 = 5;
const UNKNOWN_PREC: u8 = 0;
/// This represents the operators for which precedence must be defined
///
/// higher number -> higher precedence
#[derive(Debug, Clone, Copy)]
pub enum Precedence {
DoubleColon,
AtTz,
MulDivModOp,
PlusMinus,
Xor,
Ampersand,
Caret,
Pipe,
Between,
Eq,
Like,
Is,
PgOther,
UnaryNot,
And,
Or,
}

impl dyn Dialect {
#[inline]
Expand Down
118 changes: 28 additions & 90 deletions src/dialect/postgresql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
use log::debug;

use crate::ast::{CommentObject, Statement};
use crate::dialect::Dialect;
use crate::dialect::{Dialect, Precedence};
use crate::keywords::Keyword;
use crate::parser::{Parser, ParserError};
use crate::tokenizer::Token;
Expand Down Expand Up @@ -89,71 +89,11 @@ impl Dialect for PostgreSqlDialect {
let token = parser.peek_token();
debug!("get_next_precedence() {:?}", token);

let precedence = match token.token {
Token::Word(w) if w.keyword == Keyword::OR => OR_PREC,
Token::Word(w) if w.keyword == Keyword::XOR => XOR_PREC,
Token::Word(w) if w.keyword == Keyword::AND => AND_PREC,
Token::Word(w) if w.keyword == Keyword::AT => {
match (
parser.peek_nth_token(1).token,
parser.peek_nth_token(2).token,
) {
(Token::Word(w), Token::Word(w2))
if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE =>
{
AT_TZ_PREC
}
_ => self.prec_unknown(),
}
}

Token::Word(w) if w.keyword == Keyword::NOT => match parser.peek_nth_token(1).token {
// The precedence of NOT varies depending on keyword that
// follows it. If it is followed by IN, BETWEEN, or LIKE,
// it takes on the precedence of those tokens. Otherwise, it
// is not an infix operator, and therefore has zero
// precedence.
Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC,
_ => self.prec_unknown(),
},
Token::Word(w) if w.keyword == Keyword::IS => IS_PREC,
Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::OPERATOR => BETWEEN_LIKE_PREC,
Token::Word(w) if w.keyword == Keyword::DIV => MUL_DIV_MOD_OP_PREC,
Token::Word(w) if w.keyword == Keyword::COLLATE => COLLATE_PREC,
Token::Eq
| Token::Lt
| Token::LtEq
| Token::Neq
| Token::Gt
| Token::GtEq
| Token::DoubleEq
| Token::Tilde
| Token::TildeAsterisk
| Token::ExclamationMarkTilde
| Token::ExclamationMarkTildeAsterisk
| Token::DoubleTilde
| Token::DoubleTildeAsterisk
| Token::ExclamationMarkDoubleTilde
| Token::ExclamationMarkDoubleTildeAsterisk
| Token::Spaceship => EQ_PREC,
Token::Caret => CARET_PREC,
Token::Plus | Token::Minus => PLUS_MINUS_PREC,
Token::Mul | Token::Div | Token::Mod => MUL_DIV_MOD_OP_PREC,
Token::DoubleColon => DOUBLE_COLON_PREC,
Token::LBracket => BRACKET_PREC,
// we only return some custom value here when the behaviour (not merely the numeric value) differs
// from the default implementation
match token.token {
Token::Word(w) if w.keyword == Keyword::COLLATE => Some(Ok(COLLATE_PREC)),
Token::LBracket => Some(Ok(BRACKET_PREC)),
Token::Arrow
| Token::LongArrow
| Token::HashArrow
Expand All @@ -173,12 +113,9 @@ impl Dialect for PostgreSqlDialect {
| Token::Sharp
| Token::ShiftRight
| Token::ShiftLeft
| Token::Pipe
| Token::Ampersand
| Token::CustomBinaryOperator(_) => PG_OTHER_PREC,
_ => self.prec_unknown(),
};
Some(Ok(precedence))
| Token::CustomBinaryOperator(_) => Some(Ok(PG_OTHER_PREC)),
_ => None,
}
}

fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
Expand All @@ -197,24 +134,25 @@ impl Dialect for PostgreSqlDialect {
true
}

fn prec_mul_div_mod_op(&self) -> u8 {
MUL_DIV_MOD_OP_PREC
}

fn prec_plus_minus(&self) -> u8 {
PLUS_MINUS_PREC
}

fn prec_between(&self) -> u8 {
BETWEEN_LIKE_PREC
}

fn prec_like(&self) -> u8 {
BETWEEN_LIKE_PREC
}

fn prec_unary_not(&self) -> u8 {
NOT_PREC
fn prec_value(&self, prec: Precedence) -> u8 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

match prec {
Precedence::DoubleColon => DOUBLE_COLON_PREC,
Precedence::AtTz => AT_TZ_PREC,
Precedence::MulDivModOp => MUL_DIV_MOD_OP_PREC,
Precedence::PlusMinus => PLUS_MINUS_PREC,
Precedence::Xor => XOR_PREC,
Precedence::Ampersand => PG_OTHER_PREC,
Precedence::Caret => CARET_PREC,
Precedence::Pipe => PG_OTHER_PREC,
Precedence::Between => BETWEEN_LIKE_PREC,
Precedence::Eq => EQ_PREC,
Precedence::Like => BETWEEN_LIKE_PREC,
Precedence::Is => IS_PREC,
Precedence::PgOther => PG_OTHER_PREC,
Precedence::UnaryNot => NOT_PREC,
Precedence::And => AND_PREC,
Precedence::Or => OR_PREC,
}
}
}

Expand Down
Loading
Loading