Skip to content

Add support for first, last aggregate function parsing #880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,10 @@ pub enum Expr {
ListAgg(ListAgg),
/// The `ARRAY_AGG` function `SELECT ARRAY_AGG(... ORDER BY ...)`
ArrayAgg(ArrayAgg),
/// The `FIRST` function `SELECT FIRST(... ORDER BY ...)`
FIRST(FirstAgg),
/// The `LAST` function `SELECT LAST(... ORDER BY ...)`
LAST(LastAgg),
/// The `GROUPING SETS` expr.
GroupingSets(Vec<Vec<Expr>>),
/// The `CUBE` expr.
Expand Down Expand Up @@ -787,6 +791,8 @@ impl fmt::Display for Expr {
Expr::ArraySubquery(s) => write!(f, "ARRAY({s})"),
Expr::ListAgg(listagg) => write!(f, "{listagg}"),
Expr::ArrayAgg(arrayagg) => write!(f, "{arrayagg}"),
Expr::FIRST(first) => write!(f, "{first}"),
Expr::LAST(last) => write!(f, "{last}"),
Expr::GroupingSets(sets) => {
write!(f, "GROUPING SETS (")?;
let mut sep = "";
Expand Down Expand Up @@ -3549,6 +3555,66 @@ impl fmt::Display for ArrayAgg {
}
}

/// An `FIRST` invocation `FIRST( [ DISTINCT ] <expr> [ORDER BY <expr>] [LIMIT <n>] )`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these new structs follow the existing patterns in this file but they seem very specific to me (as in do we really need two new structs that are almost identical to each other as well as to Function.

Did you consider adding an order_by clause to Function and adjusting the parse_function function?

So something like

pub struct Function {
    pub name: ObjectName,
    pub args: Vec<FunctionArg>,
    pub over: Option<WindowSpec>,
    // aggregate functions may specify eg `COUNT(DISTINCT x)`
    pub distinct: bool,
    // Some functions must be called without trailing parentheses, for example Postgres
    // do it for current_catalog, current_schema, etc. This flags is used for formatting.
    pub special: bool,
    // optional ORDER BY expression <----------- Proposed Addition
    pub order_by: Option<Box<OrderByExpr>>,
}

I think this would both reduce the code required for this PR and make the codebase easier to maintain, as well as would extend naturally to other aggregate functions (e.g. databricks offers the first_value function in addition to first)

Adding an order_by to Function would allow users to parse any function with an order_by clause which I think would be quite valuable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't consider, this. However, it seems like a better approach to me. I will experiment with it, then let you know about result. Thanks for the suggestion.

/// Or `FIRST( [ DISTINCT ] <expr> ) [ WITHIN GROUP ( ORDER BY <expr> ) ]`
/// ORDER BY position is defined differently for BigQuery, Postgres and Snowflake.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct FirstAgg {
pub expr: Box<Expr>,
pub order_by: Option<Box<OrderByExpr>>,
pub within_group: bool, // order by is used inside a within group or not
}

impl fmt::Display for FirstAgg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "FIRST({}", self.expr)?;
if !self.within_group {
if let Some(order_by) = &self.order_by {
write!(f, " ORDER BY {order_by}")?;
}
}
write!(f, ")")?;
if self.within_group {
if let Some(order_by) = &self.order_by {
write!(f, " WITHIN GROUP (ORDER BY {order_by})")?;
}
}
Ok(())
}
}

/// An `LAST` invocation `LAST( [ DISTINCT ] <expr> [ORDER BY <expr>] [LIMIT <n>] )`
/// Or `LAST( [ DISTINCT ] <expr> ) [ WITHIN GROUP ( ORDER BY <expr> ) ]`
/// ORDER BY position is defined differently for BigQuery, Postgres and Snowflake.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct LastAgg {
pub expr: Box<Expr>,
pub order_by: Option<Box<OrderByExpr>>,
pub within_group: bool, // order by is used inside a within group or not
}

impl fmt::Display for LastAgg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "LAST({}", self.expr)?;
if !self.within_group {
if let Some(order_by) = &self.order_by {
write!(f, " ORDER BY {order_by}")?;
}
}
write!(f, ")")?;
if self.within_group {
if let Some(order_by) = &self.order_by {
write!(f, " WITHIN GROUP (ORDER BY {order_by})")?;
}
}
Ok(())
}
}

#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
Expand Down
78 changes: 78 additions & 0 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,8 @@ impl<'a> Parser<'a> {
self.parse_array_subquery()
}
Keyword::ARRAY_AGG => self.parse_array_agg_expr(),
Keyword::FIRST => self.parse_first_agg_expr(),
Keyword::LAST => self.parse_last_agg_expr(),
Keyword::NOT => self.parse_not(),
Keyword::MATCH if dialect_of!(self is MySqlDialect | GenericDialect) => {
self.parse_match_against()
Expand Down Expand Up @@ -1410,6 +1412,82 @@ impl<'a> Parser<'a> {
}))
}

pub fn parse_first_agg_expr(&mut self) -> Result<Expr, ParserError> {
self.expect_token(&Token::LParen)?;
let expr = Box::new(self.parse_expr()?);
// ANSI SQL and BigQuery define ORDER BY inside function.
if !self.dialect.supports_within_after_array_aggregation() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the call to supports_within_after_array_aggregation here as this is not an array aggregate.

let order_by = if self.parse_keywords(&[Keyword::ORDER, Keyword::BY]) {
let order_by_expr = self.parse_order_by_expr()?;
Some(Box::new(order_by_expr))
} else {
None
};
self.expect_token(&Token::RParen)?;
return Ok(Expr::FIRST(FirstAgg {
expr,
order_by,
within_group: false,
}));
}
// Snowflake defines ORDERY BY in within group instead of inside the function like
// ANSI SQL.
self.expect_token(&Token::RParen)?;
let within_group = if self.parse_keywords(&[Keyword::WITHIN, Keyword::GROUP]) {
self.expect_token(&Token::LParen)?;
self.expect_keywords(&[Keyword::ORDER, Keyword::BY])?;
let order_by_expr = self.parse_order_by_expr()?;
self.expect_token(&Token::RParen)?;
Some(Box::new(order_by_expr))
} else {
None
};

Ok(Expr::FIRST(FirstAgg {
expr,
order_by: within_group,
within_group: true,
}))
}

pub fn parse_last_agg_expr(&mut self) -> Result<Expr, ParserError> {
self.expect_token(&Token::LParen)?;
let expr = Box::new(self.parse_expr()?);
// ANSI SQL and BigQuery define ORDER BY inside function.
if !self.dialect.supports_within_after_array_aggregation() {
let order_by = if self.parse_keywords(&[Keyword::ORDER, Keyword::BY]) {
let order_by_expr = self.parse_order_by_expr()?;
Some(Box::new(order_by_expr))
} else {
None
};
self.expect_token(&Token::RParen)?;
return Ok(Expr::LAST(LastAgg {
expr,
order_by,
within_group: false,
}));
}
// Snowflake defines ORDERY BY in within group instead of inside the function like
// ANSI SQL.
self.expect_token(&Token::RParen)?;
let within_group = if self.parse_keywords(&[Keyword::WITHIN, Keyword::GROUP]) {
self.expect_token(&Token::LParen)?;
self.expect_keywords(&[Keyword::ORDER, Keyword::BY])?;
let order_by_expr = self.parse_order_by_expr()?;
self.expect_token(&Token::RParen)?;
Some(Box::new(order_by_expr))
} else {
None
};

Ok(Expr::LAST(LastAgg {
expr,
order_by: within_group,
within_group: true,
}))
}

// This function parses date/time fields for the EXTRACT function-like
// operator, interval qualifiers, and the ceil/floor operations.
// EXTRACT supports a wider set of date/time fields than interval qualifiers,
Expand Down
21 changes: 21 additions & 0 deletions tests/sqlparser_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2070,6 +2070,27 @@ fn parse_array_agg_func() {
}
}

#[test]
fn parse_first_last_func() {
let supported_dialects = TestedDialects {
dialects: vec![
Box::new(GenericDialect {}),
Box::new(PostgreSqlDialect {}),
Box::new(MsSqlDialect {}),
Box::new(AnsiDialect {}),
Box::new(HiveDialect {}),
],
options: None,
};

for sql in [
"SELECT FIRST(x ORDER BY x) AS a FROM T",
"SELECT LAST(x ORDER BY x) AS a FROM T",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given there is code above to handle WITHIN GROUP during parsing, can you either: 1) Remove that code, or 2) add test coverage for it?

        // Snowflake defines ORDERY BY in within group instead of inside the function like
        // ANSI SQL.
        self.expect_token(&Token::RParen)?;
        let within_group = if self.parse_keywords(&[Keyword::WITHIN, Keyword::GROUP]) {
            self.expect_token(&Token::LParen)?;
            self.expect_keywords(&[Keyword::ORDER, Keyword::BY])?;
            let order_by_expr = self.parse_order_by_expr()?;
            self.expect_token(&Token::RParen)?;
            Some(Box::new(order_by_expr))
        } else {
            None
        };

] {
supported_dialects.verified_stmt(sql);
}
}

#[test]
fn parse_create_table() {
let sql = "CREATE TABLE uk_cities (\
Expand Down