diff --git a/src/ast/ddl.rs b/src/ast/ddl.rs index 31e216a9f..2d1778c7b 100644 --- a/src/ast/ddl.rs +++ b/src/ast/ddl.rs @@ -26,7 +26,7 @@ use sqlparser_derive::{Visit, VisitMut}; use crate::ast::value::escape_single_quote_string; use crate::ast::{ display_comma_separated, display_separated, DataType, Expr, Ident, MySQLColumnPosition, - ObjectName, ProjectionSelect, SequenceOptions, SqlOption, + ObjectName, OrderByExpr, ProjectionSelect, SequenceOptions, SqlOption, Value, }; use crate::tokenizer::Token; @@ -1417,3 +1417,30 @@ impl fmt::Display for Deduplicate { } } } + +/// Hive supports `CLUSTERED BY` statement in `CREATE TABLE`. +/// Syntax: `CLUSTERED BY (col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS` +/// +/// [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable) +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +pub struct ClusteredBy { + pub columns: Vec, + pub sorted_by: Option>, + pub num_buckets: Value, +} + +impl fmt::Display for ClusteredBy { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "CLUSTERED BY ({})", + display_comma_separated(&self.columns) + )?; + if let Some(ref sorted_by) = self.sorted_by { + write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?; + } + write!(f, " INTO {} BUCKETS", self.num_buckets) + } +} diff --git a/src/ast/dml.rs b/src/ast/dml.rs index aad7d2e22..95ed9f00e 100644 --- a/src/ast/dml.rs +++ b/src/ast/dml.rs @@ -22,11 +22,11 @@ use sqlparser_derive::{Visit, VisitMut}; pub use super::ddl::{ColumnDef, TableConstraint}; use super::{ - display_comma_separated, display_separated, CommentDef, Expr, FileFormat, FromTable, - HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, InsertAliases, - MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, OrderByExpr, Query, - RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, TableWithJoins, Tag, - WrappedCollection, + display_comma_separated, display_separated, ClusteredBy, CommentDef, Expr, FileFormat, + FromTable, HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, + InsertAliases, MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, + OrderByExpr, Query, RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, + TableWithJoins, Tag, WrappedCollection, }; /// CREATE INDEX statement. @@ -140,6 +140,9 @@ pub struct CreateTable { /// BigQuery: Table clustering column list. /// pub cluster_by: Option>>, + /// Hive: Table clustering column list. + /// + pub clustered_by: Option, /// BigQuery: Table options list. /// pub options: Option>, @@ -236,19 +239,6 @@ impl Display for CreateTable { HiveDistributionStyle::PARTITIONED { columns } => { write!(f, " PARTITIONED BY ({})", display_comma_separated(columns))?; } - HiveDistributionStyle::CLUSTERED { - columns, - sorted_by, - num_buckets, - } => { - write!(f, " CLUSTERED BY ({})", display_comma_separated(columns))?; - if !sorted_by.is_empty() { - write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?; - } - if *num_buckets > 0 { - write!(f, " INTO {num_buckets} BUCKETS")?; - } - } HiveDistributionStyle::SKEWED { columns, on, @@ -267,6 +257,10 @@ impl Display for CreateTable { _ => (), } + if let Some(clustered_by) = &self.clustered_by { + write!(f, " {clustered_by}")?; + } + if let Some(HiveFormat { row_format, serde_properties, diff --git a/src/ast/helpers/stmt_create_table.rs b/src/ast/helpers/stmt_create_table.rs index 19efaeece..82532b291 100644 --- a/src/ast/helpers/stmt_create_table.rs +++ b/src/ast/helpers/stmt_create_table.rs @@ -9,9 +9,9 @@ use sqlparser_derive::{Visit, VisitMut}; use super::super::dml::CreateTable; use crate::ast::{ - ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, ObjectName, - OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, TableConstraint, - TableEngine, Tag, WrappedCollection, + ClusteredBy, ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, + ObjectName, OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, + TableConstraint, TableEngine, Tag, WrappedCollection, }; use crate::parser::ParserError; @@ -78,6 +78,7 @@ pub struct CreateTableBuilder { pub order_by: Option>, pub partition_by: Option>, pub cluster_by: Option>>, + pub clustered_by: Option, pub options: Option>, pub strict: bool, pub copy_grants: bool, @@ -125,6 +126,7 @@ impl CreateTableBuilder { order_by: None, partition_by: None, cluster_by: None, + clustered_by: None, options: None, strict: false, copy_grants: false, @@ -286,6 +288,11 @@ impl CreateTableBuilder { self } + pub fn clustered_by(mut self, clustered_by: Option) -> Self { + self.clustered_by = clustered_by; + self + } + pub fn options(mut self, options: Option>) -> Self { self.options = options; self @@ -380,6 +387,7 @@ impl CreateTableBuilder { order_by: self.order_by, partition_by: self.partition_by, cluster_by: self.cluster_by, + clustered_by: self.clustered_by, options: self.options, strict: self.strict, copy_grants: self.copy_grants, @@ -434,6 +442,7 @@ impl TryFrom for CreateTableBuilder { order_by, partition_by, cluster_by, + clustered_by, options, strict, copy_grants, @@ -476,6 +485,7 @@ impl TryFrom for CreateTableBuilder { order_by, partition_by, cluster_by, + clustered_by, options, strict, copy_grants, diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 8a56f3158..0e6357cbc 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -33,11 +33,11 @@ pub use self::data_type::{ }; pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue, Use}; pub use self::ddl::{ - AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption, - ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, GeneratedAs, - GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, Partition, - ProcedureParam, ReferentialAction, TableConstraint, UserDefinedTypeCompositeAttributeDef, - UserDefinedTypeRepresentation, ViewColumnDef, + AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ClusteredBy, ColumnDef, + ColumnOption, ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, + GeneratedAs, GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, + Partition, ProcedureParam, ReferentialAction, TableConstraint, + UserDefinedTypeCompositeAttributeDef, UserDefinedTypeRepresentation, ViewColumnDef, }; pub use self::dml::{CreateIndex, CreateTable, Delete, Insert}; pub use self::operator::{BinaryOperator, UnaryOperator}; @@ -5398,11 +5398,6 @@ pub enum HiveDistributionStyle { PARTITIONED { columns: Vec, }, - CLUSTERED { - columns: Vec, - sorted_by: Vec, - num_buckets: i32, - }, SKEWED { columns: Vec, on: Vec, diff --git a/src/keywords.rs b/src/keywords.rs index d2dcc57d1..ce4972f98 100644 --- a/src/keywords.rs +++ b/src/keywords.rs @@ -125,6 +125,7 @@ define_keywords!( BOTH, BROWSE, BTREE, + BUCKETS, BY, BYPASSRLS, BYTEA, @@ -156,6 +157,7 @@ define_keywords!( CLONE, CLOSE, CLUSTER, + CLUSTERED, COALESCE, COLLATE, COLLATION, @@ -675,6 +677,7 @@ define_keywords!( SNAPSHOT, SOME, SORT, + SORTED, SOURCE, SPATIAL, SPECIFIC, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 8f8c3f050..564671708 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -5378,7 +5378,7 @@ impl<'a> Parser<'a> { }) } - //TODO: Implement parsing for Skewed and Clustered + //TODO: Implement parsing for Skewed pub fn parse_hive_distribution(&mut self) -> Result { if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) { self.expect_token(&Token::LParen)?; @@ -5575,6 +5575,7 @@ impl<'a> Parser<'a> { let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]); let hive_distribution = self.parse_hive_distribution()?; + let clustered_by = self.parse_optional_clustered_by()?; let hive_formats = self.parse_hive_formats()?; // PostgreSQL supports `WITH ( options )`, before `AS` let with_options = self.parse_options(Keyword::WITH)?; @@ -5721,6 +5722,7 @@ impl<'a> Parser<'a> { .collation(collation) .on_commit(on_commit) .on_cluster(on_cluster) + .clustered_by(clustered_by) .partition_by(create_table_config.partition_by) .cluster_by(create_table_config.cluster_by) .options(create_table_config.options) @@ -6100,6 +6102,35 @@ impl<'a> Parser<'a> { })) } + pub fn parse_optional_clustered_by(&mut self) -> Result, ParserError> { + let clustered_by = if dialect_of!(self is HiveDialect|GenericDialect) + && self.parse_keywords(&[Keyword::CLUSTERED, Keyword::BY]) + { + let columns = self.parse_parenthesized_column_list(Mandatory, false)?; + + let sorted_by = if self.parse_keywords(&[Keyword::SORTED, Keyword::BY]) { + self.expect_token(&Token::LParen)?; + let sorted_by_columns = self.parse_comma_separated(|p| p.parse_order_by_expr())?; + self.expect_token(&Token::RParen)?; + Some(sorted_by_columns) + } else { + None + }; + + self.expect_keyword(Keyword::INTO)?; + let num_buckets = self.parse_number_value()?; + self.expect_keyword(Keyword::BUCKETS)?; + Some(ClusteredBy { + columns, + sorted_by, + num_buckets, + }) + } else { + None + }; + Ok(clustered_by) + } + pub fn parse_referential_action(&mut self) -> Result { if self.parse_keyword(Keyword::RESTRICT) { Ok(ReferentialAction::Restrict) diff --git a/tests/sqlparser_duckdb.rs b/tests/sqlparser_duckdb.rs index 488fddfd3..12368a88c 100644 --- a/tests/sqlparser_duckdb.rs +++ b/tests/sqlparser_duckdb.rs @@ -741,6 +741,7 @@ fn test_duckdb_union_datatype() { order_by: Default::default(), partition_by: Default::default(), cluster_by: Default::default(), + clustered_by: Default::default(), options: Default::default(), strict: Default::default(), copy_grants: Default::default(), diff --git a/tests/sqlparser_hive.rs b/tests/sqlparser_hive.rs index bd242035e..1bb4229e1 100644 --- a/tests/sqlparser_hive.rs +++ b/tests/sqlparser_hive.rs @@ -16,9 +16,9 @@ //! is also tested (on the inputs it can handle). use sqlparser::ast::{ - CreateFunctionBody, CreateFunctionUsing, Expr, Function, FunctionArgumentList, - FunctionArguments, Ident, ObjectName, OneOrManyWithParens, SelectItem, Statement, TableFactor, - UnaryOperator, Use, Value, + ClusteredBy, CreateFunctionBody, CreateFunctionUsing, CreateTable, Expr, Function, + FunctionArgumentList, FunctionArguments, Ident, ObjectName, OneOrManyWithParens, OrderByExpr, + SelectItem, Statement, TableFactor, UnaryOperator, Use, Value, }; use sqlparser::dialect::{GenericDialect, HiveDialect, MsSqlDialect}; use sqlparser::parser::ParserError; @@ -115,6 +115,74 @@ fn create_table_like() { hive().verified_stmt(like); } +#[test] +fn create_table_with_clustered_by() { + let sql = concat!( + "CREATE TABLE db.table_name (a INT, b STRING)", + " PARTITIONED BY (a INT, b STRING)", + " CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC)", + " INTO 4 BUCKETS" + ); + match hive_and_generic().verified_stmt(sql) { + Statement::CreateTable(CreateTable { clustered_by, .. }) => { + assert_eq!( + clustered_by.unwrap(), + ClusteredBy { + columns: vec![Ident::new("a"), Ident::new("b")], + sorted_by: Some(vec![ + OrderByExpr { + expr: Expr::Identifier(Ident::new("a")), + asc: Some(true), + nulls_first: None, + with_fill: None, + }, + OrderByExpr { + expr: Expr::Identifier(Ident::new("b")), + asc: Some(false), + nulls_first: None, + with_fill: None, + }, + ]), + num_buckets: Value::Number("4".parse().unwrap(), false), + } + ) + } + _ => unreachable!(), + } + + // SORTED BY is optional + hive_and_generic().verified_stmt("CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) INTO 4 BUCKETS"); + + // missing INTO BUCKETS + assert_eq!( + hive_and_generic().parse_sql_statements( + "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b)" + ).unwrap_err(), + ParserError::ParserError("Expected: INTO, found: EOF".to_string()) + ); + // missing CLUSTER BY columns + assert_eq!( + hive_and_generic().parse_sql_statements( + "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY () INTO 4 BUCKETS" + ).unwrap_err(), + ParserError::ParserError("Expected: identifier, found: )".to_string()) + ); + // missing SORT BY columns + assert_eq!( + hive_and_generic().parse_sql_statements( + "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY INTO 4 BUCKETS" + ).unwrap_err(), + ParserError::ParserError("Expected: (, found: INTO".to_string()) + ); + // missing number BUCKETS + assert_eq!( + hive_and_generic().parse_sql_statements( + "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC) INTO" + ).unwrap_err(), + ParserError::ParserError("Expected: a value, found: EOF".to_string()) + ); +} + // Turning off this test until we can parse identifiers starting with numbers :( #[test] fn test_identifier() { diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 2f9fe86c9..59afd7402 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -4838,6 +4838,7 @@ fn parse_trigger_related_functions() { order_by: None, partition_by: None, cluster_by: None, + clustered_by: None, options: None, strict: false, copy_grants: false,