Add support of parsing CLUSTERED BY clause for Hive

git-hulk · git-hulk · commit b990684a96ac · 2024-08-23T18:37:31.000+08:00
This PR supports `CLUSTERED BY` clause in CREATE TABLE for Hive dialect, which is used to group data into buckets by CLUSTERED BY columns. For more information, please refer to: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable And it also introduces the following keywords: - CLUSTERED - SORTED - BUCKETS
diff --git a/src/ast/ddl.rs b/src/ast/ddl.rs
@@ -26,7 +26,7 @@ use sqlparser_derive::{Visit, VisitMut};
 use crate::ast::value::escape_single_quote_string;
 use crate::ast::{
     display_comma_separated, display_separated, DataType, Expr, Ident, MySQLColumnPosition,
-    ObjectName, ProjectionSelect, SequenceOptions, SqlOption,
+    ObjectName, OrderByExpr, ProjectionSelect, SequenceOptions, SqlOption, Value,
 };
 use crate::tokenizer::Token;
 
@@ -1417,3 +1417,30 @@ impl fmt::Display for Deduplicate {
         }
     }
 }
+
+/// Hive supports `CLUSTERED BY` statement in `CREATE TABLE`.
+/// Syntax: `CLUSTERED BY (col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS`
+///
+/// [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable)
+#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
+pub struct ClusteredBy {
+    pub columns: Vec<Ident>,
+    pub sorted_by: Option<Vec<OrderByExpr>>,
+    pub num_buckets: Value,
+}
+
+impl fmt::Display for ClusteredBy {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "CLUSTERED BY ({})",
+            display_comma_separated(&self.columns)
+        )?;
+        if let Some(ref sorted_by) = self.sorted_by {
+            write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
+        }
+        write!(f, " INTO {} BUCKETS", self.num_buckets)
+    }
+}
diff --git a/src/ast/dml.rs b/src/ast/dml.rs
@@ -22,11 +22,11 @@ use sqlparser_derive::{Visit, VisitMut};
 pub use super::ddl::{ColumnDef, TableConstraint};
 
 use super::{
-    display_comma_separated, display_separated, CommentDef, Expr, FileFormat, FromTable,
-    HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, InsertAliases,
-    MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, OrderByExpr, Query,
-    RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, TableWithJoins, Tag,
-    WrappedCollection,
+    display_comma_separated, display_separated, ClusteredBy, CommentDef, Expr, FileFormat,
+    FromTable, HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident,
+    InsertAliases, MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens,
+    OrderByExpr, Query, RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine,
+    TableWithJoins, Tag, WrappedCollection,
 };
 
 /// CREATE INDEX statement.
@@ -140,6 +140,9 @@ pub struct CreateTable {
     /// BigQuery: Table clustering column list.
     /// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
     pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
+    /// Hive: Table clustering column list.
+    /// <https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable>
+    pub clustered_by: Option<ClusteredBy>,
     /// BigQuery: Table options list.
     /// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
     pub options: Option<Vec<SqlOption>>,
@@ -236,19 +239,6 @@ impl Display for CreateTable {
             HiveDistributionStyle::PARTITIONED { columns } => {
                 write!(f, " PARTITIONED BY ({})", display_comma_separated(columns))?;
             }
-            HiveDistributionStyle::CLUSTERED {
-                columns,
-                sorted_by,
-                num_buckets,
-            } => {
-                write!(f, " CLUSTERED BY ({})", display_comma_separated(columns))?;
-                if !sorted_by.is_empty() {
-                    write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
-                }
-                if *num_buckets > 0 {
-                    write!(f, " INTO {num_buckets} BUCKETS")?;
-                }
-            }
             HiveDistributionStyle::SKEWED {
                 columns,
                 on,
@@ -267,6 +257,10 @@ impl Display for CreateTable {
             _ => (),
         }
 
+        if let Some(clustered_by) = &self.clustered_by {
+            write!(f, " {clustered_by}")?;
+        }
+
         if let Some(HiveFormat {
             row_format,
             serde_properties,
diff --git a/src/ast/helpers/stmt_create_table.rs b/src/ast/helpers/stmt_create_table.rs
@@ -9,9 +9,9 @@ use sqlparser_derive::{Visit, VisitMut};
 
 use super::super::dml::CreateTable;
 use crate::ast::{
-    ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, ObjectName,
-    OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, TableConstraint,
-    TableEngine, Tag, WrappedCollection,
+    ClusteredBy, ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident,
+    ObjectName, OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement,
+    TableConstraint, TableEngine, Tag, WrappedCollection,
 };
 use crate::parser::ParserError;
 
@@ -78,6 +78,7 @@ pub struct CreateTableBuilder {
     pub order_by: Option<OneOrManyWithParens<Expr>>,
     pub partition_by: Option<Box<Expr>>,
     pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
+    pub clustered_by: Option<ClusteredBy>,
     pub options: Option<Vec<SqlOption>>,
     pub strict: bool,
     pub copy_grants: bool,
@@ -125,6 +126,7 @@ impl CreateTableBuilder {
             order_by: None,
             partition_by: None,
             cluster_by: None,
+            clustered_by: None,
             options: None,
             strict: false,
             copy_grants: false,
@@ -286,6 +288,11 @@ impl CreateTableBuilder {
         self
     }
 
+    pub fn clustered_by(mut self, clustered_by: Option<ClusteredBy>) -> Self {
+        self.clustered_by = clustered_by;
+        self
+    }
+
     pub fn options(mut self, options: Option<Vec<SqlOption>>) -> Self {
         self.options = options;
         self
@@ -380,6 +387,7 @@ impl CreateTableBuilder {
             order_by: self.order_by,
             partition_by: self.partition_by,
             cluster_by: self.cluster_by,
+            clustered_by: self.clustered_by,
             options: self.options,
             strict: self.strict,
             copy_grants: self.copy_grants,
@@ -434,6 +442,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
                 order_by,
                 partition_by,
                 cluster_by,
+                clustered_by,
                 options,
                 strict,
                 copy_grants,
@@ -476,6 +485,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
                 order_by,
                 partition_by,
                 cluster_by,
+                clustered_by,
                 options,
                 strict,
                 copy_grants,
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
@@ -33,11 +33,11 @@ pub use self::data_type::{
 };
 pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue};
 pub use self::ddl::{
-    AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption,
-    ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, GeneratedAs,
-    GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, Partition,
-    ProcedureParam, ReferentialAction, TableConstraint, UserDefinedTypeCompositeAttributeDef,
-    UserDefinedTypeRepresentation, ViewColumnDef,
+    AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ClusteredBy, ColumnDef,
+    ColumnOption, ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial,
+    GeneratedAs, GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner,
+    Partition, ProcedureParam, ReferentialAction, TableConstraint,
+    UserDefinedTypeCompositeAttributeDef, UserDefinedTypeRepresentation, ViewColumnDef,
 };
 pub use self::dml::{CreateIndex, CreateTable, Delete, Insert};
 pub use self::operator::{BinaryOperator, UnaryOperator};
@@ -5403,11 +5403,6 @@ pub enum HiveDistributionStyle {
     PARTITIONED {
         columns: Vec<ColumnDef>,
     },
-    CLUSTERED {
-        columns: Vec<Ident>,
-        sorted_by: Vec<ColumnDef>,
-        num_buckets: i32,
-    },
     SKEWED {
         columns: Vec<ColumnDef>,
         on: Vec<ColumnDef>,
diff --git a/src/keywords.rs b/src/keywords.rs
@@ -125,6 +125,7 @@ define_keywords!(
     BOTH,
     BROWSE,
     BTREE,
+    BUCKETS,
     BY,
     BYPASSRLS,
     BYTEA,
@@ -155,6 +156,7 @@ define_keywords!(
     CLONE,
     CLOSE,
     CLUSTER,
+    CLUSTERED,
     COALESCE,
     COLLATE,
     COLLATION,
@@ -674,6 +676,7 @@ define_keywords!(
     SNAPSHOT,
     SOME,
     SORT,
+    SORTED,
     SOURCE,
     SPATIAL,
     SPECIFIC,
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -5378,7 +5378,7 @@ impl<'a> Parser<'a> {
         })
     }
 
-    //TODO: Implement parsing for Skewed and Clustered
+    //TODO: Implement parsing for Skewed
     pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
         if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
             self.expect_token(&Token::LParen)?;
@@ -5575,6 +5575,7 @@ impl<'a> Parser<'a> {
         let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]);
 
         let hive_distribution = self.parse_hive_distribution()?;
+        let clustered_by = self.parse_optional_clustered_by()?;
         let hive_formats = self.parse_hive_formats()?;
         // PostgreSQL supports `WITH ( options )`, before `AS`
         let with_options = self.parse_options(Keyword::WITH)?;
@@ -5721,6 +5722,7 @@ impl<'a> Parser<'a> {
             .collation(collation)
             .on_commit(on_commit)
             .on_cluster(on_cluster)
+            .clustered_by(clustered_by)
             .partition_by(create_table_config.partition_by)
             .cluster_by(create_table_config.cluster_by)
             .options(create_table_config.options)
@@ -6100,6 +6102,35 @@ impl<'a> Parser<'a> {
         }))
     }
 
+    pub fn parse_optional_clustered_by(&mut self) -> Result<Option<ClusteredBy>, ParserError> {
+        let clustered_by = if dialect_of!(self is HiveDialect|GenericDialect)
+            && self.parse_keywords(&[Keyword::CLUSTERED, Keyword::BY])
+        {
+            let columns = self.parse_parenthesized_column_list(Mandatory, false)?;
+
+            let sorted_by = if self.parse_keywords(&[Keyword::SORTED, Keyword::BY]) {
+                self.expect_token(&Token::LParen)?;
+                let sorted_by_columns = self.parse_comma_separated(|p| p.parse_order_by_expr())?;
+                self.expect_token(&Token::RParen)?;
+                Some(sorted_by_columns)
+            } else {
+                None
+            };
+
+            self.expect_keyword(Keyword::INTO)?;
+            let num_buckets = self.parse_number_value()?;
+            self.expect_keyword(Keyword::BUCKETS)?;
+            Some(ClusteredBy {
+                columns,
+                sorted_by,
+                num_buckets,
+            })
+        } else {
+            None
+        };
+        Ok(clustered_by)
+    }
+
     pub fn parse_referential_action(&mut self) -> Result<ReferentialAction, ParserError> {
         if self.parse_keyword(Keyword::RESTRICT) {
             Ok(ReferentialAction::Restrict)
diff --git a/tests/sqlparser_duckdb.rs b/tests/sqlparser_duckdb.rs
@@ -741,6 +741,7 @@ fn test_duckdb_union_datatype() {
             order_by: Default::default(),
             partition_by: Default::default(),
             cluster_by: Default::default(),
+            clustered_by: Default::default(),
             options: Default::default(),
             strict: Default::default(),
             copy_grants: Default::default(),
diff --git a/tests/sqlparser_hive.rs b/tests/sqlparser_hive.rs
@@ -16,9 +16,9 @@
 //! is also tested (on the inputs it can handle).
 
 use sqlparser::ast::{
-    CreateFunctionBody, CreateFunctionUsing, Expr, Function, FunctionArgumentList,
-    FunctionArguments, Ident, ObjectName, OneOrManyWithParens, SelectItem, Statement, TableFactor,
-    UnaryOperator, Value,
+    ClusteredBy, CreateFunctionBody, CreateFunctionUsing, CreateTable, Expr, Function,
+    FunctionArgumentList, FunctionArguments, Ident, ObjectName, OneOrManyWithParens, OrderByExpr,
+    SelectItem, Statement, TableFactor, UnaryOperator, Value,
 };
 use sqlparser::dialect::{GenericDialect, HiveDialect, MsSqlDialect};
 use sqlparser::parser::ParserError;
@@ -115,6 +115,74 @@ fn create_table_like() {
     hive().verified_stmt(like);
 }
 
+#[test]
+fn create_table_with_clustered_by() {
+    let sql = concat!(
+        "CREATE TABLE db.table_name (a INT, b STRING)",
+        " PARTITIONED BY (a INT, b STRING)",
+        " CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC)",
+        " INTO 4 BUCKETS"
+    );
+    match hive_and_generic().verified_stmt(sql) {
+        Statement::CreateTable(CreateTable { clustered_by, .. }) => {
+            assert_eq!(
+                clustered_by.unwrap(),
+                ClusteredBy {
+                    columns: vec![Ident::new("a"), Ident::new("b")],
+                    sorted_by: Some(vec![
+                        OrderByExpr {
+                            expr: Expr::Identifier(Ident::new("a")),
+                            asc: Some(true),
+                            nulls_first: None,
+                            with_fill: None,
+                        },
+                        OrderByExpr {
+                            expr: Expr::Identifier(Ident::new("b")),
+                            asc: Some(false),
+                            nulls_first: None,
+                            with_fill: None,
+                        },
+                    ]),
+                    num_buckets: Value::Number("4".parse().unwrap(), false),
+                }
+            )
+        }
+        _ => unreachable!(),
+    }
+
+    // SORTED BY is optional
+    hive_and_generic().verified_stmt("CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) INTO 4 BUCKETS");
+
+    // missing INTO BUCKETS
+    assert_eq!(
+    hive_and_generic().parse_sql_statements(
+        "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b)"
+    ).unwrap_err(),
+        ParserError::ParserError("Expected: INTO, found: EOF".to_string())
+   );
+    // missing CLUSTER BY columns
+    assert_eq!(
+     hive_and_generic().parse_sql_statements(
+          "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY () INTO 4 BUCKETS"
+     ).unwrap_err(),
+          ParserError::ParserError("Expected: identifier, found: )".to_string())
+    );
+    // missing SORT BY columns
+    assert_eq!(
+     hive_and_generic().parse_sql_statements(
+          "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY INTO 4 BUCKETS"
+     ).unwrap_err(),
+          ParserError::ParserError("Expected: (, found: INTO".to_string())
+    );
+    // missing number BUCKETS
+    assert_eq!(
+     hive_and_generic().parse_sql_statements(
+          "CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC) INTO"
+     ).unwrap_err(),
+          ParserError::ParserError("Expected: a value, found: EOF".to_string())
+    );
+}
+
 // Turning off this test until we can parse identifiers starting with numbers :(
 #[test]
 fn test_identifier() {
diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs
@@ -4838,6 +4838,7 @@ fn parse_trigger_related_functions() {
             order_by: None,
             partition_by: None,
             cluster_by: None,
+            clustered_by: None,
             options: None,
             strict: false,
             copy_grants: false,