Skip to content

Add support of parsing CLUSTERED BY clause for Hive #1397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion src/ast/ddl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use sqlparser_derive::{Visit, VisitMut};
use crate::ast::value::escape_single_quote_string;
use crate::ast::{
display_comma_separated, display_separated, DataType, Expr, Ident, MySQLColumnPosition,
ObjectName, ProjectionSelect, SequenceOptions, SqlOption,
ObjectName, OrderByExpr, ProjectionSelect, SequenceOptions, SqlOption, Value,
};
use crate::tokenizer::Token;

Expand Down Expand Up @@ -1417,3 +1417,30 @@ impl fmt::Display for Deduplicate {
}
}
}

/// Hive supports `CLUSTERED BY` statement in `CREATE TABLE`.
/// Syntax: `CLUSTERED BY (col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS`
///
/// [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable)
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct ClusteredBy {
pub columns: Vec<Ident>,
pub sorted_by: Option<Vec<OrderByExpr>>,
pub num_buckets: Value,
}

impl fmt::Display for ClusteredBy {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"CLUSTERED BY ({})",
display_comma_separated(&self.columns)
)?;
if let Some(ref sorted_by) = self.sorted_by {
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
}
write!(f, " INTO {} BUCKETS", self.num_buckets)
}
}
30 changes: 12 additions & 18 deletions src/ast/dml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ use sqlparser_derive::{Visit, VisitMut};
pub use super::ddl::{ColumnDef, TableConstraint};

use super::{
display_comma_separated, display_separated, CommentDef, Expr, FileFormat, FromTable,
HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, InsertAliases,
MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, OrderByExpr, Query,
RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, TableWithJoins, Tag,
WrappedCollection,
display_comma_separated, display_separated, ClusteredBy, CommentDef, Expr, FileFormat,
FromTable, HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident,
InsertAliases, MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens,
OrderByExpr, Query, RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine,
TableWithJoins, Tag, WrappedCollection,
};

/// CREATE INDEX statement.
Expand Down Expand Up @@ -140,6 +140,9 @@ pub struct CreateTable {
/// BigQuery: Table clustering column list.
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
/// Hive: Table clustering column list.
/// <https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable>
pub clustered_by: Option<ClusteredBy>,
/// BigQuery: Table options list.
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
pub options: Option<Vec<SqlOption>>,
Expand Down Expand Up @@ -236,19 +239,6 @@ impl Display for CreateTable {
HiveDistributionStyle::PARTITIONED { columns } => {
write!(f, " PARTITIONED BY ({})", display_comma_separated(columns))?;
}
HiveDistributionStyle::CLUSTERED {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PARTIONED BY and CLUSTERED BYcan appear at the same time, so we cannot use an enum to put them together. I remove CLUSTERED from HiveDistributionStyle here but it won't affect users since we don't support parsing CLUSTERED BY before this PR. cc @iffyio

columns,
sorted_by,
num_buckets,
} => {
write!(f, " CLUSTERED BY ({})", display_comma_separated(columns))?;
if !sorted_by.is_empty() {
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
}
if *num_buckets > 0 {
write!(f, " INTO {num_buckets} BUCKETS")?;
}
}
HiveDistributionStyle::SKEWED {
columns,
on,
Expand All @@ -267,6 +257,10 @@ impl Display for CreateTable {
_ => (),
}

if let Some(clustered_by) = &self.clustered_by {
write!(f, " {clustered_by}")?;
}

if let Some(HiveFormat {
row_format,
serde_properties,
Expand Down
16 changes: 13 additions & 3 deletions src/ast/helpers/stmt_create_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ use sqlparser_derive::{Visit, VisitMut};

use super::super::dml::CreateTable;
use crate::ast::{
ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, ObjectName,
OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, TableConstraint,
TableEngine, Tag, WrappedCollection,
ClusteredBy, ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident,
ObjectName, OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement,
TableConstraint, TableEngine, Tag, WrappedCollection,
};
use crate::parser::ParserError;

Expand Down Expand Up @@ -78,6 +78,7 @@ pub struct CreateTableBuilder {
pub order_by: Option<OneOrManyWithParens<Expr>>,
pub partition_by: Option<Box<Expr>>,
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
pub clustered_by: Option<ClusteredBy>,
pub options: Option<Vec<SqlOption>>,
pub strict: bool,
pub copy_grants: bool,
Expand Down Expand Up @@ -125,6 +126,7 @@ impl CreateTableBuilder {
order_by: None,
partition_by: None,
cluster_by: None,
clustered_by: None,
options: None,
strict: false,
copy_grants: false,
Expand Down Expand Up @@ -286,6 +288,11 @@ impl CreateTableBuilder {
self
}

pub fn clustered_by(mut self, clustered_by: Option<ClusteredBy>) -> Self {
self.clustered_by = clustered_by;
self
}

pub fn options(mut self, options: Option<Vec<SqlOption>>) -> Self {
self.options = options;
self
Expand Down Expand Up @@ -380,6 +387,7 @@ impl CreateTableBuilder {
order_by: self.order_by,
partition_by: self.partition_by,
cluster_by: self.cluster_by,
clustered_by: self.clustered_by,
options: self.options,
strict: self.strict,
copy_grants: self.copy_grants,
Expand Down Expand Up @@ -434,6 +442,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
order_by,
partition_by,
cluster_by,
clustered_by,
options,
strict,
copy_grants,
Expand Down Expand Up @@ -476,6 +485,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
order_by,
partition_by,
cluster_by,
clustered_by,
options,
strict,
copy_grants,
Expand Down
15 changes: 5 additions & 10 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ pub use self::data_type::{
};
pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue, Use};
pub use self::ddl::{
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption,
ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, GeneratedAs,
GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, Partition,
ProcedureParam, ReferentialAction, TableConstraint, UserDefinedTypeCompositeAttributeDef,
UserDefinedTypeRepresentation, ViewColumnDef,
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ClusteredBy, ColumnDef,
ColumnOption, ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial,
GeneratedAs, GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner,
Partition, ProcedureParam, ReferentialAction, TableConstraint,
UserDefinedTypeCompositeAttributeDef, UserDefinedTypeRepresentation, ViewColumnDef,
};
pub use self::dml::{CreateIndex, CreateTable, Delete, Insert};
pub use self::operator::{BinaryOperator, UnaryOperator};
Expand Down Expand Up @@ -5398,11 +5398,6 @@ pub enum HiveDistributionStyle {
PARTITIONED {
columns: Vec<ColumnDef>,
},
CLUSTERED {
columns: Vec<Ident>,
sorted_by: Vec<ColumnDef>,
num_buckets: i32,
},
SKEWED {
columns: Vec<ColumnDef>,
on: Vec<ColumnDef>,
Expand Down
3 changes: 3 additions & 0 deletions src/keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ define_keywords!(
BOTH,
BROWSE,
BTREE,
BUCKETS,
BY,
BYPASSRLS,
BYTEA,
Expand Down Expand Up @@ -156,6 +157,7 @@ define_keywords!(
CLONE,
CLOSE,
CLUSTER,
CLUSTERED,
COALESCE,
COLLATE,
COLLATION,
Expand Down Expand Up @@ -675,6 +677,7 @@ define_keywords!(
SNAPSHOT,
SOME,
SORT,
SORTED,
SOURCE,
SPATIAL,
SPECIFIC,
Expand Down
33 changes: 32 additions & 1 deletion src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5378,7 +5378,7 @@ impl<'a> Parser<'a> {
})
}

//TODO: Implement parsing for Skewed and Clustered
//TODO: Implement parsing for Skewed
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
Expand Down Expand Up @@ -5575,6 +5575,7 @@ impl<'a> Parser<'a> {
let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]);

let hive_distribution = self.parse_hive_distribution()?;
let clustered_by = self.parse_optional_clustered_by()?;
let hive_formats = self.parse_hive_formats()?;
// PostgreSQL supports `WITH ( options )`, before `AS`
let with_options = self.parse_options(Keyword::WITH)?;
Expand Down Expand Up @@ -5721,6 +5722,7 @@ impl<'a> Parser<'a> {
.collation(collation)
.on_commit(on_commit)
.on_cluster(on_cluster)
.clustered_by(clustered_by)
.partition_by(create_table_config.partition_by)
.cluster_by(create_table_config.cluster_by)
.options(create_table_config.options)
Expand Down Expand Up @@ -6100,6 +6102,35 @@ impl<'a> Parser<'a> {
}))
}

pub fn parse_optional_clustered_by(&mut self) -> Result<Option<ClusteredBy>, ParserError> {
let clustered_by = if dialect_of!(self is HiveDialect|GenericDialect)
&& self.parse_keywords(&[Keyword::CLUSTERED, Keyword::BY])
{
let columns = self.parse_parenthesized_column_list(Mandatory, false)?;

let sorted_by = if self.parse_keywords(&[Keyword::SORTED, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
let sorted_by_columns = self.parse_comma_separated(|p| p.parse_order_by_expr())?;
self.expect_token(&Token::RParen)?;
Some(sorted_by_columns)
} else {
None
};

self.expect_keyword(Keyword::INTO)?;
let num_buckets = self.parse_number_value()?;
self.expect_keyword(Keyword::BUCKETS)?;
Some(ClusteredBy {
columns,
sorted_by,
num_buckets,
})
} else {
None
};
Ok(clustered_by)
}

pub fn parse_referential_action(&mut self) -> Result<ReferentialAction, ParserError> {
if self.parse_keyword(Keyword::RESTRICT) {
Ok(ReferentialAction::Restrict)
Expand Down
1 change: 1 addition & 0 deletions tests/sqlparser_duckdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,7 @@ fn test_duckdb_union_datatype() {
order_by: Default::default(),
partition_by: Default::default(),
cluster_by: Default::default(),
clustered_by: Default::default(),
options: Default::default(),
strict: Default::default(),
copy_grants: Default::default(),
Expand Down
74 changes: 71 additions & 3 deletions tests/sqlparser_hive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
//! is also tested (on the inputs it can handle).

use sqlparser::ast::{
CreateFunctionBody, CreateFunctionUsing, Expr, Function, FunctionArgumentList,
FunctionArguments, Ident, ObjectName, OneOrManyWithParens, SelectItem, Statement, TableFactor,
UnaryOperator, Use, Value,
ClusteredBy, CreateFunctionBody, CreateFunctionUsing, CreateTable, Expr, Function,
FunctionArgumentList, FunctionArguments, Ident, ObjectName, OneOrManyWithParens, OrderByExpr,
SelectItem, Statement, TableFactor, UnaryOperator, Use, Value,
};
use sqlparser::dialect::{GenericDialect, HiveDialect, MsSqlDialect};
use sqlparser::parser::ParserError;
Expand Down Expand Up @@ -115,6 +115,74 @@ fn create_table_like() {
hive().verified_stmt(like);
}

#[test]
fn create_table_with_clustered_by() {
let sql = concat!(
"CREATE TABLE db.table_name (a INT, b STRING)",
" PARTITIONED BY (a INT, b STRING)",
" CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC)",
" INTO 4 BUCKETS"
);
match hive_and_generic().verified_stmt(sql) {
Statement::CreateTable(CreateTable { clustered_by, .. }) => {
assert_eq!(
clustered_by.unwrap(),
ClusteredBy {
columns: vec![Ident::new("a"), Ident::new("b")],
sorted_by: Some(vec![
OrderByExpr {
expr: Expr::Identifier(Ident::new("a")),
asc: Some(true),
nulls_first: None,
with_fill: None,
},
OrderByExpr {
expr: Expr::Identifier(Ident::new("b")),
asc: Some(false),
nulls_first: None,
with_fill: None,
},
]),
num_buckets: Value::Number("4".parse().unwrap(), false),
}
)
}
_ => unreachable!(),
}

// SORTED BY is optional
hive_and_generic().verified_stmt("CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) INTO 4 BUCKETS");

// missing INTO BUCKETS
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b)"
).unwrap_err(),
ParserError::ParserError("Expected: INTO, found: EOF".to_string())
);
// missing CLUSTER BY columns
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY () INTO 4 BUCKETS"
).unwrap_err(),
ParserError::ParserError("Expected: identifier, found: )".to_string())
);
// missing SORT BY columns
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY INTO 4 BUCKETS"
).unwrap_err(),
ParserError::ParserError("Expected: (, found: INTO".to_string())
);
// missing number BUCKETS
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC) INTO"
).unwrap_err(),
ParserError::ParserError("Expected: a value, found: EOF".to_string())
);
}

// Turning off this test until we can parse identifiers starting with numbers :(
#[test]
fn test_identifier() {
Expand Down
1 change: 1 addition & 0 deletions tests/sqlparser_postgres.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4838,6 +4838,7 @@ fn parse_trigger_related_functions() {
order_by: None,
partition_by: None,
cluster_by: None,
clustered_by: None,
options: None,
strict: false,
copy_grants: false,
Expand Down
Loading