Skip to content

Commit b990684

Browse files
committed
Add support of parsing CLUSTERED BY clause for Hive
This PR supports `CLUSTERED BY` clause in CREATE TABLE for Hive dialect, which is used to group data into buckets by CLUSTERED BY columns. For more information, please refer to: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable And it also introduces the following keywords: - CLUSTERED - SORTED - BUCKETS
1 parent 19e694a commit b990684

File tree

9 files changed

+166
-36
lines changed

9 files changed

+166
-36
lines changed

src/ast/ddl.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use sqlparser_derive::{Visit, VisitMut};
2626
use crate::ast::value::escape_single_quote_string;
2727
use crate::ast::{
2828
display_comma_separated, display_separated, DataType, Expr, Ident, MySQLColumnPosition,
29-
ObjectName, ProjectionSelect, SequenceOptions, SqlOption,
29+
ObjectName, OrderByExpr, ProjectionSelect, SequenceOptions, SqlOption, Value,
3030
};
3131
use crate::tokenizer::Token;
3232

@@ -1417,3 +1417,30 @@ impl fmt::Display for Deduplicate {
14171417
}
14181418
}
14191419
}
1420+
1421+
/// Hive supports `CLUSTERED BY` statement in `CREATE TABLE`.
1422+
/// Syntax: `CLUSTERED BY (col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS`
1423+
///
1424+
/// [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable)
1425+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
1426+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1427+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
1428+
pub struct ClusteredBy {
1429+
pub columns: Vec<Ident>,
1430+
pub sorted_by: Option<Vec<OrderByExpr>>,
1431+
pub num_buckets: Value,
1432+
}
1433+
1434+
impl fmt::Display for ClusteredBy {
1435+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1436+
write!(
1437+
f,
1438+
"CLUSTERED BY ({})",
1439+
display_comma_separated(&self.columns)
1440+
)?;
1441+
if let Some(ref sorted_by) = self.sorted_by {
1442+
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
1443+
}
1444+
write!(f, " INTO {} BUCKETS", self.num_buckets)
1445+
}
1446+
}

src/ast/dml.rs

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@ use sqlparser_derive::{Visit, VisitMut};
2222
pub use super::ddl::{ColumnDef, TableConstraint};
2323

2424
use super::{
25-
display_comma_separated, display_separated, CommentDef, Expr, FileFormat, FromTable,
26-
HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, InsertAliases,
27-
MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, OrderByExpr, Query,
28-
RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, TableWithJoins, Tag,
29-
WrappedCollection,
25+
display_comma_separated, display_separated, ClusteredBy, CommentDef, Expr, FileFormat,
26+
FromTable, HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident,
27+
InsertAliases, MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens,
28+
OrderByExpr, Query, RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine,
29+
TableWithJoins, Tag, WrappedCollection,
3030
};
3131

3232
/// CREATE INDEX statement.
@@ -140,6 +140,9 @@ pub struct CreateTable {
140140
/// BigQuery: Table clustering column list.
141141
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
142142
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
143+
/// Hive: Table clustering column list.
144+
/// <https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable>
145+
pub clustered_by: Option<ClusteredBy>,
143146
/// BigQuery: Table options list.
144147
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
145148
pub options: Option<Vec<SqlOption>>,
@@ -236,19 +239,6 @@ impl Display for CreateTable {
236239
HiveDistributionStyle::PARTITIONED { columns } => {
237240
write!(f, " PARTITIONED BY ({})", display_comma_separated(columns))?;
238241
}
239-
HiveDistributionStyle::CLUSTERED {
240-
columns,
241-
sorted_by,
242-
num_buckets,
243-
} => {
244-
write!(f, " CLUSTERED BY ({})", display_comma_separated(columns))?;
245-
if !sorted_by.is_empty() {
246-
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
247-
}
248-
if *num_buckets > 0 {
249-
write!(f, " INTO {num_buckets} BUCKETS")?;
250-
}
251-
}
252242
HiveDistributionStyle::SKEWED {
253243
columns,
254244
on,
@@ -267,6 +257,10 @@ impl Display for CreateTable {
267257
_ => (),
268258
}
269259

260+
if let Some(clustered_by) = &self.clustered_by {
261+
write!(f, " {clustered_by}")?;
262+
}
263+
270264
if let Some(HiveFormat {
271265
row_format,
272266
serde_properties,

src/ast/helpers/stmt_create_table.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ use sqlparser_derive::{Visit, VisitMut};
99

1010
use super::super::dml::CreateTable;
1111
use crate::ast::{
12-
ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, ObjectName,
13-
OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, TableConstraint,
14-
TableEngine, Tag, WrappedCollection,
12+
ClusteredBy, ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident,
13+
ObjectName, OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement,
14+
TableConstraint, TableEngine, Tag, WrappedCollection,
1515
};
1616
use crate::parser::ParserError;
1717

@@ -78,6 +78,7 @@ pub struct CreateTableBuilder {
7878
pub order_by: Option<OneOrManyWithParens<Expr>>,
7979
pub partition_by: Option<Box<Expr>>,
8080
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
81+
pub clustered_by: Option<ClusteredBy>,
8182
pub options: Option<Vec<SqlOption>>,
8283
pub strict: bool,
8384
pub copy_grants: bool,
@@ -125,6 +126,7 @@ impl CreateTableBuilder {
125126
order_by: None,
126127
partition_by: None,
127128
cluster_by: None,
129+
clustered_by: None,
128130
options: None,
129131
strict: false,
130132
copy_grants: false,
@@ -286,6 +288,11 @@ impl CreateTableBuilder {
286288
self
287289
}
288290

291+
pub fn clustered_by(mut self, clustered_by: Option<ClusteredBy>) -> Self {
292+
self.clustered_by = clustered_by;
293+
self
294+
}
295+
289296
pub fn options(mut self, options: Option<Vec<SqlOption>>) -> Self {
290297
self.options = options;
291298
self
@@ -380,6 +387,7 @@ impl CreateTableBuilder {
380387
order_by: self.order_by,
381388
partition_by: self.partition_by,
382389
cluster_by: self.cluster_by,
390+
clustered_by: self.clustered_by,
383391
options: self.options,
384392
strict: self.strict,
385393
copy_grants: self.copy_grants,
@@ -434,6 +442,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
434442
order_by,
435443
partition_by,
436444
cluster_by,
445+
clustered_by,
437446
options,
438447
strict,
439448
copy_grants,
@@ -476,6 +485,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
476485
order_by,
477486
partition_by,
478487
cluster_by,
488+
clustered_by,
479489
options,
480490
strict,
481491
copy_grants,

src/ast/mod.rs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@ pub use self::data_type::{
3333
};
3434
pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue};
3535
pub use self::ddl::{
36-
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption,
37-
ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, GeneratedAs,
38-
GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, Partition,
39-
ProcedureParam, ReferentialAction, TableConstraint, UserDefinedTypeCompositeAttributeDef,
40-
UserDefinedTypeRepresentation, ViewColumnDef,
36+
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ClusteredBy, ColumnDef,
37+
ColumnOption, ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial,
38+
GeneratedAs, GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner,
39+
Partition, ProcedureParam, ReferentialAction, TableConstraint,
40+
UserDefinedTypeCompositeAttributeDef, UserDefinedTypeRepresentation, ViewColumnDef,
4141
};
4242
pub use self::dml::{CreateIndex, CreateTable, Delete, Insert};
4343
pub use self::operator::{BinaryOperator, UnaryOperator};
@@ -5403,11 +5403,6 @@ pub enum HiveDistributionStyle {
54035403
PARTITIONED {
54045404
columns: Vec<ColumnDef>,
54055405
},
5406-
CLUSTERED {
5407-
columns: Vec<Ident>,
5408-
sorted_by: Vec<ColumnDef>,
5409-
num_buckets: i32,
5410-
},
54115406
SKEWED {
54125407
columns: Vec<ColumnDef>,
54135408
on: Vec<ColumnDef>,

src/keywords.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ define_keywords!(
125125
BOTH,
126126
BROWSE,
127127
BTREE,
128+
BUCKETS,
128129
BY,
129130
BYPASSRLS,
130131
BYTEA,
@@ -155,6 +156,7 @@ define_keywords!(
155156
CLONE,
156157
CLOSE,
157158
CLUSTER,
159+
CLUSTERED,
158160
COALESCE,
159161
COLLATE,
160162
COLLATION,
@@ -674,6 +676,7 @@ define_keywords!(
674676
SNAPSHOT,
675677
SOME,
676678
SORT,
679+
SORTED,
677680
SOURCE,
678681
SPATIAL,
679682
SPECIFIC,

src/parser/mod.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5378,7 +5378,7 @@ impl<'a> Parser<'a> {
53785378
})
53795379
}
53805380

5381-
//TODO: Implement parsing for Skewed and Clustered
5381+
//TODO: Implement parsing for Skewed
53825382
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
53835383
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
53845384
self.expect_token(&Token::LParen)?;
@@ -5575,6 +5575,7 @@ impl<'a> Parser<'a> {
55755575
let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]);
55765576

55775577
let hive_distribution = self.parse_hive_distribution()?;
5578+
let clustered_by = self.parse_optional_clustered_by()?;
55785579
let hive_formats = self.parse_hive_formats()?;
55795580
// PostgreSQL supports `WITH ( options )`, before `AS`
55805581
let with_options = self.parse_options(Keyword::WITH)?;
@@ -5721,6 +5722,7 @@ impl<'a> Parser<'a> {
57215722
.collation(collation)
57225723
.on_commit(on_commit)
57235724
.on_cluster(on_cluster)
5725+
.clustered_by(clustered_by)
57245726
.partition_by(create_table_config.partition_by)
57255727
.cluster_by(create_table_config.cluster_by)
57265728
.options(create_table_config.options)
@@ -6100,6 +6102,35 @@ impl<'a> Parser<'a> {
61006102
}))
61016103
}
61026104

6105+
pub fn parse_optional_clustered_by(&mut self) -> Result<Option<ClusteredBy>, ParserError> {
6106+
let clustered_by = if dialect_of!(self is HiveDialect|GenericDialect)
6107+
&& self.parse_keywords(&[Keyword::CLUSTERED, Keyword::BY])
6108+
{
6109+
let columns = self.parse_parenthesized_column_list(Mandatory, false)?;
6110+
6111+
let sorted_by = if self.parse_keywords(&[Keyword::SORTED, Keyword::BY]) {
6112+
self.expect_token(&Token::LParen)?;
6113+
let sorted_by_columns = self.parse_comma_separated(|p| p.parse_order_by_expr())?;
6114+
self.expect_token(&Token::RParen)?;
6115+
Some(sorted_by_columns)
6116+
} else {
6117+
None
6118+
};
6119+
6120+
self.expect_keyword(Keyword::INTO)?;
6121+
let num_buckets = self.parse_number_value()?;
6122+
self.expect_keyword(Keyword::BUCKETS)?;
6123+
Some(ClusteredBy {
6124+
columns,
6125+
sorted_by,
6126+
num_buckets,
6127+
})
6128+
} else {
6129+
None
6130+
};
6131+
Ok(clustered_by)
6132+
}
6133+
61036134
pub fn parse_referential_action(&mut self) -> Result<ReferentialAction, ParserError> {
61046135
if self.parse_keyword(Keyword::RESTRICT) {
61056136
Ok(ReferentialAction::Restrict)

tests/sqlparser_duckdb.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,7 @@ fn test_duckdb_union_datatype() {
741741
order_by: Default::default(),
742742
partition_by: Default::default(),
743743
cluster_by: Default::default(),
744+
clustered_by: Default::default(),
744745
options: Default::default(),
745746
strict: Default::default(),
746747
copy_grants: Default::default(),

tests/sqlparser_hive.rs

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
//! is also tested (on the inputs it can handle).
1717
1818
use sqlparser::ast::{
19-
CreateFunctionBody, CreateFunctionUsing, Expr, Function, FunctionArgumentList,
20-
FunctionArguments, Ident, ObjectName, OneOrManyWithParens, SelectItem, Statement, TableFactor,
21-
UnaryOperator, Value,
19+
ClusteredBy, CreateFunctionBody, CreateFunctionUsing, CreateTable, Expr, Function,
20+
FunctionArgumentList, FunctionArguments, Ident, ObjectName, OneOrManyWithParens, OrderByExpr,
21+
SelectItem, Statement, TableFactor, UnaryOperator, Value,
2222
};
2323
use sqlparser::dialect::{GenericDialect, HiveDialect, MsSqlDialect};
2424
use sqlparser::parser::ParserError;
@@ -115,6 +115,74 @@ fn create_table_like() {
115115
hive().verified_stmt(like);
116116
}
117117

118+
#[test]
119+
fn create_table_with_clustered_by() {
120+
let sql = concat!(
121+
"CREATE TABLE db.table_name (a INT, b STRING)",
122+
" PARTITIONED BY (a INT, b STRING)",
123+
" CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC)",
124+
" INTO 4 BUCKETS"
125+
);
126+
match hive_and_generic().verified_stmt(sql) {
127+
Statement::CreateTable(CreateTable { clustered_by, .. }) => {
128+
assert_eq!(
129+
clustered_by.unwrap(),
130+
ClusteredBy {
131+
columns: vec![Ident::new("a"), Ident::new("b")],
132+
sorted_by: Some(vec![
133+
OrderByExpr {
134+
expr: Expr::Identifier(Ident::new("a")),
135+
asc: Some(true),
136+
nulls_first: None,
137+
with_fill: None,
138+
},
139+
OrderByExpr {
140+
expr: Expr::Identifier(Ident::new("b")),
141+
asc: Some(false),
142+
nulls_first: None,
143+
with_fill: None,
144+
},
145+
]),
146+
num_buckets: Value::Number("4".parse().unwrap(), false),
147+
}
148+
)
149+
}
150+
_ => unreachable!(),
151+
}
152+
153+
// SORTED BY is optional
154+
hive_and_generic().verified_stmt("CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) INTO 4 BUCKETS");
155+
156+
// missing INTO BUCKETS
157+
assert_eq!(
158+
hive_and_generic().parse_sql_statements(
159+
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b)"
160+
).unwrap_err(),
161+
ParserError::ParserError("Expected: INTO, found: EOF".to_string())
162+
);
163+
// missing CLUSTER BY columns
164+
assert_eq!(
165+
hive_and_generic().parse_sql_statements(
166+
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY () INTO 4 BUCKETS"
167+
).unwrap_err(),
168+
ParserError::ParserError("Expected: identifier, found: )".to_string())
169+
);
170+
// missing SORT BY columns
171+
assert_eq!(
172+
hive_and_generic().parse_sql_statements(
173+
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY INTO 4 BUCKETS"
174+
).unwrap_err(),
175+
ParserError::ParserError("Expected: (, found: INTO".to_string())
176+
);
177+
// missing number BUCKETS
178+
assert_eq!(
179+
hive_and_generic().parse_sql_statements(
180+
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC) INTO"
181+
).unwrap_err(),
182+
ParserError::ParserError("Expected: a value, found: EOF".to_string())
183+
);
184+
}
185+
118186
// Turning off this test until we can parse identifiers starting with numbers :(
119187
#[test]
120188
fn test_identifier() {

tests/sqlparser_postgres.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4838,6 +4838,7 @@ fn parse_trigger_related_functions() {
48384838
order_by: None,
48394839
partition_by: None,
48404840
cluster_by: None,
4841+
clustered_by: None,
48414842
options: None,
48424843
strict: false,
48434844
copy_grants: false,

0 commit comments

Comments
 (0)