Skip to content

Commit 724a1d1

Browse files
wugeeriffyio
andauthored
Add support for Hive's LOAD DATA expr (apache#1520)
Co-authored-by: Ifeanyi Ubah <[email protected]>
1 parent 62eaee6 commit 724a1d1

File tree

8 files changed

+323
-11
lines changed

8 files changed

+323
-11
lines changed

src/ast/mod.rs

+54
Original file line numberDiff line numberDiff line change
@@ -3347,6 +3347,22 @@ pub enum Statement {
33473347
channel: Ident,
33483348
payload: Option<String>,
33493349
},
3350+
/// ```sql
3351+
/// LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename
3352+
/// [PARTITION (partcol1=val1, partcol2=val2 ...)]
3353+
/// [INPUTFORMAT 'inputformat' SERDE 'serde']
3354+
/// ```
3355+
/// Loading files into tables
3356+
///
3357+
/// See Hive <https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=27362036#LanguageManualDML-Loadingfilesintotables>
3358+
LoadData {
3359+
local: bool,
3360+
inpath: String,
3361+
overwrite: bool,
3362+
table_name: ObjectName,
3363+
partitioned: Option<Vec<Expr>>,
3364+
table_format: Option<HiveLoadDataFormat>,
3365+
},
33503366
}
33513367

33523368
impl fmt::Display for Statement {
@@ -3949,6 +3965,36 @@ impl fmt::Display for Statement {
39493965
Ok(())
39503966
}
39513967
Statement::CreateTable(create_table) => create_table.fmt(f),
3968+
Statement::LoadData {
3969+
local,
3970+
inpath,
3971+
overwrite,
3972+
table_name,
3973+
partitioned,
3974+
table_format,
3975+
} => {
3976+
write!(
3977+
f,
3978+
"LOAD DATA {local}INPATH '{inpath}' {overwrite}INTO TABLE {table_name}",
3979+
local = if *local { "LOCAL " } else { "" },
3980+
inpath = inpath,
3981+
overwrite = if *overwrite { "OVERWRITE " } else { "" },
3982+
table_name = table_name,
3983+
)?;
3984+
if let Some(ref parts) = &partitioned {
3985+
if !parts.is_empty() {
3986+
write!(f, " PARTITION ({})", display_comma_separated(parts))?;
3987+
}
3988+
}
3989+
if let Some(HiveLoadDataFormat {
3990+
serde,
3991+
input_format,
3992+
}) = &table_format
3993+
{
3994+
write!(f, " INPUTFORMAT {input_format} SERDE {serde}")?;
3995+
}
3996+
Ok(())
3997+
}
39523998
Statement::CreateVirtualTable {
39533999
name,
39544000
if_not_exists,
@@ -5855,6 +5901,14 @@ pub enum HiveRowFormat {
58555901
DELIMITED { delimiters: Vec<HiveRowDelimiter> },
58565902
}
58575903

5904+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
5905+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
5906+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
5907+
pub struct HiveLoadDataFormat {
5908+
pub serde: Expr,
5909+
pub input_format: Expr,
5910+
}
5911+
58585912
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
58595913
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
58605914
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]

src/dialect/duckdb.rs

+5
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,9 @@ impl Dialect for DuckDbDialect {
6666
fn supports_explain_with_utility_options(&self) -> bool {
6767
true
6868
}
69+
70+
/// See DuckDB <https://duckdb.org/docs/sql/statements/load_and_install.html#load>
71+
fn supports_load_extension(&self) -> bool {
72+
true
73+
}
6974
}

src/dialect/generic.rs

+4
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,8 @@ impl Dialect for GenericDialect {
115115
fn supports_comment_on(&self) -> bool {
116116
true
117117
}
118+
119+
fn supports_load_extension(&self) -> bool {
120+
true
121+
}
118122
}

src/dialect/hive.rs

+5
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,9 @@ impl Dialect for HiveDialect {
5656
fn supports_bang_not_operator(&self) -> bool {
5757
true
5858
}
59+
60+
/// See Hive <https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=27362036#LanguageManualDML-Loadingfilesintotables>
61+
fn supports_load_data(&self) -> bool {
62+
true
63+
}
5964
}

src/dialect/mod.rs

+10
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,16 @@ pub trait Dialect: Debug + Any {
620620
false
621621
}
622622

623+
/// Returns true if the dialect supports the `LOAD DATA` statement
624+
fn supports_load_data(&self) -> bool {
625+
false
626+
}
627+
628+
/// Returns true if the dialect supports the `LOAD extension` statement
629+
fn supports_load_extension(&self) -> bool {
630+
false
631+
}
632+
623633
/// Returns true if this dialect expects the `TOP` option
624634
/// before the `ALL`/`DISTINCT` options in a `SELECT` statement.
625635
fn supports_top_before_distinct(&self) -> bool {

src/keywords.rs

+1
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ define_keywords!(
389389
INITIALLY,
390390
INNER,
391391
INOUT,
392+
INPATH,
392393
INPUT,
393394
INPUTFORMAT,
394395
INSENSITIVE,

src/parser/mod.rs

+45-7
Original file line numberDiff line numberDiff line change
@@ -543,10 +543,7 @@ impl<'a> Parser<'a> {
543543
Keyword::INSTALL if dialect_of!(self is DuckDbDialect | GenericDialect) => {
544544
self.parse_install()
545545
}
546-
// `LOAD` is duckdb specific https://duckdb.org/docs/extensions/overview
547-
Keyword::LOAD if dialect_of!(self is DuckDbDialect | GenericDialect) => {
548-
self.parse_load()
549-
}
546+
Keyword::LOAD => self.parse_load(),
550547
// `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
551548
Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | GenericDialect) => {
552549
self.parse_optimize_table()
@@ -11222,6 +11219,22 @@ impl<'a> Parser<'a> {
1122211219
}
1122311220
}
1122411221

11222+
pub fn parse_load_data_table_format(
11223+
&mut self,
11224+
) -> Result<Option<HiveLoadDataFormat>, ParserError> {
11225+
if self.parse_keyword(Keyword::INPUTFORMAT) {
11226+
let input_format = self.parse_expr()?;
11227+
self.expect_keyword(Keyword::SERDE)?;
11228+
let serde = self.parse_expr()?;
11229+
Ok(Some(HiveLoadDataFormat {
11230+
input_format,
11231+
serde,
11232+
}))
11233+
} else {
11234+
Ok(None)
11235+
}
11236+
}
11237+
1122511238
/// Parse an UPDATE statement, returning a `Box`ed SetExpr
1122611239
///
1122711240
/// This is used to reduce the size of the stack frames in debug builds
@@ -12224,10 +12237,35 @@ impl<'a> Parser<'a> {
1222412237
Ok(Statement::Install { extension_name })
1222512238
}
1222612239

12227-
/// `LOAD [extension_name]`
12240+
/// Parse a SQL LOAD statement
1222812241
pub fn parse_load(&mut self) -> Result<Statement, ParserError> {
12229-
let extension_name = self.parse_identifier(false)?;
12230-
Ok(Statement::Load { extension_name })
12242+
if self.dialect.supports_load_extension() {
12243+
let extension_name = self.parse_identifier(false)?;
12244+
Ok(Statement::Load { extension_name })
12245+
} else if self.parse_keyword(Keyword::DATA) && self.dialect.supports_load_data() {
12246+
let local = self.parse_one_of_keywords(&[Keyword::LOCAL]).is_some();
12247+
self.expect_keyword(Keyword::INPATH)?;
12248+
let inpath = self.parse_literal_string()?;
12249+
let overwrite = self.parse_one_of_keywords(&[Keyword::OVERWRITE]).is_some();
12250+
self.expect_keyword(Keyword::INTO)?;
12251+
self.expect_keyword(Keyword::TABLE)?;
12252+
let table_name = self.parse_object_name(false)?;
12253+
let partitioned = self.parse_insert_partition()?;
12254+
let table_format = self.parse_load_data_table_format()?;
12255+
Ok(Statement::LoadData {
12256+
local,
12257+
inpath,
12258+
overwrite,
12259+
table_name,
12260+
partitioned,
12261+
table_format,
12262+
})
12263+
} else {
12264+
self.expected(
12265+
"`DATA` or an extension name after `LOAD`",
12266+
self.peek_token(),
12267+
)
12268+
}
1223112269
}
1223212270

1223312271
/// ```sql

0 commit comments

Comments
 (0)