From b24b958ddf6ffdfb0ef56dac290074c555f21177 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 14 Jan 2025 14:29:03 +0400 Subject: [PATCH 1/4] Add support for `IS [NOT] [form] NORMALIZED` --- src/ast/mod.rs | 30 ++++++++++++-- src/ast/query.rs | 4 +- src/ast/spans.rs | 7 +++- src/ast/value.rs | 21 ++++++++++ src/keywords.rs | 1 + src/parser/mod.rs | 54 ++++++++++++++++++++++--- tests/sqlparser_common.rs | 84 +++++++++++++++++++++++++++++++++++++-- tests/sqlparser_mysql.rs | 2 +- 8 files changed, 186 insertions(+), 17 deletions(-) diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 2d79f7d6b..e59e5b5ca 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -83,7 +83,7 @@ pub use self::trigger::{ pub use self::value::{ escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString, - TrimWhereField, Value, + NormalizationForm, TrimWhereField, Value, }; use crate::ast::helpers::stmt_data_loading::{ @@ -653,6 +653,12 @@ pub enum Expr { IsDistinctFrom(Box, Box), /// `IS NOT DISTINCT FROM` operator IsNotDistinctFrom(Box, Box), + /// ` IS [ NOT ] [ form ] NORMALIZED` + IsNormalized { + expr: Box, + form: Option, + negated: bool, + }, /// `[ NOT ] IN (val1, val2, ...)` InList { expr: Box, @@ -1118,7 +1124,7 @@ impl fmt::Display for LambdaFunction { /// `OneOrManyWithParens` implements `Deref` and `IntoIterator`, /// so you can call slice methods on it and iterate over items /// # Examples -/// Acessing as a slice: +/// Accessing as a slice: /// ``` /// # use sqlparser::ast::OneOrManyWithParens; /// let one = OneOrManyWithParens::One("a"); @@ -1419,6 +1425,24 @@ impl fmt::Display for Expr { if *regexp { "REGEXP" } else { "RLIKE" }, pattern ), + Expr::IsNormalized { + expr, + form, + negated, + } => { + let not_ = if *negated { "NOT " } else { "" }; + if form.is_none() { + write!(f, "{} IS {}NORMALIZED", expr, not_) + } else { + write!( + f, + "{} IS {}{} NORMALIZED", + expr, + not_, + form.as_ref().unwrap() + ) + } + } Expr::SimilarTo { negated, expr, @@ -7749,7 +7773,7 @@ where /// ```sql /// EXPLAIN (ANALYZE, VERBOSE TRUE, FORMAT TEXT) SELECT * FROM my_table; /// -/// VACCUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table; +/// VACUUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table; /// ``` #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/ast/query.rs b/src/ast/query.rs index e7020ae23..9bcdc2e74 100644 --- a/src/ast/query.rs +++ b/src/ast/query.rs @@ -2821,10 +2821,10 @@ impl fmt::Display for ValueTableMode { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] pub enum UpdateTableFromKind { - /// Update Statment where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake) + /// Update Statement where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake) /// For Example: `UPDATE FROM t1 SET t1.name='aaa'` BeforeSet(TableWithJoins), - /// Update Statment where the 'FROM' clause is after the 'SET' keyword (Which is the standard way) + /// Update Statement where the 'FROM' clause is after the 'SET' keyword (Which is the standard way) /// For Example: `UPDATE SET t1.name='aaa' FROM t1` AfterSet(TableWithJoins), } diff --git a/src/ast/spans.rs b/src/ast/spans.rs index 183bebf8c..fca2865bd 100644 --- a/src/ast/spans.rs +++ b/src/ast/spans.rs @@ -1324,6 +1324,12 @@ impl Spanned for Expr { escape_char: _, any: _, } => expr.span().union(&pattern.span()), + Expr::RLike { .. } => Span::empty(), + Expr::IsNormalized { + expr, + form: _, + negated: _, + } => expr.span(), Expr::SimilarTo { negated: _, expr, @@ -1359,7 +1365,6 @@ impl Spanned for Expr { Expr::Array(array) => array.span(), Expr::MatchAgainst { .. } => Span::empty(), Expr::JsonAccess { value, path } => value.span().union(&path.span()), - Expr::RLike { .. } => Span::empty(), Expr::AnyOp { left, compare_op: _, diff --git a/src/ast/value.rs b/src/ast/value.rs index 45cc06a07..92eaa0228 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -270,6 +270,27 @@ impl fmt::Display for DateTimeField { } } +#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +pub enum NormalizationForm { + NFC, + NFD, + NFKC, + NFKD, +} + +impl fmt::Display for NormalizationForm { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + NormalizationForm::NFC => write!(f, "NFC"), + NormalizationForm::NFD => write!(f, "NFD"), + NormalizationForm::NFKC => write!(f, "NFKC"), + NormalizationForm::NFKD => write!(f, "NFKD"), + } + } +} + pub struct EscapeQuotedString<'a> { string: &'a str, quote: char, diff --git a/src/keywords.rs b/src/keywords.rs index eb9e3ea6f..2a88bf345 100644 --- a/src/keywords.rs +++ b/src/keywords.rs @@ -539,6 +539,7 @@ define_keywords!( NOORDER, NOREPLICATION, NORMALIZE, + NORMALIZED, NOSCAN, NOSUPERUSER, NOT, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index ac764a535..c87e221fb 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3183,9 +3183,15 @@ impl<'a> Parser<'a> { { let expr2 = self.parse_expr()?; Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2))) + } else if let Ok((form, negated)) = self.parse_unicode_is_normalized() { + Ok(Expr::IsNormalized { + expr: Box::new(expr), + form, + negated, + }) } else { self.expected( - "[NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS", + "[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS", self.peek_token(), ) } @@ -3850,7 +3856,7 @@ impl<'a> Parser<'a> { /// If the current token is the `expected` keyword, consume the token. /// Otherwise, return an error. /// - // todo deprecate infavor of expected_keyword_is + // todo deprecate in favor of expected_keyword_is pub fn expect_keyword(&mut self, expected: Keyword) -> Result { if self.parse_keyword(expected) { Ok(self.get_current_token().clone()) @@ -8452,6 +8458,42 @@ impl<'a> Parser<'a> { } } + /// Parse a literal unicode normalization clause + pub fn parse_unicode_is_normalized( + &mut self, + ) -> Result<(Option, bool), ParserError> { + let neg = self.parse_keyword(Keyword::NOT); + if self.parse_keyword(Keyword::NORMALIZED) { + return Ok((None, neg)); + } + let index = self.index; + let next_token = self.next_token(); + let normalized_form = if let Token::Word(Word { + value: ref s, + quote_style: None, + keyword: Keyword::NoKeyword, + }) = next_token.token + { + match s.to_uppercase().as_str() { + "NFC" => Some(NormalizationForm::NFC), + "NFD" => Some(NormalizationForm::NFD), + "NFKC" => Some(NormalizationForm::NFKC), + "NFKD" => Some(NormalizationForm::NFKD), + _ => { + self.index = index; + return self.expected("unicode normalization", next_token); + } + } + } else { + None + }; + if self.parse_keyword(Keyword::NORMALIZED) { + return Ok((normalized_form, neg)); + } + self.index = index; + self.expected("unicode normalization", self.peek_token()) + } + pub fn parse_enum_values(&mut self) -> Result, ParserError> { self.expect_token(&Token::LParen)?; let values = self.parse_comma_separated(|parser| { @@ -8957,7 +8999,7 @@ impl<'a> Parser<'a> { } } - /// Parse a table object for insetion + /// Parse a table object for insertion /// e.g. `some_database.some_table` or `FUNCTION some_table_func(...)` pub fn parse_table_object(&mut self) -> Result { if self.dialect.supports_insert_table_function() && self.parse_keyword(Keyword::FUNCTION) { @@ -11867,7 +11909,7 @@ impl<'a> Parser<'a> { } else { let mut name = self.parse_grantee_name()?; if self.consume_token(&Token::Colon) { - // Redshift supports namespace prefix for extenrnal users and groups: + // Redshift supports namespace prefix for external users and groups: // : or : // https://docs.aws.amazon.com/redshift/latest/mgmt/redshift-iam-access-control-native-idp.html let ident = self.parse_identifier()?; @@ -12863,7 +12905,7 @@ impl<'a> Parser<'a> { Ok(WithFill { from, to, step }) } - // Parse a set of comma seperated INTERPOLATE expressions (ClickHouse dialect) + // Parse a set of comma separated INTERPOLATE expressions (ClickHouse dialect) // that follow the INTERPOLATE keyword in an ORDER BY clause with the WITH FILL modifier pub fn parse_interpolations(&mut self) -> Result, ParserError> { if !self.parse_keyword(Keyword::INTERPOLATE) { @@ -14372,7 +14414,7 @@ mod tests { assert_eq!( ast, Err(ParserError::ParserError( - "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column: 16" + "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: a at Line: 1, Column: 16" .to_string() )) ); diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 07a30bc08..cf10a70ee 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -4579,7 +4579,7 @@ fn run_explain_analyze( expected_verbose: bool, expected_analyze: bool, expected_format: Option, - exepcted_options: Option>, + expected_options: Option>, ) { match dialect.verified_stmt(query) { Statement::Explain { @@ -4595,7 +4595,7 @@ fn run_explain_analyze( assert_eq!(verbose, expected_verbose); assert_eq!(analyze, expected_analyze); assert_eq!(format, expected_format); - assert_eq!(options, exepcted_options); + assert_eq!(options, expected_options); assert!(!query_plan); assert!(!estimate); assert_eq!("SELECT sqrt(id) FROM foo", statement.to_string()); @@ -9296,6 +9296,46 @@ fn parse_is_boolean() { verified_expr(sql) ); + let sql = "a IS NORMALIZED"; + assert_eq!( + IsNormalized { + expr: Box::new(Identifier(Ident::new("a"))), + form: None, + negated: false, + }, + verified_expr(sql) + ); + + let sql = "a IS NOT NORMALIZED"; + assert_eq!( + IsNormalized { + expr: Box::new(Identifier(Ident::new("a"))), + form: None, + negated: true, + }, + verified_expr(sql) + ); + + let sql = "a IS NFKC NORMALIZED"; + assert_eq!( + IsNormalized { + expr: Box::new(Identifier(Ident::new("a"))), + form: Some(NormalizationForm::NFKC), + negated: false, + }, + verified_expr(sql) + ); + + let sql = "a IS NOT NFKD NORMALIZED"; + assert_eq!( + IsNormalized { + expr: Box::new(Identifier(Ident::new("a"))), + form: Some(NormalizationForm::NFKD), + negated: true, + }, + verified_expr(sql) + ); + let sql = "a IS UNKNOWN"; assert_eq!( IsUnknown(Box::new(Identifier(Ident::new("a")))), @@ -9314,6 +9354,12 @@ fn parse_is_boolean() { verified_stmt("SELECT f FROM foo WHERE field IS FALSE"); verified_stmt("SELECT f FROM foo WHERE field IS NOT FALSE"); + verified_stmt("SELECT f FROM foo WHERE field IS NORMALIZED"); + verified_stmt("SELECT f FROM foo WHERE field IS NFC NORMALIZED"); + verified_stmt("SELECT f FROM foo WHERE field IS NFD NORMALIZED"); + verified_stmt("SELECT f FROM foo WHERE field IS NOT NORMALIZED"); + verified_stmt("SELECT f FROM foo WHERE field IS NOT NFKC NORMALIZED"); + verified_stmt("SELECT f FROM foo WHERE field IS UNKNOWN"); verified_stmt("SELECT f FROM foo WHERE field IS NOT UNKNOWN"); @@ -9321,7 +9367,37 @@ fn parse_is_boolean() { let res = parse_sql_statements(sql); assert_eq!( ParserError::ParserError( - "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: 0" + "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: 0" + .to_string() + ), + res.unwrap_err() + ); + + let sql = "SELECT s, s IS XYZ NORMALIZED FROM foo"; + let res = parse_sql_statements(sql); + assert_eq!( + ParserError::ParserError( + "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: XYZ" + .to_string() + ), + res.unwrap_err() + ); + + let sql = "SELECT s, s IS NFKC FROM foo"; + let res = parse_sql_statements(sql); + assert_eq!( + ParserError::ParserError( + "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: NFKC" + .to_string() + ), + res.unwrap_err() + ); + + let sql = "SELECT s, s IS TRIM(' NFKC ') FROM foo"; + let res = parse_sql_statements(sql); + assert_eq!( + ParserError::ParserError( + "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: TRIM" .to_string() ), res.unwrap_err() @@ -12982,7 +13058,7 @@ fn test_trailing_commas_in_from() { let sql = "SELECT a FROM b, WHERE c = 1"; let _ = dialects.parse_sql_statements(sql).unwrap(); - // nasted + // nested let sql = "SELECT 1, 2 FROM (SELECT * FROM t,),"; let _ = dialects.parse_sql_statements(sql).unwrap(); diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index dcf3f57fe..e93ac5695 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -2572,7 +2572,7 @@ fn parse_kill() { } #[test] -fn parse_table_colum_option_on_update() { +fn parse_table_column_option_on_update() { let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())"; match mysql().verified_stmt(sql1) { Statement::CreateTable(CreateTable { name, columns, .. }) => { From 458729e9c61550436e5a4eed19c4b7788438a841 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 16 Jan 2025 16:04:24 +0400 Subject: [PATCH 2/4] add inline docs/explanation for NormalizationForm enum values --- src/ast/value.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/ast/value.rs b/src/ast/value.rs index 92eaa0228..1b16646be 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -273,10 +273,18 @@ impl fmt::Display for DateTimeField { #[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +/// The Unicode Standard defines four normalization forms, which are intended to eliminate +/// certain distinctions between visually or functionally identical characters. +/// +/// See [Unicode Normalization Forms](https://unicode.org/reports/tr15/) for details. pub enum NormalizationForm { + /// Canonical Decomposition, followed by Canonical Composition. NFC, + /// Canonical Decomposition. NFD, + /// Compatibility Decomposition, followed by Canonical Composition. NFKC, + /// Compatibility Decomposition. NFKD, } From 8f632db61175190a36d23245c8581a0712df11c6 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 17 Jan 2025 12:26:43 +0400 Subject: [PATCH 3/4] take advantage of `maybe_parse`, make normalization forms keywords --- src/keywords.rs | 4 ++++ src/parser/mod.rs | 39 ++++++++++++++------------------------- tests/sqlparser_common.rs | 2 +- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/src/keywords.rs b/src/keywords.rs index 2a88bf345..6a5677b54 100644 --- a/src/keywords.rs +++ b/src/keywords.rs @@ -529,6 +529,10 @@ define_keywords!( NESTED, NEW, NEXT, + NFC, + NFD, + NFKC, + NFKD, NO, NOBYPASSRLS, NOCREATEDB, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index c87e221fb..fb0dadd56 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -8463,35 +8463,24 @@ impl<'a> Parser<'a> { &mut self, ) -> Result<(Option, bool), ParserError> { let neg = self.parse_keyword(Keyword::NOT); - if self.parse_keyword(Keyword::NORMALIZED) { - return Ok((None, neg)); - } - let index = self.index; - let next_token = self.next_token(); - let normalized_form = if let Token::Word(Word { - value: ref s, - quote_style: None, - keyword: Keyword::NoKeyword, - }) = next_token.token - { - match s.to_uppercase().as_str() { - "NFC" => Some(NormalizationForm::NFC), - "NFD" => Some(NormalizationForm::NFD), - "NFKC" => Some(NormalizationForm::NFKC), - "NFKD" => Some(NormalizationForm::NFKD), - _ => { - self.index = index; - return self.expected("unicode normalization", next_token); - } + let normalized_form = self.maybe_parse(|parser| { + match parser.parse_one_of_keywords(&[ + Keyword::NFC, + Keyword::NFD, + Keyword::NFKC, + Keyword::NFKD, + ]) { + Some(Keyword::NFC) => Ok(NormalizationForm::NFC), + Some(Keyword::NFD) => Ok(NormalizationForm::NFD), + Some(Keyword::NFKC) => Ok(NormalizationForm::NFKC), + Some(Keyword::NFKD) => Ok(NormalizationForm::NFKD), + _ => parser.expected("unicode normalization form", parser.peek_token()), } - } else { - None - }; + })?; if self.parse_keyword(Keyword::NORMALIZED) { return Ok((normalized_form, neg)); } - self.index = index; - self.expected("unicode normalization", self.peek_token()) + self.expected("unicode normalization form", self.peek_token()) } pub fn parse_enum_values(&mut self) -> Result, ParserError> { diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index cf10a70ee..bbc252429 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -9387,7 +9387,7 @@ fn parse_is_boolean() { let res = parse_sql_statements(sql); assert_eq!( ParserError::ParserError( - "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: NFKC" + "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: FROM" .to_string() ), res.unwrap_err() From 99bb1b79af1111ddca256eb4583c68de9fe43436 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 17 Jan 2025 13:52:28 +0400 Subject: [PATCH 4/4] update return type as `Result` --- src/parser/mod.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index fb0dadd56..786a0701a 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3183,12 +3183,8 @@ impl<'a> Parser<'a> { { let expr2 = self.parse_expr()?; Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2))) - } else if let Ok((form, negated)) = self.parse_unicode_is_normalized() { - Ok(Expr::IsNormalized { - expr: Box::new(expr), - form, - negated, - }) + } else if let Ok(is_normalized) = self.parse_unicode_is_normalized(expr) { + Ok(is_normalized) } else { self.expected( "[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS", @@ -8459,9 +8455,7 @@ impl<'a> Parser<'a> { } /// Parse a literal unicode normalization clause - pub fn parse_unicode_is_normalized( - &mut self, - ) -> Result<(Option, bool), ParserError> { + pub fn parse_unicode_is_normalized(&mut self, expr: Expr) -> Result { let neg = self.parse_keyword(Keyword::NOT); let normalized_form = self.maybe_parse(|parser| { match parser.parse_one_of_keywords(&[ @@ -8478,7 +8472,11 @@ impl<'a> Parser<'a> { } })?; if self.parse_keyword(Keyword::NORMALIZED) { - return Ok((normalized_form, neg)); + return Ok(Expr::IsNormalized { + expr: Box::new(expr), + form: normalized_form, + negated: neg, + }); } self.expected("unicode normalization form", self.peek_token()) }