Add support for IS [NOT] [form] NORMALIZED (apache#1655)

alexander-beedie · Alexander Beedie · Vedin · commit 8ba087e65a65 · 2025-02-03T21:13:26.000+02:00
Co-authored-by: Alexander Beedie &lt;alexander.beedie@adia.ae&gt;
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
@@ -83,7 +83,7 @@ pub use self::trigger::{
 
 pub use self::value::{
     escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
-    TrimWhereField, Value,
+    NormalizationForm, TrimWhereField, Value,
 };
 
 use crate::ast::helpers::stmt_data_loading::{
@@ -653,6 +653,12 @@ pub enum Expr {
     IsDistinctFrom(Box<Expr>, Box<Expr>),
     /// `IS NOT DISTINCT FROM` operator
     IsNotDistinctFrom(Box<Expr>, Box<Expr>),
+    /// `<expr> IS [ NOT ] [ form ] NORMALIZED`
+    IsNormalized {
+        expr: Box<Expr>,
+        form: Option<NormalizationForm>,
+        negated: bool,
+    },
     /// `[ NOT ] IN (val1, val2, ...)`
     InList {
         expr: Box<Expr>,
@@ -1118,7 +1124,7 @@ impl fmt::Display for LambdaFunction {
 /// `OneOrManyWithParens` implements `Deref<Target = [T]>` and `IntoIterator`,
 /// so you can call slice methods on it and iterate over items
 /// # Examples
-/// Acessing as a slice:
+/// Accessing as a slice:
 /// ```
 /// # use sqlparser::ast::OneOrManyWithParens;
 /// let one = OneOrManyWithParens::One("a");
@@ -1419,6 +1425,24 @@ impl fmt::Display for Expr {
                 if *regexp { "REGEXP" } else { "RLIKE" },
                 pattern
             ),
+            Expr::IsNormalized {
+                expr,
+                form,
+                negated,
+            } => {
+                let not_ = if *negated { "NOT " } else { "" };
+                if form.is_none() {
+                    write!(f, "{} IS {}NORMALIZED", expr, not_)
+                } else {
+                    write!(
+                        f,
+                        "{} IS {}{} NORMALIZED",
+                        expr,
+                        not_,
+                        form.as_ref().unwrap()
+                    )
+                }
+            }
             Expr::SimilarTo {
                 negated,
                 expr,
@@ -7979,7 +8003,7 @@ where
 /// ```sql
 /// EXPLAIN (ANALYZE, VERBOSE TRUE, FORMAT TEXT) SELECT * FROM my_table;
 ///
-/// VACCUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
+/// VACUUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
 /// ```
 #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
diff --git a/src/ast/query.rs b/src/ast/query.rs
@@ -2821,10 +2821,10 @@ impl fmt::Display for ValueTableMode {
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
 pub enum UpdateTableFromKind {
-    /// Update Statment where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
+    /// Update Statement where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
     /// For Example: `UPDATE FROM t1 SET t1.name='aaa'`
     BeforeSet(TableWithJoins),
-    /// Update Statment where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
+    /// Update Statement where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
     /// For Example: `UPDATE SET t1.name='aaa' FROM t1`
     AfterSet(TableWithJoins),
 }
diff --git a/src/ast/spans.rs b/src/ast/spans.rs
@@ -1352,6 +1352,12 @@ impl Spanned for Expr {
                 escape_char: _,
                 any: _,
             } => expr.span().union(&pattern.span()),
+            Expr::RLike { .. } => Span::empty(),
+            Expr::IsNormalized {
+                expr,
+                form: _,
+                negated: _,
+            } => expr.span(),
             Expr::SimilarTo {
                 negated: _,
                 expr,
@@ -1387,7 +1393,6 @@ impl Spanned for Expr {
             Expr::Array(array) => array.span(),
             Expr::MatchAgainst { .. } => Span::empty(),
             Expr::JsonAccess { value, path } => value.span().union(&path.span()),
-            Expr::RLike { .. } => Span::empty(),
             Expr::AnyOp {
                 left,
                 compare_op: _,
diff --git a/src/ast/value.rs b/src/ast/value.rs
@@ -270,6 +270,35 @@ impl fmt::Display for DateTimeField {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
+/// The Unicode Standard defines four normalization forms, which are intended to eliminate
+/// certain distinctions between visually or functionally identical characters.
+///
+/// See [Unicode Normalization Forms](https://unicode.org/reports/tr15/) for details.
+pub enum NormalizationForm {
+    /// Canonical Decomposition, followed by Canonical Composition.
+    NFC,
+    /// Canonical Decomposition.
+    NFD,
+    /// Compatibility Decomposition, followed by Canonical Composition.
+    NFKC,
+    /// Compatibility Decomposition.
+    NFKD,
+}
+
+impl fmt::Display for NormalizationForm {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            NormalizationForm::NFC => write!(f, "NFC"),
+            NormalizationForm::NFD => write!(f, "NFD"),
+            NormalizationForm::NFKC => write!(f, "NFKC"),
+            NormalizationForm::NFKD => write!(f, "NFKD"),
+        }
+    }
+}
+
 pub struct EscapeQuotedString<'a> {
     string: &'a str,
     quote: char,
diff --git a/src/keywords.rs b/src/keywords.rs
@@ -535,6 +535,10 @@ define_keywords!(
     NESTED,
     NEW,
     NEXT,
+    NFC,
+    NFD,
+    NFKC,
+    NFKD,
     NO,
     NOBYPASSRLS,
     NOCREATEDB,
@@ -545,6 +549,7 @@ define_keywords!(
     NOORDER,
     NOREPLICATION,
     NORMALIZE,
+    NORMALIZED,
     NOSCAN,
     NOSUPERUSER,
     NOT,
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -3184,9 +3184,11 @@ impl<'a> Parser<'a> {
                     {
                         let expr2 = self.parse_expr()?;
                         Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2)))
+                    } else if let Ok(is_normalized) = self.parse_unicode_is_normalized(expr) {
+                        Ok(is_normalized)
                     } else {
                         self.expected(
-                            "[NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS",
+                            "[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS",
                             self.peek_token(),
                         )
                     }
@@ -3851,7 +3853,7 @@ impl<'a> Parser<'a> {
     /// If the current token is the `expected` keyword, consume the token.
     /// Otherwise, return an error.
     ///
-    // todo deprecate infavor of expected_keyword_is
+    // todo deprecate in favor of expected_keyword_is
     pub fn expect_keyword(&mut self, expected: Keyword) -> Result<TokenWithSpan, ParserError> {
         if self.parse_keyword(expected) {
             Ok(self.get_current_token().clone())
@@ -8453,6 +8455,33 @@ impl<'a> Parser<'a> {
         }
     }
 
+    /// Parse a literal unicode normalization clause
+    pub fn parse_unicode_is_normalized(&mut self, expr: Expr) -> Result<Expr, ParserError> {
+        let neg = self.parse_keyword(Keyword::NOT);
+        let normalized_form = self.maybe_parse(|parser| {
+            match parser.parse_one_of_keywords(&[
+                Keyword::NFC,
+                Keyword::NFD,
+                Keyword::NFKC,
+                Keyword::NFKD,
+            ]) {
+                Some(Keyword::NFC) => Ok(NormalizationForm::NFC),
+                Some(Keyword::NFD) => Ok(NormalizationForm::NFD),
+                Some(Keyword::NFKC) => Ok(NormalizationForm::NFKC),
+                Some(Keyword::NFKD) => Ok(NormalizationForm::NFKD),
+                _ => parser.expected("unicode normalization form", parser.peek_token()),
+            }
+        })?;
+        if self.parse_keyword(Keyword::NORMALIZED) {
+            return Ok(Expr::IsNormalized {
+                expr: Box::new(expr),
+                form: normalized_form,
+                negated: neg,
+            });
+        }
+        self.expected("unicode normalization form", self.peek_token())
+    }
+
     pub fn parse_enum_values(&mut self) -> Result<Vec<EnumMember>, ParserError> {
         self.expect_token(&Token::LParen)?;
         let values = self.parse_comma_separated(|parser| {
@@ -8979,7 +9008,7 @@ impl<'a> Parser<'a> {
         }
     }
 
-    /// Parse a table object for insetion
+    /// Parse a table object for insertion
     /// e.g. `some_database.some_table` or `FUNCTION some_table_func(...)`
     pub fn parse_table_object(&mut self) -> Result<TableObject, ParserError> {
         if self.dialect.supports_insert_table_function() && self.parse_keyword(Keyword::FUNCTION) {
@@ -11887,7 +11916,7 @@ impl<'a> Parser<'a> {
             } else {
                 let mut name = self.parse_grantee_name()?;
                 if self.consume_token(&Token::Colon) {
-                    // Redshift supports namespace prefix for extenrnal users and groups:
+                    // Redshift supports namespace prefix for external users and groups:
                     // <Namespace>:<GroupName> or <Namespace>:<UserName>
                     // https://docs.aws.amazon.com/redshift/latest/mgmt/redshift-iam-access-control-native-idp.html
                     let ident = self.parse_identifier()?;
@@ -12883,7 +12912,7 @@ impl<'a> Parser<'a> {
         Ok(WithFill { from, to, step })
     }
 
-    // Parse a set of comma seperated INTERPOLATE expressions (ClickHouse dialect)
+    // Parse a set of comma separated INTERPOLATE expressions (ClickHouse dialect)
     // that follow the INTERPOLATE keyword in an ORDER BY clause with the WITH FILL modifier
     pub fn parse_interpolations(&mut self) -> Result<Option<Interpolate>, ParserError> {
         if !self.parse_keyword(Keyword::INTERPOLATE) {
@@ -14432,7 +14461,7 @@ mod tests {
         assert_eq!(
             ast,
             Err(ParserError::ParserError(
-                "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column: 16"
+                "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: a at Line: 1, Column: 16"
                     .to_string()
             ))
         );
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
@@ -4600,7 +4600,7 @@ fn run_explain_analyze(
     expected_verbose: bool,
     expected_analyze: bool,
     expected_format: Option<AnalyzeFormat>,
-    exepcted_options: Option<Vec<UtilityOption>>,
+    expected_options: Option<Vec<UtilityOption>>,
 ) {
     match dialect.verified_stmt(query) {
         Statement::Explain {
@@ -4616,7 +4616,7 @@ fn run_explain_analyze(
             assert_eq!(verbose, expected_verbose);
             assert_eq!(analyze, expected_analyze);
             assert_eq!(format, expected_format);
-            assert_eq!(options, exepcted_options);
+            assert_eq!(options, expected_options);
             assert!(!query_plan);
             assert!(!estimate);
             assert_eq!("SELECT sqrt(id) FROM foo", statement.to_string());
@@ -9317,6 +9317,46 @@ fn parse_is_boolean() {
         verified_expr(sql)
     );
 
+    let sql = "a IS NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: None,
+            negated: false,
+        },
+        verified_expr(sql)
+    );
+
+    let sql = "a IS NOT NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: None,
+            negated: true,
+        },
+        verified_expr(sql)
+    );
+
+    let sql = "a IS NFKC NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: Some(NormalizationForm::NFKC),
+            negated: false,
+        },
+        verified_expr(sql)
+    );
+
+    let sql = "a IS NOT NFKD NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: Some(NormalizationForm::NFKD),
+            negated: true,
+        },
+        verified_expr(sql)
+    );
+
     let sql = "a IS UNKNOWN";
     assert_eq!(
         IsUnknown(Box::new(Identifier(Ident::new("a")))),
@@ -9335,14 +9375,50 @@ fn parse_is_boolean() {
     verified_stmt("SELECT f FROM foo WHERE field IS FALSE");
     verified_stmt("SELECT f FROM foo WHERE field IS NOT FALSE");
 
+    verified_stmt("SELECT f FROM foo WHERE field IS NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NFC NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NFD NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NOT NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NOT NFKC NORMALIZED");
+
     verified_stmt("SELECT f FROM foo WHERE field IS UNKNOWN");
     verified_stmt("SELECT f FROM foo WHERE field IS NOT UNKNOWN");
 
     let sql = "SELECT f from foo where field is 0";
     let res = parse_sql_statements(sql);
     assert_eq!(
         ParserError::ParserError(
-            "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: 0"
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: 0"
+                .to_string()
+        ),
+        res.unwrap_err()
+    );
+
+    let sql = "SELECT s, s IS XYZ NORMALIZED FROM foo";
+    let res = parse_sql_statements(sql);
+    assert_eq!(
+        ParserError::ParserError(
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: XYZ"
+                .to_string()
+        ),
+        res.unwrap_err()
+    );
+
+    let sql = "SELECT s, s IS NFKC FROM foo";
+    let res = parse_sql_statements(sql);
+    assert_eq!(
+        ParserError::ParserError(
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: FROM"
+                .to_string()
+        ),
+        res.unwrap_err()
+    );
+
+    let sql = "SELECT s, s IS TRIM(' NFKC ') FROM foo";
+    let res = parse_sql_statements(sql);
+    assert_eq!(
+        ParserError::ParserError(
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: TRIM"
                 .to_string()
         ),
         res.unwrap_err()
@@ -13003,7 +13079,7 @@ fn test_trailing_commas_in_from() {
     let sql = "SELECT a FROM b, WHERE c = 1";
     let _ = dialects.parse_sql_statements(sql).unwrap();
 
-    // nasted
+    // nested
     let sql = "SELECT 1, 2 FROM (SELECT * FROM t,),";
     let _ = dialects.parse_sql_statements(sql).unwrap();
 
diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs
@@ -2572,7 +2572,7 @@ fn parse_kill() {
 }
 
 #[test]
-fn parse_table_colum_option_on_update() {
+fn parse_table_column_option_on_update() {
     let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())";
     match mysql().verified_stmt(sql1) {
         Statement::CreateTable(CreateTable { name, columns, .. }) => {

Original file line number	Diff line number	Diff line change
`@@ -2572,7 +2572,7 @@ fn parse_kill() {`
`2572`	`2572`	`}`
`2573`	`2573`
`2574`	`2574`	`#[test]`
`2575`		`-fn parse_table_colum_option_on_update() {`
	`2575`	`+fn parse_table_column_option_on_update() {`
`2576`	`2576`	let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())";
`2577`	`2577`	`match mysql().verified_stmt(sql1) {`
`2578`	`2578`	`Statement::CreateTable(CreateTable { name, columns, .. }) => {`