Skip to content

Commit 8ba087e

Browse files
alexander-beedieAlexander Beedie
authored andcommitted
Add support for IS [NOT] [form] NORMALIZED (apache#1655)
Co-authored-by: Alexander Beedie <[email protected]>
1 parent 501697b commit 8ba087e

File tree

8 files changed

+185
-17
lines changed

8 files changed

+185
-17
lines changed

src/ast/mod.rs

+27-3
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ pub use self::trigger::{
8383

8484
pub use self::value::{
8585
escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
86-
TrimWhereField, Value,
86+
NormalizationForm, TrimWhereField, Value,
8787
};
8888

8989
use crate::ast::helpers::stmt_data_loading::{
@@ -653,6 +653,12 @@ pub enum Expr {
653653
IsDistinctFrom(Box<Expr>, Box<Expr>),
654654
/// `IS NOT DISTINCT FROM` operator
655655
IsNotDistinctFrom(Box<Expr>, Box<Expr>),
656+
/// `<expr> IS [ NOT ] [ form ] NORMALIZED`
657+
IsNormalized {
658+
expr: Box<Expr>,
659+
form: Option<NormalizationForm>,
660+
negated: bool,
661+
},
656662
/// `[ NOT ] IN (val1, val2, ...)`
657663
InList {
658664
expr: Box<Expr>,
@@ -1118,7 +1124,7 @@ impl fmt::Display for LambdaFunction {
11181124
/// `OneOrManyWithParens` implements `Deref<Target = [T]>` and `IntoIterator`,
11191125
/// so you can call slice methods on it and iterate over items
11201126
/// # Examples
1121-
/// Acessing as a slice:
1127+
/// Accessing as a slice:
11221128
/// ```
11231129
/// # use sqlparser::ast::OneOrManyWithParens;
11241130
/// let one = OneOrManyWithParens::One("a");
@@ -1419,6 +1425,24 @@ impl fmt::Display for Expr {
14191425
if *regexp { "REGEXP" } else { "RLIKE" },
14201426
pattern
14211427
),
1428+
Expr::IsNormalized {
1429+
expr,
1430+
form,
1431+
negated,
1432+
} => {
1433+
let not_ = if *negated { "NOT " } else { "" };
1434+
if form.is_none() {
1435+
write!(f, "{} IS {}NORMALIZED", expr, not_)
1436+
} else {
1437+
write!(
1438+
f,
1439+
"{} IS {}{} NORMALIZED",
1440+
expr,
1441+
not_,
1442+
form.as_ref().unwrap()
1443+
)
1444+
}
1445+
}
14221446
Expr::SimilarTo {
14231447
negated,
14241448
expr,
@@ -7979,7 +8003,7 @@ where
79798003
/// ```sql
79808004
/// EXPLAIN (ANALYZE, VERBOSE TRUE, FORMAT TEXT) SELECT * FROM my_table;
79818005
///
7982-
/// VACCUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
8006+
/// VACUUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
79838007
/// ```
79848008
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
79858009
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]

src/ast/query.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -2821,10 +2821,10 @@ impl fmt::Display for ValueTableMode {
28212821
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
28222822
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
28232823
pub enum UpdateTableFromKind {
2824-
/// Update Statment where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
2824+
/// Update Statement where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
28252825
/// For Example: `UPDATE FROM t1 SET t1.name='aaa'`
28262826
BeforeSet(TableWithJoins),
2827-
/// Update Statment where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
2827+
/// Update Statement where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
28282828
/// For Example: `UPDATE SET t1.name='aaa' FROM t1`
28292829
AfterSet(TableWithJoins),
28302830
}

src/ast/spans.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -1352,6 +1352,12 @@ impl Spanned for Expr {
13521352
escape_char: _,
13531353
any: _,
13541354
} => expr.span().union(&pattern.span()),
1355+
Expr::RLike { .. } => Span::empty(),
1356+
Expr::IsNormalized {
1357+
expr,
1358+
form: _,
1359+
negated: _,
1360+
} => expr.span(),
13551361
Expr::SimilarTo {
13561362
negated: _,
13571363
expr,
@@ -1387,7 +1393,6 @@ impl Spanned for Expr {
13871393
Expr::Array(array) => array.span(),
13881394
Expr::MatchAgainst { .. } => Span::empty(),
13891395
Expr::JsonAccess { value, path } => value.span().union(&path.span()),
1390-
Expr::RLike { .. } => Span::empty(),
13911396
Expr::AnyOp {
13921397
left,
13931398
compare_op: _,

src/ast/value.rs

+29
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,35 @@ impl fmt::Display for DateTimeField {
270270
}
271271
}
272272

273+
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
274+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
275+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
276+
/// The Unicode Standard defines four normalization forms, which are intended to eliminate
277+
/// certain distinctions between visually or functionally identical characters.
278+
///
279+
/// See [Unicode Normalization Forms](https://unicode.org/reports/tr15/) for details.
280+
pub enum NormalizationForm {
281+
/// Canonical Decomposition, followed by Canonical Composition.
282+
NFC,
283+
/// Canonical Decomposition.
284+
NFD,
285+
/// Compatibility Decomposition, followed by Canonical Composition.
286+
NFKC,
287+
/// Compatibility Decomposition.
288+
NFKD,
289+
}
290+
291+
impl fmt::Display for NormalizationForm {
292+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293+
match self {
294+
NormalizationForm::NFC => write!(f, "NFC"),
295+
NormalizationForm::NFD => write!(f, "NFD"),
296+
NormalizationForm::NFKC => write!(f, "NFKC"),
297+
NormalizationForm::NFKD => write!(f, "NFKD"),
298+
}
299+
}
300+
}
301+
273302
pub struct EscapeQuotedString<'a> {
274303
string: &'a str,
275304
quote: char,

src/keywords.rs

+5
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,10 @@ define_keywords!(
535535
NESTED,
536536
NEW,
537537
NEXT,
538+
NFC,
539+
NFD,
540+
NFKC,
541+
NFKD,
538542
NO,
539543
NOBYPASSRLS,
540544
NOCREATEDB,
@@ -545,6 +549,7 @@ define_keywords!(
545549
NOORDER,
546550
NOREPLICATION,
547551
NORMALIZE,
552+
NORMALIZED,
548553
NOSCAN,
549554
NOSUPERUSER,
550555
NOT,

src/parser/mod.rs

+35-6
Original file line numberDiff line numberDiff line change
@@ -3184,9 +3184,11 @@ impl<'a> Parser<'a> {
31843184
{
31853185
let expr2 = self.parse_expr()?;
31863186
Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2)))
3187+
} else if let Ok(is_normalized) = self.parse_unicode_is_normalized(expr) {
3188+
Ok(is_normalized)
31873189
} else {
31883190
self.expected(
3189-
"[NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS",
3191+
"[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS",
31903192
self.peek_token(),
31913193
)
31923194
}
@@ -3851,7 +3853,7 @@ impl<'a> Parser<'a> {
38513853
/// If the current token is the `expected` keyword, consume the token.
38523854
/// Otherwise, return an error.
38533855
///
3854-
// todo deprecate infavor of expected_keyword_is
3856+
// todo deprecate in favor of expected_keyword_is
38553857
pub fn expect_keyword(&mut self, expected: Keyword) -> Result<TokenWithSpan, ParserError> {
38563858
if self.parse_keyword(expected) {
38573859
Ok(self.get_current_token().clone())
@@ -8453,6 +8455,33 @@ impl<'a> Parser<'a> {
84538455
}
84548456
}
84558457

8458+
/// Parse a literal unicode normalization clause
8459+
pub fn parse_unicode_is_normalized(&mut self, expr: Expr) -> Result<Expr, ParserError> {
8460+
let neg = self.parse_keyword(Keyword::NOT);
8461+
let normalized_form = self.maybe_parse(|parser| {
8462+
match parser.parse_one_of_keywords(&[
8463+
Keyword::NFC,
8464+
Keyword::NFD,
8465+
Keyword::NFKC,
8466+
Keyword::NFKD,
8467+
]) {
8468+
Some(Keyword::NFC) => Ok(NormalizationForm::NFC),
8469+
Some(Keyword::NFD) => Ok(NormalizationForm::NFD),
8470+
Some(Keyword::NFKC) => Ok(NormalizationForm::NFKC),
8471+
Some(Keyword::NFKD) => Ok(NormalizationForm::NFKD),
8472+
_ => parser.expected("unicode normalization form", parser.peek_token()),
8473+
}
8474+
})?;
8475+
if self.parse_keyword(Keyword::NORMALIZED) {
8476+
return Ok(Expr::IsNormalized {
8477+
expr: Box::new(expr),
8478+
form: normalized_form,
8479+
negated: neg,
8480+
});
8481+
}
8482+
self.expected("unicode normalization form", self.peek_token())
8483+
}
8484+
84568485
pub fn parse_enum_values(&mut self) -> Result<Vec<EnumMember>, ParserError> {
84578486
self.expect_token(&Token::LParen)?;
84588487
let values = self.parse_comma_separated(|parser| {
@@ -8979,7 +9008,7 @@ impl<'a> Parser<'a> {
89799008
}
89809009
}
89819010

8982-
/// Parse a table object for insetion
9011+
/// Parse a table object for insertion
89839012
/// e.g. `some_database.some_table` or `FUNCTION some_table_func(...)`
89849013
pub fn parse_table_object(&mut self) -> Result<TableObject, ParserError> {
89859014
if self.dialect.supports_insert_table_function() && self.parse_keyword(Keyword::FUNCTION) {
@@ -11887,7 +11916,7 @@ impl<'a> Parser<'a> {
1188711916
} else {
1188811917
let mut name = self.parse_grantee_name()?;
1188911918
if self.consume_token(&Token::Colon) {
11890-
// Redshift supports namespace prefix for extenrnal users and groups:
11919+
// Redshift supports namespace prefix for external users and groups:
1189111920
// <Namespace>:<GroupName> or <Namespace>:<UserName>
1189211921
// https://docs.aws.amazon.com/redshift/latest/mgmt/redshift-iam-access-control-native-idp.html
1189311922
let ident = self.parse_identifier()?;
@@ -12883,7 +12912,7 @@ impl<'a> Parser<'a> {
1288312912
Ok(WithFill { from, to, step })
1288412913
}
1288512914

12886-
// Parse a set of comma seperated INTERPOLATE expressions (ClickHouse dialect)
12915+
// Parse a set of comma separated INTERPOLATE expressions (ClickHouse dialect)
1288712916
// that follow the INTERPOLATE keyword in an ORDER BY clause with the WITH FILL modifier
1288812917
pub fn parse_interpolations(&mut self) -> Result<Option<Interpolate>, ParserError> {
1288912918
if !self.parse_keyword(Keyword::INTERPOLATE) {
@@ -14432,7 +14461,7 @@ mod tests {
1443214461
assert_eq!(
1443314462
ast,
1443414463
Err(ParserError::ParserError(
14435-
"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column: 16"
14464+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: a at Line: 1, Column: 16"
1443614465
.to_string()
1443714466
))
1443814467
);

tests/sqlparser_common.rs

+80-4
Original file line numberDiff line numberDiff line change
@@ -4600,7 +4600,7 @@ fn run_explain_analyze(
46004600
expected_verbose: bool,
46014601
expected_analyze: bool,
46024602
expected_format: Option<AnalyzeFormat>,
4603-
exepcted_options: Option<Vec<UtilityOption>>,
4603+
expected_options: Option<Vec<UtilityOption>>,
46044604
) {
46054605
match dialect.verified_stmt(query) {
46064606
Statement::Explain {
@@ -4616,7 +4616,7 @@ fn run_explain_analyze(
46164616
assert_eq!(verbose, expected_verbose);
46174617
assert_eq!(analyze, expected_analyze);
46184618
assert_eq!(format, expected_format);
4619-
assert_eq!(options, exepcted_options);
4619+
assert_eq!(options, expected_options);
46204620
assert!(!query_plan);
46214621
assert!(!estimate);
46224622
assert_eq!("SELECT sqrt(id) FROM foo", statement.to_string());
@@ -9317,6 +9317,46 @@ fn parse_is_boolean() {
93179317
verified_expr(sql)
93189318
);
93199319

9320+
let sql = "a IS NORMALIZED";
9321+
assert_eq!(
9322+
IsNormalized {
9323+
expr: Box::new(Identifier(Ident::new("a"))),
9324+
form: None,
9325+
negated: false,
9326+
},
9327+
verified_expr(sql)
9328+
);
9329+
9330+
let sql = "a IS NOT NORMALIZED";
9331+
assert_eq!(
9332+
IsNormalized {
9333+
expr: Box::new(Identifier(Ident::new("a"))),
9334+
form: None,
9335+
negated: true,
9336+
},
9337+
verified_expr(sql)
9338+
);
9339+
9340+
let sql = "a IS NFKC NORMALIZED";
9341+
assert_eq!(
9342+
IsNormalized {
9343+
expr: Box::new(Identifier(Ident::new("a"))),
9344+
form: Some(NormalizationForm::NFKC),
9345+
negated: false,
9346+
},
9347+
verified_expr(sql)
9348+
);
9349+
9350+
let sql = "a IS NOT NFKD NORMALIZED";
9351+
assert_eq!(
9352+
IsNormalized {
9353+
expr: Box::new(Identifier(Ident::new("a"))),
9354+
form: Some(NormalizationForm::NFKD),
9355+
negated: true,
9356+
},
9357+
verified_expr(sql)
9358+
);
9359+
93209360
let sql = "a IS UNKNOWN";
93219361
assert_eq!(
93229362
IsUnknown(Box::new(Identifier(Ident::new("a")))),
@@ -9335,14 +9375,50 @@ fn parse_is_boolean() {
93359375
verified_stmt("SELECT f FROM foo WHERE field IS FALSE");
93369376
verified_stmt("SELECT f FROM foo WHERE field IS NOT FALSE");
93379377

9378+
verified_stmt("SELECT f FROM foo WHERE field IS NORMALIZED");
9379+
verified_stmt("SELECT f FROM foo WHERE field IS NFC NORMALIZED");
9380+
verified_stmt("SELECT f FROM foo WHERE field IS NFD NORMALIZED");
9381+
verified_stmt("SELECT f FROM foo WHERE field IS NOT NORMALIZED");
9382+
verified_stmt("SELECT f FROM foo WHERE field IS NOT NFKC NORMALIZED");
9383+
93389384
verified_stmt("SELECT f FROM foo WHERE field IS UNKNOWN");
93399385
verified_stmt("SELECT f FROM foo WHERE field IS NOT UNKNOWN");
93409386

93419387
let sql = "SELECT f from foo where field is 0";
93429388
let res = parse_sql_statements(sql);
93439389
assert_eq!(
93449390
ParserError::ParserError(
9345-
"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: 0"
9391+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: 0"
9392+
.to_string()
9393+
),
9394+
res.unwrap_err()
9395+
);
9396+
9397+
let sql = "SELECT s, s IS XYZ NORMALIZED FROM foo";
9398+
let res = parse_sql_statements(sql);
9399+
assert_eq!(
9400+
ParserError::ParserError(
9401+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: XYZ"
9402+
.to_string()
9403+
),
9404+
res.unwrap_err()
9405+
);
9406+
9407+
let sql = "SELECT s, s IS NFKC FROM foo";
9408+
let res = parse_sql_statements(sql);
9409+
assert_eq!(
9410+
ParserError::ParserError(
9411+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: FROM"
9412+
.to_string()
9413+
),
9414+
res.unwrap_err()
9415+
);
9416+
9417+
let sql = "SELECT s, s IS TRIM(' NFKC ') FROM foo";
9418+
let res = parse_sql_statements(sql);
9419+
assert_eq!(
9420+
ParserError::ParserError(
9421+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: TRIM"
93469422
.to_string()
93479423
),
93489424
res.unwrap_err()
@@ -13003,7 +13079,7 @@ fn test_trailing_commas_in_from() {
1300313079
let sql = "SELECT a FROM b, WHERE c = 1";
1300413080
let _ = dialects.parse_sql_statements(sql).unwrap();
1300513081

13006-
// nasted
13082+
// nested
1300713083
let sql = "SELECT 1, 2 FROM (SELECT * FROM t,),";
1300813084
let _ = dialects.parse_sql_statements(sql).unwrap();
1300913085

tests/sqlparser_mysql.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -2572,7 +2572,7 @@ fn parse_kill() {
25722572
}
25732573

25742574
#[test]
2575-
fn parse_table_colum_option_on_update() {
2575+
fn parse_table_column_option_on_update() {
25762576
let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())";
25772577
match mysql().verified_stmt(sql1) {
25782578
Statement::CreateTable(CreateTable { name, columns, .. }) => {

0 commit comments

Comments
 (0)