PostgreSQL: GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY and GENERATED ALWAYS AS ( generation_expr ) support (apache#832)

samjay000 · web-flow · commit a8a8e65b7c7d · 2023-03-16T05:54:00.000-04:00
* GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( sequence_options ) basic impl - test are failing.

* PostgreSQL GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( sequence_options ) and GENERATED ALWAYS AS ( generation_expr ) STORED implementation.
diff --git a/src/ast/ddl.rs b/src/ast/ddl.rs
@@ -24,7 +24,9 @@ use serde::{Deserialize, Serialize};
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::ast::value::escape_single_quote_string;
-use crate::ast::{display_comma_separated, display_separated, DataType, Expr, Ident, ObjectName};
+use crate::ast::{
+    display_comma_separated, display_separated, DataType, Expr, Ident, ObjectName, SequenceOptions,
+};
 use crate::tokenizer::Token;
 
 /// An `ALTER TABLE` (`Statement::AlterTable`) operation
@@ -575,6 +577,13 @@ pub enum ColumnOption {
     CharacterSet(ObjectName),
     Comment(String),
     OnUpdate(Expr),
+    /// `Generated`s are modifiers that follow a column definition in a `CREATE
+    /// TABLE` statement.
+    Generated {
+        generated_as: GeneratedAs,
+        sequence_options: Option<Vec<SequenceOptions>>,
+        generation_expr: Option<Expr>,
+    },
 }
 
 impl fmt::Display for ColumnOption {
@@ -610,10 +619,63 @@ impl fmt::Display for ColumnOption {
             CharacterSet(n) => write!(f, "CHARACTER SET {n}"),
             Comment(v) => write!(f, "COMMENT '{}'", escape_single_quote_string(v)),
             OnUpdate(expr) => write!(f, "ON UPDATE {expr}"),
+            Generated {
+                generated_as,
+                sequence_options,
+                generation_expr,
+            } => match generated_as {
+                GeneratedAs::Always => {
+                    write!(f, "GENERATED ALWAYS AS IDENTITY")?;
+                    if sequence_options.is_some() {
+                        let so = sequence_options.as_ref().unwrap();
+                        if !so.is_empty() {
+                            write!(f, " (")?;
+                        }
+                        for sequence_option in so {
+                            write!(f, "{sequence_option}")?;
+                        }
+                        if !so.is_empty() {
+                            write!(f, " )")?;
+                        }
+                    }
+                    Ok(())
+                }
+                GeneratedAs::ByDefault => {
+                    write!(f, "GENERATED BY DEFAULT AS IDENTITY")?;
+                    if sequence_options.is_some() {
+                        let so = sequence_options.as_ref().unwrap();
+                        if !so.is_empty() {
+                            write!(f, " (")?;
+                        }
+                        for sequence_option in so {
+                            write!(f, "{sequence_option}")?;
+                        }
+                        if !so.is_empty() {
+                            write!(f, " )")?;
+                        }
+                    }
+                    Ok(())
+                }
+                GeneratedAs::ExpStored => {
+                    let expr = generation_expr.as_ref().unwrap();
+                    write!(f, "GENERATED ALWAYS AS ({expr}) STORED")
+                }
+            },
         }
     }
 }
 
+/// `GeneratedAs`s are modifiers that follow a column option in a `generated`.
+/// 'ExpStored' is PostgreSQL specific
+#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
+pub enum GeneratedAs {
+    Always,
+    ByDefault,
+    ExpStored,
+}
+
 fn display_constraint_name(name: &'_ Option<Ident>) -> impl fmt::Display + '_ {
     struct ConstraintName<'a>(&'a Option<Ident>);
     impl<'a> fmt::Display for ConstraintName<'a> {
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
@@ -30,7 +30,7 @@ pub use self::data_type::{
 };
 pub use self::ddl::{
     AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption,
-    ColumnOptionDef, IndexType, KeyOrIndexDisplay, ReferentialAction, TableConstraint,
+    ColumnOptionDef, GeneratedAs, IndexType, KeyOrIndexDisplay, ReferentialAction, TableConstraint,
 };
 pub use self::operator::{BinaryOperator, UnaryOperator};
 pub use self::query::{
diff --git a/src/dialect/ansi.rs b/src/dialect/ansi.rs
@@ -17,13 +17,10 @@ pub struct AnsiDialect {}
 
 impl Dialect for AnsiDialect {
     fn is_identifier_start(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch)
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase()
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
-            || ch == '_'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
     }
 }
diff --git a/src/dialect/bigquery.rs b/src/dialect/bigquery.rs
@@ -22,13 +22,13 @@ impl Dialect for BigQueryDialect {
     }
 
     fn is_identifier_start(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
+            || ch.is_ascii_digit()
             || ch == '_'
             || ch == '-'
     }
diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs
@@ -18,10 +18,10 @@ pub struct ClickHouseDialect {}
 impl Dialect for ClickHouseDialect {
     fn is_identifier_start(&self, ch: char) -> bool {
         // See https://clickhouse.com/docs/en/sql-reference/syntax/#syntax-identifiers
-        ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        self.is_identifier_start(ch) || ('0'..='9').contains(&ch)
+        self.is_identifier_start(ch) || ch.is_ascii_digit()
     }
 }
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
@@ -17,17 +17,13 @@ pub struct GenericDialect;
 
 impl Dialect for GenericDialect {
     fn is_identifier_start(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ch == '_'
-            || ch == '#'
-            || ch == '@'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_' || ch == '#' || ch == '@'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
+            || ch.is_ascii_digit()
             || ch == '@'
             || ch == '$'
             || ch == '#'
diff --git a/src/dialect/hive.rs b/src/dialect/hive.rs
@@ -21,16 +21,13 @@ impl Dialect for HiveDialect {
     }
 
     fn is_identifier_start(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
-            || ch == '$'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '$'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
+            || ch.is_ascii_digit()
             || ch == '_'
             || ch == '$'
             || ch == '{'
diff --git a/src/dialect/mssql.rs b/src/dialect/mssql.rs
@@ -23,17 +23,13 @@ impl Dialect for MsSqlDialect {
     fn is_identifier_start(&self, ch: char) -> bool {
         // See https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers?view=sql-server-2017#rules-for-regular-identifiers
         // We don't support non-latin "letters" currently.
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ch == '_'
-            || ch == '#'
-            || ch == '@'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_' || ch == '#' || ch == '@'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
+            || ch.is_ascii_digit()
             || ch == '@'
             || ch == '$'
             || ch == '#'
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
@@ -20,16 +20,16 @@ impl Dialect for MySqlDialect {
         // See https://dev.mysql.com/doc/refman/8.0/en/identifiers.html.
         // We don't yet support identifiers beginning with numbers, as that
         // makes it hard to distinguish numeric literals.
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
             || ch == '_'
             || ch == '$'
             || ch == '@'
             || ('\u{0080}'..='\u{ffff}').contains(&ch)
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        self.is_identifier_start(ch) || ('0'..='9').contains(&ch)
+        self.is_identifier_start(ch) || ch.is_ascii_digit()
     }
 
     fn is_delimited_identifier_start(&self, ch: char) -> bool {
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
@@ -24,13 +24,13 @@ impl Dialect for PostgreSqlDialect {
         // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
         // We don't yet support identifiers beginning with "letters with
         // diacritical marks and non-Latin letters"
-        ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
+            || ch.is_ascii_digit()
             || ch == '$'
             || ch == '_'
     }
diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs
@@ -18,13 +18,13 @@ pub struct SnowflakeDialect;
 impl Dialect for SnowflakeDialect {
     // see https://docs.snowflake.com/en/sql-reference/identifiers-syntax.html
     fn is_identifier_start(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
-            || ('0'..='9').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
+            || ch.is_ascii_digit()
             || ch == '$'
             || ch == '_'
     }
diff --git a/src/dialect/sqlite.rs b/src/dialect/sqlite.rs
@@ -28,15 +28,15 @@ impl Dialect for SQLiteDialect {
 
     fn is_identifier_start(&self, ch: char) -> bool {
         // See https://www.sqlite.org/draft/tokenreq.html
-        ('a'..='z').contains(&ch)
-            || ('A'..='Z').contains(&ch)
+        ch.is_ascii_lowercase()
+            || ch.is_ascii_uppercase()
             || ch == '_'
             || ch == '$'
             || ('\u{007f}'..='\u{ffff}').contains(&ch)
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
-        self.is_identifier_start(ch) || ('0'..='9').contains(&ch)
+        self.is_identifier_start(ch) || ch.is_ascii_digit()
     }
 
     fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
diff --git a/src/keywords.rs b/src/keywords.rs
@@ -77,6 +77,7 @@ define_keywords!(
     ALL,
     ALLOCATE,
     ALTER,
+    ALWAYS,
     ANALYZE,
     AND,
     ANTI,
@@ -270,6 +271,7 @@ define_keywords!(
     FUNCTION,
     FUNCTIONS,
     FUSION,
+    GENERATED,
     GET,
     GLOBAL,
     GRANT,
diff --git a/src/parser.rs b/src/parser.rs
@@ -3567,6 +3567,55 @@ impl<'a> Parser<'a> {
         {
             let expr = self.parse_expr()?;
             Ok(Some(ColumnOption::OnUpdate(expr)))
+        } else if self.parse_keyword(Keyword::GENERATED) {
+            self.parse_optional_column_option_generated()
+        } else {
+            Ok(None)
+        }
+    }
+    fn parse_optional_column_option_generated(
+        &mut self,
+    ) -> Result<Option<ColumnOption>, ParserError> {
+        if self.parse_keywords(&[Keyword::ALWAYS, Keyword::AS, Keyword::IDENTITY]) {
+            let mut sequence_options = vec![];
+            if self.expect_token(&Token::LParen).is_ok() {
+                sequence_options = self.parse_create_sequence_options()?;
+                self.expect_token(&Token::RParen)?;
+            }
+            Ok(Some(ColumnOption::Generated {
+                generated_as: GeneratedAs::Always,
+                sequence_options: Some(sequence_options),
+                generation_expr: None,
+            }))
+        } else if self.parse_keywords(&[
+            Keyword::BY,
+            Keyword::DEFAULT,
+            Keyword::AS,
+            Keyword::IDENTITY,
+        ]) {
+            let mut sequence_options = vec![];
+            if self.expect_token(&Token::LParen).is_ok() {
+                sequence_options = self.parse_create_sequence_options()?;
+                self.expect_token(&Token::RParen)?;
+            }
+            Ok(Some(ColumnOption::Generated {
+                generated_as: GeneratedAs::ByDefault,
+                sequence_options: Some(sequence_options),
+                generation_expr: None,
+            }))
+        } else if self.parse_keywords(&[Keyword::ALWAYS, Keyword::AS]) {
+            if self.expect_token(&Token::LParen).is_ok() {
+                let expr = self.parse_expr()?;
+                self.expect_token(&Token::RParen)?;
+                let _ = self.parse_keywords(&[Keyword::STORED]);
+                Ok(Some(ColumnOption::Generated {
+                    generated_as: GeneratedAs::ExpStored,
+                    sequence_options: None,
+                    generation_expr: Some(expr),
+                }))
+            } else {
+                Ok(None)
+            }
         } else {
             Ok(None)
         }
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -596,7 +596,7 @@ impl<'a> Tokenizer<'a> {
                     let word = self.tokenize_word(ch, chars);
 
                     // TODO: implement parsing of exponent here
-                    if word.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
+                    if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
                         let mut inner_state = State {
                             peekable: word.chars().peekable(),
                             line: 0,
diff --git a/tests/sqlparser_custom_dialect.rs b/tests/sqlparser_custom_dialect.rs
@@ -126,13 +126,13 @@ fn custom_statement_parser() -> Result<(), ParserError> {
 }
 
 fn is_identifier_start(ch: char) -> bool {
-    ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_'
+    ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
 }
 
 fn is_identifier_part(ch: char) -> bool {
-    ('a'..='z').contains(&ch)
-        || ('A'..='Z').contains(&ch)
-        || ('0'..='9').contains(&ch)
+    ch.is_ascii_lowercase()
+        || ch.is_ascii_uppercase()
+        || ch.is_ascii_digit()
         || ch == '$'
         || ch == '_'
 }
diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs

Original file line number	Diff line number	Diff line change
`@@ -17,13 +17,10 @@ pub struct AnsiDialect {}`
`17`	`17`
`18`	`18`	`impl Dialect for AnsiDialect {`
`19`	`19`	`fn is_identifier_start(&self, ch: char) -> bool {`
`20`		`- ('a'..='z').contains(&ch) \|\| ('A'..='Z').contains(&ch)`
	`20`	`+ ch.is_ascii_lowercase() \|\| ch.is_ascii_uppercase()`
`21`	`21`	`}`
`22`	`22`
`23`	`23`	`fn is_identifier_part(&self, ch: char) -> bool {`
`24`		`- ('a'..='z').contains(&ch)`
`25`		`- \|\| ('A'..='Z').contains(&ch)`
`26`		`- \|\| ('0'..='9').contains(&ch)`
`27`		`- \|\| ch == '_'`
	`24`	`+ ch.is_ascii_lowercase() \|\| ch.is_ascii_uppercase() \|\| ch.is_ascii_digit() \|\| ch == '_'`
`28`	`25`	`}`
`29`	`26`	`}`
Original file line number	Diff line number	Diff line change
`@@ -22,13 +22,13 @@ impl Dialect for BigQueryDialect {`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`fn is_identifier_start(&self, ch: char) -> bool {`
`25`		`- ('a'..='z').contains(&ch) \|\| ('A'..='Z').contains(&ch) \|\| ch == '_'`
	`25`	`+ ch.is_ascii_lowercase() \|\| ch.is_ascii_uppercase() \|\| ch == '_'`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`fn is_identifier_part(&self, ch: char) -> bool {`
`29`		`- ('a'..='z').contains(&ch)`
`30`		`- \|\| ('A'..='Z').contains(&ch)`
`31`		`- \|\| ('0'..='9').contains(&ch)`
	`29`	`+ ch.is_ascii_lowercase()`
	`30`	`+ \|\| ch.is_ascii_uppercase()`
	`31`	`+ \|\| ch.is_ascii_digit()`
`32`	`32`	`\|\| ch == '_'`
`33`	`33`	`\|\| ch == '-'`
`34`	`34`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,10 +18,10 @@ pub struct ClickHouseDialect {}`
`18`	`18`	`impl Dialect for ClickHouseDialect {`
`19`	`19`	`fn is_identifier_start(&self, ch: char) -> bool {`
`20`	`20`	`// See https://clickhouse.com/docs/en/sql-reference/syntax/#syntax-identifiers`
`21`		`- ('a'..='z').contains(&ch) \|\| ('A'..='Z').contains(&ch) \|\| ch == '_'`
	`21`	`+ ch.is_ascii_lowercase() \|\| ch.is_ascii_uppercase() \|\| ch == '_'`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`fn is_identifier_part(&self, ch: char) -> bool {`
`25`		`- self.is_identifier_start(ch) \|\| ('0'..='9').contains(&ch)`
	`25`	`+ self.is_identifier_start(ch) \|\| ch.is_ascii_digit()`
`26`	`26`	`}`
`27`	`27`	`}`
Original file line number	Diff line number	Diff line change
`@@ -126,13 +126,13 @@ fn custom_statement_parser() -> Result<(), ParserError> {`
`126`	`126`	`}`
`127`	`127`
`128`	`128`	`fn is_identifier_start(ch: char) -> bool {`
`129`		`- ('a'..='z').contains(&ch) \|\| ('A'..='Z').contains(&ch) \|\| ch == '_'`
	`129`	`+ ch.is_ascii_lowercase() \|\| ch.is_ascii_uppercase() \|\| ch == '_'`
`130`	`130`	`}`
`131`	`131`
`132`	`132`	`fn is_identifier_part(ch: char) -> bool {`
`133`		`- ('a'..='z').contains(&ch)`
`134`		`- \|\| ('A'..='Z').contains(&ch)`
`135`		`- \|\| ('0'..='9').contains(&ch)`
	`133`	`+ ch.is_ascii_lowercase()`
	`134`	`+ \|\| ch.is_ascii_uppercase()`
	`135`	`+ \|\| ch.is_ascii_digit()`
`136`	`136`	`\|\| ch == '$'`
`137`	`137`	`\|\| ch == '_'`
`138`	`138`	`}`