Extended dialect triat to support numeric prefixed identifiers (apache#1188)

groobyming · alamb · JichaoS · commit 34bb4244a58e · 2024-05-07T14:54:58.000-07:00
Co-authored-by: Andrew Lamb &lt;andrew@nerdnetworks.org&gt;
diff --git a/src/dialect/hive.rs b/src/dialect/hive.rs
@@ -38,4 +38,8 @@ impl Dialect for HiveDialect {
     fn supports_filter_during_aggregation(&self) -> bool {
         true
     }
+
+    fn supports_numeric_prefix(&self) -> bool {
+        true
+    }
 }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -185,6 +185,11 @@ pub trait Dialect: Debug + Any {
     fn supports_named_fn_args_with_eq_operator(&self) -> bool {
         false
     }
+    /// Returns true if the dialect supports identifiers starting with a numeric
+    /// prefix such as tables named: `59901_user_login`
+    fn supports_numeric_prefix(&self) -> bool {
+        false
+    }
     /// Returns true if the dialects supports specifying null treatment
     /// as part of a window function's parameter list. As opposed
     /// to after the parameter list.
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
@@ -53,6 +53,10 @@ impl Dialect for MySqlDialect {
         true
     }
 
+    fn supports_numeric_prefix(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -35,11 +35,10 @@ use serde::{Deserialize, Serialize};
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::ast::DollarQuotedString;
+use crate::dialect::Dialect;
 use crate::dialect::{
-    BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, PostgreSqlDialect,
-    SnowflakeDialect,
+    BigQueryDialect, DuckDbDialect, GenericDialect, PostgreSqlDialect, SnowflakeDialect,
 };
-use crate::dialect::{Dialect, MySqlDialect};
 use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
 
 /// SQL Token enumeration
@@ -821,7 +820,7 @@ impl<'a> Tokenizer<'a> {
 
                     // mysql dialect supports identifiers that start with a numeric prefix,
                     // as long as they aren't an exponent number.
-                    if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
+                    if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
                         let word =
                             peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
 
@@ -1544,7 +1543,10 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};
+    use crate::dialect::{
+        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
+    };
+    use core::fmt::Debug;
 
     #[test]
     fn tokenizer_error_impl() {
@@ -2414,6 +2416,54 @@ mod tests {
         check_unescape(r"Hello\xCADRust", None);
     }
 
+    #[test]
+    fn tokenize_numeric_prefix_trait() {
+        #[derive(Debug)]
+        struct NumericPrefixDialect;
+
+        impl Dialect for NumericPrefixDialect {
+            fn is_identifier_start(&self, ch: char) -> bool {
+                ch.is_ascii_lowercase()
+                    || ch.is_ascii_uppercase()
+                    || ch.is_ascii_digit()
+                    || ch == '$'
+            }
+
+            fn is_identifier_part(&self, ch: char) -> bool {
+                ch.is_ascii_lowercase()
+                    || ch.is_ascii_uppercase()
+                    || ch.is_ascii_digit()
+                    || ch == '_'
+                    || ch == '$'
+                    || ch == '{'
+                    || ch == '}'
+            }
+
+            fn supports_numeric_prefix(&self) -> bool {
+                true
+            }
+        }
+
+        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
+        tokenize_numeric_prefix_inner(&HiveDialect {});
+        tokenize_numeric_prefix_inner(&MySqlDialect {});
+    }
+
+    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
+        let sql = r#"SELECT * FROM 1"#;
+        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Mul,
+            Token::Whitespace(Whitespace::Space),
+            Token::make_keyword("FROM"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number(String::from("1"), false),
+        ];
+        compare(expected, tokens);
+    }
+
     #[test]
     fn tokenize_quoted_string_escape() {
         for (sql, expected, expected_unescaped) in [

Original file line number	Diff line number	Diff line change
`@@ -38,4 +38,8 @@ impl Dialect for HiveDialect {`
`38`	`38`	`fn supports_filter_during_aggregation(&self) -> bool {`
`39`	`39`	`true`
`40`	`40`	`}`
	`41`	`+`
	`42`	`+ fn supports_numeric_prefix(&self) -> bool {`
	`43`	`+ true`
	`44`	`+ }`
`41`	`45`	`}`