diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d33a7d8af..13bce0c0d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> { }; let mut location = state.location(); - while let Some(token) = self.next_token(&mut state)? { + while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { let span = location.span_to(state.location()); buf.push(TokenWithSpan { token, span }); @@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> { } /// Get the next token or return None - fn next_token(&self, chars: &mut State) -> Result, TokenizerError> { + fn next_token( + &self, + chars: &mut State, + prev_token: Option<&Token>, + ) -> Result, TokenizerError> { match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> { chars.next(); } + // If the dialect supports identifiers that start with a numeric prefix + // and we have now consumed a dot, check if the previous token was a Word. + // If so, what follows is definitely not part of a decimal number and + // we should yield the dot as a dedicated token so compound identifiers + // starting with digits can be parsed correctly. + if s == "." && self.dialect.supports_numeric_prefix() { + if let Some(Token::Word(_)) = prev_token { + return Ok(Some(Token::Period)); + } + } + + // Consume fractional digits. s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No number -> Token::Period + // No fraction -> Token::Period if s == "." { return Ok(Some(Token::Period)); } - let mut exponent_part = String::new(); // Parse exponent as number + let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); exponent_part.push(char_clone.next().unwrap()); @@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> { } } - // mysql dialect supports identifiers that start with a numeric prefix, - // as long as they aren't an exponent number. - if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() { - let word = - peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); - - if !word.is_empty() { - s += word.as_str(); + // If the dialect supports identifiers that start with a numeric prefix, + // we need to check if the value is in fact an identifier and must thus + // be tokenized as a word. + if self.dialect.supports_numeric_prefix() { + if exponent_part.is_empty() { + // If it is not a number with an exponent, it may be + // an identifier starting with digits. + let word = + peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); + + if !word.is_empty() { + s += word.as_str(); + return Ok(Some(Token::make_word(s.as_str(), None))); + } + } else if prev_token == Some(&Token::Period) { + // If the previous token was a period, thus not belonging to a number, + // the value we have is part of an identifier. return Ok(Some(Token::make_word(s.as_str(), None))); } } @@ -3960,4 +3985,31 @@ mod tests { ], ); } + + #[test] + fn test_tokenize_identifiers_numeric_prefix() { + all_dialects_where(|dialect| dialect.supports_numeric_prefix()) + .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()) + .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( + "t.12e34", + vec![ + Token::make_word("t", None), + Token::Period, + Token::make_word("12e34", None), + ], + ); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( + "t.1two3", + vec![ + Token::make_word("t", None), + Token::Period, + Token::make_word("1two3", None), + ], + ); + } } diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 1d4fd6a0d..ceb642418 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -1926,6 +1926,128 @@ fn parse_select_with_numeric_prefix_column_name() { } } +#[test] +fn parse_qualified_identifiers_with_numeric_prefix() { + // Case 1: Qualified column name that starts with digits. + match mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 2: Qualified column name that starts with digits and on its own represents a number. + match mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 3: Unqualified, the same token is parsed as a number. + match mysql() + .parse_sql_statements("SELECT 15e29 FROM my_table") + .unwrap() + .pop() + { + Some(Statement::Query(q)) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => { + assert_eq!(&number("15e29"), value); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 4: Quoted simple identifier. + match mysql().verified_stmt("SELECT `15e29` FROM my_table") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => { + assert_eq!(&Ident::with_quote('`', "15e29"), name); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 5: Quoted compound identifier. + match mysql().verified_stmt("SELECT t.`15e29` FROM my_table AS t") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + &[Ident::new("t"), Ident::with_quote('`', "15e29")], + &parts[..] + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 6: Multi-level compound identifiers. + match mysql().verified_stmt("SELECT 1db.1table.1column") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + &[ + Ident::new("1db"), + Ident::new("1table"), + Ident::new("1column") + ], + &parts[..] + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 7: Multi-level compound quoted identifiers. + match mysql().verified_stmt("SELECT `1`.`2`.`3`") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + &[ + Ident::with_quote('`', "1"), + Ident::with_quote('`', "2"), + Ident::with_quote('`', "3") + ], + &parts[..] + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } +} + // Don't run with bigdecimal as it fails like this on rust beta: // // 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'