Skip to content

Commit bbc80d7

Browse files
romanbRoman Borschel
and
Roman Borschel
authored
Fix tokenization of qualified identifiers with numeric prefix. (#1803)
Co-authored-by: Roman Borschel <[email protected]>
1 parent d090ad4 commit bbc80d7

File tree

2 files changed

+186
-12
lines changed

2 files changed

+186
-12
lines changed

src/tokenizer.rs

+64-12
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
895895
};
896896

897897
let mut location = state.location();
898-
while let Some(token) = self.next_token(&mut state)? {
898+
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
899899
let span = location.span_to(state.location());
900900

901901
buf.push(TokenWithSpan { token, span });
@@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
932932
}
933933

934934
/// Get the next token or return None
935-
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
935+
fn next_token(
936+
&self,
937+
chars: &mut State,
938+
prev_token: Option<&Token>,
939+
) -> Result<Option<Token>, TokenizerError> {
936940
match chars.peek() {
937941
Some(&ch) => match ch {
938942
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
12111215
chars.next();
12121216
}
12131217

1218+
// If the dialect supports identifiers that start with a numeric prefix
1219+
// and we have now consumed a dot, check if the previous token was a Word.
1220+
// If so, what follows is definitely not part of a decimal number and
1221+
// we should yield the dot as a dedicated token so compound identifiers
1222+
// starting with digits can be parsed correctly.
1223+
if s == "." && self.dialect.supports_numeric_prefix() {
1224+
if let Some(Token::Word(_)) = prev_token {
1225+
return Ok(Some(Token::Period));
1226+
}
1227+
}
1228+
1229+
// Consume fractional digits.
12141230
s += &peeking_next_take_while(chars, |ch, next_ch| {
12151231
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12161232
});
12171233

1218-
// No number -> Token::Period
1234+
// No fraction -> Token::Period
12191235
if s == "." {
12201236
return Ok(Some(Token::Period));
12211237
}
12221238

1223-
let mut exponent_part = String::new();
12241239
// Parse exponent as number
1240+
let mut exponent_part = String::new();
12251241
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
12261242
let mut char_clone = chars.peekable.clone();
12271243
exponent_part.push(char_clone.next().unwrap());
@@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
12501266
}
12511267
}
12521268

1253-
// mysql dialect supports identifiers that start with a numeric prefix,
1254-
// as long as they aren't an exponent number.
1255-
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
1256-
let word =
1257-
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1258-
1259-
if !word.is_empty() {
1260-
s += word.as_str();
1269+
// If the dialect supports identifiers that start with a numeric prefix,
1270+
// we need to check if the value is in fact an identifier and must thus
1271+
// be tokenized as a word.
1272+
if self.dialect.supports_numeric_prefix() {
1273+
if exponent_part.is_empty() {
1274+
// If it is not a number with an exponent, it may be
1275+
// an identifier starting with digits.
1276+
let word =
1277+
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1278+
1279+
if !word.is_empty() {
1280+
s += word.as_str();
1281+
return Ok(Some(Token::make_word(s.as_str(), None)));
1282+
}
1283+
} else if prev_token == Some(&Token::Period) {
1284+
// If the previous token was a period, thus not belonging to a number,
1285+
// the value we have is part of an identifier.
12611286
return Ok(Some(Token::make_word(s.as_str(), None)));
12621287
}
12631288
}
@@ -3960,4 +3985,31 @@ mod tests {
39603985
],
39613986
);
39623987
}
3988+
3989+
#[test]
3990+
fn test_tokenize_identifiers_numeric_prefix() {
3991+
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3992+
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
3993+
3994+
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3995+
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
3996+
3997+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
3998+
"t.12e34",
3999+
vec![
4000+
Token::make_word("t", None),
4001+
Token::Period,
4002+
Token::make_word("12e34", None),
4003+
],
4004+
);
4005+
4006+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4007+
"t.1two3",
4008+
vec![
4009+
Token::make_word("t", None),
4010+
Token::Period,
4011+
Token::make_word("1two3", None),
4012+
],
4013+
);
4014+
}
39634015
}

tests/sqlparser_mysql.rs

+122
Original file line numberDiff line numberDiff line change
@@ -1926,6 +1926,128 @@ fn parse_select_with_numeric_prefix_column_name() {
19261926
}
19271927
}
19281928

1929+
#[test]
1930+
fn parse_qualified_identifiers_with_numeric_prefix() {
1931+
// Case 1: Qualified column name that starts with digits.
1932+
match mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t") {
1933+
Statement::Query(q) => match *q.body {
1934+
SetExpr::Select(s) => match s.projection.last() {
1935+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1936+
assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
1937+
}
1938+
proj => panic!("Unexpected projection: {:?}", proj),
1939+
},
1940+
body => panic!("Unexpected statement body: {:?}", body),
1941+
},
1942+
stmt => panic!("Unexpected statement: {:?}", stmt),
1943+
}
1944+
1945+
// Case 2: Qualified column name that starts with digits and on its own represents a number.
1946+
match mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t") {
1947+
Statement::Query(q) => match *q.body {
1948+
SetExpr::Select(s) => match s.projection.last() {
1949+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1950+
assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
1951+
}
1952+
proj => panic!("Unexpected projection: {:?}", proj),
1953+
},
1954+
body => panic!("Unexpected statement body: {:?}", body),
1955+
},
1956+
stmt => panic!("Unexpected statement: {:?}", stmt),
1957+
}
1958+
1959+
// Case 3: Unqualified, the same token is parsed as a number.
1960+
match mysql()
1961+
.parse_sql_statements("SELECT 15e29 FROM my_table")
1962+
.unwrap()
1963+
.pop()
1964+
{
1965+
Some(Statement::Query(q)) => match *q.body {
1966+
SetExpr::Select(s) => match s.projection.last() {
1967+
Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => {
1968+
assert_eq!(&number("15e29"), value);
1969+
}
1970+
proj => panic!("Unexpected projection: {:?}", proj),
1971+
},
1972+
body => panic!("Unexpected statement body: {:?}", body),
1973+
},
1974+
stmt => panic!("Unexpected statement: {:?}", stmt),
1975+
}
1976+
1977+
// Case 4: Quoted simple identifier.
1978+
match mysql().verified_stmt("SELECT `15e29` FROM my_table") {
1979+
Statement::Query(q) => match *q.body {
1980+
SetExpr::Select(s) => match s.projection.last() {
1981+
Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
1982+
assert_eq!(&Ident::with_quote('`', "15e29"), name);
1983+
}
1984+
proj => panic!("Unexpected projection: {:?}", proj),
1985+
},
1986+
body => panic!("Unexpected statement body: {:?}", body),
1987+
},
1988+
stmt => panic!("Unexpected statement: {:?}", stmt),
1989+
}
1990+
1991+
// Case 5: Quoted compound identifier.
1992+
match mysql().verified_stmt("SELECT t.`15e29` FROM my_table AS t") {
1993+
Statement::Query(q) => match *q.body {
1994+
SetExpr::Select(s) => match s.projection.last() {
1995+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1996+
assert_eq!(
1997+
&[Ident::new("t"), Ident::with_quote('`', "15e29")],
1998+
&parts[..]
1999+
);
2000+
}
2001+
proj => panic!("Unexpected projection: {:?}", proj),
2002+
},
2003+
body => panic!("Unexpected statement body: {:?}", body),
2004+
},
2005+
stmt => panic!("Unexpected statement: {:?}", stmt),
2006+
}
2007+
2008+
// Case 6: Multi-level compound identifiers.
2009+
match mysql().verified_stmt("SELECT 1db.1table.1column") {
2010+
Statement::Query(q) => match *q.body {
2011+
SetExpr::Select(s) => match s.projection.last() {
2012+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
2013+
assert_eq!(
2014+
&[
2015+
Ident::new("1db"),
2016+
Ident::new("1table"),
2017+
Ident::new("1column")
2018+
],
2019+
&parts[..]
2020+
);
2021+
}
2022+
proj => panic!("Unexpected projection: {:?}", proj),
2023+
},
2024+
body => panic!("Unexpected statement body: {:?}", body),
2025+
},
2026+
stmt => panic!("Unexpected statement: {:?}", stmt),
2027+
}
2028+
2029+
// Case 7: Multi-level compound quoted identifiers.
2030+
match mysql().verified_stmt("SELECT `1`.`2`.`3`") {
2031+
Statement::Query(q) => match *q.body {
2032+
SetExpr::Select(s) => match s.projection.last() {
2033+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
2034+
assert_eq!(
2035+
&[
2036+
Ident::with_quote('`', "1"),
2037+
Ident::with_quote('`', "2"),
2038+
Ident::with_quote('`', "3")
2039+
],
2040+
&parts[..]
2041+
);
2042+
}
2043+
proj => panic!("Unexpected projection: {:?}", proj),
2044+
},
2045+
body => panic!("Unexpected statement body: {:?}", body),
2046+
},
2047+
stmt => panic!("Unexpected statement: {:?}", stmt),
2048+
}
2049+
}
2050+
19292051
// Don't run with bigdecimal as it fails like this on rust beta:
19302052
//
19312053
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'

0 commit comments

Comments
 (0)