Skip to content

Commit 3e33640

Browse files
committed
Ignore escaped LIKE wildcards in MySQL
MySQL has a special case for escaped LIKE wildcards (%, _) appearing in string literals: the escaping is ignored, whereas normally for any other (non-special) character, the backslash would be stripped. This is to allow them to be used in LIKE patterns without double-escaping as is needed in other escaping dialects (e.g. Snowflake), like so: MySQL matching a literal _ character: ```sql SELECT * FROM users WHERE name LIKE '%\_%'; ``` Snowflake doing the same thing: ```sql SELECT * FROM users WHERE name LIKE '%\\_%'; ``` Note that in MySQL, this escaping rule does not just affect LIKE patterns, but all string literals: ``` mysql> select '\_', hex('\\'), hex('_'), hex('\_'); +----+-----------+----------+-----------+ | \_ | hex('\\') | hex('_') | hex('\_') | +----+-----------+----------+-----------+ | \_ | 5C | 5F | 5C5F | +----+-----------+----------+-----------+ 1 row in set (0.00 sec) ``` This is implemented with a new flag onq dialect which gets passed into the tokenizer, because I don't know if any other dialects have similar behavior and wanted to make it easy to add them if they do. I can't currently test Snowflake, BigQuery, or Clickhouse on this point, so I'm just going off my best guess based on docs and examples. [MySQL docs](https://dev.mysql.com/doc/refman/8.4/en/string-literals.html)
1 parent 97f0be6 commit 3e33640

File tree

5 files changed

+101
-15
lines changed

5 files changed

+101
-15
lines changed

src/dialect/mod.rs

+27
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,33 @@ pub trait Dialect: Debug + Any {
201201
false
202202
}
203203

204+
/// Determine whether the dialect strips the backslash when escaping LIKE wildcards (%, _).
205+
///
206+
/// [MySQL] has a special case when escaping single quoted strings which leaves these unescaped
207+
/// so they can be used in LIKE patterns without double-escaping (as is necessary in other
208+
/// escaping dialects, such as [Snowflake]). Generally, special characters have escaping rules
209+
/// causing them to be replaced with a different byte sequences (e.g. `'\0'` becoming the zero
210+
/// byte), and the default if an escaped character does not have a specific escaping rule is to
211+
/// strip the backslash (e.g. there is no rule for `h`, so `'\h' = 'h'`). MySQL's special case
212+
/// for ignoring LIKE wildcard escapes is to *not* strip the backslash, so that `'\%' = '\\%'`.
213+
/// This applies to all string literals though, not just those used in LIKE patterns.
214+
///
215+
/// ```text
216+
/// mysql> select '\_', hex('\\'), hex('_'), hex('\_');
217+
/// +----+-----------+----------+-----------+
218+
/// | \_ | hex('\\') | hex('_') | hex('\_') |
219+
/// +----+-----------+----------+-----------+
220+
/// | \_ | 5C | 5F | 5C5F |
221+
/// +----+-----------+----------+-----------+
222+
/// 1 row in set (0.00 sec)
223+
/// ```
224+
///
225+
/// [MySQL]: https://dev.mysql.com/doc/refman/8.4/en/string-literals.html
226+
/// [Snowflake]: https://docs.snowflake.com/en/sql-reference/functions/like#usage-notes
227+
fn ignores_like_wildcard_escapes(&self) -> bool {
228+
false
229+
}
230+
204231
/// Determine if the dialect supports string literals with `U&` prefix.
205232
/// This is used to specify Unicode code points in string literals.
206233
/// For example, in PostgreSQL, the following is a valid string literal:

src/dialect/mysql.rs

+4
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ impl Dialect for MySqlDialect {
6262
true
6363
}
6464

65+
fn ignores_like_wildcard_escapes(&self) -> bool {
66+
true
67+
}
68+
6569
fn supports_numeric_prefix(&self) -> bool {
6670
true
6771
}

src/tokenizer.rs

+30-2
Original file line numberDiff line numberDiff line change
@@ -759,6 +759,9 @@ pub struct Tokenizer<'a> {
759759
/// If true (the default), the tokenizer will un-escape literal
760760
/// SQL strings See [`Tokenizer::with_unescape`] for more details.
761761
unescape: bool,
762+
/// If true, the tokenizer will not escape % and _, for use in in LIKE patterns. See
763+
/// [`Dialect::ignores_like_wildcard_escapes`] for more details.
764+
ignore_like_wildcard_escapes: bool,
762765
}
763766

764767
impl<'a> Tokenizer<'a> {
@@ -783,6 +786,7 @@ impl<'a> Tokenizer<'a> {
783786
dialect,
784787
query,
785788
unescape: true,
789+
ignore_like_wildcard_escapes: dialect.ignores_like_wildcard_escapes(),
786790
}
787791
}
788792

@@ -821,6 +825,13 @@ impl<'a> Tokenizer<'a> {
821825
self
822826
}
823827

828+
/// If true, the tokenizer will ignore escapes of LIKE wildcards. See
829+
/// [`Dialect::ignores_like_wildcard_escapes`] for more details.
830+
pub fn with_ignore_like_wildcard_escapes(mut self, ignore_like_wildcard_escapes: bool) -> Self {
831+
self.ignore_like_wildcard_escapes = ignore_like_wildcard_escapes;
832+
self
833+
}
834+
824835
/// Tokenize the statement and produce a vector of tokens
825836
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
826837
let twl = self.tokenize_with_location()?;
@@ -1847,8 +1858,12 @@ impl<'a> Tokenizer<'a> {
18471858
num_consecutive_quotes = 0;
18481859

18491860
if let Some(next) = chars.peek() {
1850-
if !self.unescape {
1851-
// In no-escape mode, the given query has to be saved completely including backslashes.
1861+
if !self.unescape
1862+
|| (self.ignore_like_wildcard_escapes && (*next == '%' || *next == '_'))
1863+
{
1864+
// In no-escape mode, the given query has to be saved completely
1865+
// including backslashes. Similarly, with ignore_like_wildcard_escapes,
1866+
// the backslash is not stripped.
18521867
s.push(ch);
18531868
s.push(*next);
18541869
chars.next(); // consume next
@@ -3421,6 +3436,9 @@ mod tests {
34213436
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
34223437
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
34233438
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3439+
(r#"'\q'"#, r#"\q"#, r#"q"#),
3440+
(r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3441+
(r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
34243442
] {
34253443
let tokens = Tokenizer::new(&dialect, sql)
34263444
.with_unescape(false)
@@ -3454,6 +3472,16 @@ mod tests {
34543472

34553473
compare(expected, tokens);
34563474
}
3475+
3476+
// MySQL special case for LIKE escapes
3477+
for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3478+
let dialect = MySqlDialect {};
3479+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3480+
3481+
let expected = vec![Token::SingleQuotedString(expected.to_string())];
3482+
3483+
compare(expected, tokens);
3484+
}
34573485
}
34583486

34593487
#[test]

tests/sqlparser_common.rs

+29-13
Original file line numberDiff line numberDiff line change
@@ -10102,15 +10102,8 @@ fn parse_with_recursion_limit() {
1010210102

1010310103
#[test]
1010410104
fn parse_escaped_string_with_unescape() {
10105-
fn assert_mysql_query_value(sql: &str, quoted: &str) {
10106-
let stmt = TestedDialects::new(vec![
10107-
Box::new(MySqlDialect {}),
10108-
Box::new(BigQueryDialect {}),
10109-
Box::new(SnowflakeDialect {}),
10110-
])
10111-
.one_statement_parses_to(sql, "");
10112-
10113-
match stmt {
10105+
fn assert_mysql_query_value(dialects: &TestedDialects, sql: &str, quoted: &str) {
10106+
match dialects.one_statement_parses_to(sql, "") {
1011410107
Statement::Query(query) => match *query.body {
1011510108
SetExpr::Select(value) => {
1011610109
let expr = expr_from_projection(only(&value.projection));
@@ -10124,17 +10117,40 @@ fn parse_escaped_string_with_unescape() {
1012410117
_ => unreachable!(),
1012510118
};
1012610119
}
10120+
10121+
let escaping_dialects =
10122+
&all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape());
10123+
let no_wildcard_exception = &all_dialects_where(|dialect| {
10124+
dialect.supports_string_literal_backslash_escape()
10125+
&& !dialect.ignores_like_wildcard_escapes()
10126+
});
10127+
let with_wildcard_exception = &all_dialects_where(|dialect| {
10128+
dialect.supports_string_literal_backslash_escape()
10129+
&& dialect.ignores_like_wildcard_escapes()
10130+
});
10131+
1012710132
let sql = r"SELECT 'I\'m fine'";
10128-
assert_mysql_query_value(sql, "I'm fine");
10133+
assert_mysql_query_value(escaping_dialects, sql, "I'm fine");
1012910134

1013010135
let sql = r#"SELECT 'I''m fine'"#;
10131-
assert_mysql_query_value(sql, "I'm fine");
10136+
assert_mysql_query_value(escaping_dialects, sql, "I'm fine");
1013210137

1013310138
let sql = r#"SELECT 'I\"m fine'"#;
10134-
assert_mysql_query_value(sql, "I\"m fine");
10139+
assert_mysql_query_value(escaping_dialects, sql, "I\"m fine");
1013510140

1013610141
let sql = r"SELECT 'Testing: \0 \\ \% \_ \b \n \r \t \Z \a \h \ '";
10137-
assert_mysql_query_value(sql, "Testing: \0 \\ % _ \u{8} \n \r \t \u{1a} \u{7} h ");
10142+
assert_mysql_query_value(
10143+
no_wildcard_exception,
10144+
sql,
10145+
"Testing: \0 \\ % _ \u{8} \n \r \t \u{1a} \u{7} h ",
10146+
);
10147+
10148+
// check MySQL doesn't remove backslash from escaped LIKE wildcards
10149+
assert_mysql_query_value(
10150+
with_wildcard_exception,
10151+
sql,
10152+
"Testing: \0 \\ \\% \\_ \u{8} \n \r \t \u{1a} \u{7} h ",
10153+
);
1013810154
}
1013910155

1014010156
#[test]

tests/sqlparser_mysql.rs

+11
Original file line numberDiff line numberDiff line change
@@ -2530,6 +2530,17 @@ fn parse_rlike_and_regexp() {
25302530
}
25312531
}
25322532

2533+
#[test]
2534+
fn parse_like_with_escape() {
2535+
// verify backslash is not stripped for escaped wildcards
2536+
mysql().verified_only_select(r#"SELECT 'a\%c' LIKE 'a\%c'"#);
2537+
mysql().verified_only_select(r#"SELECT 'a\_c' LIKE 'a\_c'"#);
2538+
mysql().verified_only_select(r#"SELECT '%\_\%' LIKE '%\_\%'"#);
2539+
mysql().verified_only_select(r#"SELECT '\_\%' LIKE CONCAT('\_', '\%')"#);
2540+
mysql().verified_only_select(r#"SELECT 'a%c' LIKE 'a$%c' ESCAPE '$'"#);
2541+
mysql().verified_only_select(r#"SELECT 'a_c' LIKE 'a#_c' ESCAPE '#'"#);
2542+
}
2543+
25332544
#[test]
25342545
fn parse_kill() {
25352546
let stmt = mysql_and_generic().verified_stmt("KILL CONNECTION 5");

0 commit comments

Comments
 (0)