From 49d5b8551664f44d67cd613acf7a8dbf525b01ae Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Fri, 26 Aug 2022 09:27:41 -0500 Subject: [PATCH 01/10] Refactor so we can unit test `inject_parameters` Signed-off-by: Jesse Whitehouse --- src/databricks/sql/client.py | 6 ++++-- src/databricks/sql/utils.py | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index e3190d45..b03ff6c2 100644 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -7,7 +7,7 @@ from databricks.sql import * from databricks.sql.exc import OperationalError from databricks.sql.thrift_backend import ThriftBackend -from databricks.sql.utils import ExecuteResponse, ParamEscaper +from databricks.sql.utils import ExecuteResponse, ParamEscaper, inject_parameters from databricks.sql.types import Row from databricks.sql.auth.auth import get_python_sql_connector_auth_provider from databricks.sql.experimental.oauth_persistence import OAuthPersistence @@ -309,7 +309,9 @@ def execute( :returns self """ if parameters is not None: - operation = operation % self.escaper.escape_args(parameters) + operation = inject_parameters( + operation, self.escaper.escape_args(parameters) + ) self._check_not_closed() self._close_and_clear_active_result_set() diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index 2961a1f5..d8a7de91 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -2,7 +2,7 @@ from collections.abc import Iterable import datetime from enum import Enum - +from typing import Dict import pyarrow @@ -172,3 +172,7 @@ def escape_item(self, item): return self.escape_datetime(item, self._DATE_FORMAT) else: raise exc.ProgrammingError("Unsupported object {}".format(item)) + + +def inject_parameters(operation: str, parameters: Dict[str, str]): + return operation % parameters From bce159835c3ba67498aaa7405b8f44da70de05c2 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Fri, 26 Aug 2022 09:27:51 -0500 Subject: [PATCH 02/10] Add unit tests for inject_parameters Signed-off-by: Jesse Whitehouse --- tests/unit/test_param_escaper.py | 108 +++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 tests/unit/test_param_escaper.py diff --git a/tests/unit/test_param_escaper.py b/tests/unit/test_param_escaper.py new file mode 100644 index 00000000..8066f202 --- /dev/null +++ b/tests/unit/test_param_escaper.py @@ -0,0 +1,108 @@ +from datetime import date, datetime +import unittest, pytest + +from databricks.sql.utils import ParamEscaper, inject_parameters + +pe = ParamEscaper() + +class TestIndividualFormatters(object): + + # Test individual type escapers + def test_escape_number_integer(self): + """This behaviour falls back to Python's default string formatting of numbers + """ + assert pe.escape_number(100) == 100 + + def test_escape_number_float(self): + """This behaviour falls back to Python's default string formatting of numbers + """ + assert pe.escape_number(100.1234) == 100.1234 + + def test_escape_string_normal(self): + """ + """ + + assert pe.escape_string("golly bob howdy") == "'golly bob howdy'" + + def test_escape_string_that_includes_quotes(self): + # Databricks queries support just one special character: a single quote mark + # These are escaped by doubling: + # e.g. INPUT: his name was 'robert palmer' + # e.g. OUTPUT: 'his name was ''robert palmer''' + + assert pe.escape_string("his name was 'robert palmer'") == "'his name was ''robert palmer'''" + + def test_escape_date_time(self): + INPUT = datetime(1991,8,3,21,55) + OUTPUT = "1991-08-03 21:55:00" + assert pe.escape_datetime(INPUT, OUTPUT) + + def test_escape_date(self): + INPUT = date(1991,8,3) + OUTPUT = "1991-08-03" + assert pe.escape_datetime(INPUT, OUTPUT) + + def test_escape_sequence_integer(self): + assert pe.escape_sequence([1,2,3,4]) == "(1,2,3,4)" + + def test_escape_sequence_float(self): + assert pe.escape_sequence([1.1,2.2,3.3,4.4]) == "(1.1,2.2,3.3,4.4)" + + def test_escape_sequence_string(self): + assert pe.escape_sequence( + ["his", "name", "was", "robert", "palmer"]) == \ + "('his','name','was','robert','palmer')" + + def test_escape_sequence_sequence_of_strings(self): + # This is not valid SQL. + INPUT = [["his", "name"], ["was", "robert"], ["palmer"]] + OUTPUT = "(('his','name'),('was','robert'),('palmer'))" + + assert pe.escape_sequence(INPUT) == OUTPUT + + +class TestFullQueryEscaping(object): + + def test_simple(self): + + INPUT = """ + SELECT + field1, + field2, + field3 + FROM + table + WHERE + field1 = %(param1)s + """ + + OUTPUT = """ + SELECT + field1, + field2, + field3 + FROM + table + WHERE + field1 = ';DROP ALL TABLES' + """ + + args = {"param1": ";DROP ALL TABLES"} + + assert inject_parameters(INPUT, pe.escape_args(args)) == OUTPUT + + @unittest.skipUnless(False, "Thrift server supports native parameter binding.") + def test_only_bind_in_where_clause(self): + + INPUT = """ + SELECT + %(field)s, + field2, + field3 + FROM table + """ + + args = {"field": "Some Value"} + + with pytest.raises(Exception): + inject_parameters(INPUT, pe.escape_args(args)) From dd25478c4e94fddb33884f6d206fd62427e5076c Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 10 Oct 2022 14:55:57 -0500 Subject: [PATCH 03/10] Remove inaccurate description. Per #51, spark sql does not support escaping a single quote with a second single quote. Signed-off-by: Jesse Whitehouse --- tests/unit/test_param_escaper.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_param_escaper.py b/tests/unit/test_param_escaper.py index 8066f202..ec549c57 100644 --- a/tests/unit/test_param_escaper.py +++ b/tests/unit/test_param_escaper.py @@ -24,14 +24,16 @@ def test_escape_string_normal(self): assert pe.escape_string("golly bob howdy") == "'golly bob howdy'" - def test_escape_string_that_includes_quotes(self): - # Databricks queries support just one special character: a single quote mark - # These are escaped by doubling: + def test_escape_string_that_includes_single_quotes(self): # e.g. INPUT: his name was 'robert palmer' # e.g. OUTPUT: 'his name was ''robert palmer''' assert pe.escape_string("his name was 'robert palmer'") == "'his name was ''robert palmer'''" + def test_escape_string_that_includes_special_characters(self): + + assert pe.escape_string("his name was 'robert palmer'") == "'his name was ''robert palmer'''" + def test_escape_date_time(self): INPUT = datetime(1991,8,3,21,55) OUTPUT = "1991-08-03 21:55:00" From 29bab86fd0573bd6934372f3cb4b0caf94dab684 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 10 Oct 2022 15:16:37 -0500 Subject: [PATCH 04/10] Fixes #51 and adds unit tests. Signed-off-by: Jesse Whitehouse --- src/databricks/sql/utils.py | 2 +- tests/unit/test_param_escaper.py | 38 +++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index d8a7de91..f28584a2 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -146,7 +146,7 @@ def escape_string(self, item): # This is good enough when backslashes are literal, newlines are just followed, and the way # to escape a single quote is to put two single quotes. # (i.e. only special character is single quote) - return "'{}'".format(item.replace("'", "''")) + return "'{}'".format(item.replace("'", "\'")) def escape_sequence(self, item): l = map(str, map(self.escape_item, item)) diff --git a/tests/unit/test_param_escaper.py b/tests/unit/test_param_escaper.py index ec549c57..1465142b 100644 --- a/tests/unit/test_param_escaper.py +++ b/tests/unit/test_param_escaper.py @@ -24,15 +24,41 @@ def test_escape_string_normal(self): assert pe.escape_string("golly bob howdy") == "'golly bob howdy'" - def test_escape_string_that_includes_single_quotes(self): - # e.g. INPUT: his name was 'robert palmer' - # e.g. OUTPUT: 'his name was ''robert palmer''' + def test_escape_string_that_includes_special_characters(self): + """Tests for how special characters are treated. - assert pe.escape_string("his name was 'robert palmer'") == "'his name was ''robert palmer'''" + When passed a string, the `escape_string` method wraps it in single quotes + and escapes any special characters with a back stroke (\) - def test_escape_string_that_includes_special_characters(self): + Example: + + IN : his name was 'robert palmer' + OUT: 'his name was \'robert palmer\'' + """ + + # Testing for the presence of these characters: '"/\πŸ˜‚ + + assert pe.escape_string("his name was 'robert palmer'") == "'his name was \'robert palmer\''" + + # These two tests represent the same user input in the several ways it can be written in Python + # Each argument to `escape_string` evaluates to the same bytes. But Python lets us write it differently. + assert pe.escape_string("his name was \"robert palmer\"") == "'his name was \"robert palmer\"'" + assert pe.escape_string('his name was "robert palmer"') == "'his name was \"robert palmer\"'" + assert pe.escape_string('his name was {}'.format('"robert palmer"')) == "'his name was \"robert palmer\"'" + + assert pe.escape_string("his name was robert / palmer") == "'his name was robert / palmer'" + + # If you need to include a single backslash, use an r-string to prevent Python from raising a + # DeprecationWarning for an invalid escape sequence + assert pe.escape_string(r"his name was robert \/ palmer") == "'his name was robert \\/ palmer'" + assert pe.escape_string("his name was robert \\ palmer") == "'his name was robert \\ palmer'" + assert pe.escape_string("his name was robert \\\\ palmer") == "'his name was robert \\\\ palmer'" + + assert pe.escape_string("his name was robert palmer πŸ˜‚") == "'his name was robert palmer πŸ˜‚'" + + # Adding the test from PR #56 to prove escape behaviour - assert pe.escape_string("his name was 'robert palmer'") == "'his name was ''robert palmer'''" + assert pe.escape_string("you're") == "'you\'re'" def test_escape_date_time(self): INPUT = datetime(1991,8,3,21,55) From aafbdfa3b279343e2d894faa5c6b4b1f0f3f27aa Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 10 Oct 2022 15:32:30 -0500 Subject: [PATCH 05/10] Working change which passes all written tests and most from #51 Signed-off-by: Jesse Whitehouse --- src/databricks/sql/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index f28584a2..8b7caf48 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -146,7 +146,7 @@ def escape_string(self, item): # This is good enough when backslashes are literal, newlines are just followed, and the way # to escape a single quote is to put two single quotes. # (i.e. only special character is single quote) - return "'{}'".format(item.replace("'", "\'")) + return "'{}'".format(item.replace("'", "\\'")) def escape_sequence(self, item): l = map(str, map(self.escape_item, item)) From 6750633cfb6747a005629f134ea0a742b1e1b681 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 10 Oct 2022 16:23:38 -0500 Subject: [PATCH 06/10] Implement suggest fix from #51 which now passes the integration test provided Signed-off-by: Jesse Whitehouse --- src/databricks/sql/utils.py | 2 +- tests/unit/test_param_escaper.py | 26 +++++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index 8b7caf48..92bf9114 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -146,7 +146,7 @@ def escape_string(self, item): # This is good enough when backslashes are literal, newlines are just followed, and the way # to escape a single quote is to put two single quotes. # (i.e. only special character is single quote) - return "'{}'".format(item.replace("'", "\\'")) + return "'{}'".format(item.replace('\\','\\\\' ).replace("'", "\\'")) def escape_sequence(self, item): l = map(str, map(self.escape_item, item)) diff --git a/tests/unit/test_param_escaper.py b/tests/unit/test_param_escaper.py index 1465142b..01584d19 100644 --- a/tests/unit/test_param_escaper.py +++ b/tests/unit/test_param_escaper.py @@ -38,7 +38,7 @@ def test_escape_string_that_includes_special_characters(self): # Testing for the presence of these characters: '"/\πŸ˜‚ - assert pe.escape_string("his name was 'robert palmer'") == "'his name was \'robert palmer\''" + assert pe.escape_string("his name was 'robert palmer'") == r"'his name was \'robert palmer\''" # These two tests represent the same user input in the several ways it can be written in Python # Each argument to `escape_string` evaluates to the same bytes. But Python lets us write it differently. @@ -46,19 +46,31 @@ def test_escape_string_that_includes_special_characters(self): assert pe.escape_string('his name was "robert palmer"') == "'his name was \"robert palmer\"'" assert pe.escape_string('his name was {}'.format('"robert palmer"')) == "'his name was \"robert palmer\"'" - assert pe.escape_string("his name was robert / palmer") == "'his name was robert / palmer'" + assert pe.escape_string("his name was robert / palmer") == r"'his name was robert / palmer'" # If you need to include a single backslash, use an r-string to prevent Python from raising a # DeprecationWarning for an invalid escape sequence - assert pe.escape_string(r"his name was robert \/ palmer") == "'his name was robert \\/ palmer'" - assert pe.escape_string("his name was robert \\ palmer") == "'his name was robert \\ palmer'" - assert pe.escape_string("his name was robert \\\\ palmer") == "'his name was robert \\\\ palmer'" + assert pe.escape_string("his name was robert \\/ palmer") == r"'his name was robert \\/ palmer'" + assert pe.escape_string("his name was robert \\ palmer") == r"'his name was robert \\ palmer'" + assert pe.escape_string("his name was robert \\\\ palmer") == r"'his name was robert \\\\ palmer'" - assert pe.escape_string("his name was robert palmer πŸ˜‚") == "'his name was robert palmer πŸ˜‚'" + assert pe.escape_string("his name was robert palmer πŸ˜‚") == r"'his name was robert palmer πŸ˜‚'" # Adding the test from PR #56 to prove escape behaviour - assert pe.escape_string("you're") == "'you\'re'" + assert pe.escape_string("you're") == r"'you\'re'" + + # Adding this test from #51 to prove escape behaviour when the target string involves repeated SQL escape chars + assert pe.escape_string("cat\\'s meow") == r"'cat\\\'s meow'" + + # Tests from the docs: https://docs.databricks.com/sql/language-manual/data-types/string-type.html + + assert pe.escape_string('Spark') == "'Spark'" + assert pe.escape_string("O'Connell") == r"'O\'Connell'" + assert pe.escape_string("Some\\nText") == r"'Some\\nText'" + assert pe.escape_string("Some\\\\nText") == r"'Some\\\\nText'" + assert pe.escape_string("μ„œμšΈμ‹œ") == "'μ„œμšΈμ‹œ'" + assert pe.escape_string("\\\\") == r"'\\\\'" def test_escape_date_time(self): INPUT = datetime(1991,8,3,21,55) From 82556df458d1383463a5608105de525b7b6ad223 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 10 Oct 2022 16:25:26 -0500 Subject: [PATCH 07/10] Incorporate e2e test suggested by #56 Signed-off-by: Jesse Whitehouse --- tests/e2e/driver_tests.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/e2e/driver_tests.py b/tests/e2e/driver_tests.py index 9e400770..43973724 100644 --- a/tests/e2e/driver_tests.py +++ b/tests/e2e/driver_tests.py @@ -288,6 +288,20 @@ def test_get_columns(self): for table in table_names: cursor.execute('DROP TABLE IF EXISTS {}'.format(table)) + def test_escape_single_quotes(self): + with self.cursor({}) as cursor: + table_name = 'table_{uuid}'.format(uuid=str(uuid4()).replace('-', '_')) + # Test escape syntax directly + cursor.execute("CREATE TABLE IF NOT EXISTS {} AS (SELECT 'you\\'re' AS col_1)".format(table_name)) + cursor.execute("SELECT * FROM {} WHERE col_1 LIKE 'you\\'re'".format(table_name)) + rows = cursor.fetchall() + assert rows[0]["col_1"] == "you're" + + # Test escape syntax in parameter + cursor.execute("SELECT * FROM {} WHERE {}.col_1 LIKE %(var)s".format(table_name, table_name), parameters={"var": "you're"}) + rows = cursor.fetchall() + assert rows[0]["col_1"] == "you're" + def test_get_schemas(self): with self.cursor({}) as cursor: database_name = 'db_{uuid}'.format(uuid=str(uuid4()).replace('-', '_')) From 1f9b78c98c1bface4ffc6fa95bdf64e1db894465 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 10 Oct 2022 16:28:31 -0500 Subject: [PATCH 08/10] Black utils.py Signed-off-by: Jesse Whitehouse --- src/databricks/sql/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index 92bf9114..1ffac8b1 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -146,7 +146,7 @@ def escape_string(self, item): # This is good enough when backslashes are literal, newlines are just followed, and the way # to escape a single quote is to put two single quotes. # (i.e. only special character is single quote) - return "'{}'".format(item.replace('\\','\\\\' ).replace("'", "\\'")) + return "'{}'".format(item.replace("\\", "\\\\").replace("'", "\\'")) def escape_sequence(self, item): l = map(str, map(self.escape_item, item)) From ba4fed6a894aed427bf6931a577df919f546c94f Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Tue, 11 Oct 2022 11:43:18 -0500 Subject: [PATCH 09/10] Fix one typo Signed-off-by: Jesse Whitehouse --- tests/unit/test_param_escaper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_param_escaper.py b/tests/unit/test_param_escaper.py index 01584d19..d33d548a 100644 --- a/tests/unit/test_param_escaper.py +++ b/tests/unit/test_param_escaper.py @@ -40,7 +40,7 @@ def test_escape_string_that_includes_special_characters(self): assert pe.escape_string("his name was 'robert palmer'") == r"'his name was \'robert palmer\''" - # These two tests represent the same user input in the several ways it can be written in Python + # These tests represent the same user input in the several ways it can be written in Python # Each argument to `escape_string` evaluates to the same bytes. But Python lets us write it differently. assert pe.escape_string("his name was \"robert palmer\"") == "'his name was \"robert palmer\"'" assert pe.escape_string('his name was "robert palmer"') == "'his name was \"robert palmer\"'" From 9c8eb6e8b59f9d70ff0ad75859c842b6a2441810 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Fri, 14 Oct 2022 11:47:28 -0500 Subject: [PATCH 10/10] Fixed test. This was originally written in such a way that it always passed. Signed-off-by: Jesse Whitehouse --- tests/unit/test_param_escaper.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_param_escaper.py b/tests/unit/test_param_escaper.py index d33d548a..cb5758aa 100644 --- a/tests/unit/test_param_escaper.py +++ b/tests/unit/test_param_escaper.py @@ -74,13 +74,15 @@ def test_escape_string_that_includes_special_characters(self): def test_escape_date_time(self): INPUT = datetime(1991,8,3,21,55) - OUTPUT = "1991-08-03 21:55:00" - assert pe.escape_datetime(INPUT, OUTPUT) + FORMAT = "%Y-%m-%d %H:%M:%S" + OUTPUT = "'1991-08-03 21:55:00'" + assert pe.escape_datetime(INPUT, FORMAT) == OUTPUT def test_escape_date(self): INPUT = date(1991,8,3) - OUTPUT = "1991-08-03" - assert pe.escape_datetime(INPUT, OUTPUT) + FORMAT = "%Y-%m-%d" + OUTPUT = "'1991-08-03'" + assert pe.escape_datetime(INPUT, FORMAT) == OUTPUT def test_escape_sequence_integer(self): assert pe.escape_sequence([1,2,3,4]) == "(1,2,3,4)"