databricks · susodapop · Dec 30, 2022 · Nov 15, 2022 · Nov 15, 2022 · Nov 15, 2022
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -112,6 +112,7 @@ export access_token=""
 There are several e2e test suites available:
 - `PySQLCoreTestSuite`
 - `PySQLLargeQueriesSuite`
+- `PySQLStagingIngestionTestSuite`
 - `PySQLRetryTestSuite.HTTP503Suite` **[not documented]**
 - `PySQLRetryTestSuite.HTTP429Suite` **[not documented]**
 - `PySQLUnityCatalogTestSuite` **[not documented]**
@@ -122,6 +123,12 @@ To execute the core test suite:
 poetry run python -m pytest tests/e2e/driver_tests.py::PySQLCoreTestSuite
 ```
 
+The `PySQLCoreTestSuite` namespace contains tests for all of the connector's basic features and behaviours. This is the default namespace where tests should be written unless they require specially configured clusters or take an especially long-time to execute by design.
+
+The `PySQLLargeQueriesSuite` namespace contains long-running query tests and is kept separate. In general, if the `PySQLCoreTestSuite` passes then these tests will as well.
+
+The `PySQLStagingIngestionTestSuite` namespace requires a cluster running DBR version > 12.x which supports staging ingestion commands.
+
 The suites marked `[not documented]` require additional configuration which will be documented at a later time.
 ### Code formatting
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,10 @@ pyarrow = "^9.0.0"
 lz4 = "^4.0.2"
 requests=">2.18.1"
 oauthlib=">=3.1.0"
+numpy = [
+    {version = "1.21.1", python = ">=3.7,<3.8"},
+    {version = "1.23.4", python = ">=3.8"}
+]
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.1.2"

diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py
@@ -2,6 +2,9 @@
 
 import pandas
 import pyarrow
+import requests
+import json
+import os
 
 from databricks.sql import __version__
 from databricks.sql import *
@@ -28,7 +31,7 @@ def __init__(
         session_configuration: Dict[str, Any] = None,
         catalog: Optional[str] = None,
         schema: Optional[str] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         """
         Connect to a Databricks SQL endpoint or a Databricks cluster.
@@ -173,7 +176,7 @@ def read(self) -> Optional[OAuthToken]:
             http_path,
             (http_headers or []) + base_headers,
             auth_provider,
-            **kwargs
+            **kwargs,
         )
 
         self._session_handle = self.thrift_backend.open_session(
@@ -297,6 +300,117 @@ def _check_not_closed(self):
         if not self.open:
             raise Error("Attempting operation on closed cursor")
 
+    def _handle_staging_operation(self, uploads_base_path: str):
+        """Fetch the HTTP request instruction from a staging ingestion command
+        and call the designated handler.
+
+        Raise an exception if localFile is specified by the server but the localFile
+        is not descended from uploads_base_path.
+        """
+
+        if uploads_base_path is None:
+            raise Error(
+                "You must provide an uploads_base_path when initialising a connection to perform ingestion commands"
+            )
+
+        row = self.active_result_set.fetchone()
+
+        if getattr(row, "localFile", None):
+            if os.path.commonpath([row.localFile, uploads_base_path]) != uploads_base_path:
+                raise Error("Local file operations are restricted to paths within the configured uploads_base_path")
+
+        # TODO: Experiment with DBR sending real headers.
+        # The specification says headers will be in JSON format but the current null value is actually an empty list []
+        handler_args = {
+            "presigned_url": row.presignedUrl,
+            "local_file": getattr(row, "localFile", None),
+            "headers": json.loads(row.headers or "{}"),
+        }
+
+        logger.debug(
+            f"Attempting staging operation indicated by server: {row.operation} - {getattr(row, 'localFile', '')}"
+        )
+
+        # TODO: Create a retry loop here to re-attempt if the request times out or fails
+        if row.operation == "GET":
+            return self._handle_staging_get(**handler_args)
+        elif row.operation == "PUT":
+            return self._handle_staging_put(**handler_args)
+        elif row.operation == "REMOVE":
+            # Local file isn't needed to remove a remote resource
+            handler_args.pop("local_file")
+            return self._handle_staging_remove(**handler_args)
+        else:
+            raise Error(
+                f"Operation {row.operation} is not supported. "
+                + "Supported operations are GET, PUT, and REMOVE"
+            )
+
+    def _handle_staging_put(
+        self, presigned_url: str, local_file: str, headers: dict = None
+    ):
+        """Make an HTTP PUT request
+
+        Raise an exception if request fails. Returns no data.
+        """
+
+        if local_file is None:
+            raise Error("Cannot perform PUT without specifying a local_file")
+
+        with open(local_file, "rb") as fh:
+            r = requests.put(url=presigned_url, data=fh, headers=headers)
+
+        # fmt: off
+        # Design borrowed from: https://stackoverflow.com/a/2342589/5093960
+
+        OK = requests.codes.ok                  # 200
+        CREATED = requests.codes.created        # 201
+        ACCEPTED = requests.codes.accepted      # 202
+        NO_CONTENT = requests.codes.no_content  # 204
+
+        # fmt: on
+
+        if r.status_code not in [OK, CREATED, NO_CONTENT, ACCEPTED]:
+            raise Error(
+                f"Staging operation over HTTP was unsuccessful: {r.status_code}-{r.text}"
+            )
+
+        if r.status_code == ACCEPTED:
+            logger.debug(
+                f"Response code {ACCEPTED} from server indicates ingestion command was accepted "
+                + "but not yet applied on the server. It's possible this command may fail later."
+            )
+
+    def _handle_staging_get(
+        self, local_file: str, presigned_url: str, headers: dict = None
+    ):
+        """Make an HTTP GET request, create a local file with the received data
+
+        Raise an exception if request fails. Returns no data.
+        """
+
+        with open(local_file, "wb") as fp:
+            r = requests.get(url=presigned_url, headers=headers)
+
+            # response.ok verifies the status code is not between 400-600.
+            # Any 2xx or 3xx will evaluate r.ok == True
+            if not r.ok:
+                raise Error(
+                    f"Staging operation over HTTP was unsuccessful: {r.status_code}-{r.text}"
+                )
+
+            fp.write(r.content)
+
+    def _handle_staging_remove(self, presigned_url: str, headers: dict = None):
+        """Make an HTTP DELETE request to the presigned_url"""
+
+        r = requests.delete(url=presigned_url, headers=headers)
+
+        if not r.ok:
+            raise Error(
+                f"Staging operation over HTTP was unsuccessful: {r.status_code}-{r.text}"
+            )
+
     def execute(
         self, operation: str, parameters: Optional[Dict[str, str]] = None
     ) -> "Cursor":
@@ -331,6 +445,12 @@ def execute(
             self.buffer_size_bytes,
             self.arraysize,
         )
+
+        if execute_response.is_staging_operation:
+            self._handle_staging_operation(
+                uploads_base_path=self.thrift_backend.uploads_base_path
+            )
+
         return self
 
     def executemany(self, operation, seq_of_parameters):

diff --git a/src/databricks/sql/thrift_backend.py b/src/databricks/sql/thrift_backend.py
@@ -61,6 +61,7 @@ def __init__(
         http_path: str,
         http_headers,
         auth_provider: AuthProvider,
+        uploads_base_path: str = None,
         **kwargs,
     ):
         # Internal arguments in **kwargs:
@@ -110,6 +111,7 @@ def __init__(
         else:
             raise ValueError("No valid connection settings.")
 
+        self.uploads_base_path = uploads_base_path
         self._initialize_retry_args(kwargs)
         self._use_arrow_native_complex_types = kwargs.get(
             "_use_arrow_native_complex_types", True
@@ -452,7 +454,7 @@ def open_session(self, session_configuration, catalog, schema):
                 initial_namespace = None
 
             open_session_req = ttypes.TOpenSessionReq(
-                client_protocol_i64=ttypes.TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V6,
+                client_protocol_i64=ttypes.TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V7,
                 client_protocol=None,
                 initialNamespace=initial_namespace,
                 canUseMultipleCatalogs=True,
@@ -733,6 +735,8 @@ def _results_message_to_execute_response(self, resp, operation_state):
             .to_pybytes()
         )
         lz4_compressed = t_result_set_metadata_resp.lz4Compressed
+        # TODO: will this fail if metadata doesn't include `isStagingOperation`?
+        is_staging_operation = t_result_set_metadata_resp.isStagingOperation
         if direct_results and direct_results.resultSet:
             assert direct_results.resultSet.results.startRowOffset == 0
             assert direct_results.resultSetMetadata
@@ -752,6 +756,7 @@ def _results_message_to_execute_response(self, resp, operation_state):
             has_been_closed_server_side=has_been_closed_server_side,
             has_more_rows=has_more_rows,
             lz4_compressed=lz4_compressed,
+            is_staging_operation=is_staging_operation,
             command_handle=resp.operationHandle,
             description=description,
             arrow_schema_bytes=schema_bytes,

diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -40,7 +40,7 @@ def remaining_rows(self) -> pyarrow.Table:
 
 ExecuteResponse = namedtuple(
     "ExecuteResponse",
-    "status has_been_closed_server_side has_more_rows description lz4_compressed "
+    "status has_been_closed_server_side has_more_rows description lz4_compressed is_staging_operation "
     "command_handle arrow_queue arrow_schema_bytes",
 )