Skip to content

Retry attempts that fail due to a connection timeout #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
fa13d75
Explicitly catch OSError and socket.timeout errors, with automatic retry
susodapop Jul 28, 2022
b44d0b8
Add unit and e2e tests for retries on timeout behaviour.
susodapop Jul 28, 2022
22d2ac0
Add default delay to the retry policy
susodapop Jul 28, 2022
44f12b7
Don't make unit test take 5+ seconds
susodapop Jul 28, 2022
09cefec
Fix broken unit tests: test_retry_args_bounding
susodapop Jul 28, 2022
de96fce
Black thrift_backend.py
susodapop Jul 28, 2022
21c06d4
Only retry OSError's that mention Errno 110
susodapop Jul 28, 2022
befdf3d
Fix unnecessary indentation
susodapop Jul 28, 2022
fb1275b
Clarify docstrings after review feedback
susodapop Jul 29, 2022
c76ee65
Default retry_delay = None (not retryable)
susodapop Jul 29, 2022
7474834
This approach passes the e2e tests, but they take exactly 4 mins 51 secs
susodapop Aug 2, 2022
fa1fd50
Revert all changes since `main`
susodapop Aug 4, 2022
bf65b81
isolate delay bounding
susodapop Aug 2, 2022
a0d340e
Move error details scope up one-level.
susodapop Aug 2, 2022
a55cf9d
Retry GetOperationStatus if an OSError was raised during execution
susodapop Aug 2, 2022
38411a8
Log when a request is retried due to an OSError.
susodapop Aug 4, 2022
5096ef0
Update docstring for make_request
susodapop Aug 4, 2022
5c1ee79
Nit: unit tests show the .warn message is deprecated.
susodapop Aug 4, 2022
1f87a38
Black thrift_backend.py
susodapop Aug 4, 2022
baff3d5
Rerun black on thrift_backend.py
susodapop Aug 4, 2022
10016ea
Add comment about manual tests
susodapop Aug 5, 2022
4db4ad0
Bump to v2.0.3
susodapop Aug 5, 2022
767e34c
Revert "Bump to v2.0.3"
susodapop Aug 5, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 57 additions & 15 deletions src/databricks/sql/thrift_backend.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from decimal import Decimal
import errno
import logging
import math
import time
Expand All @@ -15,6 +16,9 @@

from databricks.sql.thrift_api.TCLIService import TCLIService, ttypes
from databricks.sql import *
from databricks.sql.thrift_api.TCLIService.TCLIService import (
Client as TCLIServiceClient,
)
from databricks.sql.utils import (
ArrowQueue,
ExecuteResponse,
Expand All @@ -39,6 +43,7 @@
"_retry_delay_max": (float, 60, 5, 3600),
"_retry_stop_after_attempts_count": (int, 30, 1, 60),
"_retry_stop_after_attempts_duration": (float, 900, 1, 86400),
"_retry_delay_default": (float, 5, 1, 60),
}


Expand Down Expand Up @@ -71,6 +76,8 @@ def __init__(
# _retry_delay_min (default: 1)
# _retry_delay_max (default: 60)
# {min,max} pre-retry delay bounds
# _retry_delay_default (default: 5)
# Only used when GetOperationStatus fails due to a TCP/OS Error.
# _retry_stop_after_attempts_count (default: 30)
# total max attempts during retry sequence
# _retry_stop_after_attempts_duration (default: 900)
Expand Down Expand Up @@ -158,7 +165,7 @@ def _initialize_retry_args(self, kwargs):
"retry parameter: {} given_or_default {}".format(key, given_or_default)
)
if bound != given_or_default:
logger.warn(
logger.warning(
"Override out of policy retry parameter: "
+ "{} given {}, restricted to {}".format(
key, given_or_default, bound
Expand Down Expand Up @@ -243,7 +250,9 @@ def _handle_request_error(self, error_info, attempt, elapsed):
# FUTURE: Consider moving to https://github.com/litl/backoff or
# https://github.com/jd/tenacity for retry logic.
def make_request(self, method, request):
"""Execute given request, attempting retries when receiving HTTP 429/503.
"""Execute given request, attempting retries when
1. Receiving HTTP 429/503 from server
2. OSError is raised during a GetOperationStatus

For delay between attempts, honor the given Retry-After header, but with bounds.
Use lower bound of expontial-backoff based on _retry_delay_min,
Expand All @@ -260,17 +269,21 @@ def make_request(self, method, request):
def get_elapsed():
return time.time() - t0

def bound_retry_delay(attempt, proposed_delay):
"""bound delay (seconds) by [min_delay*1.5^(attempt-1), max_delay]"""
delay = int(proposed_delay)
delay = max(delay, self._retry_delay_min * math.pow(1.5, attempt - 1))
delay = min(delay, self._retry_delay_max)
return delay

def extract_retry_delay(attempt):
# encapsulate retry checks, returns None || delay-in-secs
# Retry IFF 429/503 code + Retry-After header set
http_code = getattr(self._transport, "code", None)
retry_after = getattr(self._transport, "headers", {}).get("Retry-After")
if http_code in [429, 503] and retry_after:
# bound delay (seconds) by [min_delay*1.5^(attempt-1), max_delay]
delay = int(retry_after)
delay = max(delay, self._retry_delay_min * math.pow(1.5, attempt - 1))
delay = min(delay, self._retry_delay_max)
return delay
return bound_retry_delay(attempt, int(retry_after))
return None

def attempt_request(attempt):
Expand All @@ -279,24 +292,53 @@ def attempt_request(attempt):
# - non-None method_return -> success, return and be done
# - non-None retry_delay -> sleep delay before retry
# - error, error_message always set when available

error, error_message, retry_delay = None, None, None
try:
logger.debug("Sending request: {}".format(request))
response = method(request)
logger.debug("Received response: {}".format(response))
return response
except Exception as error:
except OSError as err:
error = err
error_message = str(err)

gos_name = TCLIServiceClient.GetOperationStatus.__name__
if method.__name__ == gos_name:
retry_delay = bound_retry_delay(attempt, self._retry_delay_default)

# fmt: off
# The built-in errno package encapsulates OSError codes, which are OS-specific.
# log.info for errors we believe are not unusual or unexpected. log.warn for
# for others like EEXIST, EBADF, ERANGE which are not expected in this context.
# | Debian | Darwin |
info_errs = [ # |--------|--------|
errno.ESHUTDOWN, # | 32 | 32 |
errno.EAFNOSUPPORT, # | 97 | 47 |
errno.ECONNRESET, # | 104 | 54 |
errno.ETIMEDOUT, # | 110 | 60 |
]

# fmt: on
log_string = f"{gos_name} failed with code {err.errno} and will attempt to retry"
if err.errno in info_errs:
logger.info(log_string)
else:
logger.warning(log_string)
except Exception as err:
error = err
retry_delay = extract_retry_delay(attempt)
error_message = ThriftBackend._extract_error_message_from_headers(
getattr(self._transport, "headers", {})
)
return RequestErrorInfo(
error=error,
error_message=error_message,
retry_delay=retry_delay,
http_code=getattr(self._transport, "code", None),
method=method.__name__,
request=request,
)
return RequestErrorInfo(
error=error,
error_message=error_message,
retry_delay=retry_delay,
http_code=getattr(self._transport, "code", None),
method=method.__name__,
request=request,
)

# The real work:
# - for each available attempt:
Expand Down
57 changes: 57 additions & 0 deletions tests/unit/test_thrift_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def retry_policy_factory():
"_retry_delay_max": (float, 60, None, None),
"_retry_stop_after_attempts_count": (int, 30, None, None),
"_retry_stop_after_attempts_duration": (float, 900, None, None),
"_retry_delay_default": (float, 5, 1, 60)
}


Expand Down Expand Up @@ -968,6 +969,62 @@ def test_handle_execute_response_sets_active_op_handle(self):

self.assertEqual(mock_resp.operationHandle, mock_cursor.active_op_handle)

@patch("thrift.transport.THttpClient.THttpClient")
@patch("databricks.sql.thrift_api.TCLIService.TCLIService.Client.GetOperationStatus")
@patch("databricks.sql.thrift_backend._retry_policy", new_callable=retry_policy_factory)
def test_make_request_will_retry_GetOperationStatus(
self, mock_retry_policy, mock_GetOperationStatus, t_transport_class):

import thrift, errno
from databricks.sql.thrift_api.TCLIService.TCLIService import Client
from databricks.sql.exc import RequestError
from databricks.sql.utils import NoRetryReason

this_gos_name = "GetOperationStatus"
mock_GetOperationStatus.__name__ = this_gos_name
mock_GetOperationStatus.side_effect = OSError(errno.ETIMEDOUT, "Connection timed out")

protocol = thrift.protocol.TBinaryProtocol.TBinaryProtocol(t_transport_class)
client = Client(protocol)

req = ttypes.TGetOperationStatusReq(
operationHandle=self.operation_handle,
getProgressUpdate=False,
)

EXPECTED_RETRIES = 2

thrift_backend = ThriftBackend(
"foobar",
443,
"path", [],
_retry_stop_after_attempts_count=EXPECTED_RETRIES,
_retry_delay_default=1)


with self.assertRaises(RequestError) as cm:
thrift_backend.make_request(client.GetOperationStatus, req)

self.assertEqual(NoRetryReason.OUT_OF_ATTEMPTS.value, cm.exception.context["no-retry-reason"])
self.assertEqual(f'{EXPECTED_RETRIES}/{EXPECTED_RETRIES}', cm.exception.context["attempt"])

# Unusual OSError code
mock_GetOperationStatus.side_effect = OSError(errno.EEXIST, "File does not exist")

with self.assertLogs("databricks.sql.thrift_backend", level=logging.WARNING) as cm:
with self.assertRaises(RequestError):
thrift_backend.make_request(client.GetOperationStatus, req)

# There should be two warning log messages: one for each retry
self.assertEqual(len(cm.output), EXPECTED_RETRIES)

# The warnings should be identical
self.assertEqual(cm.output[1], cm.output[0])

# The warnings should include this text
self.assertIn(f"{this_gos_name} failed with code {errno.EEXIST} and will attempt to retry", cm.output[0])


@patch("thrift.transport.THttpClient.THttpClient")
def test_make_request_wont_retry_if_headers_not_present(self, t_transport_class):
t_transport_instance = t_transport_class.return_value
Expand Down