diff --git a/src/databricks/sql/auth/retry.py b/src/databricks/sql/auth/retry.py index 0c6547cb..ec321bed 100755 --- a/src/databricks/sql/auth/retry.py +++ b/src/databricks/sql/auth/retry.py @@ -1,4 +1,5 @@ import logging +import random import time import typing from enum import Enum @@ -285,25 +286,30 @@ def sleep_for_retry(self, response: BaseHTTPResponse) -> bool: """ retry_after = self.get_retry_after(response) if retry_after: - backoff = self.get_backoff_time() - proposed_wait = max(backoff, retry_after) - self.check_proposed_wait(proposed_wait) - time.sleep(proposed_wait) - return True + proposed_wait = retry_after + else: + proposed_wait = self.get_backoff_time() - return False + proposed_wait = min(proposed_wait, self.delay_max) + self.check_proposed_wait(proposed_wait) + time.sleep(proposed_wait) + return True def get_backoff_time(self) -> float: - """Calls urllib3's built-in get_backoff_time. + """ + This method implements the exponential backoff algorithm to calculate the delay between retries. Never returns a value larger than self.delay_max A MaxRetryDurationError will be raised if the calculated backoff would exceed self.max_attempts_duration - Note: within urllib3, a backoff is only calculated in cases where a Retry-After header is not present - in the previous unsuccessful request and `self.respect_retry_after_header` is True (which is always true) + :return: """ - proposed_backoff = super().get_backoff_time() + current_attempt = self.stop_after_attempts_count - self.total + proposed_backoff = (2**current_attempt) * self.delay_min + if self.backoff_jitter != 0.0: + proposed_backoff += random.random() * self.backoff_jitter + proposed_backoff = min(proposed_backoff, self.delay_max) self.check_proposed_wait(proposed_backoff) diff --git a/src/databricks/sql/thrift_backend.py b/src/databricks/sql/thrift_backend.py index cf5cd906..29be5482 100644 --- a/src/databricks/sql/thrift_backend.py +++ b/src/databricks/sql/thrift_backend.py @@ -64,8 +64,8 @@ # - 900s attempts-duration lines up w ODBC/JDBC drivers (for cluster startup > 10 mins) _retry_policy = { # (type, default, min, max) "_retry_delay_min": (float, 1, 0.1, 60), - "_retry_delay_max": (float, 60, 5, 3600), - "_retry_stop_after_attempts_count": (int, 30, 1, 60), + "_retry_delay_max": (float, 30, 5, 3600), + "_retry_stop_after_attempts_count": (int, 5, 1, 60), "_retry_stop_after_attempts_duration": (float, 900, 1, 86400), "_retry_delay_default": (float, 5, 1, 60), } diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py index 7dd5f745..942955ca 100755 --- a/tests/e2e/common/retry_test_mixins.py +++ b/tests/e2e/common/retry_test_mixins.py @@ -174,7 +174,7 @@ def test_retry_max_count_not_exceeded(self): def test_retry_exponential_backoff(self): """GIVEN the retry policy is configured for reasonable exponential backoff WHEN the server sends nothing but 429 responses with retry-afters - THEN the connector will use those retry-afters as a floor + THEN the connector will use those retry-afters values as delay """ retry_policy = self._retry_policy.copy() retry_policy["_retry_delay_min"] = 1 @@ -191,10 +191,10 @@ def test_retry_exponential_backoff(self): assert isinstance(cm.value.args[1], MaxRetryDurationError) # With setting delay_min to 1, the expected retry delays should be: - # 3, 3, 4 - # The first 2 retries are allowed, the 3rd retry puts the total duration over the limit + # 3, 3, 3, 3 + # The first 3 retries are allowed, the 4th retry puts the total duration over the limit # of 10 seconds - assert mock_obj.return_value.getresponse.call_count == 3 + assert mock_obj.return_value.getresponse.call_count == 4 assert duration > 6 # Should be less than 7, but this is a safe margin for CI/CD slowness diff --git a/tests/unit/test_retry.py b/tests/unit/test_retry.py index 2108af4f..b7648ffb 100644 --- a/tests/unit/test_retry.py +++ b/tests/unit/test_retry.py @@ -1,11 +1,9 @@ -from os import error import time -from unittest.mock import Mock, patch +from unittest.mock import patch, call import pytest -from requests import Request from urllib3 import HTTPResponse -from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory - +from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory, CommandType +from urllib3.exceptions import MaxRetryError class TestRetry: @pytest.fixture() @@ -25,32 +23,55 @@ def error_history(self) -> RequestHistory: method="POST", url=None, error=None, status=503, redirect_location=None ) + def calculate_backoff_time(self, attempt, delay_min, delay_max): + exponential_backoff_time = (2**attempt) * delay_min + return min(exponential_backoff_time, delay_max) + @patch("time.sleep") def test_sleep__no_retry_after(self, t_mock, retry_policy, error_history): retry_policy._retry_start_time = time.time() retry_policy.history = [error_history, error_history] retry_policy.sleep(HTTPResponse(status=503)) - t_mock.assert_called_with(2) + + expected_backoff_time = self.calculate_backoff_time(0, retry_policy.delay_min, retry_policy.delay_max) + t_mock.assert_called_with(expected_backoff_time) @patch("time.sleep") - def test_sleep__retry_after_is_binding(self, t_mock, retry_policy, error_history): + def test_sleep__no_retry_after_header__multiple_retries(self, t_mock, retry_policy): + num_attempts = retry_policy.stop_after_attempts_count + retry_policy._retry_start_time = time.time() - retry_policy.history = [error_history, error_history] - retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"})) - t_mock.assert_called_with(3) + retry_policy.command_type = CommandType.OTHER + + for attempt in range(num_attempts): + retry_policy.sleep(HTTPResponse(status=503)) + # Internally urllib3 calls the increment function generating a new instance for every retry + retry_policy = retry_policy.increment() + + expected_backoff_times = [] + for attempt in range(num_attempts): + expected_backoff_times.append(self.calculate_backoff_time(attempt, retry_policy.delay_min, retry_policy.delay_max)) + + # Asserts if the sleep value was called in the expected order + t_mock.assert_has_calls([call(expected_time) for expected_time in expected_backoff_times]) @patch("time.sleep") - def test_sleep__retry_after_present_but_not_binding( - self, t_mock, retry_policy, error_history - ): + def test_excessive_retry_attempts_error(self, t_mock, retry_policy): + # Attempting more than stop_after_attempt_count + num_attempts = retry_policy.stop_after_attempts_count + 1 + retry_policy._retry_start_time = time.time() - retry_policy.history = [error_history, error_history] - retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "1"})) - t_mock.assert_called_with(2) + retry_policy.command_type = CommandType.OTHER + + with pytest.raises(MaxRetryError): + for attempt in range(num_attempts): + retry_policy.sleep(HTTPResponse(status=503)) + # Internally urllib3 calls the increment function generating a new instance for every retry + retry_policy = retry_policy.increment() @patch("time.sleep") - def test_sleep__retry_after_surpassed(self, t_mock, retry_policy, error_history): + def test_sleep__retry_after_present(self, t_mock, retry_policy, error_history): retry_policy._retry_start_time = time.time() retry_policy.history = [error_history, error_history, error_history] retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"})) - t_mock.assert_called_with(4) + t_mock.assert_called_with(3)