Skip to content

[ PECO - 1768 ] PySQL: adjust HTTP retry logic to align with Go and Nodejs drivers #467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions src/databricks/sql/auth/retry.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import random
import time
import typing
from enum import Enum
Expand Down Expand Up @@ -285,25 +286,30 @@ def sleep_for_retry(self, response: BaseHTTPResponse) -> bool:
"""
retry_after = self.get_retry_after(response)
if retry_after:
backoff = self.get_backoff_time()
proposed_wait = max(backoff, retry_after)
self.check_proposed_wait(proposed_wait)
time.sleep(proposed_wait)
return True
proposed_wait = retry_after
else:
proposed_wait = self.get_backoff_time()

return False
proposed_wait = min(proposed_wait, self.delay_max)
self.check_proposed_wait(proposed_wait)
time.sleep(proposed_wait)
return True

def get_backoff_time(self) -> float:
"""Calls urllib3's built-in get_backoff_time.
"""
This method implements the exponential backoff algorithm to calculate the delay between retries.

Never returns a value larger than self.delay_max
A MaxRetryDurationError will be raised if the calculated backoff would exceed self.max_attempts_duration

Note: within urllib3, a backoff is only calculated in cases where a Retry-After header is not present
in the previous unsuccessful request and `self.respect_retry_after_header` is True (which is always true)
:return:
"""

proposed_backoff = super().get_backoff_time()
current_attempt = self.stop_after_attempts_count - self.total
proposed_backoff = (2**current_attempt) * self.delay_min
if self.backoff_jitter != 0.0:
proposed_backoff += random.random() * self.backoff_jitter

proposed_backoff = min(proposed_backoff, self.delay_max)
self.check_proposed_wait(proposed_backoff)

Expand Down
4 changes: 2 additions & 2 deletions src/databricks/sql/thrift_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@
# - 900s attempts-duration lines up w ODBC/JDBC drivers (for cluster startup > 10 mins)
_retry_policy = { # (type, default, min, max)
"_retry_delay_min": (float, 1, 0.1, 60),
"_retry_delay_max": (float, 60, 5, 3600),
"_retry_stop_after_attempts_count": (int, 30, 1, 60),
"_retry_delay_max": (float, 30, 5, 3600),
"_retry_stop_after_attempts_count": (int, 5, 1, 60),
"_retry_stop_after_attempts_duration": (float, 900, 1, 86400),
"_retry_delay_default": (float, 5, 1, 60),
}
Expand Down
8 changes: 4 additions & 4 deletions tests/e2e/common/retry_test_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_retry_max_count_not_exceeded(self):
def test_retry_exponential_backoff(self):
"""GIVEN the retry policy is configured for reasonable exponential backoff
WHEN the server sends nothing but 429 responses with retry-afters
THEN the connector will use those retry-afters as a floor
THEN the connector will use those retry-afters values as delay
"""
retry_policy = self._retry_policy.copy()
retry_policy["_retry_delay_min"] = 1
Expand All @@ -191,10 +191,10 @@ def test_retry_exponential_backoff(self):
assert isinstance(cm.value.args[1], MaxRetryDurationError)

# With setting delay_min to 1, the expected retry delays should be:
# 3, 3, 4
# The first 2 retries are allowed, the 3rd retry puts the total duration over the limit
# 3, 3, 3, 3
# The first 3 retries are allowed, the 4th retry puts the total duration over the limit
# of 10 seconds
assert mock_obj.return_value.getresponse.call_count == 3
assert mock_obj.return_value.getresponse.call_count == 4
assert duration > 6

# Should be less than 7, but this is a safe margin for CI/CD slowness
Expand Down
57 changes: 39 additions & 18 deletions tests/unit/test_retry.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from os import error
import time
from unittest.mock import Mock, patch
from unittest.mock import patch, call
import pytest
from requests import Request
from urllib3 import HTTPResponse
from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory

from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory, CommandType
from urllib3.exceptions import MaxRetryError

class TestRetry:
@pytest.fixture()
Expand All @@ -25,32 +23,55 @@ def error_history(self) -> RequestHistory:
method="POST", url=None, error=None, status=503, redirect_location=None
)

def calculate_backoff_time(self, attempt, delay_min, delay_max):
exponential_backoff_time = (2**attempt) * delay_min
return min(exponential_backoff_time, delay_max)

@patch("time.sleep")
def test_sleep__no_retry_after(self, t_mock, retry_policy, error_history):
retry_policy._retry_start_time = time.time()
retry_policy.history = [error_history, error_history]
retry_policy.sleep(HTTPResponse(status=503))
t_mock.assert_called_with(2)

expected_backoff_time = self.calculate_backoff_time(0, retry_policy.delay_min, retry_policy.delay_max)
t_mock.assert_called_with(expected_backoff_time)

@patch("time.sleep")
def test_sleep__retry_after_is_binding(self, t_mock, retry_policy, error_history):
def test_sleep__no_retry_after_header__multiple_retries(self, t_mock, retry_policy):
num_attempts = retry_policy.stop_after_attempts_count

retry_policy._retry_start_time = time.time()
retry_policy.history = [error_history, error_history]
retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"}))
t_mock.assert_called_with(3)
retry_policy.command_type = CommandType.OTHER

for attempt in range(num_attempts):
retry_policy.sleep(HTTPResponse(status=503))
# Internally urllib3 calls the increment function generating a new instance for every retry
retry_policy = retry_policy.increment()

expected_backoff_times = []
for attempt in range(num_attempts):
expected_backoff_times.append(self.calculate_backoff_time(attempt, retry_policy.delay_min, retry_policy.delay_max))

# Asserts if the sleep value was called in the expected order
t_mock.assert_has_calls([call(expected_time) for expected_time in expected_backoff_times])

@patch("time.sleep")
def test_sleep__retry_after_present_but_not_binding(
self, t_mock, retry_policy, error_history
):
def test_excessive_retry_attempts_error(self, t_mock, retry_policy):
# Attempting more than stop_after_attempt_count
num_attempts = retry_policy.stop_after_attempts_count + 1

retry_policy._retry_start_time = time.time()
retry_policy.history = [error_history, error_history]
retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "1"}))
t_mock.assert_called_with(2)
retry_policy.command_type = CommandType.OTHER

with pytest.raises(MaxRetryError):
for attempt in range(num_attempts):
retry_policy.sleep(HTTPResponse(status=503))
# Internally urllib3 calls the increment function generating a new instance for every retry
retry_policy = retry_policy.increment()

@patch("time.sleep")
def test_sleep__retry_after_surpassed(self, t_mock, retry_policy, error_history):
def test_sleep__retry_after_present(self, t_mock, retry_policy, error_history):
retry_policy._retry_start_time = time.time()
retry_policy.history = [error_history, error_history, error_history]
retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"}))
t_mock.assert_called_with(4)
t_mock.assert_called_with(3)
Loading