databricks · jprakash-db · Nov 20, 2024 · Nov 11, 2024 · Nov 12, 2024 · Nov 19, 2024
@@ -1,4 +1,5 @@
 import logging
+import random
 import time
 import typing
 from enum import Enum
@@ -285,25 +286,30 @@ def sleep_for_retry(self, response: BaseHTTPResponse) -> bool:
         """
         retry_after = self.get_retry_after(response)
         if retry_after:
-            backoff = self.get_backoff_time()
-            proposed_wait = max(backoff, retry_after)
-            self.check_proposed_wait(proposed_wait)
-            time.sleep(proposed_wait)
-            return True
+            proposed_wait = retry_after
+        else:
+            proposed_wait = self.get_backoff_time()
 
-        return False
+        proposed_wait = min(proposed_wait, self.delay_max)
+        self.check_proposed_wait(proposed_wait)
+        time.sleep(proposed_wait)
+        return True
 
     def get_backoff_time(self) -> float:
-        """Calls urllib3's built-in get_backoff_time.
+        """
+        This method implements the exponential backoff algorithm to calculate the delay between retries.
 
         Never returns a value larger than self.delay_max
         A MaxRetryDurationError will be raised if the calculated backoff would exceed self.max_attempts_duration
 
-        Note: within urllib3, a backoff is only calculated in cases where a Retry-After header is not present
-            in the previous unsuccessful request and `self.respect_retry_after_header` is True (which is always true)
+        :return:
         """
 
-        proposed_backoff = super().get_backoff_time()
+        current_attempt = self.stop_after_attempts_count - self.total
+        proposed_backoff = (2**current_attempt) * self.delay_min
+        if self.backoff_jitter != 0.0:
+            proposed_backoff += random.random() * self.backoff_jitter
+
         proposed_backoff = min(proposed_backoff, self.delay_max)
         self.check_proposed_wait(proposed_backoff)
 

@@ -64,8 +64,8 @@
 # - 900s attempts-duration lines up w ODBC/JDBC drivers (for cluster startup > 10 mins)
 _retry_policy = {  # (type, default, min, max)
     "_retry_delay_min": (float, 1, 0.1, 60),
-    "_retry_delay_max": (float, 60, 5, 3600),
-    "_retry_stop_after_attempts_count": (int, 30, 1, 60),
+    "_retry_delay_max": (float, 30, 5, 3600),
+    "_retry_stop_after_attempts_count": (int, 5, 1, 60),
     "_retry_stop_after_attempts_duration": (float, 900, 1, 86400),
     "_retry_delay_default": (float, 5, 1, 60),
 }

@@ -174,7 +174,7 @@ def test_retry_max_count_not_exceeded(self):
     def test_retry_exponential_backoff(self):
         """GIVEN the retry policy is configured for reasonable exponential backoff
         WHEN the server sends nothing but 429 responses with retry-afters
-        THEN the connector will use those retry-afters as a floor
+        THEN the connector will use those retry-afters values as delay
         """
         retry_policy = self._retry_policy.copy()
         retry_policy["_retry_delay_min"] = 1
@@ -191,10 +191,10 @@ def test_retry_exponential_backoff(self):
             assert isinstance(cm.value.args[1], MaxRetryDurationError)
 
             # With setting delay_min to 1, the expected retry delays should be:
-            # 3, 3, 4
-            # The first 2 retries are allowed, the 3rd retry puts the total duration over the limit
+            # 3, 3, 3, 3
+            # The first 3 retries are allowed, the 4th retry puts the total duration over the limit
             # of 10 seconds
-            assert mock_obj.return_value.getresponse.call_count == 3
+            assert mock_obj.return_value.getresponse.call_count == 4
             assert duration > 6
 
             # Should be less than 7, but this is a safe margin for CI/CD slowness

@@ -1,11 +1,9 @@
-from os import error
 import time
-from unittest.mock import Mock, patch
+from unittest.mock import patch, call
 import pytest
-from requests import Request
 from urllib3 import HTTPResponse
-from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory
-
+from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory, CommandType
+from urllib3.exceptions import MaxRetryError
 
 class TestRetry:
     @pytest.fixture()
@@ -25,32 +23,55 @@ def error_history(self) -> RequestHistory:
             method="POST", url=None, error=None, status=503, redirect_location=None
         )
 
+    def calculate_backoff_time(self, attempt, delay_min, delay_max):
+        exponential_backoff_time = (2**attempt) * delay_min
+        return min(exponential_backoff_time, delay_max)
+
     @patch("time.sleep")
     def test_sleep__no_retry_after(self, t_mock, retry_policy, error_history):
         retry_policy._retry_start_time = time.time()
         retry_policy.history = [error_history, error_history]
         retry_policy.sleep(HTTPResponse(status=503))
-        t_mock.assert_called_with(2)
+
+        expected_backoff_time = self.calculate_backoff_time(0, retry_policy.delay_min, retry_policy.delay_max)
+        t_mock.assert_called_with(expected_backoff_time)
 
     @patch("time.sleep")
-    def test_sleep__retry_after_is_binding(self, t_mock, retry_policy, error_history):
+    def test_sleep__no_retry_after_header__multiple_retries(self, t_mock, retry_policy):
+        num_attempts = retry_policy.stop_after_attempts_count
+
         retry_policy._retry_start_time = time.time()
-        retry_policy.history = [error_history, error_history]
-        retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"}))
-        t_mock.assert_called_with(3)
+        retry_policy.command_type = CommandType.OTHER
+
+        for attempt in range(num_attempts):
+            retry_policy.sleep(HTTPResponse(status=503))
+            # Internally urllib3 calls the increment function generating a new instance for every retry
+            retry_policy = retry_policy.increment()
+
+        expected_backoff_times = []
+        for attempt in range(num_attempts):
+            expected_backoff_times.append(self.calculate_backoff_time(attempt, retry_policy.delay_min, retry_policy.delay_max))
+
+        # Asserts if the sleep value was called in the expected order
+        t_mock.assert_has_calls([call(expected_time) for expected_time in expected_backoff_times])
 
     @patch("time.sleep")
-    def test_sleep__retry_after_present_but_not_binding(
-        self, t_mock, retry_policy, error_history
-    ):
+    def test_excessive_retry_attempts_error(self, t_mock, retry_policy):
+        # Attempting more than stop_after_attempt_count
+        num_attempts = retry_policy.stop_after_attempts_count + 1
+
         retry_policy._retry_start_time = time.time()
-        retry_policy.history = [error_history, error_history]
-        retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "1"}))
-        t_mock.assert_called_with(2)
+        retry_policy.command_type = CommandType.OTHER
+
+        with pytest.raises(MaxRetryError):
+            for attempt in range(num_attempts):
+                retry_policy.sleep(HTTPResponse(status=503))
+                # Internally urllib3 calls the increment function generating a new instance for every retry
+                retry_policy = retry_policy.increment()
 
     @patch("time.sleep")
-    def test_sleep__retry_after_surpassed(self, t_mock, retry_policy, error_history):
+    def test_sleep__retry_after_present(self, t_mock, retry_policy, error_history):
         retry_policy._retry_start_time = time.time()
         retry_policy.history = [error_history, error_history, error_history]
         retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"}))
-        t_mock.assert_called_with(4)
+        t_mock.assert_called_with(3)