Skip to content

Commit 0c7ba3c

Browse files
author
Lambros Petrou
committed
Add aggressive retries and adjust timeouts for the AWS SDK
It's better to make the handler a bit slower but attempt many retries in order to avoid failing the CloudFormation deployments of customers since that's a way bigger delay than waiting a few more seconds for the resource to be created.
1 parent e5a34cd commit 0c7ba3c

File tree

5 files changed

+101
-8
lines changed

5 files changed

+101
-8
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package software.amazon.codeguruprofiler.profilinggroup;
2+
3+
import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration;
4+
import software.amazon.awssdk.core.internal.retry.SdkDefaultRetrySetting;
5+
import software.amazon.awssdk.core.retry.RetryPolicy;
6+
import software.amazon.awssdk.core.retry.backoff.BackoffStrategy;
7+
import software.amazon.awssdk.core.retry.backoff.EqualJitterBackoffStrategy;
8+
import software.amazon.awssdk.http.SdkHttpClient;
9+
import software.amazon.awssdk.http.apache.ApacheHttpClient;
10+
import software.amazon.awssdk.services.codeguruprofiler.CodeGuruProfilerClient;
11+
12+
import java.time.Duration;
13+
14+
public class CodeGuruProfilerClientBuilder {
15+
// Following what we do at https://tiny.amazon.com/c1ma38jp/codeamazpackSkySblobmainsrc
16+
17+
/**
18+
* We use an equal-jitter exponential backoff strategy, with a base delay of 100 ms. And, the max backoff time is
19+
* set to 1 s. So, the retry pattern would be like this: 50 - 100 ms, 100 - 200 ms, 200 - 400 ms, 400 - 800 ms, 500
20+
* - 1000 ms, 500 - 1000 ms ... until the request succeeds or until the overall execution timeout or until we run
21+
* out of retries.
22+
*
23+
* The default in the SDK is a full-jitter backoff strategy in which the minimum backoff delay is 0ms (allows instant retries),
24+
* and uses a max backoff of 20 seconds.
25+
*/
26+
private static final Duration BASE_DELAY_MS = Duration.ofMillis(100); // Default: 100ms
27+
private static final Duration MAX_BACKOFF_MS = Duration.ofMillis(1000); // Default: 20 seconds
28+
29+
/**
30+
* Setting this to the max retries supported by the SDK. Note that we would timeout well before we do these many
31+
* retries since we are bound by the overall request timeout.
32+
*/
33+
private static final int MAX_ERROR_RETRY = 30; // Default: 3 (for most services)
34+
35+
/**
36+
* See https://tiny.amazon.com/vlswgwgb/codeamazpackSkySbloba96fsrc
37+
* for more details on the individual call timeouts.
38+
*/
39+
private static final Duration OVERALL_TIMEOUT = Duration.ofMillis(10000); // We can handle more here compared to our API.
40+
private static final Duration ATTEMPT_TIMEOUT = Duration.ofMillis(500);
41+
42+
/**
43+
* Maximum amount of time that the client waits for the underlying HTTP client to establish a TCP connection.
44+
* We want connection issues to time out quickly so that they can be retried like other failures. We can rely
45+
* on fast network since our calls are intra-region.
46+
*/
47+
private static final Duration CONNECTION_TIMEOUT = Duration.ofMillis(500); // Default: 10 seconds
48+
49+
/**
50+
* The maximum amount of time that the HTTP client waits to read data from an already-established TCP connection.
51+
* This is the time between when an HTTP POST ends and the entire response of the request is received, and it
52+
* includes the service and network round-trip times.
53+
*
54+
* The general recommendation is to set this value a little higher than the ATTEMPT_TIMEOUT setting if they are used together.
55+
*/
56+
private static final Duration SOCKET_TIMEOUT = Duration.ofMillis(600); // Default: 30s
57+
58+
59+
private static RetryPolicy getRetryPolicy() {
60+
BackoffStrategy failureBackoffStrategy = EqualJitterBackoffStrategy.builder()
61+
.baseDelay(BASE_DELAY_MS)
62+
.maxBackoffTime(MAX_BACKOFF_MS)
63+
.build();
64+
BackoffStrategy throttlingBackoffStrategy = EqualJitterBackoffStrategy.builder()
65+
.baseDelay(SdkDefaultRetrySetting.THROTTLED_BASE_DELAY) // 500ms
66+
.maxBackoffTime(MAX_BACKOFF_MS)
67+
.build();
68+
69+
return RetryPolicy.defaultRetryPolicy().toBuilder()
70+
.backoffStrategy(failureBackoffStrategy)
71+
.throttlingBackoffStrategy(throttlingBackoffStrategy)
72+
.numRetries(MAX_ERROR_RETRY) // We can be a bit slower in CloudFormation for the sake of not failing the deployment!
73+
.build();
74+
}
75+
76+
private static SdkHttpClient getHttpClient() {
77+
return ApacheHttpClient.builder()
78+
.connectionTimeout(CONNECTION_TIMEOUT)
79+
.socketTimeout(SOCKET_TIMEOUT)
80+
.build();
81+
}
82+
83+
public static CodeGuruProfilerClient create() {
84+
return CodeGuruProfilerClient.builder()
85+
.overrideConfiguration(ClientOverrideConfiguration.builder()
86+
.retryPolicy(getRetryPolicy())
87+
.apiCallTimeout(OVERALL_TIMEOUT)
88+
.apiCallAttemptTimeout(ATTEMPT_TIMEOUT)
89+
.build())
90+
.httpClient(getHttpClient())
91+
.build();
92+
}
93+
}

aws-codeguruprofiler-profilinggroup/src/main/java/software/amazon/codeguruprofiler/profilinggroup/CreateHandler.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
public class CreateHandler extends BaseHandler<CallbackContext> {
2121

22+
private final CodeGuruProfilerClient profilerClient = CodeGuruProfilerClientBuilder.create();
23+
2224
@Override
2325
public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2426
final AmazonWebServicesClientProxy proxy,
@@ -29,8 +31,6 @@ public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2931
final ResourceModel model = request.getDesiredResourceState();
3032

3133
try {
32-
CodeGuruProfilerClient profilerClient = CodeGuruProfilerClient.create();
33-
3434
CreateProfilingGroupRequest createProfilingGroupRequest = CreateProfilingGroupRequest.builder()
3535
.profilingGroupName(model.getProfilingGroupName())
3636
.build();

aws-codeguruprofiler-profilinggroup/src/main/java/software/amazon/codeguruprofiler/profilinggroup/DeleteHandler.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
public class DeleteHandler extends BaseHandler<CallbackContext> {
1919

20+
private final CodeGuruProfilerClient profilerClient = CodeGuruProfilerClientBuilder.create();
21+
2022
@Override
2123
public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2224
final AmazonWebServicesClientProxy proxy,
@@ -27,8 +29,6 @@ public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2729
final ResourceModel model = request.getDesiredResourceState();
2830

2931
try {
30-
CodeGuruProfilerClient profilerClient = CodeGuruProfilerClient.create();
31-
3232
DeleteProfilingGroupRequest deleteProfilingGroupRequest = DeleteProfilingGroupRequest.builder()
3333
.profilingGroupName(model.getProfilingGroupName())
3434
.build();

aws-codeguruprofiler-profilinggroup/src/main/java/software/amazon/codeguruprofiler/profilinggroup/ListHandler.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
public class ListHandler extends BaseHandler<CallbackContext> {
2020

21+
private final CodeGuruProfilerClient profilerClient = CodeGuruProfilerClientBuilder.create();
22+
2123
@Override
2224
public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2325
final AmazonWebServicesClientProxy proxy,
@@ -28,8 +30,6 @@ public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2830
final List<ResourceModel> models = new ArrayList<>();
2931

3032
try {
31-
CodeGuruProfilerClient profilerClient = CodeGuruProfilerClient.create();
32-
3333
ListProfilingGroupsRequest listProfilingGroupsRequest = ListProfilingGroupsRequest.builder()
3434
.includeDescription(true)
3535
.maxResults(100)

aws-codeguruprofiler-profilinggroup/src/main/java/software/amazon/codeguruprofiler/profilinggroup/ReadHandler.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
public class ReadHandler extends BaseHandler<CallbackContext> {
2020

21+
private final CodeGuruProfilerClient profilerClient = CodeGuruProfilerClientBuilder.create();
22+
2123
@Override
2224
public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2325
final AmazonWebServicesClientProxy proxy,
@@ -28,8 +30,6 @@ public ProgressEvent<ResourceModel, CallbackContext> handleRequest(
2830
final ResourceModel model = request.getDesiredResourceState();
2931

3032
try {
31-
CodeGuruProfilerClient profilerClient = CodeGuruProfilerClient.create();
32-
3333
DescribeProfilingGroupRequest describeProfilingGroupRequest = DescribeProfilingGroupRequest.builder()
3434
.profilingGroupName(model.getProfilingGroupName())
3535
.build();

0 commit comments

Comments
 (0)