Skip to content

Commit b60e6ef

Browse files
authored
fix(logs): LogRetention resources fail with rate exceeded errors (#26858)
The LogRetention Custom Resource used to be able to handle server-side throttling, when a lot of requests to the CloudWatch Logs service are made at the same time. Handling of this error case got lost during the migration to SDK v3. If we have (read: a lot) `LogRetention` Custom Resources in a _single_ Stack, CloudFormation apparently applies some internal breaks to the amount of parallelism. For example it appears that resources are batched in smaller groups that need to be completed before the next group is provisioned. And within the groups there appears to be a ever so slight delay between individual resources. Together this is enough to avoid rate limiting in most circumstances. **Therefore, in practice this issues only occurs when multiple stacks are deployed in parallel.** To test this scenario, I have added support for `integ-runner` to deploy all stacks of a test case concurrently. Support for arbitrary command args already existed, but needed to explicitly include the `concurrency` option. I then create an integration test that deploys 3 stacks à 25 LogRetention resources. This triggers the error cases described in #26837. The fix itself is twofold: - Pass the `maxRetries` prop value to the SDK client to increase the number of attempts of the SDK internal throttling handling. But also enforce a minimum for these retries since they might catch additional retryable failures that our custom outer loop does not account for. - Explicitly catch `ThrottlingException` errors in the outer retry loop. Closes #26837 ---- *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
1 parent ad9d764 commit b60e6ef

22 files changed

+9771
-4
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"version": "33.0.0",
3+
"files": {
4+
"21fbb51d7b23f6a6c262b46a9caee79d744a3ac019fd45422d988b96d44b2a22": {
5+
"source": {
6+
"path": "LogRetentionIntegRetriesDefaultTestDeployAssert6D1A1A1C.template.json",
7+
"packaging": "file"
8+
},
9+
"destinations": {
10+
"current_account-current_region": {
11+
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
12+
"objectKey": "21fbb51d7b23f6a6c262b46a9caee79d744a3ac019fd45422d988b96d44b2a22.json",
13+
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
14+
}
15+
}
16+
}
17+
},
18+
"dockerImages": {}
19+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"Parameters": {
3+
"BootstrapVersion": {
4+
"Type": "AWS::SSM::Parameter::Value<String>",
5+
"Default": "/cdk-bootstrap/hnb659fds/version",
6+
"Description": "Version of the CDK Bootstrap resources in this environment, automatically retrieved from SSM Parameter Store. [cdk:skip]"
7+
}
8+
},
9+
"Rules": {
10+
"CheckBootstrapVersion": {
11+
"Assertions": [
12+
{
13+
"Assert": {
14+
"Fn::Not": [
15+
{
16+
"Fn::Contains": [
17+
[
18+
"1",
19+
"2",
20+
"3",
21+
"4",
22+
"5"
23+
],
24+
{
25+
"Ref": "BootstrapVersion"
26+
}
27+
]
28+
}
29+
]
30+
},
31+
"AssertDescription": "CDK bootstrap stack version 6 required. Please run 'cdk bootstrap' with a recent version of the CDK CLI."
32+
}
33+
]
34+
}
35+
}
36+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
interface LogRetentionEvent extends Omit<AWSLambda.CloudFormationCustomResourceEvent, 'ResourceProperties'> {
2+
ResourceProperties: {
3+
ServiceToken: string;
4+
LogGroupName: string;
5+
LogGroupRegion?: string;
6+
RetentionInDays?: string;
7+
SdkRetry?: {
8+
maxRetries?: string;
9+
};
10+
RemovalPolicy?: string;
11+
};
12+
}
13+
export declare function handler(event: LogRetentionEvent, context: AWSLambda.Context): Promise<void>;
14+
export {};

packages/@aws-cdk-testing/framework-integ/test/aws-logs/test/integ.log-retention-retries.js.snapshot/asset.a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0/index.js

+192
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/* eslint-disable no-console */
2+
// eslint-disable-next-line import/no-extraneous-dependencies
3+
import * as Logs from '@aws-sdk/client-cloudwatch-logs';
4+
5+
interface LogRetentionEvent extends Omit<AWSLambda.CloudFormationCustomResourceEvent, 'ResourceProperties'> {
6+
ResourceProperties: {
7+
ServiceToken: string;
8+
LogGroupName: string;
9+
LogGroupRegion?: string;
10+
RetentionInDays?: string;
11+
SdkRetry?: {
12+
maxRetries?: string;
13+
};
14+
RemovalPolicy?: string
15+
};
16+
}
17+
18+
/**
19+
* Creates a log group and doesn't throw if it exists.
20+
*/
21+
async function createLogGroupSafe(logGroupName: string, client: Logs.CloudWatchLogsClient, withDelay: (block: () => Promise<void>) => Promise<void>) {
22+
await withDelay(async () => {
23+
try {
24+
const params = { logGroupName };
25+
const command = new Logs.CreateLogGroupCommand(params);
26+
await client.send(command);
27+
28+
} catch (error: any) {
29+
if (error instanceof Logs.ResourceAlreadyExistsException || error.name === 'ResourceAlreadyExistsException') {
30+
// The log group is already created by the lambda execution
31+
return;
32+
}
33+
34+
throw error;
35+
}
36+
});
37+
}
38+
39+
/**
40+
* Deletes a log group and doesn't throw if it does not exist.
41+
*/
42+
async function deleteLogGroup(logGroupName: string, client: Logs.CloudWatchLogsClient, withDelay: (block: () => Promise<void>) => Promise<void>) {
43+
await withDelay(async () => {
44+
try {
45+
const params = { logGroupName };
46+
const command = new Logs.DeleteLogGroupCommand(params);
47+
await client.send(command);
48+
49+
} catch (error: any) {
50+
if (error instanceof Logs.ResourceNotFoundException || error.name === 'ResourceNotFoundException') {
51+
// The log group doesn't exist
52+
return;
53+
}
54+
55+
throw error;
56+
}
57+
});
58+
}
59+
60+
/**
61+
* Puts or deletes a retention policy on a log group.
62+
*/
63+
async function setRetentionPolicy(
64+
logGroupName: string,
65+
client: Logs.CloudWatchLogsClient,
66+
withDelay: (block: () => Promise<void>) => Promise<void>,
67+
retentionInDays?: number,
68+
) {
69+
70+
await withDelay(async () => {
71+
if (!retentionInDays) {
72+
const params = { logGroupName };
73+
const deleteCommand = new Logs.DeleteRetentionPolicyCommand(params);
74+
await client.send(deleteCommand);
75+
} else {
76+
const params = { logGroupName, retentionInDays };
77+
const putCommand = new Logs.PutRetentionPolicyCommand(params);
78+
await client.send(putCommand);
79+
}
80+
});
81+
}
82+
83+
export async function handler(event: LogRetentionEvent, context: AWSLambda.Context) {
84+
try {
85+
console.log(JSON.stringify({ ...event, ResponseURL: '...' }));
86+
87+
// The target log group
88+
const logGroupName = event.ResourceProperties.LogGroupName;
89+
90+
// The region of the target log group
91+
const logGroupRegion = event.ResourceProperties.LogGroupRegion;
92+
93+
// Parse to AWS SDK retry options
94+
const maxRetries = parseIntOptional(event.ResourceProperties.SdkRetry?.maxRetries) ?? 5;
95+
const withDelay = makeWithDelay(maxRetries);
96+
97+
const sdkConfig: Logs.CloudWatchLogsClientConfig = {
98+
logger: console,
99+
region: logGroupRegion,
100+
maxAttempts: Math.max(5, maxRetries), // Use a minimum for SDK level retries, because it might include retryable failures that withDelay isn't checking for
101+
};
102+
const client = new Logs.CloudWatchLogsClient(sdkConfig);
103+
104+
if (event.RequestType === 'Create' || event.RequestType === 'Update') {
105+
// Act on the target log group
106+
await createLogGroupSafe(logGroupName, client, withDelay);
107+
await setRetentionPolicy(logGroupName, client, withDelay, parseIntOptional(event.ResourceProperties.RetentionInDays));
108+
109+
// Configure the Log Group for the Custom Resource function itself
110+
if (event.RequestType === 'Create') {
111+
const clientForCustomResourceFunction = new Logs.CloudWatchLogsClient({
112+
logger: console,
113+
region: process.env.AWS_REGION,
114+
});
115+
// Set a retention policy of 1 day on the logs of this very function.
116+
// Due to the async nature of the log group creation, the log group for this function might
117+
// still be not created yet at this point. Therefore we attempt to create it.
118+
// In case it is being created, createLogGroupSafe will handle the conflict.
119+
await createLogGroupSafe(`/aws/lambda/${context.functionName}`, clientForCustomResourceFunction, withDelay);
120+
// If createLogGroupSafe fails, the log group is not created even after multiple attempts.
121+
// In this case we have nothing to set the retention policy on but an exception will skip
122+
// the next line.
123+
await setRetentionPolicy(`/aws/lambda/${context.functionName}`, clientForCustomResourceFunction, withDelay, 1);
124+
}
125+
}
126+
127+
// When the requestType is delete, delete the log group if the removal policy is delete
128+
if (event.RequestType === 'Delete' && event.ResourceProperties.RemovalPolicy === 'destroy') {
129+
await deleteLogGroup(logGroupName, client, withDelay);
130+
// else retain the log group
131+
}
132+
133+
await respond('SUCCESS', 'OK', logGroupName);
134+
} catch (e: any) {
135+
console.log(e);
136+
await respond('FAILED', e.message, event.ResourceProperties.LogGroupName);
137+
}
138+
139+
function respond(responseStatus: string, reason: string, physicalResourceId: string) {
140+
const responseBody = JSON.stringify({
141+
Status: responseStatus,
142+
Reason: reason,
143+
PhysicalResourceId: physicalResourceId,
144+
StackId: event.StackId,
145+
RequestId: event.RequestId,
146+
LogicalResourceId: event.LogicalResourceId,
147+
Data: {
148+
// Add log group name as part of the response so that it's available via Fn::GetAtt
149+
LogGroupName: event.ResourceProperties.LogGroupName,
150+
},
151+
});
152+
153+
console.log('Responding', responseBody);
154+
155+
// eslint-disable-next-line @typescript-eslint/no-require-imports
156+
const parsedUrl = require('url').parse(event.ResponseURL);
157+
const requestOptions = {
158+
hostname: parsedUrl.hostname,
159+
path: parsedUrl.path,
160+
method: 'PUT',
161+
headers: {
162+
'content-type': '',
163+
'content-length': Buffer.byteLength(responseBody, 'utf8'),
164+
},
165+
};
166+
167+
return new Promise((resolve, reject) => {
168+
try {
169+
// eslint-disable-next-line @typescript-eslint/no-require-imports
170+
const request = require('https').request(requestOptions, resolve);
171+
request.on('error', reject);
172+
request.write(responseBody);
173+
request.end();
174+
} catch (e) {
175+
reject(e);
176+
}
177+
});
178+
}
179+
}
180+
181+
function parseIntOptional(value?: string, base = 10): number | undefined {
182+
if (value === undefined) {
183+
return undefined;
184+
}
185+
186+
return parseInt(value, base);
187+
}
188+
189+
function makeWithDelay(
190+
maxRetries: number,
191+
delayBase: number = 100,
192+
delayCap = 10 * 1000, // 10s
193+
): (block: () => Promise<void>) => Promise<void> {
194+
// If we try to update the log group, then due to the async nature of
195+
// Lambda logging there could be a race condition when the same log group is
196+
// already being created by the lambda execution. This can sometime result in
197+
// an error "OperationAbortedException: A conflicting operation is currently
198+
// in progress...Please try again."
199+
// To avoid an error, we do as requested and try again.
200+
201+
return async (block: () => Promise<void>) => {
202+
let attempts = 0;
203+
do {
204+
try {
205+
return await block();
206+
} catch (error: any) {
207+
if (
208+
error instanceof Logs.OperationAbortedException
209+
|| error.name === 'OperationAbortedException'
210+
|| error.name === 'ThrottlingException' // There is no class to check with instanceof, see https://github.com/aws/aws-sdk-js-v3/issues/5140
211+
) {
212+
if (attempts < maxRetries ) {
213+
attempts++;
214+
await new Promise(resolve => setTimeout(resolve, calculateDelay(attempts, delayBase, delayCap)));
215+
continue;
216+
} else {
217+
// The log group is still being changed by another execution but we are out of retries
218+
throw new Error('Out of attempts to change log group');
219+
}
220+
}
221+
throw error;
222+
}
223+
} while (true); // exit happens on retry count check
224+
};
225+
}
226+
227+
function calculateDelay(attempt: number, base: number, cap: number): number {
228+
return Math.round(Math.random() * Math.min(cap, base * 2 ** attempt));
229+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"version": "33.0.0",
3+
"files": {
4+
"a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0": {
5+
"source": {
6+
"path": "asset.a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0",
7+
"packaging": "zip"
8+
},
9+
"destinations": {
10+
"current_account-current_region": {
11+
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
12+
"objectKey": "a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0.zip",
13+
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
14+
}
15+
}
16+
},
17+
"9990a29f03d0c5431a972aeb27fc605359cf0093ddd08bfbdf611189e8116726": {
18+
"source": {
19+
"path": "aws-cdk-log-retention-integ-retries0.template.json",
20+
"packaging": "file"
21+
},
22+
"destinations": {
23+
"current_account-current_region": {
24+
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}",
25+
"objectKey": "9990a29f03d0c5431a972aeb27fc605359cf0093ddd08bfbdf611189e8116726.json",
26+
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}"
27+
}
28+
}
29+
}
30+
},
31+
"dockerImages": {}
32+
}

0 commit comments

Comments
 (0)