Skip to content

Commit 2667fbf

Browse files
committed
feat: Add spot termination handler
1 parent 88665e2 commit 2667fbf

File tree

13 files changed

+253
-212
lines changed

13 files changed

+253
-212
lines changed

docs/configuration.md

+18-4
Original file line numberDiff line numberDiff line change
@@ -215,21 +215,35 @@ In case the setup does not work as intended, trace the events through this seque
215215

216216
### Termination watcher
217217

218-
This feature is in early stage and therefore disabled by default.
218+
This feature is in early stage and therefore disabled by default. To enable the watcher set `instance_termination_watcher.enable = true`.
219219

220-
The termination watcher is currently watching for spot termination notifications. The module is only taken events into account for instances tagged with `ghr:environment` by default when deployment the module as part of one of the main modules (root or multi-runner). The module can also be deployed stand-alone, in that case the tag filter needs to be tunned.
220+
The termination watcher is currently watching for spot terminations. The module is only taken events into account for instances tagged with `ghr:environment` by default when deployment the module as part of one of the main modules (root or multi-runner). The module can also be deployed stand-alone, in that case the tag filter needs to be tunned.
221+
222+
### Termination notification
223+
224+
The watcher is listening for spot termination warnings and create a log message and optionally a metric. The watcher is disabled by default. The feature is enabled once the watcher is enabled, the feature can be disabled explicit by setting `instance_termination_watcher.features.enable_spot_termination_handler = false`.
221225

222226
- Logs: The module will log all termination notifications. For each warning it will look up instance details and log the environment, instance type and time the instance is running. As well some other details.
223227
- Metrics: Metrics are disabled by default, this to avoid costs. Once enabled a metric will be created for each warning with at least dimensions for the environment and instance type. THe metric name space can be configured via the variables. The metric name used is `SpotInterruptionWarning`.
224228

225-
#### Log example
229+
### Termination handler
230+
231+
!!! warning
232+
This feature will only work once the CloudTrail is enabled.
233+
234+
The termination handler is listening for spot terminations by capture the `BidEvictedEvent` via CloudTrail. The handler will log and optionally create a metric for each termination. The intend is to enhance the logic to inform the user about the termination via the GitHub Job or Workflow run. The feature is disabled by default. The feature is enabled once the watcher is enabled, the feature can be disabled explicit by setting `instance_termination_watcher.features.enable_spot_termination_handler = false`.
235+
236+
- Logs: The module will log all termination notifications. For each warning it will look up instance details and log the environment, instance type and time the instance is running. As well some other details.
237+
- Metrics: Metrics are disabled by default, this to avoid costs. Once enabled a metric will be created for each termination with at least dimensions for the environment and instance type. THe metric name space can be configured via the variables. The metric name used is `SpotTermination`.
238+
239+
### Log example (both warnings and terminations)
226240

227241
Below an example of the the log messages created.
228242

229243
```
230244
{
231245
"level": "INFO",
232-
"message": "Received spot notification warning:",
246+
"message": "Received spot notification for ${metricName}",
233247
"environment": "default",
234248
"instanceId": "i-0039b8826b3dcea55",
235249
"instanceType": "c5.large",

examples/default/main.tf

+10-10
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ module "runners" {
7878
# Let the module manage the service linked role
7979
# create_service_linked_role_spot = true
8080

81-
instance_types = ["m5.large", "c5.large"]
81+
instance_types = ["m7a.large", "m5.large"]
8282

8383
# override delay of events in seconds
8484
delay_webhook_event = 5
@@ -98,7 +98,7 @@ module "runners" {
9898
runner_name_prefix = "${local.environment}_"
9999

100100
# Enable debug logging for the lambda functions
101-
log_level = "info"
101+
log_level = "debug"
102102

103103
enable_ami_housekeeper = true
104104
ami_housekeeper_cleanup_config = {
@@ -117,14 +117,14 @@ module "runners" {
117117
}
118118

119119
# enable metric creation (experimental)
120-
# metrics = {
121-
# enable = true
122-
# metric = {
123-
# enable_spot_termination_warning = true
124-
# enable_job_retry = false
125-
# enable_github_app_rate_limit = true
126-
# }
127-
# }
120+
metrics = {
121+
enable = true
122+
metric = {
123+
enable_spot_termination_warning = true
124+
enable_job_retry = false
125+
enable_github_app_rate_limit = false
126+
}
127+
}
128128

129129
# enable job_retry feature. Be careful with this feature, it can lead to you hitting API rate limits.
130130
# job_retry = {

lambdas/functions/termination-watcher/src/ConfigResolver.ts

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
22

33
export class Config {
44
createSpotWarningMetric: boolean;
5+
createSpotTerminationMetric: boolean;
56
tagFilters: Record<string, string>;
67
prefix: string;
78

@@ -11,6 +12,7 @@ export class Config {
1112
logger.debug('Loading config from environment variables', { env: process.env });
1213

1314
this.createSpotWarningMetric = process.env.ENABLE_METRICS_SPOT_WARNING === 'true';
15+
this.createSpotTerminationMetric = process.env.ENABLE_METRICS_SPOT_TERMINATION === 'true';
1416
this.prefix = process.env.PREFIX ?? '';
1517
this.tagFilters = { 'ghr:environment': this.prefix };
1618

lambdas/functions/termination-watcher/src/lambda.test.ts

+73-5
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@ import { Context } from 'aws-lambda';
33
import { mocked } from 'jest-mock';
44

55
import { handle as interruptionWarningHandlerImpl } from './termination-warning';
6-
import { interruptionWarning } from './lambda';
7-
import { SpotInterruptionWarning, SpotTerminationDetail } from './types';
6+
import { handle as terminationHandlerImpl } from './termination';
7+
import { interruptionWarning, termination } from './lambda';
8+
import { BidEvictedDetail, BidEvictedEvent, SpotInterruptionWarning, SpotTerminationDetail } from './types';
89

910
jest.mock('./termination-warning');
11+
jest.mock('./termination');
1012

1113
process.env.POWERTOOLS_METRICS_NAMESPACE = 'test';
1214
process.env.POWERTOOLS_TRACE_ENABLED = 'true';
13-
const event: SpotInterruptionWarning<SpotTerminationDetail> = {
15+
const spotInstanceInterruptionEvent: SpotInterruptionWarning<SpotTerminationDetail> = {
1416
version: '0',
1517
id: '1',
1618
'detail-type': 'EC2 Spot Instance Interruption Warning',
@@ -25,6 +27,42 @@ const event: SpotInterruptionWarning<SpotTerminationDetail> = {
2527
},
2628
};
2729

30+
const bidEvictedEvent: BidEvictedEvent<BidEvictedDetail> = {
31+
version: '0',
32+
id: '186d7999-3121-e749-23f3-c7caec1084e1',
33+
'detail-type': 'AWS Service Event via CloudTrail',
34+
source: 'aws.ec2',
35+
account: '123456789012',
36+
time: '2024-10-09T11:48:46Z',
37+
region: 'eu-west-1',
38+
resources: [],
39+
detail: {
40+
eventVersion: '1.10',
41+
userIdentity: {
42+
accountId: '123456789012',
43+
invokedBy: 'sec2.amazonaws.com',
44+
},
45+
eventTime: '2024-10-09T11:48:46Z',
46+
eventSource: 'ec2.amazonaws.com',
47+
eventName: 'BidEvictedEvent',
48+
awsRegion: 'eu-west-1',
49+
sourceIPAddress: 'ec2.amazonaws.com',
50+
userAgent: 'ec2.amazonaws.com',
51+
requestParameters: null,
52+
responseElements: null,
53+
requestID: 'ebf032e3-5009-3484-aae8-b4946ab2e2eb',
54+
eventID: '3a15843b-96c2-41b1-aac1-7d62dc754547',
55+
readOnly: false,
56+
eventType: 'AwsServiceEvent',
57+
managementEvent: true,
58+
recipientAccountId: '123456789012',
59+
serviceEventDetails: {
60+
instanceIdSet: ['i-12345678901234567'],
61+
},
62+
eventCategory: 'Management',
63+
},
64+
};
65+
2866
const context: Context = {
2967
awsRequestId: '1',
3068
callbackWaitsForEmptyEventLoop: false,
@@ -48,22 +86,52 @@ const context: Context = {
4886

4987
// Docs for testing async with jest: https://jestjs.io/docs/tutorial-async
5088
describe('Handle sport termination interruption warning', () => {
89+
beforeEach(() => {
90+
jest.clearAllMocks();
91+
});
92+
5193
it('should not throw or log in error.', async () => {
5294
const mock = mocked(interruptionWarningHandlerImpl);
5395
mock.mockImplementation(() => {
5496
return new Promise((resolve) => {
5597
resolve();
5698
});
5799
});
58-
await expect(interruptionWarning(event, context)).resolves.not.toThrow();
100+
await expect(interruptionWarning(spotInstanceInterruptionEvent, context)).resolves.not.toThrow();
59101
});
60102

61103
it('should not throw only log in error in case of an exception.', async () => {
62104
const logSpy = jest.spyOn(logger, 'error');
63105
const error = new Error('An error.');
64106
const mock = mocked(interruptionWarningHandlerImpl);
65107
mock.mockRejectedValue(error);
66-
await expect(interruptionWarning(event, context)).resolves.toBeUndefined();
108+
await expect(interruptionWarning(spotInstanceInterruptionEvent, context)).resolves.toBeUndefined();
109+
110+
expect(logSpy).toHaveBeenCalledTimes(1);
111+
});
112+
});
113+
114+
describe('Handle sport termination (BidEvictEvent', () => {
115+
beforeEach(() => {
116+
jest.clearAllMocks();
117+
});
118+
119+
it('should not throw or log in error.', async () => {
120+
const mock = mocked(terminationHandlerImpl);
121+
mock.mockImplementation(() => {
122+
return new Promise((resolve) => {
123+
resolve();
124+
});
125+
});
126+
await expect(termination(bidEvictedEvent, context)).resolves.not.toThrow();
127+
});
128+
129+
it('should not throw only log in error in case of an exception.', async () => {
130+
const logSpy = jest.spyOn(logger, 'error');
131+
const error = new Error('An error.');
132+
const mock = mocked(terminationHandlerImpl);
133+
mock.mockRejectedValue(error);
134+
await expect(termination(bidEvictedEvent, context)).resolves.toBeUndefined();
67135

68136
expect(logSpy).toHaveBeenCalledTimes(1);
69137
});

lambdas/functions/termination-watcher/src/lambda.ts

+14-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ import { logMetrics } from '@aws-lambda-powertools/metrics/middleware';
44
import { Context } from 'aws-lambda';
55

66
import { handle as handleTerminationWarning } from './termination-warning';
7-
import { SpotInterruptionWarning, SpotTerminationDetail } from './types';
7+
import { handle as handleTermination } from './termination';
8+
import { BidEvictedDetail, BidEvictedEvent, SpotInterruptionWarning, SpotTerminationDetail } from './types';
89
import { Config } from './ConfigResolver';
910

1011
const config = new Config();
@@ -24,6 +25,18 @@ export async function interruptionWarning(
2425
}
2526
}
2627

28+
export async function termination(event: BidEvictedEvent<BidEvictedDetail>, context: Context): Promise<void> {
29+
setContext(context, 'lambda.ts');
30+
logger.logEventIfEnabled(event);
31+
logger.debug('Configuration of the lambda', { config });
32+
33+
try {
34+
await handleTermination(event, config);
35+
} catch (e) {
36+
logger.error(`${(e as Error).message}`, { error: e as Error });
37+
}
38+
}
39+
2740
const addMiddleware = () => {
2841
const middleware = middy(interruptionWarning);
2942

0 commit comments

Comments
 (0)