Skip to content

Commit c2602d0

Browse files
committed
feat: Add metric to track GitHub app rate limit
1 parent 6ed654f commit c2602d0

File tree

30 files changed

+347
-132
lines changed

30 files changed

+347
-132
lines changed

docs/configuration.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,15 @@ This feature has been disabled by default.
191191

192192
The watcher will act on all spot termination notificatins and log all onses relevant to the runner module. Therefor we suggest to only deploy the watcher once. You can either deploy the watcher by enabling in one of your deployments or deploy the watcher as a stand alone module.
193193

194+
## Metrics
195+
196+
The module supports metrics (experimental feature) to monitor the system. The metrics are disabled by default. To enable the metrics set `metrics.enable = true`. Once put by true all module managed metrics are used, you can configure the one bye one via the `metrics` object. The metrics are created in the namespace `GitHub Runners`.
197+
198+
### Supported metrics
199+
200+
- **GitHubAppRateLimitRemaining**: Remaining rate limit for the GitHub App.
201+
- **JobRetry**: Number of job retries, only relevant when job retry is enabled.
202+
- **SpotInterruptionWarning**: Number of spot interruption warnings received by the termination watcher, only relevant when the termination watcher is enabled.
194203

195204
## Debugging
196205

examples/default/main.tf

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,20 +114,24 @@ module "runners" {
114114

115115
instance_termination_watcher = {
116116
enable = true
117-
enable_metric = {
118-
spot_warning = true
119-
}
120117
}
121118

122-
# enable job_retry feature. Be careful with this feature, it can lead to API rate limits.
123-
# job_retry = {
124-
# enable = true
125-
# max_attempts = 1
126-
# delay_in_seconds = 180
119+
# enable metric creation (experimental)
120+
# metrics = {
121+
# enable = true
122+
# metric = {
123+
# enable_spot_termination_warning = true
124+
# enable_job_retry = false
125+
# enable_github_app_rate_limit = true
126+
# }
127127
# }
128128

129-
# enable metric creation by the control plane (experimental)
130-
# enable_metrics_control_plane = true
129+
# enable job_retry feature. Be careful with this feature, it can lead to API rate limits.
130+
job_retry = {
131+
enable = true
132+
max_attempts = 1
133+
delay_in_seconds = 180
134+
}
131135

132136
# enable CMK instead of aws managed key for encryptions
133137
# kms_key_arn = aws_kms_key.github.arn

examples/multi-runner/main.tf

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,15 @@ module "runners" {
103103
# Enable to track the spot instance termination warning
104104
# instance_termination_watcher = {
105105
# enable = true
106-
# enable_metric = {
107-
# spot_warning = true
106+
# }
107+
108+
# Enable metrics
109+
# metrics = {
110+
# enable = true
111+
# metric = {
112+
# enable_github_app_rate_limit = true
113+
# enable_job_retry = false
114+
# enable_spot_termination_warning = true
108115
# }
109116
# }
110117
}

lambdas/functions/control-plane/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"ts-node-dev": "^2.0.0"
3939
},
4040
"dependencies": {
41+
"@aws-lambda-powertools/parameters": "^2.7.0",
4142
"@aws-sdk/client-ec2": "^3.637.0",
4243
"@aws-sdk/client-sqs": "^3.637.0",
4344
"@aws-sdk/types": "^3.609.0",
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import { ResponseHeaders } from '@octokit/types';
2+
import { createSingleMetric } from '@terraform-aws-github-runner/aws-powertools-util';
3+
import { MetricUnit } from '@aws-lambda-powertools/metrics';
4+
import { metricGitHubAppRateLimit } from './rate-limit';
5+
6+
process.env.PARAMETER_GITHUB_APP_ID_NAME = 'test';
7+
jest.mock('@terraform-aws-github-runner/aws-ssm-util', () => ({
8+
...jest.requireActual('@terraform-aws-github-runner/aws-ssm-util'),
9+
// get parameter name from process.env.PARAMETER_GITHUB_APP_ID_NAME rerunt 1234
10+
getParameter: jest.fn((name: string) => {
11+
if (name === process.env.PARAMETER_GITHUB_APP_ID_NAME) {
12+
return '1234';
13+
} else {
14+
return '';
15+
}
16+
}),
17+
}));
18+
19+
jest.mock('@terraform-aws-github-runner/aws-powertools-util', () => ({
20+
...jest.requireActual('@terraform-aws-github-runner/aws-powertools-util'),
21+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
22+
createSingleMetric: jest.fn((name: string, unit: string, value: number, dimensions?: Record<string, string>) => {
23+
return {
24+
addMetadata: jest.fn(),
25+
};
26+
}),
27+
}));
28+
29+
describe('metricGitHubAppRateLimit', () => {
30+
beforeEach(() => {
31+
jest.clearAllMocks();
32+
});
33+
34+
it('should update rate limit metric', async () => {
35+
// set process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT to true
36+
process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = 'true';
37+
const headers: ResponseHeaders = {
38+
'x-ratelimit-remaining': '10',
39+
'x-ratelimit-limit': '60',
40+
};
41+
42+
await metricGitHubAppRateLimit(headers);
43+
44+
expect(createSingleMetric).toHaveBeenCalledWith('GitHubAppRateLimitRemaining', MetricUnit.Count, 10, {
45+
AppId: '1234',
46+
});
47+
});
48+
49+
it('should not update rate limit metric', async () => {
50+
// set process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT to false
51+
process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = 'false';
52+
const headers: ResponseHeaders = {
53+
'x-ratelimit-remaining': '10',
54+
'x-ratelimit-limit': '60',
55+
};
56+
57+
await metricGitHubAppRateLimit(headers);
58+
59+
expect(createSingleMetric).not.toHaveBeenCalled();
60+
});
61+
62+
it('should not update rate limit metric if headers are undefined', async () => {
63+
// set process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT to true
64+
process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = 'true';
65+
66+
await metricGitHubAppRateLimit(undefined as unknown as ResponseHeaders);
67+
68+
expect(createSingleMetric).not.toHaveBeenCalled();
69+
});
70+
});
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import { ResponseHeaders } from '@octokit/types';
2+
import { createSingleMetric, logger } from '@terraform-aws-github-runner/aws-powertools-util';
3+
import { MetricUnit } from '@aws-lambda-powertools/metrics';
4+
import yn from 'yn';
5+
import { getParameter } from '@terraform-aws-github-runner/aws-ssm-util';
6+
7+
export async function metricGitHubAppRateLimit(headers: ResponseHeaders): Promise<void> {
8+
try {
9+
const remaining = parseInt(headers['x-ratelimit-remaining'] as string);
10+
const limit = parseInt(headers['x-ratelimit-limit'] as string);
11+
12+
logger.debug(`Rate limit remaining: ${remaining}, limit: ${limit}`);
13+
14+
const updateMetric = yn(process.env.ENABLE_METRIC_GITHUB_APP_RATE_LIMIT);
15+
if (updateMetric) {
16+
const appId = await getParameter(process.env.PARAMETER_GITHUB_APP_ID_NAME);
17+
const metric = createSingleMetric('GitHubAppRateLimitRemaining', MetricUnit.Count, remaining, {
18+
AppId: appId,
19+
});
20+
metric.addMetadata('AppId', appId);
21+
}
22+
} catch (e) {
23+
logger.debug(`Error updating rate limit metric`, { error: e });
24+
}
25+
}

lambdas/functions/control-plane/src/modules.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
declare namespace NodeJS {
22
export interface ProcessEnv {
33
AWS_REGION: string;
4+
ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string;
45
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string;
56
ENVIRONMENT: string;
67
GHES_URL: string;

lambdas/functions/control-plane/src/scale-runners/job-retry.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ describe(`Test job retry check`, () => {
179179
process.env.ENABLE_ORGANIZATION_RUNNERS = 'true';
180180
process.env.ENVIRONMENT = 'test';
181181
process.env.RUNNER_NAME_PREFIX = 'test';
182-
process.env.ENABLE_METRICS = 'true';
182+
process.env.ENABLE_METRIC_JOB_RETRY = 'true';
183183
process.env.JOB_QUEUE_SCALE_UP_URL =
184184
'https://sqs.eu-west-1.amazonaws.com/123456789/webhook_events_workflow_job_queue';
185185

lambdas/functions/control-plane/src/scale-runners/job-retry.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ export async function checkAndRetryJob(payload: ActionRequestMessageRetry): Prom
4646
const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`;
4747
const runnerNamePrefix = process.env.RUNNER_NAME_PREFIX ?? '';
4848
const jobQueueUrl = process.env.JOB_QUEUE_SCALE_UP_URL ?? '';
49-
const enableMetrics = yn(process.env.ENABLE_METRICS, { default: false });
49+
const enableMetrics = yn(process.env.ENABLE_METRIC_JOB_RETRY, { default: false });
5050
const environment = process.env.ENVIRONMENT;
5151

5252
addPersistentContextToChildLogger({

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws
77
import { RunnerInfo, RunnerList } from './../aws/runners.d';
88
import { GhRunners, githubCache } from './cache';
99
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
10+
import { metricGitHubAppRateLimit } from '../gh-auth/rate-limit';
1011

1112
const logger = createChildLogger('scale-down');
1213

@@ -63,6 +64,8 @@ async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo,
6364

6465
logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`);
6566

67+
metricGitHubAppRateLimit(state.headers);
68+
6669
return state.data.busy;
6770
}
6871

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { GetParameterCommand, PutParameterCommand, SSMClient } from '@aws-sdk/client-ssm';
1+
import { PutParameterCommand, SSMClient } from '@aws-sdk/client-ssm';
22
import { Octokit } from '@octokit/rest';
33
import { mockClient } from 'aws-sdk-client-mock';
44
import 'aws-sdk-client-mock-jest';
@@ -11,6 +11,7 @@ import { createRunner, listEC2Runners } from './../aws/runners';
1111
import { RunnerInputParameters } from './../aws/runners.d';
1212
import ScaleError from './ScaleError';
1313
import * as scaleUpModule from './scale-up';
14+
import { getParameter } from '@terraform-aws-github-runner/aws-ssm-util';
1415

1516
const mockOctokit = {
1617
paginate: jest.fn(),
@@ -30,13 +31,20 @@ const mockOctokit = {
3031
const mockCreateRunner = mocked(createRunner);
3132
const mockListRunners = mocked(listEC2Runners);
3233
const mockSSMClient = mockClient(SSMClient);
34+
const mockSSM = mocked(getParameter);
3335

3436
jest.mock('@octokit/rest', () => ({
3537
Octokit: jest.fn().mockImplementation(() => mockOctokit),
3638
}));
3739

3840
jest.mock('./../aws/runners');
3941
jest.mock('./../gh-auth/gh-auth');
42+
43+
jest.mock('@terraform-aws-github-runner/aws-ssm-util', () => ({
44+
...jest.requireActual('@terraform-aws-github-runner/aws-ssm-util'),
45+
getParameter: jest.fn(),
46+
}));
47+
4048
export type RunnerType = 'ephemeral' | 'non-ephemeral';
4149

4250
// for ephemeral and non-ephemeral runners
@@ -172,6 +180,10 @@ beforeEach(() => {
172180
});
173181

174182
mockCreateClient.mockResolvedValue(new mocktokit());
183+
184+
mockSSM.mockImplementation(async () => {
185+
return '1';
186+
});
175187
});
176188

177189
describe('scaleUp with GHES', () => {
@@ -213,7 +225,6 @@ describe('scaleUp with GHES', () => {
213225

214226
expectedRunnerParams = { ...EXPECTED_RUNNER_PARAMS };
215227
mockSSMClient.reset();
216-
mockSSMClient.on(GetParameterCommand).resolves({ Parameter: { Value: '1' } });
217228
});
218229

219230
it('gets the current org level runners', async () => {
@@ -270,7 +281,9 @@ describe('scaleUp with GHES', () => {
270281

271282
it('Throws an error if runner group doesnt exist for ephemeral runners', async () => {
272283
process.env.RUNNER_GROUP_NAME = 'test-runner-group';
273-
mockSSMClient.on(GetParameterCommand).rejects();
284+
mockSSM.mockImplementation(async () => {
285+
throw new Error('ParameterNotFound');
286+
});
274287
await expect(scaleUpModule.scaleUp('aws:sqs', TEST_DATA)).rejects.toBeInstanceOf(Error);
275288
expect(mockOctokit.paginate).toHaveBeenCalledTimes(1);
276289
});
@@ -284,7 +297,9 @@ describe('scaleUp with GHES', () => {
284297
});
285298

286299
it('create SSM parameter for runner group id if it doesnt exist', async () => {
287-
mockSSMClient.on(GetParameterCommand).rejects();
300+
mockSSM.mockImplementation(async () => {
301+
throw new Error('ParameterNotFound');
302+
});
288303
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
289304
expect(mockOctokit.paginate).toHaveBeenCalledTimes(1);
290305
expect(mockSSMClient).toHaveReceivedCommandTimes(PutParameterCommand, 2);
@@ -295,16 +310,14 @@ describe('scaleUp with GHES', () => {
295310
});
296311
});
297312

298-
it('Doesnt create SSM parameter for runner group id if it exists', async () => {
299-
mockSSMClient.on(GetParameterCommand).resolves({ Parameter: { Value: '1' } });
313+
it('Does not create SSM parameter for runner group id if it exists', async () => {
300314
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
301315
expect(mockOctokit.paginate).toHaveBeenCalledTimes(0);
302316
expect(mockSSMClient).toHaveReceivedCommandTimes(PutParameterCommand, 1);
303317
});
304318

305319
it('create start runner config for ephemeral runners ', async () => {
306320
process.env.RUNNERS_MAXIMUM_COUNT = '2';
307-
mockSSMClient.on(GetParameterCommand).resolves({ Parameter: { Value: '1' } });
308321
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
309322
expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toBeCalledWith({
310323
org: TEST_DATA.repositoryOwner,
@@ -356,7 +369,6 @@ describe('scaleUp with GHES', () => {
356369
mockListRunners.mockImplementation(async () => {
357370
return [];
358371
});
359-
mockSSMClient.on(GetParameterCommand).resolves({ Parameter: { Value: '1' } });
360372
const startTime = performance.now();
361373
const instances = [
362374
'i-1234',

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { createRunner, listEC2Runners } from './../aws/runners';
88
import { RunnerInputParameters } from './../aws/runners.d';
99
import ScaleError from './ScaleError';
1010
import { publishRetryMessage } from './job-retry';
11+
import { metricGitHubAppRateLimit } from '../gh-auth/rate-limit';
1112

1213
const logger = createChildLogger('scale-up');
1314

@@ -94,6 +95,9 @@ async function getGithubRunnerRegistrationToken(githubRunnerConfig: CreateGitHub
9495
owner: githubRunnerConfig.runnerOwner.split('/')[0],
9596
repo: githubRunnerConfig.runnerOwner.split('/')[1],
9697
});
98+
99+
const appId = parseInt(await getParameter(process.env.PARAMETER_GITHUB_APP_ID_NAME));
100+
logger.info('App id from SSM', { appId: appId });
97101
return registrationToken.data.token;
98102
}
99103

@@ -142,6 +146,7 @@ export async function isJobQueued(githubInstallationClient: Octokit, payload: Ac
142146
owner: payload.repositoryOwner,
143147
repo: payload.repositoryName,
144148
});
149+
metricGitHubAppRateLimit(jobForWorkflowRun.headers);
145150
isQueued = jobForWorkflowRun.data.status === 'queued';
146151
} else {
147152
throw Error(`Event ${payload.eventType} is not supported`);
@@ -169,7 +174,7 @@ async function getRunnerGroupId(githubRunnerConfig: CreateGitHubRunnerConfig, gh
169174
}
170175
if (runnerGroup === undefined) {
171176
// get runner group id from GitHub
172-
runnerGroupId = await GetRunnerGroupByName(ghClient, githubRunnerConfig);
177+
runnerGroupId = await getRunnerGroupByName(ghClient, githubRunnerConfig);
173178
// store runner group id in SSM
174179
try {
175180
await putParameter(
@@ -188,7 +193,7 @@ async function getRunnerGroupId(githubRunnerConfig: CreateGitHubRunnerConfig, gh
188193
return runnerGroupId;
189194
}
190195

191-
async function GetRunnerGroupByName(ghClient: Octokit, githubRunnerConfig: CreateGitHubRunnerConfig): Promise<number> {
196+
async function getRunnerGroupByName(ghClient: Octokit, githubRunnerConfig: CreateGitHubRunnerConfig): Promise<number> {
192197
const runnerGroups: RunnerGroup[] = await ghClient.paginate(`GET /orgs/{org}/actions/runner-groups`, {
193198
org: githubRunnerConfig.runnerOwner,
194199
per_page: 100,
@@ -432,6 +437,8 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins
432437
labels: ephemeralRunnerConfig.runnerLabels,
433438
});
434439

440+
metricGitHubAppRateLimit(runnerConfig.headers);
441+
435442
// store jit config in ssm parameter store
436443
logger.debug('Runner JIT config for ephemeral runner generated.', {
437444
instance: instance,

lambdas/libs/aws-ssm-util/src/index.test.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,6 @@ describe('Test getParameter and putParameter', () => {
136136
mockSSMClient.on(GetParameterCommand).resolves(output);
137137

138138
// Act
139-
const result = await getParameter(parameterName);
140-
141-
// Assert
142-
expect(result).toBe(undefined);
139+
await expect(getParameter(parameterName)).rejects.toThrow(`Parameter ${parameterName} not found`);
143140
});
144141
});

0 commit comments

Comments
 (0)