Skip to content
This repository was archived by the owner on Jan 16, 2025. It is now read-only.

Commit 9f841f7

Browse files
npalmM1kepmckern
authored
fix: Honnor booting instance in runner pool (#2801)
Since the pool was not respecting booting instances it could happen you end up with a too large pool. Applying this change could mean that you need a larger pool than before. Co-authored-by: Michael Poutre <[email protected]> Co-authored-by: Ryan McKern <[email protected]>
1 parent fca2ddf commit 9f841f7

File tree

5 files changed

+161
-57
lines changed

5 files changed

+161
-57
lines changed

Diff for: modules/runners/lambdas/runners/src/pool/pool.test.ts

+112-44
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import { Octokit } from '@octokit/rest';
22
import { mocked } from 'jest-mock';
3+
import moment from 'moment-timezone';
34
import nock from 'nock';
45

56
import { listEC2Runners } from '../aws/runners';
67
import * as ghAuth from '../gh-auth/gh-auth';
7-
import * as scale from '../scale-runners/scale-up';
8+
import { createRunners } from '../scale-runners/scale-up';
89
import { adjust } from './pool';
910

1011
const mockOctokit = {
@@ -24,6 +25,7 @@ jest.mock('@octokit/rest', () => ({
2425

2526
jest.mock('./../aws/runners');
2627
jest.mock('./../gh-auth/gh-auth');
28+
jest.mock('./../scale-runners/scale-up');
2729

2830
const mocktokit = Octokit as jest.MockedClass<typeof Octokit>;
2931
const mockedAppAuth = mocked(ghAuth.createGithubAppAuth, {
@@ -36,6 +38,36 @@ const mockListRunners = mocked(listEC2Runners);
3638
const cleanEnv = process.env;
3739

3840
const ORG = 'my-org';
41+
const MINIMUM_TIME_RUNNING = 15;
42+
43+
const ec2InstancesRegistered = [
44+
{
45+
instanceId: 'i-1-idle',
46+
launchTime: new Date(),
47+
type: 'Org',
48+
owner: ORG,
49+
},
50+
{
51+
instanceId: 'i-2-busy',
52+
launchTime: new Date(),
53+
type: 'Org',
54+
owner: ORG,
55+
},
56+
{
57+
instanceId: 'i-3-offline',
58+
launchTime: new Date(),
59+
type: 'Org',
60+
owner: ORG,
61+
},
62+
{
63+
instanceId: 'i-4-idle-older-than-minimum-time-running',
64+
launchTime: moment(new Date())
65+
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
66+
.toDate(),
67+
type: 'Org',
68+
owner: ORG,
69+
},
70+
];
3971

4072
beforeEach(() => {
4173
nock.disableNetConnect();
@@ -55,6 +87,7 @@ beforeEach(() => {
5587
process.env.INSTANCE_TYPES = 'm5.large';
5688
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
5789
process.env.RUNNER_OWNER = ORG;
90+
process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();
5891

5992
const mockTokenReturnValue = {
6093
data: {
@@ -66,66 +99,39 @@ beforeEach(() => {
6699
mockOctokit.paginate.mockImplementation(() => [
67100
{
68101
id: 1,
69-
name: 'i-1',
102+
name: 'i-1-idle',
70103
os: 'linux',
71104
status: 'online',
72105
busy: false,
73106
labels: [],
74107
},
75108
{
76109
id: 2,
77-
name: 'i-2',
110+
name: 'i-2-busy',
78111
os: 'linux',
79112
status: 'online',
80113
busy: true,
81114
labels: [],
82115
},
83116
{
84117
id: 3,
85-
name: 'i-3',
118+
name: 'i-3-offline',
86119
os: 'linux',
87120
status: 'offline',
88121
busy: false,
89122
labels: [],
90123
},
91124
{
92-
id: 11,
93-
name: 'j-1', // some runner of another env
125+
id: 3,
126+
name: 'i-4-idle-older-than-minimum-time-running',
94127
os: 'linux',
95128
status: 'online',
96129
busy: false,
97130
labels: [],
98131
},
99-
{
100-
id: 12,
101-
name: 'j-2', // some runner of another env
102-
os: 'linux',
103-
status: 'online',
104-
busy: true,
105-
labels: [],
106-
},
107132
]);
108133

109-
mockListRunners.mockImplementation(async () => [
110-
{
111-
instanceId: 'i-1',
112-
launchTime: new Date(),
113-
type: 'Org',
114-
owner: ORG,
115-
},
116-
{
117-
instanceId: 'i-2',
118-
launchTime: new Date(),
119-
type: 'Org',
120-
owner: ORG,
121-
},
122-
{
123-
instanceId: 'i-3',
124-
launchTime: new Date(),
125-
type: 'Org',
126-
owner: ORG,
127-
},
128-
]);
134+
mockListRunners.mockImplementation(async () => ec2InstancesRegistered);
129135

130136
const mockInstallationIdReturnValueOrgs = {
131137
data: {
@@ -156,16 +162,74 @@ beforeEach(() => {
156162

157163
describe('Test simple pool.', () => {
158164
describe('With GitHub Cloud', () => {
159-
it('Top up pool with pool size 2.', async () => {
160-
const spy = jest.spyOn(scale, 'createRunners');
161-
await expect(adjust({ poolSize: 2 })).resolves;
162-
expect(spy).toBeCalled;
165+
it('Top up pool with pool size 2 registered.', async () => {
166+
await expect(await adjust({ poolSize: 3 })).resolves;
167+
expect(createRunners).toHaveBeenCalledTimes(1);
168+
expect(createRunners).toHaveBeenCalledWith(
169+
expect.anything(),
170+
expect.objectContaining({ numberOfRunners: 1 }),
171+
expect.anything(),
172+
);
163173
});
164174

165175
it('Should not top up if pool size is reached.', async () => {
166-
const spy = jest.spyOn(scale, 'createRunners');
167-
await expect(adjust({ poolSize: 1 })).resolves;
168-
expect(spy).not.toHaveBeenCalled;
176+
await expect(await adjust({ poolSize: 1 })).resolves;
177+
expect(createRunners).not.toHaveBeenCalled();
178+
});
179+
180+
it('Should top up if pool size is not reached including a booting instance.', async () => {
181+
mockListRunners.mockImplementation(async () => [
182+
...ec2InstancesRegistered,
183+
{
184+
instanceId: 'i-4-still-booting',
185+
launchTime: moment(new Date())
186+
.subtract(MINIMUM_TIME_RUNNING - 3, 'minutes')
187+
.toDate(),
188+
type: 'Org',
189+
owner: ORG,
190+
},
191+
{
192+
instanceId: 'i-5-orphan',
193+
launchTime: moment(new Date())
194+
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
195+
.toDate(),
196+
type: 'Org',
197+
owner: ORG,
198+
},
199+
]);
200+
201+
// 2 idle + 1 booting = 3, top up with 2 to match a pool of 5
202+
await expect(await adjust({ poolSize: 5 })).resolves;
203+
expect(createRunners).toHaveBeenCalledWith(
204+
expect.anything(),
205+
expect.objectContaining({ numberOfRunners: 2 }),
206+
expect.anything(),
207+
);
208+
});
209+
210+
it('Should not top up if pool size is reached including a booting instance.', async () => {
211+
mockListRunners.mockImplementation(async () => [
212+
...ec2InstancesRegistered,
213+
{
214+
instanceId: 'i-4-still-booting',
215+
launchTime: moment(new Date())
216+
.subtract(MINIMUM_TIME_RUNNING - 3, 'minutes')
217+
.toDate(),
218+
type: 'Org',
219+
owner: ORG,
220+
},
221+
{
222+
instanceId: 'i-5-orphan',
223+
launchTime: moment(new Date())
224+
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
225+
.toDate(),
226+
type: 'Org',
227+
owner: ORG,
228+
},
229+
]);
230+
231+
await expect(await adjust({ poolSize: 2 })).resolves;
232+
expect(createRunners).not.toHaveBeenCalled();
169233
});
170234
});
171235

@@ -175,9 +239,13 @@ describe('Test simple pool.', () => {
175239
});
176240

177241
it('Top up if the pool size is set to 5', async () => {
178-
const spy = jest.spyOn(scale, 'createRunners');
179-
await expect(adjust({ poolSize: 5 })).resolves;
180-
expect(spy).toBeCalled;
242+
await expect(await adjust({ poolSize: 5 })).resolves;
243+
// 2 idle, top up with 3 to match a pool of 5
244+
expect(createRunners).toHaveBeenCalledWith(
245+
expect.anything(),
246+
expect.objectContaining({ numberOfRunners: 3 }),
247+
expect.anything(),
248+
);
181249
});
182250
});
183251
});

Diff for: modules/runners/lambdas/runners/src/pool/pool.ts

+46-13
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
import moment from 'moment';
12
import yn from 'yn';
23

3-
import { listEC2Runners } from '../aws/runners';
4+
import { RunnerList, listEC2Runners } from '../aws/runners';
45
import { createGithubAppAuth, createGithubInstallationAuth, createOctoClient } from '../gh-auth/gh-auth';
56
import { logger as rootLogger } from '../logger';
67
import { createRunners } from '../scale-runners/scale-up';
@@ -11,6 +12,11 @@ export interface PoolEvent {
1112
poolSize: number;
1213
}
1314

15+
interface RunnerStatus {
16+
busy: boolean;
17+
status: string;
18+
}
19+
1420
export async function adjust(event: PoolEvent): Promise<void> {
1521
logger.info(`Checking current pool size against pool of size: ${event.poolSize}`);
1622
const runnerExtraLabels = process.env.RUNNER_EXTRA_LABELS;
@@ -45,20 +51,41 @@ export async function adjust(event: PoolEvent): Promise<void> {
4551
per_page: 100,
4652
},
4753
);
48-
const idleRunners = runners.filter((r) => !r.busy && r.status === 'online').map((r) => r.name);
54+
const runnerStatus = new Map<string, RunnerStatus>();
55+
for (const runner of runners) {
56+
runnerStatus.set(runner.name, { busy: runner.busy, status: runner.status });
57+
}
4958

5059
// Look up the managed ec2 runners in AWS, but running does not mean idle
51-
const ec2runners = (
52-
await listEC2Runners({
53-
environment,
54-
runnerOwner,
55-
runnerType: 'Org',
56-
statuses: ['running'],
57-
})
58-
).map((r) => r.instanceId);
60+
const ec2runners = await listEC2Runners({
61+
environment,
62+
runnerOwner,
63+
runnerType: 'Org',
64+
statuses: ['running'],
65+
});
5966

60-
const managedIdleRunners = ec2runners.filter((r) => idleRunners.includes(r));
61-
const topUp = event.poolSize - managedIdleRunners.length;
67+
// Runner should be considered idle if it is still booting, or is idle in GitHub
68+
let numberOfRunnersInPool = 0;
69+
for (const ec2Instance of ec2runners) {
70+
if (
71+
runnerStatus.get(ec2Instance.instanceId)?.busy === false &&
72+
runnerStatus.get(ec2Instance.instanceId)?.status === 'online'
73+
) {
74+
numberOfRunnersInPool++;
75+
logger.debug(`Runner ${ec2Instance.instanceId} is idle in GitHub and counted as part of the pool`);
76+
} else if (runnerStatus.get(ec2Instance.instanceId) != null) {
77+
logger.debug(`Runner ${ec2Instance.instanceId} is not idle in GitHub and NOT counted as part of the pool`);
78+
} else if (!bootTimeExceeded(ec2Instance)) {
79+
numberOfRunnersInPool++;
80+
logger.info(`Runner ${ec2Instance.instanceId} is still booting and counted as part of the pool`);
81+
} else {
82+
logger.debug(
83+
`Runner ${ec2Instance.instanceId} is not idle in GitHub nor booting and not counted as part of the pool`,
84+
);
85+
}
86+
}
87+
88+
const topUp = event.poolSize - numberOfRunnersInPool;
6289
if (topUp > 0) {
6390
logger.info(`The pool will be topped up with ${topUp} runners.`);
6491
await createRunners(
@@ -87,7 +114,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
87114
githubInstallationClient,
88115
);
89116
} else {
90-
logger.info(`Pool will not be topped up. Find ${managedIdleRunners} managed idle runners.`);
117+
logger.info(`Pool will not be topped up. Found ${numberOfRunnersInPool} managed idle runners.`);
91118
}
92119
}
93120

@@ -101,3 +128,9 @@ async function getInstallationId(ghesApiUrl: string, org: string): Promise<numbe
101128
})
102129
).data.id;
103130
}
131+
132+
function bootTimeExceeded(ec2Runner: RunnerList): boolean {
133+
const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
134+
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
135+
return launchTimePlusBootTime < moment(new Date()).utc();
136+
}

Diff for: modules/runners/pool.tf

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ module "pool" {
3737
runner = {
3838
disable_runner_autoupdate = var.disable_runner_autoupdate
3939
ephemeral = var.enable_ephemeral_runners
40+
boot_time_in_minutes = var.runner_boot_time_in_minutes
4041
extra_labels = var.runner_extra_labels
4142
launch_template = aws_launch_template.runner
4243
group_name = var.runner_group_name

Diff for: modules/runners/pool/main.tf

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ resource "aws_lambda_function" "pool" {
3131
NODE_TLS_REJECT_UNAUTHORIZED = var.config.ghes.url != null && !var.config.ghes.ssl_verify ? 0 : 1
3232
PARAMETER_GITHUB_APP_ID_NAME = var.config.github_app_parameters.id.name
3333
PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.config.github_app_parameters.key_base64.name
34+
RUNNER_BOOT_TIME_IN_MINUTES = var.config.runner.boot_time_in_minutes
3435
RUNNER_EXTRA_LABELS = var.config.runner.extra_labels
3536
RUNNER_GROUP_NAME = var.config.runner.group_name
3637
RUNNER_OWNER = var.config.runner.pool_owner

Diff for: modules/runners/pool/variables.tf

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ variable "config" {
2929
runner = object({
3030
disable_runner_autoupdate = bool
3131
ephemeral = bool
32+
boot_time_in_minutes = number
3233
extra_labels = string
3334
launch_template = object({
3435
name = string

0 commit comments

Comments
 (0)