feat(runners): Add support for looking up runner AMI ID from an SSM parameter at instance launch time (#2520)

jpalomaki · web-flow · commit 68e238196877 · 2022-10-31T19:00:40.000+01:00
* Add support for looking up a pre-built runner AMI ID from an SSM parameter

* Format terraform code

* Format TS code

* Add tests

* Make error message more helpful

* Fixes per comments

* Re-format terraform code

* Sync var description

* Add scale up test for overridden AMI ID

* Fix iam role policy name

* Add example
diff --git a/README.md b/README.md
@@ -329,6 +329,8 @@ export interface GithubWorkflowEvent {
 This extendible format allows to add more fields to be added if needed.
 You can configure the queue by setting properties to `workflow_job_events_queue_config`
 
+NOTE: By default, a runner AMI update requires a re-apply of this terraform config (the runner AMI ID is looked up by a terraform data source). To avoid this, you can use `ami_id_ssm_parameter_name` to have the scale-up lambda dynamically lookup the runner AMI ID from an SSM parameter at instance launch time. Said SSM parameter is managed outside of this module (e.g. by a runner AMI build workflow).
+
 ## Examples
 
 Examples are located in the [examples](./examples) directory. The following examples are provided:
@@ -419,6 +421,7 @@ We welcome any improvement to the standard module to make the default as secure
 |------|-------------|------|---------|:--------:|
 | <a name="input_ami_filter"></a> [ami\_filter](#input\_ami\_filter) | List of maps used to create the AMI filter for the action runner AMI. By default amazon linux 2 is used. | `map(list(string))` | `null` | no |
 | <a name="input_ami_owners"></a> [ami\_owners](#input\_ami\_owners) | The list of owners used to select the AMI of action runner instances. | `list(string)` | <pre>[<br>  "amazon"<br>]</pre> | no |
+| <a name="input_ami_id_ssm_parameter_name"></a> [ami\_id\_ssm\_parameter\_name](#input\_ami\_id\_ssm\_parameter\_name) | Optional SSM parameter that contains the runner AMI ID to launch instances from. Overrides `ami_filter`. The parameter value is managed outside of this module (e.g. in a runner AMI build workflow). This allows for AMI updates without having to re-apply this terraform config. | `string` | `null` | no |
 | <a name="input_aws_partition"></a> [aws\_partition](#input\_aws\_partition) | (optiona) partition in the arn namespace to use if not 'aws' | `string` | `"aws"` | no |
 | <a name="input_aws_region"></a> [aws\_region](#input\_aws\_region) | AWS region. | `string` | n/a | yes |
 | <a name="input_block_device_mappings"></a> [block\_device\_mappings](#input\_block\_device\_mappings) | The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops`, `throughput`, `kms_key_id`, `snapshot_id`. | <pre>list(object({<br>    delete_on_termination = bool<br>    device_name           = string<br>    encrypted             = bool<br>    iops                  = number<br>    kms_key_id            = string<br>    snapshot_id           = string<br>    throughput            = number<br>    volume_size           = number<br>    volume_type           = string<br>  }))</pre> | <pre>[<br>  {<br>    "delete_on_termination": true,<br>    "device_name": "/dev/xvda",<br>    "encrypted": true,<br>    "iops": null,<br>    "kms_key_id": null,<br>    "snapshot_id": null,<br>    "throughput": null,<br>    "volume_size": 30,<br>    "volume_type": "gp3"<br>  }<br>]</pre> | no |
diff --git a/examples/prebuilt/main.tf b/examples/prebuilt/main.tf
@@ -45,6 +45,10 @@ module "runners" {
   ami_filter       = { name = [var.ami_name_filter] }
   ami_owners       = [data.aws_caller_identity.current.account_id]
 
+  # Look up runner AMI ID from an AWS SSM parameter (overrides ami_filter at instance launch time)
+  # NOTE: the parameter must be managed outside of this module (e.g. in a runner AMI build workflow)
+  # ami_id_ssm_parameter_name = "my-runner-ami-id"
+
   # disable binary syncer since github agent is already installed in the AMI.
   enable_runner_binaries_syncer = false
 
diff --git a/main.tf b/main.tf
@@ -178,9 +178,10 @@ module "runners" {
   instance_max_spot_price       = var.instance_max_spot_price
   block_device_mappings         = var.block_device_mappings
 
-  runner_architecture = var.runner_architecture
-  ami_filter          = var.ami_filter
-  ami_owners          = var.ami_owners
+  runner_architecture       = var.runner_architecture
+  ami_filter                = var.ami_filter
+  ami_owners                = var.ami_owners
+  ami_id_ssm_parameter_name = var.ami_id_ssm_parameter_name
 
   sqs_build_queue                      = aws_sqs_queue.queued_builds
   github_app_parameters                = local.github_app_parameters
diff --git a/modules/runners/lambdas/runners/src/aws/runners.test.ts b/modules/runners/lambdas/runners/src/aws/runners.test.ts
@@ -4,7 +4,7 @@ import ScaleError from './../scale-runners/ScaleError';
 import { RunnerInfo, RunnerInputParameters, createRunner, listEC2Runners, terminateRunner } from './runners';
 
 const mockEC2 = { describeInstances: jest.fn(), createFleet: jest.fn(), terminateInstances: jest.fn() };
-const mockSSM = { putParameter: jest.fn() };
+const mockSSM = { putParameter: jest.fn(), getParameter: jest.fn() };
 jest.mock('aws-sdk', () => ({
   EC2: jest.fn().mockImplementation(() => mockEC2),
   SSM: jest.fn().mockImplementation(() => mockSSM),
@@ -170,6 +170,8 @@ describe('terminate runner', () => {
 describe('create runner', () => {
   const mockCreateFleet = { promise: jest.fn() };
   const mockPutParameter = { promise: jest.fn() };
+  const mockGetParameter = { promise: jest.fn() };
+
   const defaultRunnerConfig: RunnerConfig = {
     allocationStrategy: 'capacity-optimized',
     capacityType: 'spot',
@@ -191,6 +193,8 @@ describe('create runner', () => {
       Instances: [{ InstanceIds: ['i-1234'] }],
     });
     mockSSM.putParameter.mockImplementation(() => mockPutParameter);
+
+    mockSSM.getParameter.mockImplementation(() => mockGetParameter);
   });
 
   it('calls create fleet of 1 instance with the correct config for repo', async () => {
@@ -259,6 +263,21 @@ describe('create runner', () => {
     await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toThrowError(Error);
     expect(mockSSM.putParameter).not.toBeCalled();
   });
+
+  it('uses ami id from ssm parameter when ami id ssm param is specified', async () => {
+    const paramValue: AWS.SSM.GetParameterResult = {
+      Parameter: {
+        Value: 'ami-123',
+      },
+    };
+    mockGetParameter.promise.mockReturnValue(paramValue);
+    await createRunner(createRunnerConfig({ ...defaultRunnerConfig, amiIdSsmParameterName: 'my-ami-id-param' }));
+    const expectedRequest = expectedCreateFleetRequest({ ...defaultExpectedFleetRequestValues, imageId: 'ami-123' });
+    expect(mockEC2.createFleet).toBeCalledWith(expectedRequest);
+    expect(mockSSM.getParameter).toBeCalledWith({
+      Name: 'my-ami-id-param',
+    });
+  });
 });
 
 describe('create runner with errors', () => {
@@ -279,6 +298,10 @@ describe('create runner with errors', () => {
     const mockPutParameter = { promise: jest.fn() };
 
     mockSSM.putParameter.mockImplementation(() => mockPutParameter);
+
+    const mockGetParameter = { promise: jest.fn() };
+
+    mockSSM.getParameter.mockImplementation(() => mockGetParameter);
   });
 
   it('test ScaleError with one error.', async () => {
@@ -326,6 +349,22 @@ describe('create runner with errors', () => {
     expect(mockEC2.createFleet).toBeCalledWith(expectedCreateFleetRequest(defaultExpectedFleetRequestValues));
     expect(mockSSM.putParameter).not.toBeCalled();
   });
+
+  it('test error in ami id lookup from ssm parameter', async () => {
+    mockSSM.getParameter.mockImplementation(() => {
+      return {
+        promise: jest.fn().mockImplementation(() => {
+          throw Error('Wow, such transient');
+        }),
+      };
+    });
+
+    await expect(
+      createRunner(createRunnerConfig({ ...defaultRunnerConfig, amiIdSsmParameterName: 'my-ami-id-param' })),
+    ).rejects.toBeInstanceOf(Error);
+    expect(mockEC2.createFleet).not.toBeCalled();
+    expect(mockSSM.putParameter).not.toBeCalled();
+  });
 });
 
 function createFleetMockWithErrors(errors: string[], instances?: string[]) {
@@ -354,6 +393,7 @@ interface RunnerConfig {
   capacityType: EC2.DefaultTargetCapacityType;
   allocationStrategy: EC2.AllocationStrategy;
   maxSpotPrice?: string;
+  amiIdSsmParameterName?: string;
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -370,6 +410,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
       instanceAllocationStrategy: runnerConfig.allocationStrategy,
     },
     subnets: ['subnet-123', 'subnet-456'],
+    amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
   };
 }
 
@@ -379,10 +420,11 @@ interface ExpectedFleetRequestValues {
   allocationStrategy: EC2.AllocationStrategy;
   maxSpotPrice?: string;
   totalTargetCapacity: number;
+  imageId?: string;
 }
 
 function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues): AWS.EC2.CreateFleetRequest {
-  return {
+  const request: AWS.EC2.CreateFleetRequest = {
     LaunchTemplateConfigs: [
       {
         LaunchTemplateSpecification: {
@@ -429,4 +471,16 @@ function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues):
     },
     Type: 'instant',
   };
+
+  if (expectedValues.imageId) {
+    for (const config of request.LaunchTemplateConfigs) {
+      if (config.Overrides) {
+        for (const override of config.Overrides) {
+          override.ImageId = expectedValues.imageId;
+        }
+      }
+    }
+  }
+
+  return request;
 }
diff --git a/modules/runners/lambdas/runners/src/aws/runners.ts b/modules/runners/lambdas/runners/src/aws/runners.ts
@@ -42,6 +42,7 @@ export interface RunnerInputParameters {
     instanceAllocationStrategy: EC2.SpotAllocationStrategy;
   };
   numberOfRunners?: number;
+  amiIdSsmParameterName?: string;
 }
 
 export async function listEC2Runners(filters: ListRunnerFilters | undefined = undefined): Promise<RunnerList[]> {
@@ -107,17 +108,27 @@ export async function terminateRunner(instanceId: string): Promise<void> {
   logger.info(`Runner ${instanceId} has been terminated.`, LogFields.print());
 }
 
-function generateFleeOverrides(
+function generateFleetOverrides(
   subnetIds: string[],
   instancesTypes: string[],
+  amiId?: string,
 ): EC2.FleetLaunchTemplateOverridesListRequest {
+  type Override = {
+    SubnetId: string;
+    InstanceType: string;
+    ImageId?: string;
+  };
   const result: EC2.FleetLaunchTemplateOverridesListRequest = [];
   subnetIds.forEach((s) => {
     instancesTypes.forEach((i) => {
-      result.push({
+      const item: Override = {
         SubnetId: s,
         InstanceType: i,
-      });
+      };
+      if (amiId) {
+        item.ImageId = amiId;
+      }
+      result.push(item);
     });
   });
   return result;
@@ -127,6 +138,27 @@ export async function createRunner(runnerParameters: RunnerInputParameters): Pro
   logger.debug('Runner configuration: ' + JSON.stringify(runnerParameters), LogFields.print());
 
   const ec2 = new EC2();
+  const ssm = new SSM();
+
+  let amiIdOverride = undefined;
+
+  if (runnerParameters.amiIdSsmParameterName) {
+    logger.debug(`Looking up runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}`);
+    try {
+      const result: AWS.SSM.GetParameterResult = await ssm
+        .getParameter({ Name: runnerParameters.amiIdSsmParameterName })
+        .promise();
+      amiIdOverride = result.Parameter?.Value;
+    } catch (e) {
+      logger.error(
+        `Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}. ` +
+          'Please ensure that the given parameter exists on this region and contains a valid runner AMI ID',
+        e,
+      );
+      throw e;
+    }
+  }
+
   const numberOfRunners = runnerParameters.numberOfRunners ? runnerParameters.numberOfRunners : 1;
 
   let fleet: AWS.EC2.CreateFleetResult;
@@ -140,9 +172,10 @@ export async function createRunner(runnerParameters: RunnerInputParameters): Pro
               LaunchTemplateName: runnerParameters.launchTemplateName,
               Version: '$Default',
             },
-            Overrides: generateFleeOverrides(
+            Overrides: generateFleetOverrides(
               runnerParameters.subnets,
               runnerParameters.ec2instanceCriteria.instanceTypes,
+              amiIdOverride,
             ),
           },
         ],
@@ -202,7 +235,6 @@ export async function createRunner(runnerParameters: RunnerInputParameters): Pro
 
   logger.info('Created instance(s): ', instances.join(','), LogFields.print());
 
-  const ssm = new SSM();
   for (const instance of instances) {
     await ssm
       .putParameter({
diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts
@@ -243,6 +243,12 @@ describe('scaleUp with GHES', () => {
       ];
       expect(createRunner).toBeCalledWith(expectedRunnerParams);
     });
+
+    it('creates a runner with ami id override from ssm parameter', async () => {
+      process.env.AMI_ID_SSM_PARAMETER_NAME = 'my-ami-id-param';
+      await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
+      expect(createRunner).toBeCalledWith({ ...expectedRunnerParams, amiIdSsmParameterName: 'my-ami-id-param' });
+    });
   });
 
   describe('on repo level', () => {
diff --git a/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts
@@ -32,6 +32,7 @@ interface CreateEC2RunnerConfig {
   launchTemplateName: string;
   ec2instanceCriteria: RunnerInputParameters['ec2instanceCriteria'];
   numberOfRunners?: number;
+  amiIdSsmParameterName?: string;
 }
 
 function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -159,6 +160,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
   const instanceMaxSpotPrice = process.env.INSTANCE_MAX_SPOT_PRICE;
   const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; // same as AWS default
   const enableJobQueuedCheck = yn(process.env.ENABLE_JOB_QUEUED_CHECK, { default: true });
+  const amiIdSsmParameterName = process.env.AMI_ID_SSM_PARAMETER_NAME;
 
   if (ephemeralEnabled && payload.eventType !== 'workflow_job') {
     logger.warn(
@@ -222,6 +224,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
           environment,
           launchTemplateName,
           subnets,
+          amiIdSsmParameterName,
         },
         githubInstallationClient,
       );
diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf
@@ -36,6 +36,7 @@ resource "aws_lambda_function" "scale_up" {
       RUNNER_GROUP_NAME                    = var.runner_group_name
       RUNNERS_MAXIMUM_COUNT                = var.runners_maximum_count
       SUBNET_IDS                           = join(",", var.subnet_ids)
+      AMI_ID_SSM_PARAMETER_NAME            = var.ami_id_ssm_parameter_name
     }
   }
 
@@ -110,3 +111,25 @@ resource "aws_iam_role_policy_attachment" "scale_up_vpc_execution_role" {
   role       = aws_iam_role.scale_up.name
   policy_arn = "arn:${var.aws_partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
 }
+
+resource "aws_iam_role_policy" "ami_id_ssm_parameter_read" {
+  count  = var.ami_id_ssm_parameter_name != null ? 1 : 0
+  name   = "${var.prefix}-ami-id-ssm-parameter-read"
+  role   = aws_iam_role.scale_up.name
+  policy = <<-JSON
+    {
+      "Version": "2012-10-17",
+      "Statement": [
+        {
+          "Effect": "Allow",
+          "Action": [
+            "ssm:GetParameter"
+          ],
+          "Resource": [
+            "arn:aws:ssm:${var.aws_region}:${data.aws_caller_identity.current.account_id}:parameter/${trimprefix(var.ami_id_ssm_parameter_name, "/")}"
+          ]
+        }
+      ]
+    }
+  JSON
+}
diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf
@@ -155,6 +155,12 @@ variable "ami_owners" {
   default     = ["amazon"]
 }
 
+variable "ami_id_ssm_parameter_name" {
+  description = "Externally managed SSM parameter (of data type aws:ec2:image) that contains the AMI ID to launch runner instances from. Overrides ami_filter"
+  type        = string
+  default     = null
+}
+
 variable "enabled_userdata" {
   description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI"
   type        = bool
diff --git a/variables.tf b/variables.tf
@@ -302,11 +302,19 @@ variable "ami_filter" {
   type        = map(list(string))
   default     = null
 }
+
 variable "ami_owners" {
   description = "The list of owners used to select the AMI of action runner instances."
   type        = list(string)
   default     = ["amazon"]
 }
+
+variable "ami_id_ssm_parameter_name" {
+  description = "Externally managed SSM parameter (of data type aws:ec2:image) that contains the AMI ID to launch runner instances from. Overrides ami_filter"
+  type        = string
+  default     = null
+}
+
 variable "lambda_s3_bucket" {
   description = "S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly."
   default     = null