github-aws-runners
diff --git a/‎.github/workflows/terraform.yml
+23-2 b/‎.github/workflows/terraform.yml
+23-2
diff --git a/‎.github/workflows/update-docs.yml
+22 b/‎.github/workflows/update-docs.yml
+22
diff --git a/‎README.md
+4 b/‎README.md
+4
diff --git a/‎docs/configuration.md
+37 b/‎docs/configuration.md
+37
diff --git a/‎docs/examples/termination-watcher.md
+1 b/‎docs/examples/termination-watcher.md
+1
diff --git a/‎docs/index.md
+5 b/‎docs/index.md
+5
diff --git a/‎examples/default/README.md
+1 b/‎examples/default/README.md
+1
diff --git a/‎examples/default/main.tf
+9-2 b/‎examples/default/main.tf
+9-2
diff --git a/‎examples/default/variables.tf
+7 b/‎examples/default/variables.tf
+7
diff --git a/‎examples/lambdas-download/main.tf
+8 b/‎examples/lambdas-download/main.tf
+8
diff --git a/‎examples/multi-runner/.terraform.lock.hcl
+16-16 b/‎examples/multi-runner/.terraform.lock.hcl
+16-16
diff --git a/‎examples/multi-runner/README.md
+1 b/‎examples/multi-runner/README.md
+1
diff --git a/‎examples/multi-runner/main.tf
+14-1 b/‎examples/multi-runner/main.tf
+14-1
diff --git a/‎examples/multi-runner/variables.tf
+7 b/‎examples/multi-runner/variables.tf
+7
diff --git a/‎examples/termination-watcher/.terraform.lock.hcl
+25 b/‎examples/termination-watcher/.terraform.lock.hcl
+25
@@ -30,6 +30,7 @@ jobs:
           touch lambdas/functions/control-plane/runners.zip
           touch lambdas/functions/gh-agent-syncer/runner-binaries-syncer.zip
           touch lambdas/functions/ami-housekeeper/ami-housekeeper.zip
+          touch lambdas/functions/termination-watcher/termination-watcher.zip
       - name: terraform init
         run: terraform init -get -backend=false -input=false
       - if: contains(matrix.terraform, '1.5.')
@@ -69,7 +70,18 @@ jobs:
       matrix:
         terraform: [1.5.6, "latest"]
         module:
-          ["ami-housekeeper", "download-lambda", "multi-runner", "runner-binaries-syncer", "runners", "setup-iam-permissions", "ssm", "webhook"]
+          [
+            "ami-housekeeper",
+            "download-lambda",
+            "lambda",
+            "multi-runner",
+            "runner-binaries-syncer",
+            "runners",
+            "setup-iam-permissions",
+            "ssm",
+            "termination-watcher",
+            "webhook",
+          ]
     defaults:
       run:
         working-directory: modules/${{ matrix.module }}
@@ -118,7 +130,16 @@ jobs:
       matrix:
         terraform: [1.5.6, "latest"]
         example:
-          ["default", "ubuntu", "prebuilt", "arm64", "ephemeral", "windows", "multi-runner"]
+          [
+            "default",
+            "ubuntu",
+            "prebuilt",
+            "arm64",
+            "ephemeral",
+            "termination-watcher",
+            "windows",
+            "multi-runner",
+          ]
     defaults:
       run:
         working-directory: examples/${{ matrix.example }}
 
@@ -16,10 +16,32 @@ jobs:
     name: Auto update terraform docs
     runs-on: ubuntu-latest
     steps:
+      - uses: philips-software/app-token-action@9f5d57062c9f2beaffafaa9a34f66f824ead63a9 # v2.0.0
+        id: app
+        with:
+          app_id: ${{ vars.FOREST_PR_BOT_APP_ID }}
+          app_base64_private_key: ${{ secrets.FOREST_PR_BOT_APP_KEY_BASE64 }}
+          auth_type: installation
+          org: philips-labs
+
       - name: Checkout with GITHUB Action token
         uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # ratchet:actions/checkout@v4
+        with:
+          token: ${{ steps.app.outputs.token }}
 
+      # use an app to ensure CI is triggered
       - name: Generate TF docs
+        if: github.repository_owner == 'philips-labs'
+        uses: terraform-docs/gh-actions@f6d59f89a280fa0a3febf55ef68f146784b20ba0 # ratchet:terraform-docs/[email protected]
+        with:
+          find-dir: .
+          git-commit-message: "docs: auto update terraform docs"
+          git-push: ${{ github.ref != 'refs/heads/main' || github.repository_owner != 'philips-labs' }}
+          git-push-user-name: forest-pr|bot
+          git-push-user-email: "forest-pr[bot]@users.noreply.github.com"
+
+      - name: Generate TF docs (forks)
+        if: github.repository_owner != 'philips-labs'
         uses: terraform-docs/gh-actions@f6d59f89a280fa0a3febf55ef68f146784b20ba0 # ratchet:terraform-docs/[email protected]
         with:
           find-dir: .
 
@@ -98,6 +98,7 @@ Talk to the forestkeepers in the `runners-channel` on Slack.
 | Name | Source | Version |
 |------|--------|---------|
 | <a name="module_ami_housekeeper"></a> [ami\_housekeeper](#module\_ami\_housekeeper) | ./modules/ami-housekeeper | n/a |
+| <a name="module_instance_termination_watcher"></a> [instance\_termination\_watcher](#module\_instance\_termination\_watcher) | ./modules/termination-watcher | n/a |
 | <a name="module_runner_binaries"></a> [runner\_binaries](#module\_runner\_binaries) | ./modules/runner-binaries-syncer | n/a |
 | <a name="module_runners"></a> [runners](#module\_runners) | ./modules/runners | n/a |
 | <a name="module_ssm"></a> [ssm](#module\_ssm) | ./modules/ssm | n/a |
@@ -163,6 +164,7 @@ Talk to the forestkeepers in the `runners-channel` on Slack.
 | <a name="input_instance_max_spot_price"></a> [instance\_max\_spot\_price](#input\_instance\_max\_spot\_price) | Max price price for spot instances per hour. This variable will be passed to the create fleet as max spot price for the fleet. | `string` | `null` | no |
 | <a name="input_instance_profile_path"></a> [instance\_profile\_path](#input\_instance\_profile\_path) | The path that will be added to the instance\_profile, if not set the environment name will be used. | `string` | `null` | no |
 | <a name="input_instance_target_capacity_type"></a> [instance\_target\_capacity\_type](#input\_instance\_target\_capacity\_type) | Default lifecycle used for runner instances, can be either `spot` or `on-demand`. | `string` | `"spot"` | no |
+| <a name="input_instance_termination_watcher"></a> [instance\_termination\_watcher](#input\_instance\_termination\_watcher) | Configuration for the instance termination watcher. This feature is Beta, changes will not trigger a major release as long in beta.<br><br>`enable`: Enable or disable the spot termination watcher.<br>'enable\_metrics': Enable or disable the metrics for the spot termination watcher.<br>`memory_size`: Memory size linit in MB of the lambda.<br>`s3_key`: S3 key for syncer lambda function. Required if using S3 bucket to specify lambdas.<br>`s3_object_version`: S3 object version for syncer lambda function. Useful if S3 versioning is enabled on source bucket.<br>`timeout`: Time out of the lambda in seconds.<br>`zip`: File location of the lambda zip file. | <pre>object({<br>    enable = optional(bool, false)<br>    enable_metric = optional(object({<br>      spot_warning = optional(bool, false)<br>    }))<br>    memory_size       = optional(number, null)<br>    s3_key            = optional(string, null)<br>    s3_object_version = optional(string, null)<br>    timeout           = optional(number, null)<br>    zip               = optional(string, null)<br>  })</pre> | `{}` | no |
 | <a name="input_instance_types"></a> [instance\_types](#input\_instance\_types) | List of instance types for the action runner. Defaults are based on runner\_os (al2023 for linux and Windows Server Core for win). | `list(string)` | <pre>[<br>  "m5.large",<br>  "c5.large"<br>]</pre> | no |
 | <a name="input_job_queue_retention_in_seconds"></a> [job\_queue\_retention\_in\_seconds](#input\_job\_queue\_retention\_in\_seconds) | The number of seconds the job is held in the queue before it is purged. | `number` | `86400` | no |
 | <a name="input_key_name"></a> [key\_name](#input\_key\_name) | Key pair name | `string` | `null` | no |
@@ -177,6 +179,7 @@ Talk to the forestkeepers in the `runners-channel` on Slack.
 | <a name="input_log_level"></a> [log\_level](#input\_log\_level) | Logging level for lambda logging. Valid values are  'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no |
 | <a name="input_logging_kms_key_id"></a> [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | Specifies the kms key id to encrypt the logs with. | `string` | `null` | no |
 | <a name="input_logging_retention_in_days"></a> [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no |
+| <a name="input_metrics_namespace"></a> [metrics\_namespace](#input\_metrics\_namespace) | The namespace for the metrics created by the module. Merics will only be created if explicit enabled. | `string` | `"GitHub Runners"` | no |
 | <a name="input_minimum_running_time_in_minutes"></a> [minimum\_running\_time\_in\_minutes](#input\_minimum\_running\_time\_in\_minutes) | The time an ec2 action runner should be running at minimum before terminated, if not busy. | `number` | `null` | no |
 | <a name="input_pool_config"></a> [pool\_config](#input\_pool\_config) | The configuration for updating the pool. The `pool_size` to adjust to by the events triggered by the `schedule_expression`. For example you can configure a cron expression for weekdays to adjust the pool to 10 and another expression for the weekend to adjust the pool to 1. | <pre>list(object({<br>    schedule_expression = string<br>    size                = number<br>  }))</pre> | `[]` | no |
 | <a name="input_pool_lambda_memory_size"></a> [pool\_lambda\_memory\_size](#input\_pool\_lambda\_memory\_size) | Memory size limit for scale-up lambda. | `number` | `512` | no |
@@ -248,6 +251,7 @@ Talk to the forestkeepers in the `runners-channel` on Slack.
 | Name | Description |
 |------|-------------|
 | <a name="output_binaries_syncer"></a> [binaries\_syncer](#output\_binaries\_syncer) | n/a |
+| <a name="output_instance_termination_watcher"></a> [instance\_termination\_watcher](#output\_instance\_termination\_watcher) | n/a |
 | <a name="output_queues"></a> [queues](#output\_queues) | SQS queues. |
 | <a name="output_runners"></a> [runners](#output\_runners) | n/a |
 | <a name="output_ssm_parameters"></a> [ssm\_parameters](#output\_ssm\_parameters) | n/a |
 
@@ -175,6 +175,11 @@ This tracing config generates timelines for following events:
 
 This feature has been disabled by default.
 
+### Multiple runner module in your AWS account
+
+The watcher will act on all spot termination notificatins and log all onses relevant to the runner module. Therefor we suggest to only deploy the watcher once. You can either deploy the watcher by enabling in one of your deployments or deploy the watcher as a stand alone module.
+
+
 ## Debugging
 
 In case the setup does not work as intended, trace the events through this sequence:
@@ -187,6 +192,38 @@ In case the setup does not work as intended, trace the events through this seque
 
 ## Experimental features
 
+### Termination watcher
+
+This feature is in early stage and therefore disabled by default.
+
+The termination watcher is currently watching for spot termination notifications. The module is only taken events into account for instances tagged with `ghr:environment` by default when deployment the module as part of one of the main modules (root or multi-runner). The module can also be deployed stand-alone, in that case the tag filter needs to be tunned.
+
+- Logs: The module will log all termination notifications. For each warning it will look up instance details and log the environment, instance type and time the instance is running. As well some other details.
+- Metrics: Metrics are disabled by default, this to avoid costs. Once enabled a metric will be created for each warning with at least dimensions for the environment and instance type. THe metric name space can be configured via the variables. The metric name used is `SpotInterruptionWarning`.
+
+#### Log example
+
+Below an example of the the log messages created.
+
+```
+{
+    "level": "INFO",
+    "message": "Received spot notification warning:",
+    "environment": "default",
+    "instanceId": "i-0039b8826b3dcea55",
+    "instanceType": "c5.large",
+    "instanceLaunchTime": "2024-03-15T08:10:34.000Z",
+    "instanceRunningTimeInSeconds": 68,
+    "tags": [
+        {
+            "Key": "ghr:environment",
+            "Value": "default"
+        }
+        ... all tags ...
+    ]
+}
+```
+
 ### Queue to publish workflow job events
 
 This queue is an experimental feature to allow you to receive a copy of the wokflow_jobs events sent by the GitHub App. This can be used to calculate a matrix or monitor the system.
 
@@ -0,0 +1 @@
+--8<-- "examples/termination-watcher/README.md"
@@ -64,6 +64,11 @@ The control plane (scale up lambda) will store the runner registration configura
 
 The AMI cleaner is a lambda that will clean up AMIs that are older than a configurable amount of days. This is useful when using the AMI builder to create AMIs. The cleaner will also check which AMIs are used the latest version of the launch template. And you can provide SSM config paths pointing to AMI IDs. The cleaner will not delete these AMIs. The AMI cleaner is opt in, it will not be created by default.
 
+### Instance Termination Watcher
+
+> This feature is Beta, changes will not trigger a major release as long in beta.
+
+The Instance Termination Watcher is creating log and optional metrics for termination of instances. Currently only spot termination warnings are watched. See [configuration](configuration/) for more details. 
 
 ### Security
 
 
@@ -62,6 +62,7 @@ terraform output -raw webhook_secret
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
+| <a name="input_aws_region"></a> [aws\_region](#input\_aws\_region) | AWS region. | `string` | `"eu-west-1"` | no |
 | <a name="input_environment"></a> [environment](#input\_environment) | Environment name, used as prefix. | `string` | `null` | no |
 | <a name="input_github_app"></a> [github\_app](#input\_github\_app) | GitHub for API usages. | <pre>object({<br>    id         = string<br>    key_base64 = string<br>  })</pre> | n/a | yes |
 
 
@@ -1,6 +1,6 @@
 locals {
   environment = var.environment != null ? var.environment : "default"
-  aws_region  = "eu-west-1"
+  aws_region  = var.aws_region
 }
 
 resource "random_id" "random" {
@@ -79,7 +79,7 @@ module "runners" {
 
   # override delay of events in seconds
   delay_webhook_event   = 5
-  runners_maximum_count = 1
+  runners_maximum_count = 2
 
   # set up a fifo queue to remain order
   enable_fifo_build_queue = true
@@ -109,6 +109,13 @@ module "runners" {
     ]
   }
 
+  instance_termination_watcher = {
+    enable = true
+    enable_metric = {
+      spot_warning = true
+    }
+  }
+
 }
 
 module "webhook_github_app" {
 
@@ -13,3 +13,10 @@ variable "environment" {
   type    = string
   default = null
 }
+
+variable "aws_region" {
+  description = "AWS region."
+
+  type    = string
+  default = "eu-west-1"
+}
@@ -12,6 +12,14 @@ module "lambdas" {
     {
       name = "runner-binaries-syncer"
       tag  = var.module_version
+    },
+    {
+      name = "ami-housekeeper"
+      tag  = var.module_version
+    },
+    {
+      name = "termination-watcher"
+      tag  = var.module_version
     }
   ]
 }
 
@@ -80,6 +80,7 @@ terraform output -raw webhook_secret
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
+| <a name="input_aws_region"></a> [aws\_region](#input\_aws\_region) | AWS region to deploy to | `string` | `"eu-west-1"` | no |
 | <a name="input_environment"></a> [environment](#input\_environment) | Environment name, used as prefix | `string` | `null` | no |
 | <a name="input_github_app"></a> [github\_app](#input\_github\_app) | GitHub for API usages. | <pre>object({<br>    id         = string<br>    key_base64 = string<br>  })</pre> | n/a | yes |
 
 
@@ -1,6 +1,6 @@
 locals {
   environment = var.environment != null ? var.environment : "multi-runner"
-  aws_region  = "eu-west-1"
+  aws_region  = var.aws_region
 
   # Load runner configurations from Yaml files
   multi_runner_config_files = {
@@ -94,6 +94,19 @@ module "runners" {
 
   # Enable debug logging for the lambda functions
   # log_level = "debug"
+
+  # Enable spot termination watcher
+  # spot_instance_termination_watcher = {
+  #   enable = true
+  # }
+
+  # Enable to track the spot instance termination warning
+  # instance_termination_watcher = {
+  #   enable         = true
+  #   enable_metric = {
+  #     spot_warning = true
+  #   }
+  # }
 }
 
 module "webhook_github_app" {
 
@@ -13,3 +13,10 @@ variable "environment" {
   type    = string
   default = null
 }
+
+variable "aws_region" {
+  description = "AWS region to deploy to"
+
+  type    = string
+  default = "eu-west-1"
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+--8<-- "examples/termination-watcher/README.md"`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`locals {`
`2`	`2`	`environment = var.environment != null ? var.environment : "default"`
`3`		`- aws_region = "eu-west-1"`
	`3`	`+ aws_region = var.aws_region`
`4`	`4`	`}`
`5`	`5`
`6`	`6`	`resource "random_id" "random" {`
`@@ -79,7 +79,7 @@ module "runners" {`
`79`	`79`
`80`	`80`	`# override delay of events in seconds`
`81`	`81`	`delay_webhook_event = 5`
`82`		`- runners_maximum_count = 1`
	`82`	`+ runners_maximum_count = 2`
`83`	`83`
`84`	`84`	`# set up a fifo queue to remain order`
`85`	`85`	`enable_fifo_build_queue = true`
`@@ -109,6 +109,13 @@ module "runners" {`
`109`	`109`	`]`
`110`	`110`	`}`
`111`	`111`
	`112`	`+ instance_termination_watcher = {`
	`113`	`+ enable = true`
	`114`	`+ enable_metric = {`
	`115`	`+ spot_warning = true`
	`116`	`+ }`
	`117`	`+ }`
	`118`	`+`
`112`	`119`	`}`
`113`	`120`
`114`	`121`	`module "webhook_github_app" {`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,14 @@ module "lambdas" {`
`12`	`12`	`{`
`13`	`13`	`name = "runner-binaries-syncer"`
`14`	`14`	`tag = var.module_version`
	`15`	`+ },`
	`16`	`+ {`
	`17`	`+ name = "ami-housekeeper"`
	`18`	`+ tag = var.module_version`
	`19`	`+ },`
	`20`	`+ {`
	`21`	`+ name = "termination-watcher"`
	`22`	`+ tag = var.module_version`
`15`	`23`	`}`
`16`	`24`	`]`
`17`	`25`	`}`