From 413aabe8ca38ff8562a510213d5bc7095330709f Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 14:03:47 +0000 Subject: [PATCH] Add CoderdIneligiblePrebuilds alert Signed-off-by: Danny Kopping --- README.md | 2 +- coder-observability/runbooks/coderd.md | 7 ++++++ .../configmap-prometheus-alerts.yaml | 23 ++++++++++++++++++- coder-observability/values.yaml | 5 ++++ compiled/resources.yaml | 17 ++++++++++---- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 645119f..d33b398 100644 --- a/README.md +++ b/README.md @@ -228,7 +228,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | Key | Type | Default | Description | |-----|------|---------|-------------| -| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | +| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. | | global.coder.externalProvisionersNamespace | string | `"coder"` | the namespace into which any external provisioners have been deployed. | diff --git a/coder-observability/runbooks/coderd.md b/coder-observability/runbooks/coderd.md index 62c80f5..ef5eb71 100644 --- a/coder-observability/runbooks/coderd.md +++ b/coder-observability/runbooks/coderd.md @@ -76,3 +76,10 @@ Terraform plugin. Your Enterprise license is approaching or has exceeded the number of seats purchased. Please contact your Coder sales contact, or visit https://coder.com/contact/sales. + +## CoderdIneligiblePrebuilds + +Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup +scripts have completed. + +If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. \ No newline at end of file diff --git a/coder-observability/templates/configmap-prometheus-alerts.yaml b/coder-observability/templates/configmap-prometheus-alerts.yaml index eec7171..3311be7 100644 --- a/coder-observability/templates/configmap-prometheus-alerts.yaml +++ b/coder-observability/templates/configmap-prometheus-alerts.yaml @@ -4,7 +4,7 @@ metadata: name: metrics-alerts namespace: {{ .Release.Namespace }} data: - {{- $service := dict "service" "coder" -}} + {{- $service := dict "service" "coderd" -}} {{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}} coderd.yaml: |- @@ -104,6 +104,27 @@ data: {{- end }} {{- end }} + {{- with .groups.IneligiblePrebuilds }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Ineligible Prebuilds + rules: + {{ $alert := "CoderdIneligiblePrebuilds" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{/* end-section */}} diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index d035dd8..c2298fb 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -76,6 +76,11 @@ global: notify: 2 warning: 5 critical: 10 + IneligiblePrebuilds: + enabled: true + delay: 10m + thresholds: + notify: 1 provisionerd: groups: Replicas: diff --git a/compiled/resources.yaml b/compiled/resources.yaml index ccdf33b..9583951 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -777,8 +777,8 @@ metadata: name: metrics-alerts namespace: coder-observability data: - coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures " - provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas " + coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds " + provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas " enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats " postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" --- @@ -789,9 +789,9 @@ metadata: name: runbooks namespace: coder-observability annotations: - checksum/config: 7ab4d06d5b454cc584880f9b58fb50fe85f2803bbd0021edb957c5a2d73b640e + checksum/config: c7f0ed6a8142ac725babf921fbbc4275738149cf4da49b3d4311b37f61994267 data: - coderd.md: | + coderd.md: |- # Coderd Runbooks ## CoderdCPUUsage @@ -870,6 +870,13 @@ data: Your Enterprise license is approaching or has exceeded the number of seats purchased. Please contact your Coder sales contact, or visit https://coder.com/contact/sales. + + ## CoderdIneligiblePrebuilds + + Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup + scripts have completed. + + If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. postgres.md: | # Postgres Runbooks @@ -11114,7 +11121,7 @@ spec: template: metadata: annotations: - checksum/config: 7ab4d06d5b454cc584880f9b58fb50fe85f2803bbd0021edb957c5a2d73b640e + checksum/config: c7f0ed6a8142ac725babf921fbbc4275738149cf4da49b3d4311b37f61994267 labels: app: runbook-viewer spec: