Skip to content

Commit cb30274

Browse files
geroplroboquat
authored andcommitted
[ops] WebApp: Alert on services crashlooping
1 parent ddf5651 commit cb30274

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

operations/observability/mixins/meta/rules/components/server/alerts.libsonnet

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,21 @@
148148
description: 'WebApp Services execcisve CPU USAGE',
149149
},
150150
},
151+
{
152+
alert: 'WebAppServicesCrashlooping',
153+
expr: 'sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80',
154+
'for': '15m',
155+
labels: {
156+
// sent to the team internal channel until we fine tuned it
157+
severity: 'warning',
158+
team: 'webapp'
159+
},
160+
annotations: {
161+
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md',
162+
summary: 'Pod is crash looping.',
163+
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes',
164+
},
165+
},
151166
],
152167
},
153168
],

0 commit comments

Comments
 (0)