Skip to content

Commit 077a5d1

Browse files
author
Muhammad Shahzeb
authored
Merge pull request prometheus-community#21 from grafana/shahzeb/add-alerts-postgres
Add alerts
2 parents f59d4af + 5324eaa commit 077a5d1

File tree

3 files changed

+139
-8
lines changed

3 files changed

+139
-8
lines changed

postgres_mixin/alerts/postgres.libsonnet

+137-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
prometheusAlerts+:: {
2+
prometheusAlerts+: {
33
groups+: [
44
{
55
name: 'PostgreSQL',
@@ -63,7 +63,7 @@
6363
expr: |||
6464
avg by (datname) (
6565
rate (
66-
pg_stat_activity_max_tx_duration{datname!~"template.*",%(postgresExporterSelector)s}[2m]
66+
pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m]
6767
)
6868
) > 2 * 60
6969
||| % $._config,
@@ -81,11 +81,11 @@
8181
expr: |||
8282
avg by (datname) (
8383
irate(
84-
pg_stat_database_xact_commit{datname!~"template.*",%(postgresExporterSelector)s}[5m]
84+
pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
8585
)
8686
+
8787
irate(
88-
pg_stat_database_xact_rollback{datname!~"template.*",%(postgresExporterSelector)s}[5m]
88+
pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
8989
)
9090
) > 10000
9191
||| % $._config,
@@ -102,15 +102,15 @@
102102
},
103103
expr: |||
104104
avg by (datname) (
105-
rate(pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m])
105+
rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m])
106106
/
107107
(
108108
rate(
109-
pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m]
109+
pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
110110
)
111111
+
112112
rate(
113-
pg_stat_database_blks_read{datname!~"template.*",%(postgresExporterSelector)s}[5m]
113+
pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
114114
)
115115
)
116116
) < 0.98
@@ -120,6 +120,136 @@
120120
severity: 'warning',
121121
},
122122
},
123+
{
124+
alert: 'PostgresHasTooManyRollbacks',
125+
annotations: {
126+
description: 'PostgreSQL has too many rollbacks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
127+
summary: 'PostgreSQL has too many rollbacks.',
128+
},
129+
expr: |||
130+
avg without(pod, instance)
131+
(rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]) /
132+
(rate(pg_stat_database_xact_commit{%(dbNameFilter)s}[5m]) + rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]))) > 0.10
133+
||| % $._config,
134+
'for': '5m',
135+
labels: {
136+
severity: 'warning',
137+
},
138+
},
139+
{
140+
alert: 'PostgresHasHighDeadLocks',
141+
annotations: {
142+
description: 'PostgreSQL has too high deadlocks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
143+
summary: 'PostgreSQL has high number of deadlocks.',
144+
},
145+
expr: |||
146+
max without(pod, instance) (rate(pg_stat_database_deadlocks{%(dbNameFilter)s}[5m]) * 60) > 5
147+
||| % $._config,
148+
'for': '5m',
149+
labels: {
150+
severity: 'warning',
151+
},
152+
},
153+
{
154+
alert: 'PostgresAcquiredTooManyLocks',
155+
annotations: {
156+
description: 'PostgreSQL has acquired too many locks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
157+
summary: 'PostgreSQL has high number of acquired locks.',
158+
},
159+
expr: |||
160+
max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) /
161+
on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20
162+
||| % $._config,
163+
'for': '5m',
164+
labels: {
165+
severity: 'warning',
166+
},
167+
},
168+
{
169+
alert: 'PostgresReplicationLaggingMore1Hour',
170+
annotations: {
171+
description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
172+
summary: 'PostgreSQL replication lagging more than 1 hour.',
173+
},
174+
expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)',
175+
'for': '5m',
176+
labels: {
177+
severity: 'warning',
178+
},
179+
},
180+
{
181+
alert: 'PostgresHasReplicationSlotUsed',
182+
annotations: {
183+
description: '{{ $labels.instance }} has replication slots that are not used, which might lead to replication lag or data inconsistency.',
184+
summary: 'PostgreSQL has unused replication slots.',
185+
},
186+
expr: 'pg_replication_slots_active{} == 0',
187+
'for': '30m',
188+
labels: {
189+
severity: 'critical',
190+
},
191+
},
192+
{
193+
alert: 'PostgresReplicationRoleChanged',
194+
annotations: {
195+
description: '{{ $labels.instance }} replication role has changed. Verify if this is expected or if it indicates a failover.',
196+
summary: 'PostgreSQL replication role change detected.',
197+
},
198+
expr: 'pg_replication_is_replica{} and changes(pg_replication_is_replica{}[1m]) > 0',
199+
labels: {
200+
severity: 'warning',
201+
},
202+
},
203+
{
204+
alert: 'PostgresHasExporterErrors',
205+
annotations: {
206+
description: '{{ $labels.instance }} exporter is experiencing errors. Verify exporter health and configuration.',
207+
summary: 'PostgreSQL exporter errors detected.',
208+
},
209+
expr: 'pg_exporter_last_scrape_error{} > 0',
210+
'for': '30m',
211+
labels: {
212+
severity: 'critical',
213+
},
214+
},
215+
{
216+
alert: 'PostgresTablesNotVaccumed',
217+
annotations: {
218+
description: '{{ $labels.instance }} tables have not been vacuumed recently within the last hour, which may lead to performance degradation.',
219+
summary: 'PostgreSQL tables not vacuumed.',
220+
},
221+
expr: |||
222+
group without(pod, instance)(
223+
timestamp(
224+
pg_stat_user_tables_n_dead_tup{} >
225+
pg_stat_user_tables_n_live_tup{}
226+
* on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{}
227+
+ on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{}
228+
)
229+
< time() - 36000
230+
)
231+
|||,
232+
'for': '30m',
233+
labels: {
234+
severity: 'critical',
235+
},
236+
},
237+
{
238+
alert: 'PostgresTooManyCheckpointsRequested',
239+
annotations: {
240+
description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.',
241+
summary: 'PostgreSQL too many checkpoints requested.',
242+
},
243+
expr: |||
244+
rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) /
245+
(rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{}[5m]))
246+
< 0.5
247+
|||,
248+
'for': '5m',
249+
labels: {
250+
severity: 'warning',
251+
},
252+
},
123253
],
124254
},
125255
],

postgres_mixin/config.libsonnet

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
_config+:: {
3+
dbNameFilter: 'datname!~"template.*"',
34
postgresExporterSelector: '',
45
},
56
}

postgres_mixin/dashboards/dashboards.libsonnet

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
grafanaDashboards+:: {
33
'postgres-overview.json': (import 'postgres-overview.json'),
44
},
5-
}
5+
}

0 commit comments

Comments
 (0)