|
1 | 1 | {
|
2 |
| - prometheusAlerts+:: { |
| 2 | + prometheusAlerts+: { |
3 | 3 | groups+: [
|
4 | 4 | {
|
5 | 5 | name: 'PostgreSQL',
|
|
63 | 63 | expr: |||
|
64 | 64 | avg by (datname) (
|
65 | 65 | rate (
|
66 |
| - pg_stat_activity_max_tx_duration{datname!~"template.*",%(postgresExporterSelector)s}[2m] |
| 66 | + pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m] |
67 | 67 | )
|
68 | 68 | ) > 2 * 60
|
69 | 69 | ||| % $._config,
|
|
81 | 81 | expr: |||
|
82 | 82 | avg by (datname) (
|
83 | 83 | irate(
|
84 |
| - pg_stat_database_xact_commit{datname!~"template.*",%(postgresExporterSelector)s}[5m] |
| 84 | + pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
85 | 85 | )
|
86 | 86 | +
|
87 | 87 | irate(
|
88 |
| - pg_stat_database_xact_rollback{datname!~"template.*",%(postgresExporterSelector)s}[5m] |
| 88 | + pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
89 | 89 | )
|
90 | 90 | ) > 10000
|
91 | 91 | ||| % $._config,
|
|
102 | 102 | },
|
103 | 103 | expr: |||
|
104 | 104 | avg by (datname) (
|
105 |
| - rate(pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m]) |
| 105 | + rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]) |
106 | 106 | /
|
107 | 107 | (
|
108 | 108 | rate(
|
109 |
| - pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m] |
| 109 | + pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
110 | 110 | )
|
111 | 111 | +
|
112 | 112 | rate(
|
113 |
| - pg_stat_database_blks_read{datname!~"template.*",%(postgresExporterSelector)s}[5m] |
| 113 | + pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
114 | 114 | )
|
115 | 115 | )
|
116 | 116 | ) < 0.98
|
|
120 | 120 | severity: 'warning',
|
121 | 121 | },
|
122 | 122 | },
|
| 123 | + { |
| 124 | + alert: 'PostgresHasTooManyRollbacks', |
| 125 | + annotations: { |
| 126 | + description: 'PostgreSQL has too many rollbacks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', |
| 127 | + summary: 'PostgreSQL has too many rollbacks.', |
| 128 | + }, |
| 129 | + expr: ||| |
| 130 | + avg without(pod, instance) |
| 131 | + (rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]) / |
| 132 | + (rate(pg_stat_database_xact_commit{%(dbNameFilter)s}[5m]) + rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]))) > 0.10 |
| 133 | + ||| % $._config, |
| 134 | + 'for': '5m', |
| 135 | + labels: { |
| 136 | + severity: 'warning', |
| 137 | + }, |
| 138 | + }, |
| 139 | + { |
| 140 | + alert: 'PostgresHasHighDeadLocks', |
| 141 | + annotations: { |
| 142 | + description: 'PostgreSQL has too high deadlocks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', |
| 143 | + summary: 'PostgreSQL has high number of deadlocks.', |
| 144 | + }, |
| 145 | + expr: ||| |
| 146 | + max without(pod, instance) (rate(pg_stat_database_deadlocks{%(dbNameFilter)s}[5m]) * 60) > 5 |
| 147 | + ||| % $._config, |
| 148 | + 'for': '5m', |
| 149 | + labels: { |
| 150 | + severity: 'warning', |
| 151 | + }, |
| 152 | + }, |
| 153 | + { |
| 154 | + alert: 'PostgresAcquiredTooManyLocks', |
| 155 | + annotations: { |
| 156 | + description: 'PostgreSQL has acquired too many locks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', |
| 157 | + summary: 'PostgreSQL has high number of acquired locks.', |
| 158 | + }, |
| 159 | + expr: ||| |
| 160 | + max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) / |
| 161 | + on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 |
| 162 | + ||| % $._config, |
| 163 | + 'for': '5m', |
| 164 | + labels: { |
| 165 | + severity: 'warning', |
| 166 | + }, |
| 167 | + }, |
| 168 | + { |
| 169 | + alert: 'PostgresReplicationLaggingMore1Hour', |
| 170 | + annotations: { |
| 171 | + description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.', |
| 172 | + summary: 'PostgreSQL replication lagging more than 1 hour.', |
| 173 | + }, |
| 174 | + expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)', |
| 175 | + 'for': '5m', |
| 176 | + labels: { |
| 177 | + severity: 'warning', |
| 178 | + }, |
| 179 | + }, |
| 180 | + { |
| 181 | + alert: 'PostgresHasReplicationSlotUsed', |
| 182 | + annotations: { |
| 183 | + description: '{{ $labels.instance }} has replication slots that are not used, which might lead to replication lag or data inconsistency.', |
| 184 | + summary: 'PostgreSQL has unused replication slots.', |
| 185 | + }, |
| 186 | + expr: 'pg_replication_slots_active{} == 0', |
| 187 | + 'for': '30m', |
| 188 | + labels: { |
| 189 | + severity: 'critical', |
| 190 | + }, |
| 191 | + }, |
| 192 | + { |
| 193 | + alert: 'PostgresReplicationRoleChanged', |
| 194 | + annotations: { |
| 195 | + description: '{{ $labels.instance }} replication role has changed. Verify if this is expected or if it indicates a failover.', |
| 196 | + summary: 'PostgreSQL replication role change detected.', |
| 197 | + }, |
| 198 | + expr: 'pg_replication_is_replica{} and changes(pg_replication_is_replica{}[1m]) > 0', |
| 199 | + labels: { |
| 200 | + severity: 'warning', |
| 201 | + }, |
| 202 | + }, |
| 203 | + { |
| 204 | + alert: 'PostgresHasExporterErrors', |
| 205 | + annotations: { |
| 206 | + description: '{{ $labels.instance }} exporter is experiencing errors. Verify exporter health and configuration.', |
| 207 | + summary: 'PostgreSQL exporter errors detected.', |
| 208 | + }, |
| 209 | + expr: 'pg_exporter_last_scrape_error{} > 0', |
| 210 | + 'for': '30m', |
| 211 | + labels: { |
| 212 | + severity: 'critical', |
| 213 | + }, |
| 214 | + }, |
| 215 | + { |
| 216 | + alert: 'PostgresTablesNotVaccumed', |
| 217 | + annotations: { |
| 218 | + description: '{{ $labels.instance }} tables have not been vacuumed recently within the last hour, which may lead to performance degradation.', |
| 219 | + summary: 'PostgreSQL tables not vacuumed.', |
| 220 | + }, |
| 221 | + expr: ||| |
| 222 | + group without(pod, instance)( |
| 223 | + timestamp( |
| 224 | + pg_stat_user_tables_n_dead_tup{} > |
| 225 | + pg_stat_user_tables_n_live_tup{} |
| 226 | + * on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
| 227 | + + on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{} |
| 228 | + ) |
| 229 | + < time() - 36000 |
| 230 | + ) |
| 231 | + |||, |
| 232 | + 'for': '30m', |
| 233 | + labels: { |
| 234 | + severity: 'critical', |
| 235 | + }, |
| 236 | + }, |
| 237 | + { |
| 238 | + alert: 'PostgresTooManyCheckpointsRequested', |
| 239 | + annotations: { |
| 240 | + description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.', |
| 241 | + summary: 'PostgreSQL too many checkpoints requested.', |
| 242 | + }, |
| 243 | + expr: ||| |
| 244 | + rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) / |
| 245 | + (rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{}[5m])) |
| 246 | + < 0.5 |
| 247 | + |||, |
| 248 | + 'for': '5m', |
| 249 | + labels: { |
| 250 | + severity: 'warning', |
| 251 | + }, |
| 252 | + }, |
123 | 253 | ],
|
124 | 254 | },
|
125 | 255 | ],
|
|
0 commit comments