|
7 | 7 | {
|
8 | 8 | alert: 'PostgreSQLMaxConnectionsReached',
|
9 | 9 | annotations: {
|
10 |
| - description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.', |
| 10 | + description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy).', |
11 | 11 | summary: 'Postgres connections count is over the maximum amount.',
|
12 | 12 | },
|
13 | 13 | expr: |||
|
14 |
| - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
| 14 | + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
15 | 15 | >=
|
16 |
| - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
| 16 | + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
17 | 17 | -
|
18 |
| - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
19 |
| - ||| % $._config, |
| 18 | + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
| 19 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
20 | 20 | 'for': '1m',
|
21 | 21 | labels: {
|
22 | 22 | severity: 'warning',
|
|
29 | 29 | summary: 'Postgres connections count is over 80% of maximum amount.',
|
30 | 30 | },
|
31 | 31 | expr: |||
|
32 |
| - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
| 32 | + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
33 | 33 | >
|
34 | 34 | (
|
35 |
| - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
| 35 | + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
36 | 36 | -
|
37 |
| - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
| 37 | + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
38 | 38 | ) * 0.8
|
39 |
| - ||| % $._config, |
| 39 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
40 | 40 | 'for': '10m',
|
41 | 41 | labels: {
|
42 | 42 | severity: 'warning',
|
|
61 | 61 | summary: 'PostgreSQL high number of slow queries.',
|
62 | 62 | },
|
63 | 63 | expr: |||
|
64 |
| - avg by (datname) ( |
| 64 | + avg by (datname, %(agg)s) ( |
65 | 65 | rate (
|
66 |
| - pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m] |
| 66 | + pg_stat_activity_max_tx_duration{%(dbNameFilter)s, %(postgresExporterSelector)s}[2m] |
67 | 67 | )
|
68 | 68 | ) > 2 * 60
|
69 |
| - ||| % $._config, |
| 69 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
70 | 70 | 'for': '2m',
|
71 | 71 | labels: {
|
72 | 72 | severity: 'warning',
|
|
79 | 79 | summary: 'PostgreSQL high number of queries per second.',
|
80 | 80 | },
|
81 | 81 | expr: |||
|
82 |
| - avg by (datname) ( |
| 82 | + avg by (datname, %(agg)s) ( |
83 | 83 | irate(
|
84 |
| - pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 84 | + pg_stat_database_xact_commit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
85 | 85 | )
|
86 | 86 | +
|
87 | 87 | irate(
|
88 |
| - pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 88 | + pg_stat_database_xact_rollback{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
89 | 89 | )
|
90 | 90 | ) > 10000
|
91 |
| - ||| % $._config, |
| 91 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
92 | 92 | 'for': '5m',
|
93 | 93 | labels: {
|
94 | 94 | severity: 'warning',
|
|
101 | 101 | summary: 'PostgreSQL low cache hit rate.',
|
102 | 102 | },
|
103 | 103 | expr: |||
|
104 |
| - avg by (datname) ( |
105 |
| - rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]) |
| 104 | + avg by (datname, %(agg)s) ( |
| 105 | + rate(pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]) |
106 | 106 | /
|
107 | 107 | (
|
108 | 108 | rate(
|
109 |
| - pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 109 | + pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
110 | 110 | )
|
111 | 111 | +
|
112 | 112 | rate(
|
113 |
| - pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 113 | + pg_stat_database_blks_read{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
114 | 114 | )
|
115 | 115 | )
|
116 | 116 | ) < 0.98
|
117 |
| - ||| % $._config, |
| 117 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
118 | 118 | 'for': '5m',
|
119 | 119 | labels: {
|
120 | 120 | severity: 'warning',
|
|
157 | 157 | summary: 'PostgreSQL has high number of acquired locks.',
|
158 | 158 | },
|
159 | 159 | expr: |||
|
160 |
| - max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) / |
161 |
| - on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 |
162 |
| - ||| % $._config, |
| 160 | + max by(datname, %(agg)s) ( |
| 161 | + (pg_locks_count{%(dbNameFilter)s}) |
| 162 | + / |
| 163 | + on(%(aggWithoutServer)s) group_left(server) ( |
| 164 | + pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{} |
| 165 | + ) |
| 166 | + ) > 0.20 |
| 167 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels), aggWithoutServer: std.join(',', std.filter(function(x) x != "server", $._config.groupLabels + $._config.instanceLabels)) }, |
163 | 168 | 'for': '5m',
|
164 | 169 | labels: {
|
165 | 170 | severity: 'warning',
|
|
171 | 176 | description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
|
172 | 177 | summary: 'PostgreSQL replication lagging more than 1 hour.',
|
173 | 178 | },
|
174 |
| - expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)', |
| 179 | + expr: ||| |
| 180 | + (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1) |
| 181 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
175 | 182 | 'for': '5m',
|
176 | 183 | labels: {
|
177 | 184 | severity: 'warning',
|
|
223 | 230 | timestamp(
|
224 | 231 | pg_stat_user_tables_n_dead_tup{} >
|
225 | 232 | pg_stat_user_tables_n_live_tup{}
|
226 |
| - * on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
227 |
| - + on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{} |
| 233 | + * on(%(agg)s) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
| 234 | + + on(%(agg)s) group_left pg_settings_autovacuum_vacuum_threshold{} |
228 | 235 | )
|
229 | 236 | < time() - 36000
|
230 | 237 | )
|
231 |
| - |||, |
| 238 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
232 | 239 | 'for': '30m',
|
233 | 240 | labels: {
|
234 | 241 | severity: 'critical',
|
|
0 commit comments