Skip to content

PMM-12650 vacuum monitoring #166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions cmd/postgres_exporter/postgres_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,59 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
true,
0,
},
"pg_autovacuum_workers": {
map[string]ColumnMapping{
"duration": {GAUGE, "Duration in seconds that autovacuum is running for one table", nil, nil},
"mode": {LABEL, "Type of vacuum", nil, nil},
"database": {LABEL, "Name of database", nil, nil},
"relation": {LABEL, "Vacuumed relation", nil, nil},
"phase": {LABEL, "Vacuum phase", nil, nil},
"table_size": {GAUGE, "Table size", nil, nil},
"total_size": {GAUGE, "Total relation size", nil, nil},
"scanned": {GAUGE, "Bytes scanned by vacuum", nil, nil},
"vacuumed": {GAUGE, "Bytes vacuumed by vacuum", nil, nil},
"scanned_pct": {GAUGE, "Percentage scanned by vacuum", nil, nil},
"vacuumed_pct": {GAUGE, "Percentage vacuumed by vacuum", nil, nil},
"index_vacuum_count": {GAUGE, "Count of vacuumed indexes", nil, nil},
},
true,
0,
},
"pg_table_size": {
map[string]ColumnMapping{
"table_name": {LABEL, "Table name", nil, nil},
"bytes": {GAUGE, "Number of dead rows", nil, nil},
"xid_age": {GAUGE, "Relation xid age", nil, nil},
},
true,
0,
},
"pg_wraparound": {
map[string]ColumnMapping{
"oldest_current_xid": {GAUGE, "Oldest txid", nil, nil},
"percent_towards_wraparound": {GAUGE, "Percentage towards wraparound", nil, nil},
"percent_towards_emergency_autovacuum": {GAUGE, "Percentage towards emergency autovacuum", nil, nil},
},
true,
0,
},
"pg_autovacuum_disabled": {
map[string]ColumnMapping{
"relname": {LABEL, "Table name", nil, nil},
"xid_age": {GAUGE, "Relation age", nil, nil},
},
true,
0,
},
"pg_index_size": {
map[string]ColumnMapping{
"schema": {LABEL, "Table schema", nil, nil},
"index_name": {LABEL, "Table name", nil, nil},
"bytes": {GAUGE, "Number of dead rows", nil, nil},
},
true,
0,
},
"pg_stat_replication": {
map[string]ColumnMapping{
"procpid": {DISCARD, "Process ID of a WAL sender process", nil, semver.MustParseRange("<9.2.0")},
Expand Down Expand Up @@ -254,6 +307,7 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
},
"pg_stat_activity": {
map[string]ColumnMapping{
"pid": {LABEL, "Process ID", nil, nil},
"datname": {LABEL, "Name of this database", nil, nil},
"state": {LABEL, "connection state", nil, semver.MustParseRange(">=9.2.0")},
"usename": {LABEL, "Name of the user logged into this backend", nil, nil},
Expand Down
84 changes: 82 additions & 2 deletions cmd/postgres_exporter/queries.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,85 @@ var queryOverrides = map[string][]OverrideQuery{
`,
},
},

"pg_autovacuum_workers": {
{
semver.MustParseRange(">=11.0.0"),
`
SELECT
EXTRACT(EPOCH FROM (now() - a.xact_start))::int AS duration,
CASE WHEN a.query ~*'^autovacuum.*to prevent wraparound' THEN 'wraparound' WHEN a.query ~*'^vacuum' THEN 'user' ELSE 'regular' END AS mode,
p.datname AS database, p.relid::regclass AS relation,p.phase,
p.heap_blks_total * current_setting('block_size')::int AS table_size,
pg_total_relation_size(relid) AS total_size,
p.heap_blks_scanned * current_setting('block_size')::int AS scanned,
p.heap_blks_vacuumed * current_setting('block_size')::int AS vacuumed,
round(100.0 * p.heap_blks_scanned / p.heap_blks_total, 0) AS scanned_pct,
round(100.0 * p.heap_blks_vacuumed / p.heap_blks_total, 0) AS vacuumed_pct,
p.index_vacuum_count
FROM pg_stat_progress_vacuum p JOIN pg_stat_activity a using (pid)
`,
},
},
"pg_wraparound": {
{
semver.MustParseRange(">=9.4.0"),
`
WITH max_age AS (
SELECT 2000000000 as max_old_xid
, setting AS autovacuum_freeze_max_age
FROM pg_catalog.pg_settings
WHERE name = 'autovacuum_freeze_max_age' )
, per_database_stats AS (
SELECT datname
, m.max_old_xid::int
, m.autovacuum_freeze_max_age::int
, age(d.datfrozenxid) AS oldest_current_xid
FROM pg_catalog.pg_database d
JOIN max_age m ON (true)
WHERE d.datallowconn )
SELECT max(oldest_current_xid) AS oldest_current_xid
, max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound
, max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovacuum
FROM per_database_stats
`,
},
},
"pg_table_size": {
{
semver.MustParseRange(">=9.4.0"),
`
SELECT nspname||'.'||relname AS table_name,pg_relation_size (C.oid) AS bytes,age(C.relfrozenxid) as xid_age
FROM pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
WHERE nspname NOT IN ('pg_catalog','information_schema')
AND C.relkind not in ('i','v')
AND nspname !~ '^pg_toast'
`,
},
},
"pg_autovacuum_disabled": {
{
semver.MustParseRange(">=9.4.0"),
`
select n.nspname||'.'||c.relname,age(c.relfrozenxid) as xid_age from pg_class c
LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)
where c.relkind = 'r' and 'autovacuum_enabled=off'=ANY(c.reloptions)
`,
},
},
"pg_index_size": {
{
semver.MustParseRange(">=9.4.0"),
`
SELECT relnamespace as schema,relname AS index_name,pg_relation_size (C .oid) AS bytes
FROM pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
WHERE nspname NOT IN ('pg_catalog','information_schema')
AND C .relkind = 'i'
AND nspname !~ '^pg_toast'
`,
},
},
"pg_replication_slots": {
{
semver.MustParseRange(">=9.4.0 <10.0.0"),
Expand Down Expand Up @@ -125,6 +203,7 @@ var queryOverrides = map[string][]OverrideQuery{
SELECT
pg_database.datname,
tmp.state,
tmp2.pid,
tmp2.usename,
tmp2.application_name,
COALESCE(count,0) as count,
Expand All @@ -142,12 +221,13 @@ var queryOverrides = map[string][]OverrideQuery{
(
SELECT
datname,
pid,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as far as I remember this pid value creates high cardinality and brings a problem with bad performance and high disk space usage.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BupycHuk Are we going to try it and if needed discart it based on QAs results? Or discart it before merge?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's discard it before merge, because we've faced this problem already in past

state,
usename,
application_name,
count(*) AS count,
MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration
FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2
FROM pg_stat_activity GROUP BY datname,state,usename,application_name,pid) AS tmp2
ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname
`,
},
Expand Down