Skip to content

Commit 8648fe2

Browse files
committed
add metric for per-cpu, per-stage timing
1 parent e34e276 commit 8648fe2

File tree

6 files changed

+151
-112
lines changed

6 files changed

+151
-112
lines changed

config/system-stats-monitor.json

Lines changed: 100 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,102 @@
11
{
2-
"cpu": {
3-
"metricsConfigs": {
4-
"cpu/runnable_task_count": {
5-
"displayName": "cpu/runnable_task_count"
6-
},
7-
"cpu/usage_time": {
8-
"displayName": "cpu/usage_time"
9-
},
10-
"cpu/load_1m": {
11-
"displayName": "cpu/load_1m"
12-
},
13-
"cpu/load_5m": {
14-
"displayName": "cpu/load_5m"
15-
},
16-
"cpu/load_15m": {
17-
"displayName": "cpu/load_15m"
18-
},
19-
"system/processes_total": {
20-
"displayName": "system/processes_total"
21-
},
22-
"system/procs_running": {
23-
"displayName": "system/procs_running"
24-
},
25-
"system/procs_blocked": {
26-
"displayName": "system/procs_blocked"
27-
},
28-
"system/interrupts_total": {
29-
"displayName": "system/interrupts_total"
30-
}
31-
}
32-
},
33-
"disk": {
34-
"metricsConfigs": {
35-
"disk/io_time": {
36-
"displayName": "disk/io_time"
37-
},
38-
"disk/weighted_io": {
39-
"displayName": "disk/weighted_io"
40-
},
41-
"disk/avg_queue_len": {
42-
"displayName": "disk/avg_queue_len"
43-
},
44-
"disk/operation_count": {
45-
"displayName": "disk/operation_count"
46-
},
47-
"disk/merged_operation_count": {
48-
"displayName": "disk/merged_operation_count"
49-
},
50-
"disk/operation_bytes_count": {
51-
"displayName": "disk/operation_bytes_count"
52-
},
53-
"disk/operation_time": {
54-
"displayName": "disk/operation_time"
55-
},
56-
"disk/bytes_used": {
57-
"displayName": "disk/bytes_used"
58-
}
59-
},
60-
"includeRootBlk": true,
61-
"includeAllAttachedBlk": true,
62-
"lsblkTimeout": "5s"
63-
},
64-
"host": {
65-
"metricsConfigs": {
66-
"host/uptime": {
67-
"displayName": "host/uptime"
68-
}
69-
}
70-
},
71-
"memory": {
72-
"metricsConfigs": {
73-
"memory/bytes_used": {
74-
"displayName": "memory/bytes_used"
75-
},
76-
"memory/anonymous_used": {
77-
"displayName": "memory/anonymous_used"
78-
},
79-
"memory/page_cache_used": {
80-
"displayName": "memory/page_cache_used"
81-
},
82-
"memory/unevictable_used": {
83-
"displayName": "memory/unevictable_used"
84-
},
85-
"memory/dirty_used": {
86-
"displayName": "memory/dirty_used"
87-
}
88-
}
89-
},
90-
"osFeature": {
91-
"metricsConfigs": {
92-
"system/os_feature": {
93-
"displayName": "system/os_feature"
94-
}
95-
},
96-
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json"
97-
},
98-
"invokeInterval": "60s"
2+
"cpu": {
3+
"metricsConfigs": {
4+
"cpu/load_15m": {
5+
"displayName": "cpu/load_15m"
6+
},
7+
"cpu/load_1m": {
8+
"displayName": "cpu/load_1m"
9+
},
10+
"cpu/load_5m": {
11+
"displayName": "cpu/load_5m"
12+
},
13+
"cpu/runnable_task_count": {
14+
"displayName": "cpu/runnable_task_count"
15+
},
16+
"cpu/usage_time": {
17+
"displayName": "cpu/usage_time"
18+
},
19+
"system/cpu_stat": {
20+
"displayName": "system/cpu_stat"
21+
},
22+
"system/interrupts_total": {
23+
"displayName": "system/interrupts_total"
24+
},
25+
"system/processes_total": {
26+
"displayName": "system/processes_total"
27+
},
28+
"system/procs_blocked": {
29+
"displayName": "system/procs_blocked"
30+
},
31+
"system/procs_running": {
32+
"displayName": "system/procs_running"
33+
}
34+
}
35+
},
36+
"disk": {
37+
"includeAllAttachedBlk": true,
38+
"includeRootBlk": true,
39+
"lsblkTimeout": "5s",
40+
"metricsConfigs": {
41+
"disk/avg_queue_len": {
42+
"displayName": "disk/avg_queue_len"
43+
},
44+
"disk/bytes_used": {
45+
"displayName": "disk/bytes_used"
46+
},
47+
"disk/io_time": {
48+
"displayName": "disk/io_time"
49+
},
50+
"disk/merged_operation_count": {
51+
"displayName": "disk/merged_operation_count"
52+
},
53+
"disk/operation_bytes_count": {
54+
"displayName": "disk/operation_bytes_count"
55+
},
56+
"disk/operation_count": {
57+
"displayName": "disk/operation_count"
58+
},
59+
"disk/operation_time": {
60+
"displayName": "disk/operation_time"
61+
},
62+
"disk/weighted_io": {
63+
"displayName": "disk/weighted_io"
64+
}
65+
}
66+
},
67+
"host": {
68+
"metricsConfigs": {
69+
"host/uptime": {
70+
"displayName": "host/uptime"
71+
}
72+
}
73+
},
74+
"invokeInterval": "60s",
75+
"memory": {
76+
"metricsConfigs": {
77+
"memory/anonymous_used": {
78+
"displayName": "memory/anonymous_used"
79+
},
80+
"memory/bytes_used": {
81+
"displayName": "memory/bytes_used"
82+
},
83+
"memory/dirty_used": {
84+
"displayName": "memory/dirty_used"
85+
},
86+
"memory/page_cache_used": {
87+
"displayName": "memory/page_cache_used"
88+
},
89+
"memory/unevictable_used": {
90+
"displayName": "memory/unevictable_used"
91+
}
92+
}
93+
},
94+
"osFeature": {
95+
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json",
96+
"metricsConfigs": {
97+
"system/os_feature": {
98+
"displayName": "system/os_feature"
99+
}
100+
}
101+
}
99102
}

pkg/exporters/stackdriver/stackdriver_exporter.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
7373
metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running",
7474
metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked",
7575
metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total",
76+
metrics.SystemCPUStat: "kubernetes.io/internal/node/guest/system/cpu_stat",
7677
metrics.NetDevRxBytes: "kubernetes.io/internal/node/guest/net/rx_bytes",
7778
metrics.NetDevRxPackets: "kubernetes.io/internal/node/guest/net/rx_packets",
7879
metrics.NetDevRxErrors: "kubernetes.io/internal/node/guest/net/rx_errors",

pkg/systemstatsmonitor/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Below metrics are collected from `cpu` component:
3232
* `system/procs_running`: Number of processes currently running.
3333
* `system/procs_blocked`: Number of processes currently blocked.
3434
* `system/interrupts_total`: Total number of interrupts serviced (cumulative).
35+
* `system/cpu_stats`: Cumulative time each cpu spent in various stages. Collected from `/proc/stats`. Has a label for `cpu` and `stage`.
3536

3637
[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html
3738

pkg/systemstatsmonitor/cpu_collector.go

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package systemstatsmonitor
1818

1919
import (
20+
"fmt"
21+
2022
"github.com/golang/glog"
2123
"github.com/prometheus/procfs"
2224
"github.com/shirou/gopsutil/cpu"
@@ -50,6 +52,7 @@ type cpuCollector struct {
5052
mSystemProcsRunning *metrics.Int64Metric
5153
mSystemProcsBlocked *metrics.Int64Metric
5254
mSystemInterruptsTotal *metrics.Int64Metric
55+
mSystemCPUStat *metrics.Float64Metric // per-cpu time from /proc/stats
5356

5457
config *ssmtypes.CPUStatsConfig
5558

@@ -63,13 +66,13 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
6366
if err != nil {
6467
glog.Fatalf("Failed to retrieve kernel version: %v", err)
6568
}
66-
cc.tags["kernel_version"] = kernelVersion
69+
cc.tags[kernelVersionLabel] = kernelVersion
6770

6871
osVersion, err := util.GetOSVersion()
6972
if err != nil {
7073
glog.Fatalf("Failed to retrieve OS version: %v", err)
7174
}
72-
cc.tags["os_version"] = osVersion
75+
cc.tags[osVersionLabel] = osVersion
7376

7477
cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
7578
metrics.CPURunnableTaskCountID,
@@ -170,6 +173,17 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
170173
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err)
171174
}
172175

176+
cc.mSystemCPUStat, err = metrics.NewFloat64Metric(
177+
metrics.SystemCPUStat,
178+
cpuConfig.MetricsConfigs[string(metrics.SystemCPUStat)].DisplayName,
179+
"Cumulative time each cpu spent in various stages.",
180+
"ns",
181+
metrics.Sum,
182+
[]string{osVersionLabel, kernelVersionLabel, cpuLabel, stageLabel})
183+
if err != nil {
184+
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemCPUStat, err)
185+
}
186+
173187
cc.lastUsageTime = make(map[string]float64)
174188

175189
return &cc
@@ -238,19 +252,6 @@ func (cc *cpuCollector) recordUsage() {
238252
}
239253

240254
func (cc *cpuCollector) recordSystemStats() {
241-
if cc.mSystemProcessesTotal == nil {
242-
return
243-
}
244-
if cc.mSystemProcsRunning == nil {
245-
return
246-
}
247-
if cc.mSystemProcsBlocked == nil {
248-
return
249-
}
250-
if cc.mSystemInterruptsTotal == nil {
251-
return
252-
}
253-
254255
fs, err := procfs.NewFS("/proc")
255256
stats, err := fs.Stat()
256257
if err != nil {
@@ -262,6 +263,32 @@ func (cc *cpuCollector) recordSystemStats() {
262263
cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning))
263264
cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked))
264265
cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal))
266+
267+
for i, c := range stats.CPU {
268+
tags := cc.tags
269+
tags[cpuLabel] = fmt.Sprintf("cpu%d", i)
270+
271+
tags[stageLabel] = "user"
272+
cc.mSystemCPUStat.Record(tags, c.User)
273+
tags[stageLabel] = "nice"
274+
cc.mSystemCPUStat.Record(tags, c.Nice)
275+
tags[stageLabel] = "system"
276+
cc.mSystemCPUStat.Record(tags, c.System)
277+
tags[stageLabel] = "idle"
278+
cc.mSystemCPUStat.Record(tags, c.Idle)
279+
tags[stageLabel] = "iowait"
280+
cc.mSystemCPUStat.Record(tags, c.Iowait)
281+
tags[stageLabel] = "iRQ"
282+
cc.mSystemCPUStat.Record(tags, c.IRQ)
283+
tags[stageLabel] = "softIRQ"
284+
cc.mSystemCPUStat.Record(tags, c.SoftIRQ)
285+
tags[stageLabel] = "steal"
286+
cc.mSystemCPUStat.Record(tags, c.Steal)
287+
tags[stageLabel] = "guest"
288+
cc.mSystemCPUStat.Record(tags, c.Guest)
289+
tags[stageLabel] = "guestNice"
290+
cc.mSystemCPUStat.Record(tags, c.GuestNice)
291+
}
265292
}
266293

267294
func (cc *cpuCollector) collect() {

pkg/systemstatsmonitor/labels.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,9 @@ const kernelVersionLabel = "kernel_version"
4545

4646
// interfaceNameLabel labels the network interface name
4747
const interfaceNameLabel = "interface_name"
48+
49+
// cpuLabel labels the CPU (eg "cpu0")
50+
const cpuLabel = "cpu"
51+
52+
// stageLabel labels the stage according to the kernel where CPU time was spent
53+
const stageLabel = "stage"

pkg/util/metrics/metric.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ const (
4646
SystemProcsRunning MetricID = "system/procs_running"
4747
SystemProcsBlocked MetricID = "system/procs_blocked"
4848
SystemInterruptsTotal MetricID = "system/interrupts_total"
49+
SystemCPUStat MetricID = "system/cpu_stat"
4950
NetDevRxBytes MetricID = "net/rx_bytes"
5051
NetDevRxPackets MetricID = "net/rx_packets"
5152
NetDevRxErrors MetricID = "net/rx_errors"

0 commit comments

Comments
 (0)