Skip to content

Commit 30babe9

Browse files
authored
Merge pull request #303 from xueweiz/self
Implement host collector as part of system-stats-monitor
2 parents 146dfd7 + 4944ac3 commit 30babe9

File tree

122 files changed

+21553
-37
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

122 files changed

+21553
-37
lines changed

config/system-stats-monitor.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,12 @@
1515
"includeAllAttachedBlk": true,
1616
"lsblkTimeout": "5s"
1717
},
18+
"host": {
19+
"metricsConfigs": {
20+
"host/uptime": {
21+
"displayName": "host/uptime"
22+
}
23+
}
24+
},
1825
"invokeInterval": "60s"
1926
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ require (
99
github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2 // indirect
1010
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect
1111
github.com/beorn7/perks v1.0.0 // indirect
12+
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249
1213
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
1314
github.com/coreos/pkg v0.0.0-20160620232715-fa29b1d70f0b // indirect
1415
github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633 // indirect
@@ -39,6 +40,7 @@ require (
3940
github.com/prometheus/common v0.3.0 // indirect
4041
github.com/prometheus/procfs v0.0.0-20190425082905-87a4384529e0 // indirect
4142
github.com/shirou/gopsutil v2.18.12+incompatible
43+
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect
4244
github.com/sigma/go-inotify v0.0.0-20181102212354-c87b6cf5033d // indirect
4345
github.com/spf13/pflag v1.0.3
4446
github.com/stretchr/testify v1.3.0

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24
1616
github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0=
1717
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
1818
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
19+
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249 h1:R0IDH8daQ3lODvu8YtxnIqqth5qMGCJyADoUQvmLx4o=
20+
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249/go.mod h1:EHKW9yNEYSBpTKzuu7Y9oOrft/UlzH57rMIB03oev6M=
1921
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e h1:Wf6HqHfScWJN9/ZjdUKyjop4mf3Qdd+1TvvltAvM3m8=
2022
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
2123
github.com/coreos/pkg v0.0.0-20160620232715-fa29b1d70f0b h1:IqgHacj6F3QnV+0H9PXFWAmML5HdxkZakBQgZgfD+FU=
@@ -118,6 +120,8 @@ github.com/prometheus/procfs v0.0.0-20190425082905-87a4384529e0 h1:c8R11WC8m7KNM
118120
github.com/prometheus/procfs v0.0.0-20190425082905-87a4384529e0/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
119121
github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAriGFsTZppLXDX93OM=
120122
github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
123+
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U=
124+
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
121125
github.com/sigma/go-inotify v0.0.0-20181102212354-c87b6cf5033d h1:G1nNtZVTzcCvVKMwcG0Vispo3bhc15EbjO5uamiLikI=
122126
github.com/sigma/go-inotify v0.0.0-20181102212354-c87b6cf5033d/go.mod h1:stlh9OsqBQSdwxTxX73mu41BBtRbIpZLQ7flcAoxAfo=
123127
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=

pkg/systemstatsmonitor/disk_collector.go

Lines changed: 58 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ import (
2727
"go.opencensus.io/stats"
2828
"go.opencensus.io/stats/view"
2929
"go.opencensus.io/tag"
30+
3031
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
32+
"k8s.io/node-problem-detector/pkg/util/metrics"
3133
)
3234

3335
type diskCollector struct {
@@ -44,28 +46,39 @@ type diskCollector struct {
4446

4547
func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector {
4648
dc := diskCollector{config: diskConfig}
47-
dc.keyDevice, _ = tag.NewKey("device")
48-
49-
dc.mIOTime = newInt64Metric(
50-
diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
51-
"The IO time spent on the disk",
52-
"second",
53-
view.LastValue(),
54-
[]tag.Key{dc.keyDevice})
55-
56-
dc.mWeightedIO = newInt64Metric(
57-
diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
58-
"The weighted IO on the disk",
59-
"second",
60-
view.LastValue(),
61-
[]tag.Key{dc.keyDevice})
62-
63-
dc.mAvgQueueLen = newFloat64Metric(
64-
diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName,
65-
"The average queue length on the disk",
66-
"second",
67-
view.LastValue(),
68-
[]tag.Key{dc.keyDevice})
49+
50+
var err error
51+
dc.keyDevice, err = tag.NewKey("device")
52+
if err != nil {
53+
glog.Fatalf("Failed to create device tag during initializing disk collector: %v", err)
54+
}
55+
56+
if diskConfig.MetricsConfigs["disk/io_time"].DisplayName != "" {
57+
dc.mIOTime = metrics.NewInt64Metric(
58+
diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
59+
"The IO time spent on the disk",
60+
"second",
61+
view.LastValue(),
62+
[]tag.Key{dc.keyDevice})
63+
}
64+
65+
if diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName != "" {
66+
dc.mWeightedIO = metrics.NewInt64Metric(
67+
diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
68+
"The weighted IO on the disk",
69+
"second",
70+
view.LastValue(),
71+
[]tag.Key{dc.keyDevice})
72+
}
73+
74+
if diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName != "" {
75+
dc.mAvgQueueLen = metrics.NewFloat64Metric(
76+
diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName,
77+
"The average queue length on the disk",
78+
"second",
79+
view.LastValue(),
80+
[]tag.Key{dc.keyDevice})
81+
}
6982

7083
dc.historyIOTime = make(map[string]uint64)
7184
dc.historyWeightedIO = make(map[string]uint64)
@@ -86,7 +99,11 @@ func (dc *diskCollector) collect() {
8699
blks = append(blks, listAttachedBlockDevices()...)
87100
}
88101

89-
ioCountersStats, _ := disk.IOCounters(blks...)
102+
ioCountersStats, err := disk.IOCounters(blks...)
103+
if err != nil {
104+
glog.Errorf("Failed to retrieve disk IO counters: %v", err)
105+
return
106+
}
90107

91108
for deviceName, ioCountersStat := range ioCountersStats {
92109
// Calculate average IO queue length since last measurement.
@@ -96,21 +113,26 @@ func (dc *diskCollector) collect() {
96113
dc.historyIOTime[deviceName] = ioCountersStat.IoTime
97114
dc.historyWeightedIO[deviceName] = ioCountersStat.WeightedIO
98115

99-
avg_queue_len := float64(0.0)
116+
avgQueueLen := float64(0.0)
100117
if lastIOTime != ioCountersStat.IoTime {
101-
avg_queue_len = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
118+
avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
102119
}
103120

104121
// Attach label {"device": deviceName} to the metrics.
105-
device_ctx, _ := tag.New(context.Background(), tag.Upsert(dc.keyDevice, deviceName))
122+
deviceCtx, err := tag.New(context.Background(), tag.Upsert(dc.keyDevice, deviceName))
123+
if err != nil {
124+
glog.Errorf("Failed to create context with device tag: %v", err)
125+
deviceCtx = context.Background()
126+
}
127+
106128
if dc.mIOTime != nil {
107-
stats.Record(device_ctx, dc.mIOTime.M(int64(ioCountersStat.IoTime)))
129+
stats.Record(deviceCtx, dc.mIOTime.M(int64(ioCountersStat.IoTime)))
108130
}
109131
if dc.mWeightedIO != nil {
110-
stats.Record(device_ctx, dc.mWeightedIO.M(int64(ioCountersStat.WeightedIO)))
132+
stats.Record(deviceCtx, dc.mWeightedIO.M(int64(ioCountersStat.WeightedIO)))
111133
}
112134
if dc.mAvgQueueLen != nil {
113-
stats.Record(device_ctx, dc.mAvgQueueLen.M(avg_queue_len))
135+
stats.Record(deviceCtx, dc.mAvgQueueLen.M(avgQueueLen))
114136
}
115137
}
116138
}
@@ -133,8 +155,14 @@ func listRootBlockDevices(timeout time.Duration) []string {
133155

134156
// listAttachedBlockDevices lists all currently attached block devices.
135157
func listAttachedBlockDevices() []string {
136-
partitions, _ := disk.Partitions(false)
137158
blks := []string{}
159+
160+
partitions, err := disk.Partitions(false)
161+
if err != nil {
162+
glog.Errorf("Failed to retrieve the list of disk partitions: %v", err)
163+
return blks
164+
}
165+
138166
for _, partition := range partitions {
139167
blks = append(blks, partition.Device)
140168
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package systemstatsmonitor
18+
19+
import (
20+
"context"
21+
22+
"github.com/golang/glog"
23+
"github.com/shirou/gopsutil/host"
24+
"go.opencensus.io/stats"
25+
"go.opencensus.io/stats/view"
26+
"go.opencensus.io/tag"
27+
28+
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
29+
"k8s.io/node-problem-detector/pkg/util"
30+
"k8s.io/node-problem-detector/pkg/util/metrics"
31+
)
32+
33+
type hostCollector struct {
34+
tags []tag.Mutator
35+
uptime *stats.Int64Measure
36+
}
37+
38+
func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector {
39+
hc := hostCollector{}
40+
41+
keyKernelVersion, err := tag.NewKey("kernel_version")
42+
if err != nil {
43+
glog.Fatalf("Failed to create kernel_version tag during initializing host collector: %v", err)
44+
}
45+
kernelVersion, err := host.KernelVersion()
46+
if err != nil {
47+
glog.Fatalf("Failed to retrieve kernel version: %v", err)
48+
}
49+
hc.tags = append(hc.tags, tag.Upsert(keyKernelVersion, kernelVersion))
50+
51+
keyOSVersion, err := tag.NewKey("os_version")
52+
if err != nil {
53+
glog.Fatalf("Failed to create os_version tag during initializing host collector: %v", err)
54+
}
55+
osVersion, err := util.GetOSVersion()
56+
if err != nil {
57+
glog.Fatalf("Failed to retrieve OS version: %v", err)
58+
}
59+
hc.tags = append(hc.tags, tag.Upsert(keyOSVersion, osVersion))
60+
61+
if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" {
62+
hc.uptime = metrics.NewInt64Metric(
63+
hostConfig.MetricsConfigs["host/uptime"].DisplayName,
64+
"The uptime of the operating system",
65+
"second",
66+
view.LastValue(),
67+
[]tag.Key{keyKernelVersion, keyOSVersion})
68+
}
69+
70+
return &hc
71+
}
72+
73+
func (hc *hostCollector) collect() {
74+
if hc == nil {
75+
return
76+
}
77+
78+
uptime, err := host.Uptime()
79+
if err != nil {
80+
glog.Errorf("Failed to retrieve uptime of the host: %v", err)
81+
return
82+
}
83+
84+
if hc.uptime != nil {
85+
err := stats.RecordWithTags(context.Background(), hc.tags, hc.uptime.M(int64(uptime)))
86+
if err != nil {
87+
glog.Errorf("Failed to record current uptime (%d seconds) of the host: %v", uptime, err)
88+
}
89+
}
90+
}

pkg/systemstatsmonitor/system_stats_monitor.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"time"
2323

2424
"github.com/golang/glog"
25+
2526
"k8s.io/node-problem-detector/pkg/problemdaemon"
2627
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
2728
"k8s.io/node-problem-detector/pkg/types"
@@ -39,6 +40,7 @@ func init() {
3940
type systemStatsMonitor struct {
4041
config ssmtypes.SystemStatsConfig
4142
diskCollector *diskCollector
43+
hostCollector *hostCollector
4244
tomb *tomb.Tomb
4345
}
4446

@@ -68,10 +70,12 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
6870
glog.Fatalf("Failed to validate configuration %+v: %v", ssm.config, err)
6971
}
7072

71-
// Initialize diskCollector if needed.
7273
if len(ssm.config.DiskConfig.MetricsConfigs) > 0 {
7374
ssm.diskCollector = NewDiskCollectorOrDie(&ssm.config.DiskConfig)
7475
}
76+
if len(ssm.config.HostConfig.MetricsConfigs) > 0 {
77+
ssm.hostCollector = NewHostCollectorOrDie(&ssm.config.HostConfig)
78+
}
7579
return &ssm
7680
}
7781

@@ -93,12 +97,14 @@ func (ssm *systemStatsMonitor) monitorLoop() {
9397
return
9498
default:
9599
ssm.diskCollector.collect()
100+
ssm.hostCollector.collect()
96101
}
97102

98103
for {
99104
select {
100105
case <-runTicker.C:
101106
ssm.diskCollector.collect()
107+
ssm.hostCollector.collect()
102108
case <-ssm.tomb.Stopping():
103109
glog.Infof("System stats monitor stopped")
104110
return

pkg/systemstatsmonitor/types/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,13 @@ type DiskStatsConfig struct {
3838
LsblkTimeout time.Duration `json:"-"`
3939
}
4040

41+
type HostStatsConfig struct {
42+
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
43+
}
44+
4145
type SystemStatsConfig struct {
4246
DiskConfig DiskStatsConfig `json:"disk"`
47+
HostConfig HostStatsConfig `json:"host"`
4348
InvokeIntervalString string `json:"invokeInterval"`
4449
InvokeInterval time.Duration `json:"-"`
4550
}

pkg/util/helpers.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@ import (
2020
"syscall"
2121
"time"
2222

23+
"github.com/cobaugh/osrelease"
24+
2325
"k8s.io/node-problem-detector/pkg/types"
2426
)
2527

28+
var osReleasePath = "/etc/os-release"
29+
2630
// GenerateConditionChangeEvent generates an event for condition change.
2731
func GenerateConditionChangeEvent(t string, status types.ConditionStatus, reason string, timestamp time.Time) types.Event {
2832
return types.Event{
@@ -70,3 +74,34 @@ func GetStartTime(now time.Time, uptimeDuration time.Duration, lookbackStr strin
7074

7175
return startTime, nil
7276
}
77+
78+
// GetOSVersion retrieves the version of the current operating system.
79+
// For example: "cos 77-12293.0.0", "ubuntu 16.04.6 LTS (Xenial Xerus)".
80+
func GetOSVersion() (string, error) {
81+
osReleaseMap, err := osrelease.ReadFile(osReleasePath)
82+
if err != nil {
83+
return "", err
84+
}
85+
switch osReleaseMap["ID"] {
86+
case "cos":
87+
return getCOSVersion(osReleaseMap), nil
88+
case "debian":
89+
return getDebianVersion(osReleaseMap), nil
90+
case "ubuntu":
91+
return getDebianVersion(osReleaseMap), nil
92+
default:
93+
return "", fmt.Errorf("Unsupported ID in /etc/os-release: %q", osReleaseMap["ID"])
94+
}
95+
}
96+
97+
func getCOSVersion(osReleaseMap map[string]string) string {
98+
// /etc/os-release syntax for COS is defined here:
99+
// https://chromium.git.corp.google.com/chromiumos/docs/+/8edec95a297edfd8f1290f0f03a8aa35795b516b/os_config.md
100+
return fmt.Sprintf("%s %s-%s", osReleaseMap["ID"], osReleaseMap["VERSION"], osReleaseMap["BUILD_ID"])
101+
}
102+
103+
func getDebianVersion(osReleaseMap map[string]string) string {
104+
// /etc/os-release syntax for Debian is defined here:
105+
// https://manpages.debian.org/testing/systemd/os-release.5.en.html
106+
return fmt.Sprintf("%s %s", osReleaseMap["ID"], osReleaseMap["VERSION"])
107+
}

0 commit comments

Comments
 (0)