Skip to content

Commit 4944ac3

Browse files
author
Xuewei Zhang
committed
Implement host collector as part of system-stats-monitor
Host collector report three things today: 1. Host OS uptime (in seconds) 2. Host kernel version (as a metric label) 3. Host OS version (as a metric label)
1 parent ed16a29 commit 4944ac3

File tree

12 files changed

+292
-31
lines changed

12 files changed

+292
-31
lines changed

config/system-stats-monitor.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,12 @@
1515
"includeAllAttachedBlk": true,
1616
"lsblkTimeout": "5s"
1717
},
18+
"host": {
19+
"metricsConfigs": {
20+
"host/uptime": {
21+
"displayName": "host/uptime"
22+
}
23+
}
24+
},
1825
"invokeInterval": "60s"
1926
}

pkg/systemstatsmonitor/disk_collector.go

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -46,28 +46,39 @@ type diskCollector struct {
4646

4747
func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector {
4848
dc := diskCollector{config: diskConfig}
49-
dc.keyDevice, _ = tag.NewKey("device")
50-
51-
dc.mIOTime = metrics.NewInt64Metric(
52-
diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
53-
"The IO time spent on the disk",
54-
"second",
55-
view.LastValue(),
56-
[]tag.Key{dc.keyDevice})
57-
58-
dc.mWeightedIO = metrics.NewInt64Metric(
59-
diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
60-
"The weighted IO on the disk",
61-
"second",
62-
view.LastValue(),
63-
[]tag.Key{dc.keyDevice})
64-
65-
dc.mAvgQueueLen = metrics.NewFloat64Metric(
66-
diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName,
67-
"The average queue length on the disk",
68-
"second",
69-
view.LastValue(),
70-
[]tag.Key{dc.keyDevice})
49+
50+
var err error
51+
dc.keyDevice, err = tag.NewKey("device")
52+
if err != nil {
53+
glog.Fatalf("Failed to create device tag during initializing disk collector: %v", err)
54+
}
55+
56+
if diskConfig.MetricsConfigs["disk/io_time"].DisplayName != "" {
57+
dc.mIOTime = metrics.NewInt64Metric(
58+
diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
59+
"The IO time spent on the disk",
60+
"second",
61+
view.LastValue(),
62+
[]tag.Key{dc.keyDevice})
63+
}
64+
65+
if diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName != "" {
66+
dc.mWeightedIO = metrics.NewInt64Metric(
67+
diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
68+
"The weighted IO on the disk",
69+
"second",
70+
view.LastValue(),
71+
[]tag.Key{dc.keyDevice})
72+
}
73+
74+
if diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName != "" {
75+
dc.mAvgQueueLen = metrics.NewFloat64Metric(
76+
diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName,
77+
"The average queue length on the disk",
78+
"second",
79+
view.LastValue(),
80+
[]tag.Key{dc.keyDevice})
81+
}
7182

7283
dc.historyIOTime = make(map[string]uint64)
7384
dc.historyWeightedIO = make(map[string]uint64)
@@ -88,7 +99,11 @@ func (dc *diskCollector) collect() {
8899
blks = append(blks, listAttachedBlockDevices()...)
89100
}
90101

91-
ioCountersStats, _ := disk.IOCounters(blks...)
102+
ioCountersStats, err := disk.IOCounters(blks...)
103+
if err != nil {
104+
glog.Errorf("Failed to retrieve disk IO counters: %v", err)
105+
return
106+
}
92107

93108
for deviceName, ioCountersStat := range ioCountersStats {
94109
// Calculate average IO queue length since last measurement.
@@ -98,21 +113,26 @@ func (dc *diskCollector) collect() {
98113
dc.historyIOTime[deviceName] = ioCountersStat.IoTime
99114
dc.historyWeightedIO[deviceName] = ioCountersStat.WeightedIO
100115

101-
avg_queue_len := float64(0.0)
116+
avgQueueLen := float64(0.0)
102117
if lastIOTime != ioCountersStat.IoTime {
103-
avg_queue_len = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
118+
avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
104119
}
105120

106121
// Attach label {"device": deviceName} to the metrics.
107-
device_ctx, _ := tag.New(context.Background(), tag.Upsert(dc.keyDevice, deviceName))
122+
deviceCtx, err := tag.New(context.Background(), tag.Upsert(dc.keyDevice, deviceName))
123+
if err != nil {
124+
glog.Errorf("Failed to create context with device tag: %v", err)
125+
deviceCtx = context.Background()
126+
}
127+
108128
if dc.mIOTime != nil {
109-
stats.Record(device_ctx, dc.mIOTime.M(int64(ioCountersStat.IoTime)))
129+
stats.Record(deviceCtx, dc.mIOTime.M(int64(ioCountersStat.IoTime)))
110130
}
111131
if dc.mWeightedIO != nil {
112-
stats.Record(device_ctx, dc.mWeightedIO.M(int64(ioCountersStat.WeightedIO)))
132+
stats.Record(deviceCtx, dc.mWeightedIO.M(int64(ioCountersStat.WeightedIO)))
113133
}
114134
if dc.mAvgQueueLen != nil {
115-
stats.Record(device_ctx, dc.mAvgQueueLen.M(avg_queue_len))
135+
stats.Record(deviceCtx, dc.mAvgQueueLen.M(avgQueueLen))
116136
}
117137
}
118138
}
@@ -135,8 +155,14 @@ func listRootBlockDevices(timeout time.Duration) []string {
135155

136156
// listAttachedBlockDevices lists all currently attached block devices.
137157
func listAttachedBlockDevices() []string {
138-
partitions, _ := disk.Partitions(false)
139158
blks := []string{}
159+
160+
partitions, err := disk.Partitions(false)
161+
if err != nil {
162+
glog.Errorf("Failed to retrieve the list of disk partitions: %v", err)
163+
return blks
164+
}
165+
140166
for _, partition := range partitions {
141167
blks = append(blks, partition.Device)
142168
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package systemstatsmonitor
18+
19+
import (
20+
"context"
21+
22+
"github.com/golang/glog"
23+
"github.com/shirou/gopsutil/host"
24+
"go.opencensus.io/stats"
25+
"go.opencensus.io/stats/view"
26+
"go.opencensus.io/tag"
27+
28+
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
29+
"k8s.io/node-problem-detector/pkg/util"
30+
"k8s.io/node-problem-detector/pkg/util/metrics"
31+
)
32+
33+
type hostCollector struct {
34+
tags []tag.Mutator
35+
uptime *stats.Int64Measure
36+
}
37+
38+
func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector {
39+
hc := hostCollector{}
40+
41+
keyKernelVersion, err := tag.NewKey("kernel_version")
42+
if err != nil {
43+
glog.Fatalf("Failed to create kernel_version tag during initializing host collector: %v", err)
44+
}
45+
kernelVersion, err := host.KernelVersion()
46+
if err != nil {
47+
glog.Fatalf("Failed to retrieve kernel version: %v", err)
48+
}
49+
hc.tags = append(hc.tags, tag.Upsert(keyKernelVersion, kernelVersion))
50+
51+
keyOSVersion, err := tag.NewKey("os_version")
52+
if err != nil {
53+
glog.Fatalf("Failed to create os_version tag during initializing host collector: %v", err)
54+
}
55+
osVersion, err := util.GetOSVersion()
56+
if err != nil {
57+
glog.Fatalf("Failed to retrieve OS version: %v", err)
58+
}
59+
hc.tags = append(hc.tags, tag.Upsert(keyOSVersion, osVersion))
60+
61+
if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" {
62+
hc.uptime = metrics.NewInt64Metric(
63+
hostConfig.MetricsConfigs["host/uptime"].DisplayName,
64+
"The uptime of the operating system",
65+
"second",
66+
view.LastValue(),
67+
[]tag.Key{keyKernelVersion, keyOSVersion})
68+
}
69+
70+
return &hc
71+
}
72+
73+
func (hc *hostCollector) collect() {
74+
if hc == nil {
75+
return
76+
}
77+
78+
uptime, err := host.Uptime()
79+
if err != nil {
80+
glog.Errorf("Failed to retrieve uptime of the host: %v", err)
81+
return
82+
}
83+
84+
if hc.uptime != nil {
85+
err := stats.RecordWithTags(context.Background(), hc.tags, hc.uptime.M(int64(uptime)))
86+
if err != nil {
87+
glog.Errorf("Failed to record current uptime (%d seconds) of the host: %v", uptime, err)
88+
}
89+
}
90+
}

pkg/systemstatsmonitor/system_stats_monitor.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ func init() {
4040
type systemStatsMonitor struct {
4141
config ssmtypes.SystemStatsConfig
4242
diskCollector *diskCollector
43+
hostCollector *hostCollector
4344
tomb *tomb.Tomb
4445
}
4546

@@ -69,10 +70,12 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
6970
glog.Fatalf("Failed to validate configuration %+v: %v", ssm.config, err)
7071
}
7172

72-
// Initialize diskCollector if needed.
7373
if len(ssm.config.DiskConfig.MetricsConfigs) > 0 {
7474
ssm.diskCollector = NewDiskCollectorOrDie(&ssm.config.DiskConfig)
7575
}
76+
if len(ssm.config.HostConfig.MetricsConfigs) > 0 {
77+
ssm.hostCollector = NewHostCollectorOrDie(&ssm.config.HostConfig)
78+
}
7679
return &ssm
7780
}
7881

@@ -94,12 +97,14 @@ func (ssm *systemStatsMonitor) monitorLoop() {
9497
return
9598
default:
9699
ssm.diskCollector.collect()
100+
ssm.hostCollector.collect()
97101
}
98102

99103
for {
100104
select {
101105
case <-runTicker.C:
102106
ssm.diskCollector.collect()
107+
ssm.hostCollector.collect()
103108
case <-ssm.tomb.Stopping():
104109
glog.Infof("System stats monitor stopped")
105110
return

pkg/systemstatsmonitor/types/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,13 @@ type DiskStatsConfig struct {
3838
LsblkTimeout time.Duration `json:"-"`
3939
}
4040

41+
type HostStatsConfig struct {
42+
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
43+
}
44+
4145
type SystemStatsConfig struct {
4246
DiskConfig DiskStatsConfig `json:"disk"`
47+
HostConfig HostStatsConfig `json:"host"`
4348
InvokeIntervalString string `json:"invokeInterval"`
4449
InvokeInterval time.Duration `json:"-"`
4550
}

pkg/util/helpers.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@ import (
2020
"syscall"
2121
"time"
2222

23+
"github.com/cobaugh/osrelease"
24+
2325
"k8s.io/node-problem-detector/pkg/types"
2426
)
2527

28+
var osReleasePath = "/etc/os-release"
29+
2630
// GenerateConditionChangeEvent generates an event for condition change.
2731
func GenerateConditionChangeEvent(t string, status types.ConditionStatus, reason string, timestamp time.Time) types.Event {
2832
return types.Event{
@@ -70,3 +74,34 @@ func GetStartTime(now time.Time, uptimeDuration time.Duration, lookbackStr strin
7074

7175
return startTime, nil
7276
}
77+
78+
// GetOSVersion retrieves the version of the current operating system.
79+
// For example: "cos 77-12293.0.0", "ubuntu 16.04.6 LTS (Xenial Xerus)".
80+
func GetOSVersion() (string, error) {
81+
osReleaseMap, err := osrelease.ReadFile(osReleasePath)
82+
if err != nil {
83+
return "", err
84+
}
85+
switch osReleaseMap["ID"] {
86+
case "cos":
87+
return getCOSVersion(osReleaseMap), nil
88+
case "debian":
89+
return getDebianVersion(osReleaseMap), nil
90+
case "ubuntu":
91+
return getDebianVersion(osReleaseMap), nil
92+
default:
93+
return "", fmt.Errorf("Unsupported ID in /etc/os-release: %q", osReleaseMap["ID"])
94+
}
95+
}
96+
97+
func getCOSVersion(osReleaseMap map[string]string) string {
98+
// /etc/os-release syntax for COS is defined here:
99+
// https://chromium.git.corp.google.com/chromiumos/docs/+/8edec95a297edfd8f1290f0f03a8aa35795b516b/os_config.md
100+
return fmt.Sprintf("%s %s-%s", osReleaseMap["ID"], osReleaseMap["VERSION"], osReleaseMap["BUILD_ID"])
101+
}
102+
103+
func getDebianVersion(osReleaseMap map[string]string) string {
104+
// /etc/os-release syntax for Debian is defined here:
105+
// https://manpages.debian.org/testing/systemd/os-release.5.en.html
106+
return fmt.Sprintf("%s %s", osReleaseMap["ID"], osReleaseMap["VERSION"])
107+
}

pkg/util/helpers_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,65 @@ func TestGetStartTime(t *testing.T) {
135135
})
136136
}
137137
}
138+
139+
func TestGetOSVersion(t *testing.T) {
140+
testCases := []struct {
141+
name string
142+
fakeOSReleasePath string
143+
expectedOSVersion string
144+
expectErr bool
145+
}{
146+
{
147+
name: "COS",
148+
fakeOSReleasePath: "testdata/os-release-cos",
149+
expectedOSVersion: "cos 77-12293.0.0",
150+
expectErr: false,
151+
},
152+
{
153+
name: "Debian",
154+
fakeOSReleasePath: "testdata/os-release-debian",
155+
expectedOSVersion: "debian 9 (stretch)",
156+
expectErr: false,
157+
},
158+
{
159+
name: "Ubuntu",
160+
fakeOSReleasePath: "testdata/os-release-ubuntu",
161+
expectedOSVersion: "ubuntu 16.04.6 LTS (Xenial Xerus)",
162+
expectErr: false,
163+
},
164+
{
165+
name: "Unknown",
166+
fakeOSReleasePath: "testdata/os-release-unknown",
167+
expectedOSVersion: "",
168+
expectErr: true,
169+
},
170+
{
171+
name: "Empty",
172+
fakeOSReleasePath: "testdata/os-release-empty",
173+
expectedOSVersion: "",
174+
expectErr: true,
175+
},
176+
}
177+
178+
for _, test := range testCases {
179+
t.Run(test.name, func(t *testing.T) {
180+
originalOSReleasePath := osReleasePath
181+
defer func() {
182+
osReleasePath = originalOSReleasePath
183+
}()
184+
185+
osReleasePath = test.fakeOSReleasePath
186+
osVersion, err := GetOSVersion()
187+
188+
if test.expectErr && err == nil {
189+
t.Errorf("Expect to get error, but got no returned error.")
190+
}
191+
if !test.expectErr && err != nil {
192+
t.Errorf("Expect to get no error, but got returned error: %v", err)
193+
}
194+
if !test.expectErr && osVersion != test.expectedOSVersion {
195+
t.Errorf("Wanted: %+v. \nGot: %+v", test.expectedOSVersion, osVersion)
196+
}
197+
})
198+
}
199+
}

0 commit comments

Comments
 (0)