Skip to content

Commit d6d20e4

Browse files
authored
Merge pull request #505 from vteratipally/retrieve_os_features
added a new metric to retrieve os features like unknown modules, KTD
2 parents 989a15b + f89f620 commit d6d20e4

File tree

12 files changed

+308
-15
lines changed

12 files changed

+308
-15
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
[
2+
{ "moduleName": "xt_MASQUERADE"},
3+
{ "moduleName": "xt_addrtype"},
4+
{ "moduleName": "iptable_nat"},
5+
{ "moduleName": "nf_nat"},
6+
{ "moduleName": "br_netfilter"},
7+
{ "moduleName": "ip6table_filter"},
8+
{ "moduleName": "ip6_tables"},
9+
{ "moduleName": "aesni_intel"},
10+
{ "moduleName": "glue_helper"},
11+
{ "moduleName": "crypto_simd"},
12+
{ "moduleName": "cryptd"},
13+
{ "moduleName": "virtio_balloon"},
14+
{ "moduleName": "loadpin_trigger"},
15+
{ "moduleName":"ip6table_filter"},
16+
{ "moduleName":"ip6_tables"},
17+
{ "moduleName":"iptable_filter"},
18+
{ "moduleName":"bpfilter"},
19+
{ "moduleName":"nls_iso8859_1"},
20+
{ "moduleName":"intel_rapl_msr"},
21+
{ "moduleName":"intel_rapl_common"},
22+
{ "moduleName":"sb_edac"},
23+
{ "moduleName":"rapl"},
24+
{ "moduleName":"input_leds"},
25+
{ "moduleName":"serio_raw"},
26+
{ "moduleName":"pvpanic"},
27+
{ "moduleName":"mac_hid"},
28+
{ "moduleName":"sch_fq_codel"},
29+
{ "moduleName":"ib_iser"},
30+
{ "moduleName":"rdma_cm"},
31+
{ "moduleName":"iw_cm"},
32+
{ "moduleName":"ib_cm"},
33+
{ "moduleName":"ib_core"},
34+
{ "moduleName":"iscsi_tcp"},
35+
{ "moduleName":"libiscsi_tcp"},
36+
{ "moduleName":"libiscsi"},
37+
{ "moduleName":"scsi_transport_iscsi"},
38+
{ "moduleName":"virtio_rng"},
39+
{ "moduleName":"ip_tables"},
40+
{ "moduleName":"x_tables"},
41+
{ "moduleName":"autofs4"},
42+
{ "moduleName":"btrfs"},
43+
{ "moduleName":"zstd_compress"},
44+
{ "moduleName":"raid10"},
45+
{ "moduleName":"raid456"},
46+
{ "moduleName":"async_raid6_recov"},
47+
{ "moduleName":"async_memcpy"},
48+
{ "moduleName":"async_pq"},
49+
{ "moduleName":"async_xor"},
50+
{ "moduleName":"async_tx"},
51+
{ "moduleName":"xor"},
52+
{ "moduleName":"raid6_pq"},
53+
{ "moduleName":"raid1"},
54+
{ "moduleName":"raid0"},
55+
{ "moduleName":"multipath"},
56+
{ "moduleName":"linear"},
57+
{ "moduleName":"crct10dif_pclmul"},
58+
{ "moduleName":"crc32_pclmul"},
59+
{ "moduleName":"ghash_clmulni_intel"},
60+
{ "moduleName":"aesni_intel"},
61+
{ "moduleName":"crypto_simd"},
62+
{ "moduleName":"cryptd"},
63+
{ "moduleName":"glue_helper"},
64+
{ "moduleName":"psmouse"},
65+
{ "moduleName":"virtio_net"},
66+
{ "moduleName":"net_failover"},
67+
{ "moduleName": "failover"},
68+
{ "moduleName":"i2c_piix4"}
69+
]

config/system-stats-monitor.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,13 @@
7575
}
7676
}
7777
},
78+
"osFeature": {
79+
"metricsConfigs": {
80+
"system/os_feature": {
81+
"displayName": "system/os_feature"
82+
}
83+
},
84+
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json"
85+
},
7886
"invokeInterval": "60s"
7987
}

pkg/exporters/stackdriver/stackdriver_exporter.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
6868
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
6969
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
7070
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
71+
metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled",
7172
}
7273

7374
func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string {

pkg/systemstatsmonitor/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,19 @@ Below metrics are collected from `memory` component:
7272
* `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used.
7373
* `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes.
7474
* `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk.
75+
76+
### OS features
77+
78+
The guest OS features such as KTD kernel, GPU support are collected. Below are the OS
79+
features collected:
80+
81+
* `KTD`: Enabled, if KTD feature is enabled on OS
82+
* `UnifiedCgroupHierarchy`: Enabled, if Unified hierarchy is enabled on OS.
83+
* `KernelModuleIntegrity`: Enabled, if load pin security is enabled and modules are signed.
84+
* `GPUSupport`: Enabled, if OS has GPU drivers installed like nvidia.
85+
* `UnknownModules`: Enabled, if the OS has third party kernel modules installed.
86+
UnknownModules are derived from the /proc/modules compared with the known-modules.json.
87+
88+
And an option:
89+
`knownModulesConfigPath`: The path to the file that contains the known modules(default
90+
modules) can be set. By default, the path is set to `known-modules.json`

pkg/systemstatsmonitor/labels.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ const fsTypeLabel = "fs_type"
3030

3131
// mountOptionLabel labels the mount_options of the monitored disk device
3232
const mountOptionLabel = "mount_option"
33+
34+
// featureLabel labels the features of the guest os system
35+
const featureLabel = "os_feature"
36+
37+
// valueLabel labels the value for the features of the guest os system if required
38+
const valueLabel = "value"
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors All rights reserved.
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
Unless required by applicable law or agreed to in writing, software
8+
distributed under the License is distributed on an "AS IS" BASIS,
9+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
See the License for the specific language governing permissions and
11+
limitations under the License.
12+
*/
13+
14+
package systemstatsmonitor
15+
16+
import (
17+
"encoding/json"
18+
"io/ioutil"
19+
"strconv"
20+
"strings"
21+
22+
"github.com/golang/glog"
23+
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
24+
"k8s.io/node-problem-detector/pkg/util/metrics"
25+
"k8s.io/node-problem-detector/pkg/util/metrics/system"
26+
)
27+
28+
type osFeatureCollector struct {
29+
config *ssmtypes.OSFeatureStatsConfig
30+
osFeature *metrics.Int64Metric
31+
}
32+
33+
func NewOsFeatureCollectorOrDie(osFeatureConfig *ssmtypes.OSFeatureStatsConfig) *osFeatureCollector {
34+
oc := osFeatureCollector{config: osFeatureConfig}
35+
var err error
36+
// Use metrics.Last aggregation method to ensure the metric is a guage metric.
37+
if osFeatureConfig.MetricsConfigs["system/os_feature"].DisplayName != "" {
38+
oc.osFeature, err = metrics.NewInt64Metric(
39+
metrics.OSFeatureID,
40+
osFeatureConfig.MetricsConfigs[string(metrics.OSFeatureID)].DisplayName,
41+
"OS Features like GPU support, KTD kernel, third party modules as unknown modules. 1 if the feature is enabled and 0, if disabled.",
42+
"1",
43+
metrics.LastValue,
44+
[]string{featureLabel, valueLabel})
45+
if err != nil {
46+
glog.Fatalf("Error initializing metric for system/os_feature: %v", err)
47+
}
48+
}
49+
return &oc
50+
}
51+
52+
// recordFeaturesFromCmdline records the guest OS features that can be derived
53+
// from the /proc/cmdline
54+
// The following features are recorded:
55+
// 1. KTD kernel based on csm.enabled value
56+
// 2. UnifiedCgroupHierarchy based on systemd.unified_cgroup_hierarchy
57+
// 3. KernelModuleIntegrity based on the loadpin enabled and a module signed.
58+
func (ofc *osFeatureCollector) recordFeaturesFromCmdline(cmdlineArgs []system.CmdlineArg) {
59+
var featuresMap = map[string]int64{
60+
"KTD": 0,
61+
"UnifiedCgroupHierarchy": 0,
62+
"ModuleSigned": 0,
63+
"LoadPinEnabled": 0,
64+
}
65+
for _, cmdlineArg := range cmdlineArgs {
66+
// record KTD feature.
67+
if cmdlineArg.Key == "csm.enabled" {
68+
featuresMap["KTD"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
69+
}
70+
// record UnifiedCgroupHierarchy feature.
71+
if cmdlineArg.Key == "systemd.unified_cgroup_hierarchy" {
72+
featuresMap["UnifiedCgroupHierarchy"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
73+
}
74+
// record KernelModuleIntegrity feature.
75+
if cmdlineArg.Key == "module.sig_enforce" {
76+
featuresMap["ModuleSigned"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
77+
}
78+
if cmdlineArg.Key == "loadpin.enabled" {
79+
featuresMap["LoadPinEnabled"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
80+
}
81+
}
82+
// Record the feature values.
83+
ofc.osFeature.Record(map[string]string{featureLabel: "KTD"}, featuresMap["KTD"])
84+
ofc.osFeature.Record(map[string]string{featureLabel: "UnifiedCgroupHierarchy"}, featuresMap["UnifiedCgroupHierarchy"])
85+
if featuresMap["ModuleSigned"] == 1 && featuresMap["LoadPinEnabled"] == 1 {
86+
ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 1)
87+
} else {
88+
ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 0)
89+
}
90+
}
91+
92+
// recordFeaturesFromModules records the guest OS features that can be derived
93+
// from the /proc/modules
94+
// The following features are recorded:
95+
// 1. GPUSupport based on the presence of nvidia module
96+
// 2. UnknownModules are tracked based on the presence of thirdparty kernel modules.
97+
func (ofc *osFeatureCollector) recordFeaturesFromModules(modules []system.Module) {
98+
// Collect known modules (default modules based on guest OS present in known-modules.json)
99+
var knownModules []system.Module
100+
f, err := ioutil.ReadFile(ofc.config.KnownModulesConfigPath)
101+
if err != nil {
102+
glog.Warningf("Failed to read configuration file %s: %v",
103+
ofc.config.KnownModulesConfigPath, err)
104+
}
105+
// When the knownModulesConfigPath is not set
106+
// it should assume all the metrics are assumed to be default modules.
107+
if f != nil {
108+
err = json.Unmarshal(f, &knownModules)
109+
if err != nil {
110+
glog.Warningf("Failed to retrieve known modules %v", err)
111+
}
112+
} else {
113+
knownModules = []system.Module{}
114+
}
115+
116+
var hasGPUSupport = 0
117+
unknownModules := []string{}
118+
119+
// Collect UnknownModules and check GPUSupport
120+
for _, module := range modules {
121+
// if the module has nvidia modules, then the hasGPUSupport is set.
122+
if strings.Contains(module.ModuleName, "nvidia") {
123+
hasGPUSupport = 1
124+
} else {
125+
if module.OutOfTree || module.Proprietary {
126+
if !system.ContainsModule(module.ModuleName, knownModules) {
127+
unknownModules = append(unknownModules, module.ModuleName)
128+
}
129+
}
130+
}
131+
}
132+
// record the UnknownModules and GPUSupport
133+
if len(unknownModules) > 0 {
134+
ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules",
135+
valueLabel: strings.Join(unknownModules, ",")}, 1)
136+
} else {
137+
ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules"},
138+
0)
139+
}
140+
ofc.osFeature.Record(map[string]string{featureLabel: "GPUSupport"},
141+
int64(hasGPUSupport))
142+
}
143+
144+
func (ofc *osFeatureCollector) collect() {
145+
cmdlineArgs, err := system.CmdlineArgs()
146+
if err != nil {
147+
glog.Fatalf("Error retrieving cmdline args: %v", err)
148+
}
149+
ofc.recordFeaturesFromCmdline(cmdlineArgs)
150+
modules, err := system.Modules()
151+
if err != nil {
152+
glog.Fatalf("Error retrieving kernel modules: %v", err)
153+
}
154+
ofc.recordFeaturesFromModules(modules)
155+
}

pkg/systemstatsmonitor/system_stats_monitor.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,14 @@ func init() {
3838
}
3939

4040
type systemStatsMonitor struct {
41-
configPath string
42-
config ssmtypes.SystemStatsConfig
43-
cpuCollector *cpuCollector
44-
diskCollector *diskCollector
45-
hostCollector *hostCollector
46-
memoryCollector *memoryCollector
47-
tomb *tomb.Tomb
41+
configPath string
42+
config ssmtypes.SystemStatsConfig
43+
cpuCollector *cpuCollector
44+
diskCollector *diskCollector
45+
hostCollector *hostCollector
46+
memoryCollector *memoryCollector
47+
osFeatureCollector *osFeatureCollector
48+
tomb *tomb.Tomb
4849
}
4950

5051
// NewSystemStatsMonitorOrDie creates a system stats monitor.
@@ -69,6 +70,8 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
6970
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
7071
}
7172

73+
glog.Infof("Error: %v", ssm.config)
74+
7275
err = ssm.config.Validate()
7376
if err != nil {
7477
glog.Fatalf("Failed to validate %s configuration %+v: %v", ssm.configPath, ssm.config, err)
@@ -86,6 +89,9 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
8689
if len(ssm.config.MemoryConfig.MetricsConfigs) > 0 {
8790
ssm.memoryCollector = NewMemoryCollectorOrDie(&ssm.config.MemoryConfig)
8891
}
92+
if len(ssm.config.OsFeatureConfig.MetricsConfigs) > 0 {
93+
ssm.osFeatureCollector = NewOsFeatureCollectorOrDie(&ssm.config.OsFeatureConfig)
94+
}
8995
return &ssm
9096
}
9197

@@ -110,6 +116,7 @@ func (ssm *systemStatsMonitor) monitorLoop() {
110116
ssm.diskCollector.collect()
111117
ssm.hostCollector.collect()
112118
ssm.memoryCollector.collect()
119+
ssm.osFeatureCollector.collect()
113120
}
114121

115122
for {
@@ -119,6 +126,7 @@ func (ssm *systemStatsMonitor) monitorLoop() {
119126
ssm.diskCollector.collect()
120127
ssm.hostCollector.collect()
121128
ssm.memoryCollector.collect()
129+
ssm.osFeatureCollector.collect()
122130
case <-ssm.tomb.Stopping():
123131
glog.Infof("System stats monitor stopped: %s", ssm.configPath)
124132
return

pkg/systemstatsmonitor/types/config.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ import (
2222
)
2323

2424
var (
25-
defaultInvokeIntervalString = (60 * time.Second).String()
26-
defaultlsblkTimeoutString = (5 * time.Second).String()
25+
defaultInvokeIntervalString = (60 * time.Second).String()
26+
defaultlsblkTimeoutString = (5 * time.Second).String()
27+
defaultKnownModulesConfigPath = "config/guestosconfig/known-modules.json"
2728
)
2829

2930
type MetricConfig struct {
@@ -50,13 +51,19 @@ type MemoryStatsConfig struct {
5051
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
5152
}
5253

54+
type OSFeatureStatsConfig struct {
55+
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
56+
KnownModulesConfigPath string `json:"knownModulesConfigPath"`
57+
}
58+
5359
type SystemStatsConfig struct {
54-
CPUConfig CPUStatsConfig `json:"cpu"`
55-
DiskConfig DiskStatsConfig `json:"disk"`
56-
HostConfig HostStatsConfig `json:"host"`
57-
MemoryConfig MemoryStatsConfig `json:"memory"`
58-
InvokeIntervalString string `json:"invokeInterval"`
59-
InvokeInterval time.Duration `json:"-"`
60+
CPUConfig CPUStatsConfig `json:"cpu"`
61+
DiskConfig DiskStatsConfig `json:"disk"`
62+
HostConfig HostStatsConfig `json:"host"`
63+
MemoryConfig MemoryStatsConfig `json:"memory"`
64+
OsFeatureConfig OSFeatureStatsConfig `json:"osFeature"`
65+
InvokeIntervalString string `json:"invokeInterval"`
66+
InvokeInterval time.Duration `json:"-"`
6067
}
6168

6269
// ApplyConfiguration applies default configurations.
@@ -67,6 +74,9 @@ func (ssc *SystemStatsConfig) ApplyConfiguration() error {
6774
if ssc.DiskConfig.LsblkTimeoutString == "" {
6875
ssc.DiskConfig.LsblkTimeoutString = defaultlsblkTimeoutString
6976
}
77+
if ssc.OsFeatureConfig.KnownModulesConfigPath == "" {
78+
ssc.OsFeatureConfig.KnownModulesConfigPath = defaultKnownModulesConfigPath
79+
}
7080

7181
var err error
7282
ssc.InvokeInterval, err = time.ParseDuration(ssc.InvokeIntervalString)

0 commit comments

Comments
 (0)