Skip to content

Commit 1fda257

Browse files
committed
Fix kernel monitor issues:
* Remove `unregister_netdevice` rule to fix kubernetes#47. * Change `KernelPanic` to `KernelOops` because we can't handle kernel panic currently. * Use system boot time instead of "StartPattern" to fix kubernetes#48.
1 parent b66c4df commit 1fda257

File tree

7 files changed

+54
-37
lines changed

7 files changed

+54
-37
lines changed

config/kernel-monitor.json

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
{
22
"plugin": "journald",
33
"logPath": "/var/log/journal",
4-
"lookback": "10m",
5-
"startPattern": "Initializing cgroup subsys cpuset",
4+
"lookback": "5m",
65
"bufferSize": 10,
76
"source": "kernel-monitor",
87
"conditions": [
@@ -25,12 +24,17 @@
2524
},
2625
{
2726
"type": "temporary",
28-
"reason": "KernelPanic",
27+
"reason": "UnregisterNetDevice",
28+
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
29+
},
30+
{
31+
"type": "temporary",
32+
"reason": "KernelOops",
2933
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
3034
},
3135
{
3236
"type": "temporary",
33-
"reason": "KernelPanic",
37+
"reason": "KernelOops",
3438
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
3539
},
3640
{
@@ -44,12 +48,6 @@
4448
"condition": "KernelDeadlock",
4549
"reason": "DockerHung",
4650
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
47-
},
48-
{
49-
"type": "permanent",
50-
"condition": "KernelDeadlock",
51-
"reason": "UnregisterNetDeviceIssue",
52-
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
5351
}
5452
]
5553
}

pkg/kernelmonitor/config.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ type MonitorConfig struct {
3434
DefaultConditions []types.Condition `json:"conditions"`
3535
// Rules are the rules kernel monitor will follow to parse the log file.
3636
Rules []kerntypes.Rule `json:"rules"`
37-
// StartPattern is the pattern of the start line
38-
StartPattern string `json:"startPattern, omitempty"`
3937
}
4038

4139
// applyDefaultConfiguration applies default configurations.

pkg/kernelmonitor/kernel_monitor.go

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,7 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor {
7070
glog.Fatalf("Failed to validate matching rules %#v: %v", k.config.Rules, err)
7171
}
7272
glog.Infof("Finish parsing log file: %+v", k.config)
73-
k.watcher, err = logwatchers.GetLogWatcher(k.config.WatcherConfig)
74-
if err != nil {
75-
glog.Fatalf("Failed to create log watcher with watcher config %#v: %v", k.config.WatcherConfig, err)
76-
}
73+
k.watcher = logwatchers.GetLogWatcherOrDie(k.config.WatcherConfig)
7774
k.buffer = NewLogBuffer(k.config.BufferSize)
7875
// A 1000 size channel should be big enough.
7976
k.output = make(chan *types.Status, 1000)
@@ -117,12 +114,6 @@ func (k *kernelMonitor) parseLog(log *kerntypes.KernelLog) {
117114
// Once there is new log, kernel monitor will push it into the log buffer and try
118115
// to match each rule. If any rule is matched, kernel monitor will report a status.
119116
k.buffer.Push(log)
120-
if matched := k.buffer.Match(k.config.StartPattern); len(matched) != 0 {
121-
// Reset the condition if a start log shows up.
122-
glog.Infof("Found start log %q, re-initialize the status", generateMessage(matched))
123-
k.initializeStatus()
124-
return
125-
}
126117
for _, rule := range k.config.Rules {
127118
matched := k.buffer.Match(rule.Pattern)
128119
if len(matched) == 0 {

pkg/kernelmonitor/logwatchers/log_watchers.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ limitations under the License.
1717
package logwatchers
1818

1919
import (
20-
"fmt"
21-
2220
"k8s.io/node-problem-detector/pkg/kernelmonitor/logwatchers/types"
2321

2422
"github.com/golang/glog"
@@ -32,12 +30,13 @@ func registerLogWatcher(name string, create types.WatcherCreateFunc) {
3230
createFuncs[name] = create
3331
}
3432

35-
// GetLogWatcher get a log watcher based on the passed in configuration.
36-
func GetLogWatcher(config types.WatcherConfig) (types.LogWatcher, error) {
33+
// GetLogWatcherOrDie get a log watcher based on the passed in configuration.
34+
// The function panics when encounts an error.
35+
func GetLogWatcherOrDie(config types.WatcherConfig) types.LogWatcher {
3736
create, ok := createFuncs[config.Plugin]
3837
if !ok {
39-
return nil, fmt.Errorf("no create function found for plugin %q", config.Plugin)
38+
glog.Fatalf("No create function found for plugin %q", config.Plugin)
4039
}
4140
glog.Infof("Use log watcher of plugin %q", config.Plugin)
42-
return create(config), nil
41+
return create(config)
4342
}

pkg/kernelmonitor/logwatchers/register_syslog.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@ const syslogPluginName = "syslog"
2424

2525
func init() {
2626
// Register the syslog plugin.
27-
registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcher)
27+
registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcherOrDie)
2828
}

pkg/kernelmonitor/logwatchers/syslog/log_watcher.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bufio"
2121
"bytes"
2222
"io"
23+
"syscall"
2324
"time"
2425

2526
utilclock "code.cloudfoundry.org/clock"
@@ -35,23 +36,30 @@ type syslogWatcher struct {
3536
reader *bufio.Reader
3637
closer io.Closer
3738
logCh chan *kerntypes.KernelLog
39+
uptime time.Time
3840
tomb *util.Tomb
3941
clock utilclock.Clock
4042
}
4143

42-
// NewSyslogWatcher creates a new kernel log watcher.
43-
func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher {
44+
// NewSyslogWatcherOrDie creates a new kernel log watcher. The function panics
45+
// when encounters an error.
46+
func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
47+
var info syscall.Sysinfo_t
48+
if err := syscall.Sysinfo(&info); err != nil {
49+
glog.Fatalf("Failed to get system info: %v", err)
50+
}
4451
return &syslogWatcher{
45-
cfg: cfg,
46-
tomb: util.NewTomb(),
52+
cfg: cfg,
53+
uptime: time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))),
54+
tomb: util.NewTomb(),
4755
// A capacity 1000 buffer should be enough
4856
logCh: make(chan *kerntypes.KernelLog, 1000),
4957
clock: utilclock.NewClock(),
5058
}
5159
}
5260

5361
// Make sure NewSyslogWathcer is types.WatcherCreateFunc.
54-
var _ types.WatcherCreateFunc = NewSyslogWatcher
62+
var _ types.WatcherCreateFunc = NewSyslogWatcherOrDie
5563

5664
// Watch starts the syslog watcher.
5765
func (s *syslogWatcher) Watch() (<-chan *kerntypes.KernelLog, error) {
@@ -113,8 +121,8 @@ func (s *syslogWatcher) watchLoop() {
113121
glog.Warningf("Unable to parse line: %q, %v", line, err)
114122
continue
115123
}
116-
// If the log is older than look back duration, discard it.
117-
if s.clock.Since(log.Timestamp) > lookback {
124+
// If the log is older than look back duration or system boot time, discard it.
125+
if s.clock.Since(log.Timestamp) > lookback || log.Timestamp.Before(s.uptime) {
118126
continue
119127
}
120128
s.logCh <- log

pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ func TestWatch(t *testing.T) {
3636
testCases := []struct {
3737
log string
3838
logs []kerntypes.KernelLog
39+
uptime time.Time
3940
lookback string
4041
}{
4142
{
@@ -96,6 +97,26 @@ func TestWatch(t *testing.T) {
9697
},
9798
},
9899
},
100+
{
101+
// The start point is at the end of the log file, we look back, but
102+
// system rebooted at in the middle of the log file.
103+
log: `Jan 2 03:04:03 kernel: [0.000000] 1
104+
Jan 2 03:04:04 kernel: [1.000000] 2
105+
Jan 2 03:04:05 kernel: [2.000000] 3
106+
`,
107+
uptime: time.Date(time.Now().Year(), time.January, 2, 3, 4, 4, 0, time.Local),
108+
lookback: "2s",
109+
logs: []kerntypes.KernelLog{
110+
{
111+
Timestamp: now.Add(-time.Second),
112+
Message: "2",
113+
},
114+
{
115+
Timestamp: now,
116+
Message: "3",
117+
},
118+
},
119+
},
99120
}
100121
for c, test := range testCases {
101122
t.Logf("TestCase #%d: %#v", c+1, test)
@@ -108,11 +129,13 @@ func TestWatch(t *testing.T) {
108129
_, err = f.Write([]byte(test.log))
109130
assert.NoError(t, err)
110131

111-
w := NewSyslogWatcher(types.WatcherConfig{
132+
w := NewSyslogWatcherOrDie(types.WatcherConfig{
112133
Plugin: "syslog",
113134
LogPath: f.Name(),
114135
Lookback: test.lookback,
115136
})
137+
// Set the uptime.
138+
w.(*syslogWatcher).uptime = test.uptime
116139
// Set the fake clock.
117140
w.(*syslogWatcher).clock = fakeClock
118141
logCh, err := w.Watch()

0 commit comments

Comments
 (0)