diff --git a/config/kernel-monitor.json b/config/kernel-monitor.json index 9b9674403..4b6e656d2 100644 --- a/config/kernel-monitor.json +++ b/config/kernel-monitor.json @@ -1,8 +1,7 @@ { "plugin": "journald", "logPath": "/var/log/journal", - "lookback": "10m", - "startPattern": "Initializing cgroup subsys cpuset", + "lookback": "5m", "bufferSize": 10, "source": "kernel-monitor", "conditions": [ @@ -25,12 +24,17 @@ }, { "type": "temporary", - "reason": "KernelPanic", + "reason": "UnregisterNetDevice", + "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" + }, + { + "type": "temporary", + "reason": "KernelOops", "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*" }, { "type": "temporary", - "reason": "KernelPanic", + "reason": "KernelOops", "pattern": "divide error: 0000 \\[#\\d+\\] SMP" }, { @@ -44,12 +48,6 @@ "condition": "KernelDeadlock", "reason": "DockerHung", "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\." - }, - { - "type": "permanent", - "condition": "KernelDeadlock", - "reason": "UnregisterNetDeviceIssue", - "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" } ] } diff --git a/pkg/kernelmonitor/config.go b/pkg/kernelmonitor/config.go index cc14671e5..79fc27a97 100644 --- a/pkg/kernelmonitor/config.go +++ b/pkg/kernelmonitor/config.go @@ -34,8 +34,6 @@ type MonitorConfig struct { DefaultConditions []types.Condition `json:"conditions"` // Rules are the rules kernel monitor will follow to parse the log file. Rules []kerntypes.Rule `json:"rules"` - // StartPattern is the pattern of the start line - StartPattern string `json:"startPattern, omitempty"` } // applyDefaultConfiguration applies default configurations. diff --git a/pkg/kernelmonitor/kernel_monitor.go b/pkg/kernelmonitor/kernel_monitor.go index 4c0c5f110..5d1baafe5 100644 --- a/pkg/kernelmonitor/kernel_monitor.go +++ b/pkg/kernelmonitor/kernel_monitor.go @@ -70,10 +70,7 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor { glog.Fatalf("Failed to validate matching rules %#v: %v", k.config.Rules, err) } glog.Infof("Finish parsing log file: %+v", k.config) - k.watcher, err = logwatchers.GetLogWatcher(k.config.WatcherConfig) - if err != nil { - glog.Fatalf("Failed to create log watcher with watcher config %#v: %v", k.config.WatcherConfig, err) - } + k.watcher = logwatchers.GetLogWatcherOrDie(k.config.WatcherConfig) k.buffer = NewLogBuffer(k.config.BufferSize) // A 1000 size channel should be big enough. k.output = make(chan *types.Status, 1000) @@ -117,12 +114,6 @@ func (k *kernelMonitor) parseLog(log *kerntypes.KernelLog) { // Once there is new log, kernel monitor will push it into the log buffer and try // to match each rule. If any rule is matched, kernel monitor will report a status. k.buffer.Push(log) - if matched := k.buffer.Match(k.config.StartPattern); len(matched) != 0 { - // Reset the condition if a start log shows up. - glog.Infof("Found start log %q, re-initialize the status", generateMessage(matched)) - k.initializeStatus() - return - } for _, rule := range k.config.Rules { matched := k.buffer.Match(rule.Pattern) if len(matched) == 0 { diff --git a/pkg/kernelmonitor/logwatchers/log_watchers.go b/pkg/kernelmonitor/logwatchers/log_watchers.go index 198be47f2..fd47baf37 100644 --- a/pkg/kernelmonitor/logwatchers/log_watchers.go +++ b/pkg/kernelmonitor/logwatchers/log_watchers.go @@ -17,8 +17,6 @@ limitations under the License. package logwatchers import ( - "fmt" - "k8s.io/node-problem-detector/pkg/kernelmonitor/logwatchers/types" "github.com/golang/glog" @@ -32,12 +30,13 @@ func registerLogWatcher(name string, create types.WatcherCreateFunc) { createFuncs[name] = create } -// GetLogWatcher get a log watcher based on the passed in configuration. -func GetLogWatcher(config types.WatcherConfig) (types.LogWatcher, error) { +// GetLogWatcherOrDie get a log watcher based on the passed in configuration. +// The function panics when encounts an error. +func GetLogWatcherOrDie(config types.WatcherConfig) types.LogWatcher { create, ok := createFuncs[config.Plugin] if !ok { - return nil, fmt.Errorf("no create function found for plugin %q", config.Plugin) + glog.Fatalf("No create function found for plugin %q", config.Plugin) } glog.Infof("Use log watcher of plugin %q", config.Plugin) - return create(config), nil + return create(config) } diff --git a/pkg/kernelmonitor/logwatchers/register_syslog.go b/pkg/kernelmonitor/logwatchers/register_syslog.go index c48923bc1..398b63981 100644 --- a/pkg/kernelmonitor/logwatchers/register_syslog.go +++ b/pkg/kernelmonitor/logwatchers/register_syslog.go @@ -24,5 +24,5 @@ const syslogPluginName = "syslog" func init() { // Register the syslog plugin. - registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcher) + registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcherOrDie) } diff --git a/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go b/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go index deb3ccff9..845457905 100644 --- a/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go +++ b/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go @@ -20,6 +20,7 @@ import ( "bufio" "bytes" "io" + "syscall" "time" utilclock "code.cloudfoundry.org/clock" @@ -35,15 +36,22 @@ type syslogWatcher struct { reader *bufio.Reader closer io.Closer logCh chan *kerntypes.KernelLog + uptime time.Time tomb *util.Tomb clock utilclock.Clock } -// NewSyslogWatcher creates a new kernel log watcher. -func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher { +// NewSyslogWatcherOrDie creates a new kernel log watcher. The function panics +// when encounters an error. +func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher { + var info syscall.Sysinfo_t + if err := syscall.Sysinfo(&info); err != nil { + glog.Fatalf("Failed to get system info: %v", err) + } return &syslogWatcher{ - cfg: cfg, - tomb: util.NewTomb(), + cfg: cfg, + uptime: time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))), + tomb: util.NewTomb(), // A capacity 1000 buffer should be enough logCh: make(chan *kerntypes.KernelLog, 1000), clock: utilclock.NewClock(), @@ -51,7 +59,7 @@ func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher { } // Make sure NewSyslogWathcer is types.WatcherCreateFunc. -var _ types.WatcherCreateFunc = NewSyslogWatcher +var _ types.WatcherCreateFunc = NewSyslogWatcherOrDie // Watch starts the syslog watcher. func (s *syslogWatcher) Watch() (<-chan *kerntypes.KernelLog, error) { @@ -113,8 +121,8 @@ func (s *syslogWatcher) watchLoop() { glog.Warningf("Unable to parse line: %q, %v", line, err) continue } - // If the log is older than look back duration, discard it. - if s.clock.Since(log.Timestamp) > lookback { + // If the log is older than look back duration or system boot time, discard it. + if s.clock.Since(log.Timestamp) > lookback || log.Timestamp.Before(s.uptime) { continue } s.logCh <- log diff --git a/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go b/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go index 5b1617406..2886a021d 100644 --- a/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go +++ b/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go @@ -36,6 +36,7 @@ func TestWatch(t *testing.T) { testCases := []struct { log string logs []kerntypes.KernelLog + uptime time.Time lookback string }{ { @@ -96,6 +97,26 @@ func TestWatch(t *testing.T) { }, }, }, + { + // The start point is at the end of the log file, we look back, but + // system rebooted at in the middle of the log file. + log: `Jan 2 03:04:03 kernel: [0.000000] 1 + Jan 2 03:04:04 kernel: [1.000000] 2 + Jan 2 03:04:05 kernel: [2.000000] 3 + `, + uptime: time.Date(time.Now().Year(), time.January, 2, 3, 4, 4, 0, time.Local), + lookback: "2s", + logs: []kerntypes.KernelLog{ + { + Timestamp: now.Add(-time.Second), + Message: "2", + }, + { + Timestamp: now, + Message: "3", + }, + }, + }, } for c, test := range testCases { t.Logf("TestCase #%d: %#v", c+1, test) @@ -108,11 +129,13 @@ func TestWatch(t *testing.T) { _, err = f.Write([]byte(test.log)) assert.NoError(t, err) - w := NewSyslogWatcher(types.WatcherConfig{ + w := NewSyslogWatcherOrDie(types.WatcherConfig{ Plugin: "syslog", LogPath: f.Name(), Lookback: test.lookback, }) + // Set the uptime. + w.(*syslogWatcher).uptime = test.uptime // Set the fake clock. w.(*syslogWatcher).clock = fakeClock logCh, err := w.Watch()