diff --git a/cmd/healthchecker/options/options.go b/cmd/healthchecker/options/options.go index 53940182f..128c44d44 100644 --- a/cmd/healthchecker/options/options.go +++ b/cmd/healthchecker/options/options.go @@ -40,6 +40,7 @@ type HealthCheckerOptions struct { CriCtlPath string CriSocketPath string CoolDownTime time.Duration + LoopBackTime time.Duration HealthCheckTimeout time.Duration LogPatterns types.LogPatternFlag } @@ -63,6 +64,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) { "The path to the cri socket. Used with crictl to specify the socket path.") fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime, "The duration to wait for the service to be up before attempting repair.") + fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime, + "The duration to loop back, if it is 0, health-check will check from start time.") fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout, "The time to wait before marking the component as unhealthy.") fs.Var(&hco.LogPatterns, "log-pattern", diff --git a/config/health-checker-kubelet.json b/config/health-checker-kubelet.json index 3d641b4fd..994fc11d9 100644 --- a/config/health-checker-kubelet.json +++ b/config/health-checker-kubelet.json @@ -25,6 +25,7 @@ "--component=kubelet", "--enable-repair=true", "--cooldown-time=1m", + "--loopback-time=0", "--health-check-timeout=10s" ], "timeout": "3m" diff --git a/deployment/node-problem-detector-healthchecker.yaml b/deployment/node-problem-detector-healthchecker.yaml new file mode 100644 index 000000000..327815256 --- /dev/null +++ b/deployment/node-problem-detector-healthchecker.yaml @@ -0,0 +1,102 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + namespace: kube-system + labels: + app: node-problem-detector +spec: + selector: + matchLabels: + app: node-problem-detector + template: + metadata: + labels: + app: node-problem-detector + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + containers: + - name: node-problem-detector + command: + - /node-problem-detector + - --logtostderr + - --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json + - --config.custom-plugin-monitor=/config/health-checker-kubelet.json + image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6 + resources: + limits: + cpu: 10m + memory: 80Mi + requests: + cpu: 10m + memory: 80Mi + imagePullPolicy: Always + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + readOnly: true + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + # Make sure node problem detector is in the same timezone + # with the host. + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: config + mountPath: /config + readOnly: true + - mountPath: /etc/machine-id + name: machine-id + readOnly: true + - mountPath: /run/systemd/system + name: systemd + - mountPath: /var/run/dbus/ + name: dbus + mountPropagation: Bidirectional + volumes: + - name: log + # Config `log` to your system log directory + hostPath: + path: /var/log/ + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + - name: config + configMap: + name: node-problem-detector-config + items: + - key: kernel-monitor.json + path: kernel-monitor.json + - key: docker-monitor.json + path: docker-monitor.json + - name: machine-id + hostPath: + path: /etc/machine-id + type: "File" + - name: systemd + hostPath: + path: /run/systemd/system/ + type: "" + - name: dbus + hostPath: + path: /var/run/dbus/ + type: "" diff --git a/pkg/healthchecker/health_checker.go b/pkg/healthchecker/health_checker.go index 77a96d29a..9ab5c1b83 100644 --- a/pkg/healthchecker/health_checker.go +++ b/pkg/healthchecker/health_checker.go @@ -36,6 +36,7 @@ type healthChecker struct { crictlPath string healthCheckTimeout time.Duration coolDownTime time.Duration + loopBackTime time.Duration logPatternsToCheck map[string]int } @@ -48,6 +49,7 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e healthCheckTimeout: hco.HealthCheckTimeout, coolDownTime: hco.CoolDownTime, service: hco.Service, + loopBackTime: hco.LoopBackTime, logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(), } hc.healthCheckFunc = getHealthCheckFunc(hco) @@ -59,11 +61,22 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e // CheckHealth checks for the health of the component and tries to repair if enabled. // Returns true if healthy, false otherwise. func (hc *healthChecker) CheckHealth() (bool, error) { + var logStartTime string healthy, err := hc.healthCheckFunc() if err != nil { return healthy, err } - logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck) + uptime, err := hc.uptimeFunc() + if err != nil { + glog.Warningf("Failed to get the uptime: %+v", err) + return true, err + } + if hc.loopBackTime > 0 && uptime > hc.loopBackTime { + logStartTime = time.Now().Add(-hc.loopBackTime).Format(types.LogParsingTimeLayout) + } else { + logStartTime = time.Now().Add(-uptime).Format(types.LogParsingTimeLayout) + } + logPatternHealthy, err := logPatternHealthCheck(hc.service, logStartTime, hc.logPatternsToCheck) if err != nil { return logPatternHealthy, err } @@ -74,10 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) { // Attempt repair based on flag. if hc.enableRepair { // repair if the service has been up for the cool down period. - uptime, err := hc.uptimeFunc() - if err != nil { - glog.Infof("error in getting uptime for %v: %v\n", hc.component, err) - } glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime) if uptime > hc.coolDownTime { glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime) @@ -89,19 +98,10 @@ func (hc *healthChecker) CheckHealth() (bool, error) { // logPatternHealthCheck checks for the provided logPattern occurrences in the service logs. // Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise. -func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) { +func logPatternHealthCheck(service, logStartTime string, logPatternsToCheck map[string]int) (bool, error) { if len(logPatternsToCheck) == 0 { return true, nil } - uptimeFunc := getUptimeFunc(service) - uptime, err := uptimeFunc() - if err != nil { - return true, err - } - logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout) - if err != nil { - return true, err - } for pattern, count := range logPatternsToCheck { healthy, err := checkForPattern(service, logStartTime, pattern, count) if err != nil || !healthy { diff --git a/pkg/healthchecker/types/types.go b/pkg/healthchecker/types/types.go index a8585ff8c..02523f6b3 100644 --- a/pkg/healthchecker/types/types.go +++ b/pkg/healthchecker/types/types.go @@ -25,6 +25,7 @@ import ( ) const ( + DefaultLoopBackTime = 0 * time.Minute DefaultCoolDownTime = 2 * time.Minute DefaultHealthCheckTimeout = 10 * time.Second CmdTimeout = 10 * time.Second