Merge pull request #539 from smileusd/health_check

k8s-ci-robot · web-flow · commit e349323507a8 · 2021-06-25T09:48:45.000-07:00
improvement health-checker
diff --git a/cmd/healthchecker/options/options.go b/cmd/healthchecker/options/options.go
@@ -40,6 +40,7 @@ type HealthCheckerOptions struct {
 	CriCtlPath         string
 	CriSocketPath      string
 	CoolDownTime       time.Duration
+	LoopBackTime       time.Duration
 	HealthCheckTimeout time.Duration
 	LogPatterns        types.LogPatternFlag
 }
@@ -63,6 +64,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
 		"The path to the cri socket. Used with crictl to specify the socket path.")
 	fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
 		"The duration to wait for the service to be up before attempting repair.")
+	fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
+		"The duration to loop back, if it is 0, health-check will check from start time.")
 	fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
 		"The time to wait before marking the component as unhealthy.")
 	fs.Var(&hco.LogPatterns, "log-pattern",
diff --git a/config/health-checker-kubelet.json b/config/health-checker-kubelet.json
@@ -25,6 +25,7 @@
         "--component=kubelet",
         "--enable-repair=true",
         "--cooldown-time=1m",
+        "--loopback-time=0",
         "--health-check-timeout=10s"
       ],
       "timeout": "3m"
diff --git a/deployment/node-problem-detector-healthchecker.yaml b/deployment/node-problem-detector-healthchecker.yaml
@@ -0,0 +1,102 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-problem-detector
+  namespace: kube-system
+  labels:
+    app: node-problem-detector
+spec:
+  selector:
+    matchLabels:
+      app: node-problem-detector
+  template:
+    metadata:
+      labels:
+        app: node-problem-detector
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/os
+                    operator: In
+                    values:
+                      - linux
+      containers:
+      - name: node-problem-detector
+        command:
+        - /node-problem-detector
+        - --logtostderr
+        - --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
+        - --config.custom-plugin-monitor=/config/health-checker-kubelet.json
+        image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6
+        resources:
+          limits:
+            cpu: 10m
+            memory: 80Mi
+          requests:
+            cpu: 10m
+            memory: 80Mi
+        imagePullPolicy: Always
+        securityContext:
+          privileged: true
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        volumeMounts:
+        - name: log
+          mountPath: /var/log
+          readOnly: true
+        - name: kmsg
+          mountPath: /dev/kmsg
+          readOnly: true
+        # Make sure node problem detector is in the same timezone
+        # with the host.
+        - name: localtime
+          mountPath: /etc/localtime
+          readOnly: true
+        - name: config
+          mountPath: /config
+          readOnly: true
+        - mountPath: /etc/machine-id
+          name: machine-id
+          readOnly: true
+        - mountPath: /run/systemd/system
+          name: systemd
+        - mountPath: /var/run/dbus/
+          name: dbus
+          mountPropagation: Bidirectional
+      volumes:
+      - name: log
+        # Config `log` to your system log directory
+        hostPath:
+          path: /var/log/
+      - name: kmsg
+        hostPath:
+          path: /dev/kmsg
+      - name: localtime
+        hostPath:
+          path: /etc/localtime
+      - name: config
+        configMap:
+          name: node-problem-detector-config
+          items:
+          - key: kernel-monitor.json
+            path: kernel-monitor.json
+          - key: docker-monitor.json
+            path: docker-monitor.json
+      - name: machine-id
+        hostPath:
+          path: /etc/machine-id
+          type: "File"
+      - name: systemd
+        hostPath:
+          path: /run/systemd/system/
+          type: ""
+      - name: dbus
+        hostPath:
+          path: /var/run/dbus/
+          type: ""
diff --git a/pkg/healthchecker/health_checker.go b/pkg/healthchecker/health_checker.go
@@ -36,6 +36,7 @@ type healthChecker struct {
 	crictlPath         string
 	healthCheckTimeout time.Duration
 	coolDownTime       time.Duration
+	loopBackTime       time.Duration
 	logPatternsToCheck map[string]int
 }
 
@@ -48,6 +49,7 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
 		healthCheckTimeout: hco.HealthCheckTimeout,
 		coolDownTime:       hco.CoolDownTime,
 		service:            hco.Service,
+		loopBackTime:       hco.LoopBackTime,
 		logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
 	}
 	hc.healthCheckFunc = getHealthCheckFunc(hco)
@@ -59,11 +61,22 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
 // CheckHealth checks for the health of the component and tries to repair if enabled.
 // Returns true if healthy, false otherwise.
 func (hc *healthChecker) CheckHealth() (bool, error) {
+	var logStartTime string
 	healthy, err := hc.healthCheckFunc()
 	if err != nil {
 		return healthy, err
 	}
-	logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
+	uptime, err := hc.uptimeFunc()
+	if err != nil {
+		glog.Warningf("Failed to get the uptime: %+v", err)
+		return true, err
+	}
+	if hc.loopBackTime > 0 && uptime > hc.loopBackTime {
+		logStartTime = time.Now().Add(-hc.loopBackTime).Format(types.LogParsingTimeLayout)
+	} else {
+		logStartTime = time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
+	}
+	logPatternHealthy, err := logPatternHealthCheck(hc.service, logStartTime, hc.logPatternsToCheck)
 	if err != nil {
 		return logPatternHealthy, err
 	}
@@ -74,10 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
 	// Attempt repair based on flag.
 	if hc.enableRepair {
 		// repair if the service has been up for the cool down period.
-		uptime, err := hc.uptimeFunc()
-		if err != nil {
-			glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
-		}
 		glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
 		if uptime > hc.coolDownTime {
 			glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
@@ -89,19 +98,10 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
 
 // logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
 // Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
-func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
+func logPatternHealthCheck(service, logStartTime string, logPatternsToCheck map[string]int) (bool, error) {
 	if len(logPatternsToCheck) == 0 {
 		return true, nil
 	}
-	uptimeFunc := getUptimeFunc(service)
-	uptime, err := uptimeFunc()
-	if err != nil {
-		return true, err
-	}
-	logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
-	if err != nil {
-		return true, err
-	}
 	for pattern, count := range logPatternsToCheck {
 		healthy, err := checkForPattern(service, logStartTime, pattern, count)
 		if err != nil || !healthy {
diff --git a/pkg/healthchecker/types/types.go b/pkg/healthchecker/types/types.go
@@ -25,6 +25,7 @@ import (
 )
 
 const (
+	DefaultLoopBackTime       = 0 * time.Minute
 	DefaultCoolDownTime       = 2 * time.Minute
 	DefaultHealthCheckTimeout = 10 * time.Second
 	CmdTimeout                = 10 * time.Second

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ import (`
`25`	`25`	`)`
`26`	`26`
`27`	`27`	`const (`
	`28`	`+ DefaultLoopBackTime = 0 * time.Minute`
`28`	`29`	`DefaultCoolDownTime = 2 * time.Minute`
`29`	`30`	`DefaultHealthCheckTimeout = 10 * time.Second`
`30`	`31`	`CmdTimeout = 10 * time.Second`