Skip to content

Commit e349323

Browse files
authored
Merge pull request #539 from smileusd/health_check
improvement health-checker
2 parents 93badb2 + b409875 commit e349323

File tree

5 files changed

+122
-15
lines changed

5 files changed

+122
-15
lines changed

cmd/healthchecker/options/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type HealthCheckerOptions struct {
4040
CriCtlPath string
4141
CriSocketPath string
4242
CoolDownTime time.Duration
43+
LoopBackTime time.Duration
4344
HealthCheckTimeout time.Duration
4445
LogPatterns types.LogPatternFlag
4546
}
@@ -63,6 +64,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
6364
"The path to the cri socket. Used with crictl to specify the socket path.")
6465
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
6566
"The duration to wait for the service to be up before attempting repair.")
67+
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
68+
"The duration to loop back, if it is 0, health-check will check from start time.")
6669
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
6770
"The time to wait before marking the component as unhealthy.")
6871
fs.Var(&hco.LogPatterns, "log-pattern",

config/health-checker-kubelet.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"--component=kubelet",
2626
"--enable-repair=true",
2727
"--cooldown-time=1m",
28+
"--loopback-time=0",
2829
"--health-check-timeout=10s"
2930
],
3031
"timeout": "3m"
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
apiVersion: apps/v1
2+
kind: DaemonSet
3+
metadata:
4+
name: node-problem-detector
5+
namespace: kube-system
6+
labels:
7+
app: node-problem-detector
8+
spec:
9+
selector:
10+
matchLabels:
11+
app: node-problem-detector
12+
template:
13+
metadata:
14+
labels:
15+
app: node-problem-detector
16+
spec:
17+
affinity:
18+
nodeAffinity:
19+
requiredDuringSchedulingIgnoredDuringExecution:
20+
nodeSelectorTerms:
21+
- matchExpressions:
22+
- key: kubernetes.io/os
23+
operator: In
24+
values:
25+
- linux
26+
containers:
27+
- name: node-problem-detector
28+
command:
29+
- /node-problem-detector
30+
- --logtostderr
31+
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
32+
- --config.custom-plugin-monitor=/config/health-checker-kubelet.json
33+
image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6
34+
resources:
35+
limits:
36+
cpu: 10m
37+
memory: 80Mi
38+
requests:
39+
cpu: 10m
40+
memory: 80Mi
41+
imagePullPolicy: Always
42+
securityContext:
43+
privileged: true
44+
env:
45+
- name: NODE_NAME
46+
valueFrom:
47+
fieldRef:
48+
fieldPath: spec.nodeName
49+
volumeMounts:
50+
- name: log
51+
mountPath: /var/log
52+
readOnly: true
53+
- name: kmsg
54+
mountPath: /dev/kmsg
55+
readOnly: true
56+
# Make sure node problem detector is in the same timezone
57+
# with the host.
58+
- name: localtime
59+
mountPath: /etc/localtime
60+
readOnly: true
61+
- name: config
62+
mountPath: /config
63+
readOnly: true
64+
- mountPath: /etc/machine-id
65+
name: machine-id
66+
readOnly: true
67+
- mountPath: /run/systemd/system
68+
name: systemd
69+
- mountPath: /var/run/dbus/
70+
name: dbus
71+
mountPropagation: Bidirectional
72+
volumes:
73+
- name: log
74+
# Config `log` to your system log directory
75+
hostPath:
76+
path: /var/log/
77+
- name: kmsg
78+
hostPath:
79+
path: /dev/kmsg
80+
- name: localtime
81+
hostPath:
82+
path: /etc/localtime
83+
- name: config
84+
configMap:
85+
name: node-problem-detector-config
86+
items:
87+
- key: kernel-monitor.json
88+
path: kernel-monitor.json
89+
- key: docker-monitor.json
90+
path: docker-monitor.json
91+
- name: machine-id
92+
hostPath:
93+
path: /etc/machine-id
94+
type: "File"
95+
- name: systemd
96+
hostPath:
97+
path: /run/systemd/system/
98+
type: ""
99+
- name: dbus
100+
hostPath:
101+
path: /var/run/dbus/
102+
type: ""

pkg/healthchecker/health_checker.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ type healthChecker struct {
3636
crictlPath string
3737
healthCheckTimeout time.Duration
3838
coolDownTime time.Duration
39+
loopBackTime time.Duration
3940
logPatternsToCheck map[string]int
4041
}
4142

@@ -48,6 +49,7 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
4849
healthCheckTimeout: hco.HealthCheckTimeout,
4950
coolDownTime: hco.CoolDownTime,
5051
service: hco.Service,
52+
loopBackTime: hco.LoopBackTime,
5153
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
5254
}
5355
hc.healthCheckFunc = getHealthCheckFunc(hco)
@@ -59,11 +61,22 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
5961
// CheckHealth checks for the health of the component and tries to repair if enabled.
6062
// Returns true if healthy, false otherwise.
6163
func (hc *healthChecker) CheckHealth() (bool, error) {
64+
var logStartTime string
6265
healthy, err := hc.healthCheckFunc()
6366
if err != nil {
6467
return healthy, err
6568
}
66-
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
69+
uptime, err := hc.uptimeFunc()
70+
if err != nil {
71+
glog.Warningf("Failed to get the uptime: %+v", err)
72+
return true, err
73+
}
74+
if hc.loopBackTime > 0 && uptime > hc.loopBackTime {
75+
logStartTime = time.Now().Add(-hc.loopBackTime).Format(types.LogParsingTimeLayout)
76+
} else {
77+
logStartTime = time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
78+
}
79+
logPatternHealthy, err := logPatternHealthCheck(hc.service, logStartTime, hc.logPatternsToCheck)
6780
if err != nil {
6881
return logPatternHealthy, err
6982
}
@@ -74,10 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
7487
// Attempt repair based on flag.
7588
if hc.enableRepair {
7689
// repair if the service has been up for the cool down period.
77-
uptime, err := hc.uptimeFunc()
78-
if err != nil {
79-
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
80-
}
8190
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
8291
if uptime > hc.coolDownTime {
8392
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
@@ -89,19 +98,10 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
8998

9099
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
91100
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
92-
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
101+
func logPatternHealthCheck(service, logStartTime string, logPatternsToCheck map[string]int) (bool, error) {
93102
if len(logPatternsToCheck) == 0 {
94103
return true, nil
95104
}
96-
uptimeFunc := getUptimeFunc(service)
97-
uptime, err := uptimeFunc()
98-
if err != nil {
99-
return true, err
100-
}
101-
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
102-
if err != nil {
103-
return true, err
104-
}
105105
for pattern, count := range logPatternsToCheck {
106106
healthy, err := checkForPattern(service, logStartTime, pattern, count)
107107
if err != nil || !healthy {

pkg/healthchecker/types/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
)
2626

2727
const (
28+
DefaultLoopBackTime = 0 * time.Minute
2829
DefaultCoolDownTime = 2 * time.Minute
2930
DefaultHealthCheckTimeout = 10 * time.Second
3031
CmdTimeout = 10 * time.Second

0 commit comments

Comments
 (0)