Skip to content

improvement health-checker #539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmd/healthchecker/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type HealthCheckerOptions struct {
CriCtlPath string
CriSocketPath string
CoolDownTime time.Duration
LoopBackTime time.Duration
HealthCheckTimeout time.Duration
LogPatterns types.LogPatternFlag
}
Expand All @@ -63,6 +64,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
"The path to the cri socket. Used with crictl to specify the socket path.")
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
"The duration to wait for the service to be up before attempting repair.")
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
"The duration to loop back, if it is 0, health-check will check from start time.")
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
"The time to wait before marking the component as unhealthy.")
fs.Var(&hco.LogPatterns, "log-pattern",
Expand Down
1 change: 1 addition & 0 deletions config/health-checker-kubelet.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"--component=kubelet",
"--enable-repair=true",
"--cooldown-time=1m",
"--loopback-time=0",
"--health-check-timeout=10s"
],
"timeout": "3m"
Expand Down
102 changes: 102 additions & 0 deletions deployment/node-problem-detector-healthchecker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-problem-detector
namespace: kube-system
labels:
app: node-problem-detector
spec:
selector:
matchLabels:
app: node-problem-detector
template:
metadata:
labels:
app: node-problem-detector
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
containers:
- name: node-problem-detector
command:
- /node-problem-detector
- --logtostderr
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
- --config.custom-plugin-monitor=/config/health-checker-kubelet.json
image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6
resources:
limits:
cpu: 10m
memory: 80Mi
requests:
cpu: 10m
memory: 80Mi
imagePullPolicy: Always
securityContext:
privileged: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
readOnly: true
- name: kmsg
mountPath: /dev/kmsg
readOnly: true
# Make sure node problem detector is in the same timezone
# with the host.
- name: localtime
mountPath: /etc/localtime
readOnly: true
- name: config
mountPath: /config
readOnly: true
- mountPath: /etc/machine-id
name: machine-id
readOnly: true
- mountPath: /run/systemd/system
name: systemd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were you able to run the deployment and get the health signals without errors using this configuration?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this deployment has been ran in our productions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack.

- mountPath: /var/run/dbus/
name: dbus
mountPropagation: Bidirectional
volumes:
- name: log
# Config `log` to your system log directory
hostPath:
path: /var/log/
- name: kmsg
hostPath:
path: /dev/kmsg
- name: localtime
hostPath:
path: /etc/localtime
- name: config
configMap:
name: node-problem-detector-config
items:
- key: kernel-monitor.json
path: kernel-monitor.json
- key: docker-monitor.json
path: docker-monitor.json
- name: machine-id
hostPath:
path: /etc/machine-id
type: "File"
- name: systemd
hostPath:
path: /run/systemd/system/
type: ""
- name: dbus
hostPath:
path: /var/run/dbus/
type: ""
30 changes: 15 additions & 15 deletions pkg/healthchecker/health_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type healthChecker struct {
crictlPath string
healthCheckTimeout time.Duration
coolDownTime time.Duration
loopBackTime time.Duration
logPatternsToCheck map[string]int
}

Expand All @@ -48,6 +49,7 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
healthCheckTimeout: hco.HealthCheckTimeout,
coolDownTime: hco.CoolDownTime,
service: hco.Service,
loopBackTime: hco.LoopBackTime,
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
}
hc.healthCheckFunc = getHealthCheckFunc(hco)
Expand All @@ -59,11 +61,22 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
// CheckHealth checks for the health of the component and tries to repair if enabled.
// Returns true if healthy, false otherwise.
func (hc *healthChecker) CheckHealth() (bool, error) {
var logStartTime string
healthy, err := hc.healthCheckFunc()
if err != nil {
return healthy, err
}
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
uptime, err := hc.uptimeFunc()
if err != nil {
glog.Warningf("Failed to get the uptime: %+v", err)
return true, err
}
if hc.loopBackTime > 0 && uptime > hc.loopBackTime {
logStartTime = time.Now().Add(-hc.loopBackTime).Format(types.LogParsingTimeLayout)
} else {
logStartTime = time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
}
logPatternHealthy, err := logPatternHealthCheck(hc.service, logStartTime, hc.logPatternsToCheck)
if err != nil {
return logPatternHealthy, err
}
Expand All @@ -74,10 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
// Attempt repair based on flag.
if hc.enableRepair {
// repair if the service has been up for the cool down period.
uptime, err := hc.uptimeFunc()
if err != nil {
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
}
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
if uptime > hc.coolDownTime {
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
Expand All @@ -89,19 +98,10 @@ func (hc *healthChecker) CheckHealth() (bool, error) {

// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
func logPatternHealthCheck(service, logStartTime string, logPatternsToCheck map[string]int) (bool, error) {
if len(logPatternsToCheck) == 0 {
return true, nil
}
uptimeFunc := getUptimeFunc(service)
uptime, err := uptimeFunc()
if err != nil {
return true, err
}
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
if err != nil {
return true, err
}
for pattern, count := range logPatternsToCheck {
healthy, err := checkForPattern(service, logStartTime, pattern, count)
if err != nil || !healthy {
Expand Down
1 change: 1 addition & 0 deletions pkg/healthchecker/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
)

const (
DefaultLoopBackTime = 0 * time.Minute
DefaultCoolDownTime = 2 * time.Minute
DefaultHealthCheckTimeout = 10 * time.Second
CmdTimeout = 10 * time.Second
Expand Down