Skip to content

Commit 68c5fc6

Browse files
committed
add loopbacktime to reduce time of journalctl call
1 parent cb8534b commit 68c5fc6

File tree

4 files changed

+19
-15
lines changed

4 files changed

+19
-15
lines changed

cmd/healthchecker/options/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ type HealthCheckerOptions struct {
3939
CriCtlPath string
4040
CriSocketPath string
4141
CoolDownTime time.Duration
42+
LoopBackTime time.Duration
4243
HealthCheckTimeout time.Duration
4344
LogPatterns types.LogPatternFlag
4445
}
@@ -56,6 +57,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
5657
"The path to the cri socket. Used with crictl to specify the socket path.")
5758
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
5859
"The duration to wait for the service to be up before attempting repair.")
60+
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
61+
"The duration to loop back, if it is 0, health-check will check from start time.")
5962
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
6063
"The time to wait before marking the component as unhealthy.")
6164
fs.Var(&hco.LogPatterns, "log-pattern",

config/health-checker-kubelet.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"--component=kubelet",
2626
"--enable-repair=true",
2727
"--cooldown-time=1m",
28+
"--loopback-time=5m",
2829
"--health-check-timeout=10s"
2930
],
3031
"timeout": "3m"

pkg/healthchecker/health_checker.go

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ type healthChecker struct {
4343
crictlPath string
4444
healthCheckTimeout time.Duration
4545
coolDownTime time.Duration
46+
loopBackTime time.Duration
4647
logPatternsToCheck map[string]int
4748
}
4849

@@ -54,6 +55,7 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
5455
crictlPath: hco.CriCtlPath,
5556
healthCheckTimeout: hco.HealthCheckTimeout,
5657
coolDownTime: hco.CoolDownTime,
58+
loopBackTime: hco.LoopBackTime,
5759
systemdService: hco.SystemdService,
5860
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
5961
}
@@ -139,11 +141,21 @@ func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error)
139141
// CheckHealth checks for the health of the component and tries to repair if enabled.
140142
// Returns true if healthy, false otherwise.
141143
func (hc *healthChecker) CheckHealth() (bool, error) {
144+
var logStartTime string
142145
healthy, err := hc.healthCheckFunc()
143146
if err != nil {
144147
return healthy, err
145148
}
146-
logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck)
149+
uptime, err := hc.uptimeFunc()
150+
if err != nil {
151+
return false, err
152+
}
153+
if hc.loopBackTime > 0 && uptime > hc.loopBackTime {
154+
logStartTime = time.Now().Add(-hc.loopBackTime).Format(types.LogParsingTimeLayout)
155+
} else {
156+
logStartTime = time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
157+
}
158+
logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, logStartTime, hc.logPatternsToCheck)
147159
if err != nil {
148160
return logPatternHealthy, err
149161
}
@@ -154,10 +166,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
154166
// Attempt repair based on flag.
155167
if hc.enableRepair {
156168
// repair if the service has been up for the cool down period.
157-
uptime, err := hc.uptimeFunc()
158-
if err != nil {
159-
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
160-
}
161169
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
162170
if uptime > hc.coolDownTime {
163171
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
@@ -182,19 +190,10 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
182190

183191
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
184192
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
185-
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
193+
func logPatternHealthCheck(service, logStartTime string, logPatternsToCheck map[string]int) (bool, error) {
186194
if len(logPatternsToCheck) == 0 {
187195
return true, nil
188196
}
189-
uptimeFunc := getUptimeFunc(service)
190-
uptime, err := uptimeFunc()
191-
if err != nil {
192-
return true, err
193-
}
194-
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
195-
if err != nil {
196-
return true, err
197-
}
198197
for pattern, count := range logPatternsToCheck {
199198
healthy, err := checkForPattern(service, logStartTime, pattern, count)
200199
if err != nil || !healthy {

pkg/healthchecker/types/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
)
2626

2727
const (
28+
DefaultLoopBackTime = 0 * time.Minute
2829
DefaultCoolDownTime = 2 * time.Minute
2930
DefaultHealthCheckTimeout = 10 * time.Second
3031
CmdTimeout = 10 * time.Second

0 commit comments

Comments
 (0)