@@ -33,6 +33,7 @@ import (
33
33
34
34
type healthChecker struct {
35
35
component string
36
+ systemdService string
36
37
enableRepair bool
37
38
healthCheckFunc func () (bool , error )
38
39
// The repair is "best-effort" and ignores the error from the underlying actions.
@@ -42,6 +43,7 @@ type healthChecker struct {
42
43
crictlPath string
43
44
healthCheckTimeout time.Duration
44
45
coolDownTime time.Duration
46
+ logPatternsToCheck map [string ]int
45
47
}
46
48
47
49
// NewHealthChecker returns a new health checker configured with the given options.
@@ -52,6 +54,8 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
52
54
crictlPath : hco .CriCtlPath ,
53
55
healthCheckTimeout : hco .HealthCheckTimeout ,
54
56
coolDownTime : hco .CoolDownTime ,
57
+ systemdService : hco .SystemdService ,
58
+ logPatternsToCheck : hco .LogPatterns .GetLogPatternCountMap (),
55
59
}
56
60
hc .healthCheckFunc = getHealthCheckFunc (hco )
57
61
hc .repairFunc = getRepairFunc (hco )
@@ -106,7 +110,14 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
106
110
func getHealthCheckFunc (hco * options.HealthCheckerOptions ) func () (bool , error ) {
107
111
switch hco .Component {
108
112
case types .KubeletComponent :
109
- return getKubeletHealthCheckFunc (hco .HealthCheckTimeout )
113
+ return func () (bool , error ) {
114
+ httpClient := http.Client {Timeout : hco .HealthCheckTimeout }
115
+ response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
116
+ if err != nil || response .StatusCode != http .StatusOK {
117
+ return false , nil
118
+ }
119
+ return true , nil
120
+ }
110
121
case types .DockerComponent :
111
122
return func () (bool , error ) {
112
123
if _ , err := execCommand (hco .HealthCheckTimeout , "docker" , "ps" ); err != nil {
@@ -132,7 +143,11 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
132
143
if err != nil {
133
144
return healthy , err
134
145
}
135
- if healthy {
146
+ logPatternHealthy , err := logPatternHealthCheck (hc .systemdService , hc .logPatternsToCheck )
147
+ if err != nil {
148
+ return logPatternHealthy , err
149
+ }
150
+ if healthy && logPatternHealthy {
136
151
return true , nil
137
152
}
138
153
// The service is unhealthy.
@@ -165,36 +180,38 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
165
180
return strings .TrimSuffix (string (out ), "\n " ), nil
166
181
}
167
182
168
- // kubeletHttpHealthCheck checks the health api response on kubelet.
169
- // Returns true for healthy, false otherwise.
170
- func kubeletHttpHealthCheck (healthCheckTimeout time.Duration ) bool {
171
- httpClient := http.Client {Timeout : healthCheckTimeout }
172
- response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
173
- if err != nil || response .StatusCode != http .StatusOK {
174
- glog .Info ("kubelet failed http health check" )
175
- return false
183
+ // logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
184
+ // Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
185
+ func logPatternHealthCheck (service string , logPatternsToCheck map [string ]int ) (bool , error ) {
186
+ if len (logPatternsToCheck ) == 0 {
187
+ return true , nil
176
188
}
177
- return true
178
- }
179
-
180
- // kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
181
- // by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
182
- // Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
183
- func kubeletConnectionHealthCheck () (bool , error ) {
184
- kubeletUptimeFunc := getUptimeFunc (types .KubeletComponent )
185
- uptime , err := kubeletUptimeFunc ()
189
+ uptimeFunc := getUptimeFunc (service )
190
+ uptime , err := uptimeFunc ()
186
191
if err != nil {
187
192
return true , err
188
193
}
189
194
logStartTime := time .Now ().Add (- uptime ).Format (types .LogParsingTimeLayout )
190
195
if err != nil {
191
196
return true , err
192
197
}
198
+ for pattern , count := range logPatternsToCheck {
199
+ healthy , err := checkForPattern (service , logStartTime , pattern , count )
200
+ if err != nil || ! healthy {
201
+ return healthy , err
202
+ }
203
+ }
204
+ return true , nil
205
+ }
206
+
207
+ // checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
208
+ // service restart. (false, nil) otherwise.
209
+ func checkForPattern (service , logStartTime , logPattern string , logCountThreshold int ) (bool , error ) {
193
210
out , err := execCommand (types .CmdTimeout , "/bin/sh" , "-c" ,
194
- // Query kubelet logs since the logStartTime
195
- `journalctl --unit kubelet --since "` + logStartTime +
196
- // Grep the pattern for lost connection
197
- `" | grep -i "` + types . KubeletClosedConnectionLogPattern +
211
+ // Query service logs since the logStartTime
212
+ `journalctl --unit "` + service + `" --since "`+ logStartTime +
213
+ // Grep the pattern
214
+ `" | grep -i "` + logPattern +
198
215
// Get the count of occurrences
199
216
`" | wc -l` )
200
217
if err != nil {
@@ -204,26 +221,9 @@ func kubeletConnectionHealthCheck() (bool, error) {
204
221
if err != nil {
205
222
return true , err
206
223
}
207
- if occurrences >= types . KubeletClosedConnectionLogPatternThresholdCount {
208
- glog .Infof ("kubelet failed apiserver connection check, log pattern occurrences: %v" , occurrences )
224
+ if occurrences >= logCountThreshold {
225
+ glog .Infof ("%s failed log pattern check, %s occurrences: %v" , service , logPattern , occurrences )
209
226
return false , nil
210
227
}
211
228
return true , nil
212
229
}
213
-
214
- // getKubeletHealthCheckFunc returns a function that checks for kubelet health and
215
- // return false if identified as unhealthy, true otherwise.
216
- func getKubeletHealthCheckFunc (healthCheckTimeout time.Duration ) func () (bool , error ) {
217
- return func () (bool , error ) {
218
- httpHealthy := kubeletHttpHealthCheck (healthCheckTimeout )
219
- connectionHealthy , err := kubeletConnectionHealthCheck ()
220
- // The plugin will return Unknown status code in case there is any error in
221
- // checking kubelet health.
222
- if err != nil {
223
- glog .Infof ("Error in determining apiserver connection health: %v" , err )
224
- return false , err
225
- }
226
- healthy := httpHealthy && connectionHealthy
227
- return healthy , nil
228
- }
229
- }
0 commit comments