Skip to content

Commit 100f2bf

Browse files
committed
Make log pattern check configurable in health checker
1 parent fc4f167 commit 100f2bf

File tree

4 files changed

+206
-46
lines changed

4 files changed

+206
-46
lines changed

cmd/healthchecker/options/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type HealthCheckerOptions struct {
4040
CriSocketPath string
4141
CoolDownTime time.Duration
4242
HealthCheckTimeout time.Duration
43+
LogPatterns types.LogPatternFlag
4344
}
4445

4546
// AddFlags adds health checker command line options to pflag.
@@ -57,6 +58,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
5758
"The duration to wait for the service to be up before attempting repair.")
5859
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
5960
"The time to wait before marking the component as unhealthy.")
61+
fs.Var(&hco.LogPatterns, "log-pattern",
62+
"The log pattern to look for in service journald logs. The format for flag value <failureThresholdCount>:<logPattern>")
6063
}
6164

6265
// IsValid validates health checker command line options.

pkg/healthchecker/health_checker.go

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333

3434
type healthChecker struct {
3535
component string
36+
systemdService string
3637
enableRepair bool
3738
healthCheckFunc func() (bool, error)
3839
// The repair is "best-effort" and ignores the error from the underlying actions.
@@ -42,6 +43,7 @@ type healthChecker struct {
4243
crictlPath string
4344
healthCheckTimeout time.Duration
4445
coolDownTime time.Duration
46+
logPatternsToCheck map[string]int
4547
}
4648

4749
// NewHealthChecker returns a new health checker configured with the given options.
@@ -52,6 +54,8 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
5254
crictlPath: hco.CriCtlPath,
5355
healthCheckTimeout: hco.HealthCheckTimeout,
5456
coolDownTime: hco.CoolDownTime,
57+
systemdService: hco.SystemdService,
58+
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
5559
}
5660
hc.healthCheckFunc = getHealthCheckFunc(hco)
5761
hc.repairFunc = getRepairFunc(hco)
@@ -106,7 +110,14 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
106110
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
107111
switch hco.Component {
108112
case types.KubeletComponent:
109-
return getKubeletHealthCheckFunc(hco.HealthCheckTimeout)
113+
return func() (bool, error) {
114+
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
115+
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
116+
if err != nil || response.StatusCode != http.StatusOK {
117+
return false, nil
118+
}
119+
return true, nil
120+
}
110121
case types.DockerComponent:
111122
return func() (bool, error) {
112123
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
@@ -132,7 +143,11 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
132143
if err != nil {
133144
return healthy, err
134145
}
135-
if healthy {
146+
logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck)
147+
if err != nil {
148+
return logPatternHealthy, err
149+
}
150+
if healthy && logPatternHealthy {
136151
return true, nil
137152
}
138153
// The service is unhealthy.
@@ -165,36 +180,38 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
165180
return strings.TrimSuffix(string(out), "\n"), nil
166181
}
167182

168-
// kubeletHttpHealthCheck checks the health api response on kubelet.
169-
// Returns true for healthy, false otherwise.
170-
func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool {
171-
httpClient := http.Client{Timeout: healthCheckTimeout}
172-
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
173-
if err != nil || response.StatusCode != http.StatusOK {
174-
glog.Info("kubelet failed http health check")
175-
return false
183+
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
184+
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
185+
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
186+
if len(logPatternsToCheck) == 0 {
187+
return true, nil
176188
}
177-
return true
178-
}
179-
180-
// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
181-
// by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
182-
// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
183-
func kubeletConnectionHealthCheck() (bool, error) {
184-
kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent)
185-
uptime, err := kubeletUptimeFunc()
189+
uptimeFunc := getUptimeFunc(service)
190+
uptime, err := uptimeFunc()
186191
if err != nil {
187192
return true, err
188193
}
189194
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
190195
if err != nil {
191196
return true, err
192197
}
198+
for pattern, count := range logPatternsToCheck {
199+
healthy, err := checkForPattern(service, logStartTime, pattern, count)
200+
if err != nil || !healthy {
201+
return healthy, err
202+
}
203+
}
204+
return true, nil
205+
}
206+
207+
// checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
208+
// service restart. (false, nil) otherwise.
209+
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
193210
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
194-
// Query kubelet logs since the logStartTime
195-
`journalctl --unit kubelet --since "`+logStartTime+
196-
// Grep the pattern for lost connection
197-
`" | grep -i "`+types.KubeletClosedConnectionLogPattern+
211+
// Query service logs since the logStartTime
212+
`journalctl --unit "`+service+`" --since "`+logStartTime+
213+
// Grep the pattern
214+
`" | grep -i "`+logPattern+
198215
// Get the count of occurrences
199216
`" | wc -l`)
200217
if err != nil {
@@ -204,26 +221,9 @@ func kubeletConnectionHealthCheck() (bool, error) {
204221
if err != nil {
205222
return true, err
206223
}
207-
if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount {
208-
glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences)
224+
if occurrences >= logCountThreshold {
225+
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
209226
return false, nil
210227
}
211228
return true, nil
212229
}
213-
214-
// getKubeletHealthCheckFunc returns a function that checks for kubelet health and
215-
// return false if identified as unhealthy, true otherwise.
216-
func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) {
217-
return func() (bool, error) {
218-
httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout)
219-
connectionHealthy, err := kubeletConnectionHealthCheck()
220-
// The plugin will return Unknown status code in case there is any error in
221-
// checking kubelet health.
222-
if err != nil {
223-
glog.Infof("Error in determining apiserver connection health: %v", err)
224-
return false, err
225-
}
226-
healthy := httpHealthy && connectionHealthy
227-
return healthy, nil
228-
}
229-
}

pkg/healthchecker/types/types.go

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@ limitations under the License.
1616

1717
package types
1818

19-
import "time"
19+
import (
20+
"fmt"
21+
"strconv"
22+
"strings"
23+
"time"
24+
)
2025

2126
const (
2227
DefaultCoolDownTime = 2 * time.Minute
@@ -33,11 +38,63 @@ const (
3338
DockerComponent = "docker"
3439
ContainerdService = "containerd"
3540

36-
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
37-
KubeletClosedConnectionLogPattern = "use of closed network connection"
38-
KubeletClosedConnectionLogPatternThresholdCount = 10
41+
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
42+
43+
LogPatternFlagSeparator = ":"
3944
)
4045

4146
type HealthChecker interface {
4247
CheckHealth() (bool, error)
4348
}
49+
50+
// LogPatternFlag defines the flag for log pattern health check.
51+
// It contains a map of <log pattern> to <failure threshold for the pattern>
52+
type LogPatternFlag struct {
53+
logPatternCountMap map[string]int
54+
}
55+
56+
// String implements the String function for flag.Value interface
57+
func (lpf *LogPatternFlag) String() string {
58+
result := ""
59+
for k, v := range lpf.logPatternCountMap {
60+
if result != "" {
61+
result += " "
62+
}
63+
result += fmt.Sprintf("%v:%v", k, v)
64+
}
65+
return result
66+
}
67+
68+
// Set implements the Set function for flag.Value interface
69+
func (lpf *LogPatternFlag) Set(value string) error {
70+
if lpf.logPatternCountMap == nil {
71+
lpf.logPatternCountMap = make(map[string]int)
72+
}
73+
items := strings.Split(value, ",")
74+
for _, item := range items {
75+
val := strings.SplitN(item, LogPatternFlagSeparator, 2)
76+
if len(val) != 2 {
77+
return fmt.Errorf("invalid format of the flag value: %v", val)
78+
}
79+
countThreshold, err := strconv.Atoi(val[0])
80+
if err != nil || countThreshold == 0 {
81+
return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
82+
}
83+
pattern := val[1]
84+
if pattern == "" {
85+
return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
86+
}
87+
lpf.logPatternCountMap[pattern] = countThreshold
88+
}
89+
return nil
90+
}
91+
92+
// Type implements the Type function for flag.Value interface
93+
func (lpf *LogPatternFlag) Type() string {
94+
return "logPatternFlag"
95+
}
96+
97+
// GetLogPatternCountMap returns the stored log count map
98+
func (lpf *LogPatternFlag) GetLogPatternCountMap() map[string]int {
99+
return lpf.logPatternCountMap
100+
}

pkg/healthchecker/types/types_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
Copyright 2021 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package types
18+
19+
import (
20+
"reflect"
21+
"testing"
22+
23+
"github.com/stretchr/testify/assert"
24+
)
25+
26+
func TestLogPatternFlag(t *testing.T) {
27+
testCases := []struct {
28+
name string
29+
value string
30+
expectedStringVal string
31+
expectedLogPatternCountMap map[string]int
32+
expectSetError bool
33+
}{
34+
{
35+
name: "valid single flag value",
36+
value: "10:pattern1",
37+
expectedStringVal: "pattern1:10",
38+
expectedLogPatternCountMap: map[string]int{"pattern1": 10},
39+
expectSetError: false,
40+
},
41+
{
42+
name: "valid multiple flag values",
43+
value: "10:pattern1,20:pattern2",
44+
expectedStringVal: "pattern1:10 pattern2:20",
45+
expectedLogPatternCountMap: map[string]int{"pattern1": 10, "pattern2": 20},
46+
expectSetError: false,
47+
},
48+
{
49+
name: "empty log pattern",
50+
value: "10:",
51+
expectSetError: true,
52+
},
53+
{
54+
name: "0 failure threshold count",
55+
value: "0:pattern1",
56+
expectSetError: true,
57+
},
58+
{
59+
name: "empty failure threshold count",
60+
value: ":pattern1",
61+
expectSetError: true,
62+
},
63+
{
64+
name: "empty failure threshold count and pattern",
65+
value: ":",
66+
expectSetError: true,
67+
},
68+
{
69+
name: "non integer value in failure threshold",
70+
value: "notAnInteger:pattern1",
71+
expectSetError: true,
72+
},
73+
{
74+
name: "valid log pattern with ':'",
75+
value: "10:pattern1a:pattern1b,20:pattern2",
76+
expectedStringVal: "pattern1a:pattern1b:10 pattern2:20",
77+
expectedLogPatternCountMap: map[string]int{"pattern1a:pattern1b": 10, "pattern2": 20},
78+
expectSetError: false,
79+
},
80+
}
81+
82+
for _, test := range testCases {
83+
t.Run(test.name, func(t *testing.T) {
84+
flag := LogPatternFlag{}
85+
err := flag.Set(test.value)
86+
if test.expectSetError {
87+
assert.Error(t, err)
88+
} else {
89+
assert.NoError(t, err)
90+
actualStringVal := flag.String()
91+
actualLogPatternCountMap := flag.GetLogPatternCountMap()
92+
assert.Equal(t, test.expectedStringVal, actualStringVal)
93+
if !reflect.DeepEqual(test.expectedLogPatternCountMap, actualLogPatternCountMap) {
94+
t.Fatalf("logPatternCountMap mismatch, expected: %v, actual: %v", test.expectedLogPatternCountMap, actualLogPatternCountMap)
95+
}
96+
assert.Equal(t, test.expectedLogPatternCountMap, actualLogPatternCountMap)
97+
}
98+
})
99+
}
100+
}

0 commit comments

Comments
 (0)