Skip to content

Separate linux/windows health checker files. Build health checker plugin for Windows #544

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/healthchecker/health_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func main() {
os.Exit(int(types.Unknown))
}
if !healthy {
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.Service, hco.EnableRepair)
os.Exit(int(types.NonOK))
}
os.Exit(int(types.OK))
Expand Down
25 changes: 16 additions & 9 deletions cmd/healthchecker/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package options
import (
"flag"
"fmt"
"runtime"
"time"

"github.com/spf13/pflag"
Expand All @@ -34,7 +35,7 @@ func NewHealthCheckerOptions() *HealthCheckerOptions {
// HealthCheckerOptions are the options used to configure the health checker.
type HealthCheckerOptions struct {
Component string
SystemdService string
Service string
EnableRepair bool
CriCtlPath string
CriSocketPath string
Expand All @@ -47,8 +48,14 @@ type HealthCheckerOptions struct {
func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&hco.Component, "component", types.KubeletComponent,
"The component to check health for. Supports kubelet, docker and cri")
fs.StringVar(&hco.SystemdService, "systemd-service", "",
"The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
// Deprecated: For backward compatibility on linux environment. Going forward "service" will be used instead of systemd-service
if runtime.GOOS == "linux" {
fs.MarkDeprecated("systemd-service", "please use --service flag instead")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deprecated != not working any more.

systemd-service should still apply value on hco.Service if specified.

Copy link
Contributor Author

@mcshooter mcshooter Apr 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so need both like the following?

// Deprecated: For backward compatibility on linux environment. Going forward "service" will be used instead of systemd-service
if runtime.GOOS == "linux" {
fs.MarkDeprecated("systemd-service", "please use --service flag instead")
fs.StringVar(&hco.Service, "systemd-service", "",
	"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
}

fs.StringVar(&hco.Service, "systemd-service", "",
"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
}
fs.StringVar(&hco.Service, "service", "",
"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should explain what the service means on linux and windows. And I don't think cri is a valid option here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean cri is not a valid option? I think the message says to use containerd for cri?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh nvm. I misread the flag description.

fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.")
fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl,
"The path to the crictl binary. This is used to check health of cri component.")
Expand All @@ -69,9 +76,9 @@ func (hco *HealthCheckerOptions) IsValid() error {
if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent {
return fmt.Errorf("the component specified is not supported. Supported components are : <kubelet/docker/cri>")
}
// Make sure the systemd service is specified if repair is enabled.
if hco.EnableRepair && hco.SystemdService == "" {
return fmt.Errorf("systemd-service cannot be empty when repair is enabled")
// Make sure the service is specified if repair is enabled.
if hco.EnableRepair && hco.Service == "" {
return fmt.Errorf("service cannot be empty when repair is enabled")
}
// Skip checking further if the component is not cri.
if hco.Component != types.CRIComponent {
Expand All @@ -90,14 +97,14 @@ func (hco *HealthCheckerOptions) IsValid() error {

// SetDefaults sets the defaults values for the dependent flags.
func (hco *HealthCheckerOptions) SetDefaults() {
if hco.SystemdService != "" {
if hco.Service != "" {
return
}
if hco.Component != types.CRIComponent {
hco.SystemdService = hco.Component
hco.Service = hco.Component
return
}
hco.SystemdService = types.ContainerdService
hco.Service = types.ContainerdService
}

func init() {
Expand Down
6 changes: 3 additions & 3 deletions cmd/healthchecker/options/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ func TestIsValid(t *testing.T) {
{
name: "empty systemd-service and repair enabled",
hco: HealthCheckerOptions{
Component: types.KubeletComponent,
EnableRepair: true,
SystemdService: "",
Component: types.KubeletComponent,
EnableRepair: true,
Service: "",
},
expectError: true,
},
Expand Down
2 changes: 2 additions & 0 deletions cmd/nodeproblemdetector/node_problem_detector_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// +build !disable_system_log_monitor

/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Expand Down
2 changes: 2 additions & 0 deletions cmd/nodeproblemdetector/node_problem_detector_windows_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// +build !disable_system_log_monitor

/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Expand Down
34 changes: 34 additions & 0 deletions config/windows-health-checker-containerd.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "ContainerRuntimeUnhealthy",
"reason": "ContainerRuntimeIsHealthy",
"message": "Container runtime on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "ContainerRuntimeUnhealthy",
"reason": "ContainerdUnhealthy",
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
"args": [
"--component=cri",
"--enable-repair=true",
"--cooldown-time=2m",
"--health-check-timeout=60s"
],
"timeout": "3m"
}
]
}

34 changes: 34 additions & 0 deletions config/windows-health-checker-docker.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "ContainerRuntimeUnhealthy",
"reason": "ContainerRuntimeIsHealthy",
"message": "Container runtime on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "ContainerRuntimeUnhealthy",
"reason": "DockerUnhealthy",
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
"args": [
"--component=docker",
"--enable-repair=true",
"--cooldown-time=2m",
"--health-check-timeout=60s"
],
"timeout": "3m"
}
]
}

34 changes: 34 additions & 0 deletions config/windows-health-checker-kubelet.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "KubeletUnhealthy",
"reason": "KubeletIsHealthy",
"message": "kubelet on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "KubeletUnhealthy",
"reason": "KubeletUnhealthy",
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
"args": [
"--component=kubelet",
"--enable-repair=true",
"--cooldown-time=1m",
"--health-check-timeout=10s"
],
"timeout": "3m"
}
]
}

125 changes: 4 additions & 121 deletions pkg/healthchecker/health_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,16 @@ limitations under the License.
package healthchecker

import (
"context"
"errors"
"net/http"
"os/exec"
"strconv"
"strings"
"time"

"github.com/golang/glog"

"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/healthchecker/types"
)

type healthChecker struct {
component string
systemdService string
service string
enableRepair bool
healthCheckFunc func() (bool, error)
// The repair is "best-effort" and ignores the error from the underlying actions.
Expand All @@ -54,96 +47,23 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
crictlPath: hco.CriCtlPath,
healthCheckTimeout: hco.HealthCheckTimeout,
coolDownTime: hco.CoolDownTime,
systemdService: hco.SystemdService,
service: hco.Service,
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
}
hc.healthCheckFunc = getHealthCheckFunc(hco)
hc.repairFunc = getRepairFunc(hco)
hc.uptimeFunc = getUptimeFunc(hco.SystemdService)
hc.uptimeFunc = getUptimeFunc(hco.Service)
return hc, nil
}

// getUptimeFunc returns the time for which the given service has been running.
func getUptimeFunc(service string) func() (time.Duration, error) {
return func() (time.Duration, error) {
// Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will
// transition from inactive -> activating and the timestamp is captured.
// Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/
// Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when
// RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in
// activating state and hence ActiveEnterTimestamp was never updated.
out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=InactiveExitTimestamp")
if err != nil {
return time.Duration(0), err
}
val := strings.Split(out, "=")
if len(val) < 2 {
return time.Duration(0), errors.New("could not parse the service uptime time correctly")
}
t, err := time.Parse(types.UptimeTimeLayout, val[1])
if err != nil {
return time.Duration(0), err
}
return time.Since(t), nil
}
}

// getRepairFunc returns the repair function based on the component.
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
switch hco.Component {
case types.DockerComponent:
// Use "docker ps" for docker health check. Not using crictl for docker to remove
// dependency on the kubelet.
return func() {
execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd")
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
}
default:
// Just kill the service for all other components
return func() {
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
}
}
}

// getHealthCheckFunc returns the health check function based on the component.
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
switch hco.Component {
case types.KubeletComponent:
return func() (bool, error) {
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
if err != nil || response.StatusCode != http.StatusOK {
return false, nil
}
return true, nil
}
case types.DockerComponent:
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
return false, nil
}
return true, nil
}
case types.CRIComponent:
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
return false, nil
}
return true, nil
}
}
return nil
}

// CheckHealth checks for the health of the component and tries to repair if enabled.
// Returns true if healthy, false otherwise.
func (hc *healthChecker) CheckHealth() (bool, error) {
healthy, err := hc.healthCheckFunc()
if err != nil {
return healthy, err
}
logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck)
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
if err != nil {
return logPatternHealthy, err
}
Expand All @@ -167,19 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
return false, nil
}

// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
cmd := exec.CommandContext(ctx, command, args...)
out, err := cmd.Output()
if err != nil {
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
return "", err
}
return strings.TrimSuffix(string(out), "\n"), nil
}

// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
Expand All @@ -203,27 +110,3 @@ func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (b
}
return true, nil
}

// checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
// service restart. (false, nil) otherwise.
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
// Query service logs since the logStartTime
`journalctl --unit "`+service+`" --since "`+logStartTime+
// Grep the pattern
`" | grep -i "`+logPattern+
// Get the count of occurrences
`" | wc -l`)
if err != nil {
return true, err
}
occurrences, err := strconv.Atoi(out)
if err != nil {
return true, err
}
if occurrences >= logCountThreshold {
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
return false, nil
}
return true, nil
}
Loading