From c4e5400ed6d7ca30d3a803248ae5b55c53557e59 Mon Sep 17 00:00:00 2001 From: michelletandya Date: Wed, 14 Apr 2021 20:27:37 +0000 Subject: [PATCH] separate linux/windows health checker files. --- cmd/healthchecker/health_checker.go | 2 +- cmd/healthchecker/options/options.go | 25 +-- cmd/healthchecker/options/options_test.go | 6 +- .../node_problem_detector_test.go | 2 + .../node_problem_detector_windows_test.go | 2 + config/windows-health-checker-containerd.json | 34 +++++ config/windows-health-checker-docker.json | 34 +++++ config/windows-health-checker-kubelet.json | 34 +++++ pkg/healthchecker/health_checker.go | 125 +-------------- pkg/healthchecker/health_checker_linux.go | 143 ++++++++++++++++++ pkg/healthchecker/health_checker_windows.go | 138 +++++++++++++++++ pkg/healthchecker/types/types.go | 4 - pkg/healthchecker/types/types_linux.go | 23 +++ pkg/healthchecker/types/types_windows.go | 24 +++ pkg/util/exec_windows.go | 11 +- 15 files changed, 467 insertions(+), 140 deletions(-) create mode 100644 config/windows-health-checker-containerd.json create mode 100644 config/windows-health-checker-docker.json create mode 100644 config/windows-health-checker-kubelet.json create mode 100644 pkg/healthchecker/health_checker_linux.go create mode 100644 pkg/healthchecker/health_checker_windows.go create mode 100644 pkg/healthchecker/types/types_linux.go create mode 100644 pkg/healthchecker/types/types_windows.go diff --git a/cmd/healthchecker/health_checker.go b/cmd/healthchecker/health_checker.go index 3cce076ca..2e815036d 100644 --- a/cmd/healthchecker/health_checker.go +++ b/cmd/healthchecker/health_checker.go @@ -55,7 +55,7 @@ func main() { os.Exit(int(types.Unknown)) } if !healthy { - fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair) + fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.Service, hco.EnableRepair) os.Exit(int(types.NonOK)) } os.Exit(int(types.OK)) diff --git a/cmd/healthchecker/options/options.go b/cmd/healthchecker/options/options.go index abff53a2d..f555ca6f3 100644 --- a/cmd/healthchecker/options/options.go +++ b/cmd/healthchecker/options/options.go @@ -19,6 +19,7 @@ package options import ( "flag" "fmt" + "runtime" "time" "github.com/spf13/pflag" @@ -34,7 +35,7 @@ func NewHealthCheckerOptions() *HealthCheckerOptions { // HealthCheckerOptions are the options used to configure the health checker. type HealthCheckerOptions struct { Component string - SystemdService string + Service string EnableRepair bool CriCtlPath string CriSocketPath string @@ -47,8 +48,14 @@ type HealthCheckerOptions struct { func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&hco.Component, "component", types.KubeletComponent, "The component to check health for. Supports kubelet, docker and cri") - fs.StringVar(&hco.SystemdService, "systemd-service", "", - "The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.") + // Deprecated: For backward compatibility on linux environment. Going forward "service" will be used instead of systemd-service + if runtime.GOOS == "linux" { + fs.MarkDeprecated("systemd-service", "please use --service flag instead") + fs.StringVar(&hco.Service, "systemd-service", "", + "The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.") + } + fs.StringVar(&hco.Service, "service", "", + "The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.") fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.") fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl, "The path to the crictl binary. This is used to check health of cri component.") @@ -69,9 +76,9 @@ func (hco *HealthCheckerOptions) IsValid() error { if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent { return fmt.Errorf("the component specified is not supported. Supported components are : ") } - // Make sure the systemd service is specified if repair is enabled. - if hco.EnableRepair && hco.SystemdService == "" { - return fmt.Errorf("systemd-service cannot be empty when repair is enabled") + // Make sure the service is specified if repair is enabled. + if hco.EnableRepair && hco.Service == "" { + return fmt.Errorf("service cannot be empty when repair is enabled") } // Skip checking further if the component is not cri. if hco.Component != types.CRIComponent { @@ -90,14 +97,14 @@ func (hco *HealthCheckerOptions) IsValid() error { // SetDefaults sets the defaults values for the dependent flags. func (hco *HealthCheckerOptions) SetDefaults() { - if hco.SystemdService != "" { + if hco.Service != "" { return } if hco.Component != types.CRIComponent { - hco.SystemdService = hco.Component + hco.Service = hco.Component return } - hco.SystemdService = types.ContainerdService + hco.Service = types.ContainerdService } func init() { diff --git a/cmd/healthchecker/options/options_test.go b/cmd/healthchecker/options/options_test.go index 737f2ae96..94bf75083 100644 --- a/cmd/healthchecker/options/options_test.go +++ b/cmd/healthchecker/options/options_test.go @@ -56,9 +56,9 @@ func TestIsValid(t *testing.T) { { name: "empty systemd-service and repair enabled", hco: HealthCheckerOptions{ - Component: types.KubeletComponent, - EnableRepair: true, - SystemdService: "", + Component: types.KubeletComponent, + EnableRepair: true, + Service: "", }, expectError: true, }, diff --git a/cmd/nodeproblemdetector/node_problem_detector_test.go b/cmd/nodeproblemdetector/node_problem_detector_test.go index 8d6876497..5ad551a46 100644 --- a/cmd/nodeproblemdetector/node_problem_detector_test.go +++ b/cmd/nodeproblemdetector/node_problem_detector_test.go @@ -1,3 +1,5 @@ +// +build !disable_system_log_monitor + /* Copyright 2021 The Kubernetes Authors All rights reserved. diff --git a/cmd/nodeproblemdetector/node_problem_detector_windows_test.go b/cmd/nodeproblemdetector/node_problem_detector_windows_test.go index 79d0cb77a..dd814da29 100644 --- a/cmd/nodeproblemdetector/node_problem_detector_windows_test.go +++ b/cmd/nodeproblemdetector/node_problem_detector_windows_test.go @@ -1,3 +1,5 @@ +// +build !disable_system_log_monitor + /* Copyright 2021 The Kubernetes Authors All rights reserved. diff --git a/config/windows-health-checker-containerd.json b/config/windows-health-checker-containerd.json new file mode 100644 index 000000000..329e1a66e --- /dev/null +++ b/config/windows-health-checker-containerd.json @@ -0,0 +1,34 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "ContainerRuntimeUnhealthy", + "reason": "ContainerRuntimeIsHealthy", + "message": "Container runtime on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ContainerRuntimeUnhealthy", + "reason": "ContainerdUnhealthy", + "path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe", + "args": [ + "--component=cri", + "--enable-repair=true", + "--cooldown-time=2m", + "--health-check-timeout=60s" + ], + "timeout": "3m" + } + ] + } + \ No newline at end of file diff --git a/config/windows-health-checker-docker.json b/config/windows-health-checker-docker.json new file mode 100644 index 000000000..a41054a81 --- /dev/null +++ b/config/windows-health-checker-docker.json @@ -0,0 +1,34 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "ContainerRuntimeUnhealthy", + "reason": "ContainerRuntimeIsHealthy", + "message": "Container runtime on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ContainerRuntimeUnhealthy", + "reason": "DockerUnhealthy", + "path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe", + "args": [ + "--component=docker", + "--enable-repair=true", + "--cooldown-time=2m", + "--health-check-timeout=60s" + ], + "timeout": "3m" + } + ] + } + \ No newline at end of file diff --git a/config/windows-health-checker-kubelet.json b/config/windows-health-checker-kubelet.json new file mode 100644 index 000000000..37ee5ba70 --- /dev/null +++ b/config/windows-health-checker-kubelet.json @@ -0,0 +1,34 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "KubeletUnhealthy", + "reason": "KubeletIsHealthy", + "message": "kubelet on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "KubeletUnhealthy", + "reason": "KubeletUnhealthy", + "path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe", + "args": [ + "--component=kubelet", + "--enable-repair=true", + "--cooldown-time=1m", + "--health-check-timeout=10s" + ], + "timeout": "3m" + } + ] + } + \ No newline at end of file diff --git a/pkg/healthchecker/health_checker.go b/pkg/healthchecker/health_checker.go index 29909f4e7..77a96d29a 100644 --- a/pkg/healthchecker/health_checker.go +++ b/pkg/healthchecker/health_checker.go @@ -17,23 +17,16 @@ limitations under the License. package healthchecker import ( - "context" - "errors" - "net/http" - "os/exec" - "strconv" - "strings" "time" "github.com/golang/glog" - "k8s.io/node-problem-detector/cmd/healthchecker/options" "k8s.io/node-problem-detector/pkg/healthchecker/types" ) type healthChecker struct { component string - systemdService string + service string enableRepair bool healthCheckFunc func() (bool, error) // The repair is "best-effort" and ignores the error from the underlying actions. @@ -54,88 +47,15 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e crictlPath: hco.CriCtlPath, healthCheckTimeout: hco.HealthCheckTimeout, coolDownTime: hco.CoolDownTime, - systemdService: hco.SystemdService, + service: hco.Service, logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(), } hc.healthCheckFunc = getHealthCheckFunc(hco) hc.repairFunc = getRepairFunc(hco) - hc.uptimeFunc = getUptimeFunc(hco.SystemdService) + hc.uptimeFunc = getUptimeFunc(hco.Service) return hc, nil } -// getUptimeFunc returns the time for which the given service has been running. -func getUptimeFunc(service string) func() (time.Duration, error) { - return func() (time.Duration, error) { - // Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will - // transition from inactive -> activating and the timestamp is captured. - // Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/ - // Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when - // RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in - // activating state and hence ActiveEnterTimestamp was never updated. - out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=InactiveExitTimestamp") - if err != nil { - return time.Duration(0), err - } - val := strings.Split(out, "=") - if len(val) < 2 { - return time.Duration(0), errors.New("could not parse the service uptime time correctly") - } - t, err := time.Parse(types.UptimeTimeLayout, val[1]) - if err != nil { - return time.Duration(0), err - } - return time.Since(t), nil - } -} - -// getRepairFunc returns the repair function based on the component. -func getRepairFunc(hco *options.HealthCheckerOptions) func() { - switch hco.Component { - case types.DockerComponent: - // Use "docker ps" for docker health check. Not using crictl for docker to remove - // dependency on the kubelet. - return func() { - execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd") - execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService) - } - default: - // Just kill the service for all other components - return func() { - execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService) - } - } -} - -// getHealthCheckFunc returns the health check function based on the component. -func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) { - switch hco.Component { - case types.KubeletComponent: - return func() (bool, error) { - httpClient := http.Client{Timeout: hco.HealthCheckTimeout} - response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) - if err != nil || response.StatusCode != http.StatusOK { - return false, nil - } - return true, nil - } - case types.DockerComponent: - return func() (bool, error) { - if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil { - return false, nil - } - return true, nil - } - case types.CRIComponent: - return func() (bool, error) { - if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil { - return false, nil - } - return true, nil - } - } - return nil -} - // CheckHealth checks for the health of the component and tries to repair if enabled. // Returns true if healthy, false otherwise. func (hc *healthChecker) CheckHealth() (bool, error) { @@ -143,7 +63,7 @@ func (hc *healthChecker) CheckHealth() (bool, error) { if err != nil { return healthy, err } - logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck) + logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck) if err != nil { return logPatternHealthy, err } @@ -167,19 +87,6 @@ func (hc *healthChecker) CheckHealth() (bool, error) { return false, nil } -// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs. -func execCommand(timeout time.Duration, command string, args ...string) (string, error) { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - cmd := exec.CommandContext(ctx, command, args...) - out, err := cmd.Output() - if err != nil { - glog.Infof("command %v failed: %v, %v\n", cmd, err, out) - return "", err - } - return strings.TrimSuffix(string(out), "\n"), nil -} - // logPatternHealthCheck checks for the provided logPattern occurrences in the service logs. // Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise. func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) { @@ -203,27 +110,3 @@ func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (b } return true, nil } - -// checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last -// service restart. (false, nil) otherwise. -func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) { - out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c", - // Query service logs since the logStartTime - `journalctl --unit "`+service+`" --since "`+logStartTime+ - // Grep the pattern - `" | grep -i "`+logPattern+ - // Get the count of occurrences - `" | wc -l`) - if err != nil { - return true, err - } - occurrences, err := strconv.Atoi(out) - if err != nil { - return true, err - } - if occurrences >= logCountThreshold { - glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences) - return false, nil - } - return true, nil -} diff --git a/pkg/healthchecker/health_checker_linux.go b/pkg/healthchecker/health_checker_linux.go new file mode 100644 index 000000000..df0f7261e --- /dev/null +++ b/pkg/healthchecker/health_checker_linux.go @@ -0,0 +1,143 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthchecker + +import ( + "context" + "errors" + "net/http" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/golang/glog" + + "k8s.io/node-problem-detector/cmd/healthchecker/options" + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +// getUptimeFunc returns the time for which the given service has been running. +func getUptimeFunc(service string) func() (time.Duration, error) { + return func() (time.Duration, error) { + // Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will + // transition from inactive -> activating and the timestamp is captured. + // Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/ + // Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when + // RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in + // activating state and hence ActiveEnterTimestamp was never updated. + out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=InactiveExitTimestamp") + + if err != nil { + return time.Duration(0), err + } + val := strings.Split(out, "=") + if len(val) < 2 { + return time.Duration(0), errors.New("could not parse the service uptime time correctly") + } + t, err := time.Parse(types.UptimeTimeLayout, val[1]) + if err != nil { + return time.Duration(0), err + } + return time.Since(t), nil + } +} + +// getRepairFunc returns the repair function based on the component. +func getRepairFunc(hco *options.HealthCheckerOptions) func() { + switch hco.Component { + case types.DockerComponent: + // Use "docker ps" for docker health check. Not using crictl for docker to remove + // dependency on the kubelet. + return func() { + execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd") + execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.Service) + } + default: + // Just kill the service for all other components + return func() { + execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.Service) + } + } +} + +// getHealthCheckFunc returns the health check function based on the component. +func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) { + switch hco.Component { + case types.KubeletComponent: + return func() (bool, error) { + httpClient := http.Client{Timeout: hco.HealthCheckTimeout} + response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) + if err != nil || response.StatusCode != http.StatusOK { + return false, nil + } + return true, nil + } + case types.DockerComponent: + return func() (bool, error) { + if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil { + return false, nil + } + return true, nil + } + case types.CRIComponent: + return func() (bool, error) { + if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil { + return false, nil + } + return true, nil + } + } + return nil +} + +// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs. +func execCommand(timeout time.Duration, command string, args ...string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + cmd := exec.CommandContext(ctx, command, args...) + out, err := cmd.Output() + if err != nil { + glog.Infof("command %v failed: %v, %v\n", cmd, err, out) + return "", err + } + return strings.TrimSuffix(string(out), "\n"), nil +} + +// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last +// service restart. (false, nil) otherwise. +func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) { + out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c", + // Query service logs since the logStartTime + `journalctl --unit "`+service+`" --since "`+logStartTime+ + // Grep the pattern + `" | grep -i "`+logPattern+ + // Get the count of occurrences + `" | wc -l`) + if err != nil { + return true, err + } + occurrences, err := strconv.Atoi(out) + if err != nil { + return true, err + } + if occurrences >= logCountThreshold { + glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences) + return false, nil + } + return true, nil +} diff --git a/pkg/healthchecker/health_checker_windows.go b/pkg/healthchecker/health_checker_windows.go new file mode 100644 index 000000000..61916ecd6 --- /dev/null +++ b/pkg/healthchecker/health_checker_windows.go @@ -0,0 +1,138 @@ +/* +Copyright 2021 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthchecker + +import ( + "fmt" + "net/http" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/golang/glog" + + "k8s.io/node-problem-detector/cmd/healthchecker/options" + "k8s.io/node-problem-detector/pkg/healthchecker/types" + "k8s.io/node-problem-detector/pkg/util" +) + +// getUptimeFunc returns the time for which the given service has been running. +func getUptimeFunc(service string) func() (time.Duration, error) { + return func() (time.Duration, error) { + // Using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state. + // The powershell command formats the TimeCreated of the event log in RFC1123Pattern. + // However, because the time library parser does not recognize the ',' in this RFC1123Pattern format, + // it is manually removed before parsing it using the UptimeTimeLayout. + getTimeCreatedCmd := "(Get-WinEvent -Logname System | Where-Object {$_.Message -Match '.*(" + service + + ").*(running).*'} | Select-Object -Property TimeCreated -First 1 | foreach {$_.TimeCreated.ToString('R')} | Out-String).Trim()" + out, err := powershell(getTimeCreatedCmd) + if err != nil { + return time.Duration(0), err + } + if out == "" { + return time.Duration(0), fmt.Errorf("service time creation not found for %s", service) + } + out = strings.ReplaceAll(out, ",", "") + t, err := time.Parse(types.UptimeTimeLayout, out) + if err != nil { + return time.Duration(0), err + } + return time.Since(t), nil + } +} + +// getRepairFunc returns the repair function based on the component. +func getRepairFunc(hco *options.HealthCheckerOptions) func() { + // Restart-Service will stop and attempt to start the service + return func() { + powershell("Restart-Service", hco.Service) + } +} + +// getHealthCheckFunc returns the health check function based on the component. +func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) { + switch hco.Component { + case types.KubeletComponent: + return func() (bool, error) { + httpClient := http.Client{Timeout: hco.HealthCheckTimeout} + response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) + if err != nil || response.StatusCode != http.StatusOK { + return false, nil + } + return true, nil + } + case types.DockerComponent: + return func() (bool, error) { + if _, err := execCommand("docker.exe", "ps"); err != nil { + return false, nil + } + return true, nil + } + case types.CRIComponent: + return func() (bool, error) { + if _, err := execCommand(hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil { + return false, nil + } + return true, nil + } + } + return nil +} + +// execCommand creates a new process, executes the command, and returns the (output, error) from command. +func execCommand(command string, args ...string) (string, error) { + cmd := util.Exec(command, args...) + return extractCommandOutput(cmd) +} + +// powershell executes the arguments in powershell process and returns (output, error) from command. +func powershell(args ...string) (string, error) { + cmd := util.Powershell(args...) + return extractCommandOutput(cmd) +} + +// Given an executable command, run and return the standard output, or error if command failed. +func extractCommandOutput(cmd *exec.Cmd) (string, error) { + out, err := cmd.Output() + if err != nil { + glog.Infof("command %v failed: %v, %v\n", cmd, err, out) + return "", err + } + return strings.TrimSuffix(string(out), "\r\n"), nil +} + +// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last +// service restart. (false, nil) otherwise. +func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) { + countPatternLogCmd := "@(Get-WinEvent -Logname System | Where-Object {($_.TimeCreated -ge ([datetime]::ParseExact('" + logStartTime + + "','" + types.LogParsingTimeFormat + "', $null))) -and ($_.Message -Match '" + logPattern + "')}).count" + + out, err := powershell(countPatternLogCmd) + if err != nil { + return true, err + } + occurrences, err := strconv.Atoi(out) + if err != nil { + return true, err + } + if occurrences >= logCountThreshold { + glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences) + return false, nil + } + return true, nil +} diff --git a/pkg/healthchecker/types/types.go b/pkg/healthchecker/types/types.go index fac9913c8..65dcaebb5 100644 --- a/pkg/healthchecker/types/types.go +++ b/pkg/healthchecker/types/types.go @@ -28,12 +28,8 @@ const ( DefaultCoolDownTime = 2 * time.Minute DefaultHealthCheckTimeout = 10 * time.Second CmdTimeout = 10 * time.Second - UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC" LogParsingTimeLayout = "2006-01-02 15:04:05" - DefaultCriCtl = "/usr/bin/crictl" - DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock" - KubeletComponent = "kubelet" CRIComponent = "cri" DockerComponent = "docker" diff --git a/pkg/healthchecker/types/types_linux.go b/pkg/healthchecker/types/types_linux.go new file mode 100644 index 000000000..4a47edd37 --- /dev/null +++ b/pkg/healthchecker/types/types_linux.go @@ -0,0 +1,23 @@ +/* +Copyright 2021 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +const ( + DefaultCriCtl = "/usr/bin/crictl" + DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock" + UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC" +) diff --git a/pkg/healthchecker/types/types_windows.go b/pkg/healthchecker/types/types_windows.go new file mode 100644 index 000000000..33276f4f7 --- /dev/null +++ b/pkg/healthchecker/types/types_windows.go @@ -0,0 +1,24 @@ +/* +Copyright 2021 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +const ( + DefaultCriCtl = "C:/node/crictl.exe" + DefaultCriSocketPath = "npipe:////./pipe/containerd-containerd" + UptimeTimeLayout = "Mon 02 Jan 2006 15:04:05 MST" + LogParsingTimeFormat = "yyyy-MM-dd HH:mm:ss" +) diff --git a/pkg/util/exec_windows.go b/pkg/util/exec_windows.go index 2079610c8..3e70d6f23 100644 --- a/pkg/util/exec_windows.go +++ b/pkg/util/exec_windows.go @@ -42,8 +42,8 @@ func Exec(name string, arg ...string) *exec.Cmd { name = "cmd.exe" // Powershell Scripts case ".ps1": - cmdArgs = append([]string{"-NoLogo", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "RemoteSigned", name}, cmdArgs...) - name = "powershell.exe" + cmdArgs = append([]string{name}, cmdArgs...) + return Powershell(cmdArgs...) default: // Run directly. } @@ -51,6 +51,13 @@ func Exec(name string, arg ...string) *exec.Cmd { return exec.Command(name, cmdArgs...) } +// Powershell creates a new powershell process with the specified arguments +func Powershell(args ...string) *exec.Cmd { + defaultFlags := []string{"-NoLogo", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "RemoteSigned"} + args = append(defaultFlags, args...) + return exec.Command("powershell.exe", args...) +} + // ExitStatus returns the exit code of the application. func ExitStatus(cmd *exec.Cmd) int { return cmd.ProcessState.Sys().(syscall.WaitStatus).ExitStatus()