Skip to content

Commit a136766

Browse files
committed
removed DockerHung, and some other clean up.
1 parent 0af3a94 commit a136766

File tree

8 files changed

+31
-37
lines changed

8 files changed

+31
-37
lines changed

Diff for: README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,9 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
260260
3. ```./bin/node-problem-detector --logtostderr --apiserver-override=http://127.0.0.1:8080?inClusterConfig=false --config.system-log-monitor=config/kernel-monitor.json --config.system-stats-monitor=config/system-stats-monitor.json --port=20256 --prometheus-port=20257``` (or point to any API server address:port and Prometheus port)
261261
4. ```sudo sh -c "echo 'kernel: BUG: unable to handle kernel NULL pointer dereference at TESTING' >> /dev/kmsg"```
262262
5. You can see ```KernelOops``` event in the node-problem-detector log.
263-
6. ```sudo sh -c "echo 'kernel: INFO: task docker:20744 blocked for more than 120 seconds.' >> /dev/kmsg"```
264-
7. You can see ```DockerHung``` event and condition in the node-problem-detector log.
265-
8. You can see ```DockerHung``` condition at [http://127.0.0.1:20256/conditions](http://127.0.0.1:20256/conditions).
263+
6. ```sudo sh -c "echo 'kernel: INFO: task foo:20744 blocked for more than 120 seconds.' >> /dev/kmsg"```
264+
7. You can see ```TaskHung``` event and condition in the node-problem-detector log.
265+
8. You can see ```TaskHung``` condition at [http://127.0.0.1:20256/conditions](http://127.0.0.1:20256/conditions).
266266
9. You can see disk-related system metrics in Prometheus format at [http://127.0.0.1:20257/metrics](http://127.0.0.1:20257/metrics).
267267

268268
**Note**:

Diff for: pkg/systemlogmonitor/README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,8 @@ example:
108108
```
109109
# HELP problem_counter Number of times a specific type of problem have occurred.
110110
# TYPE problem_counter counter
111-
problem_counter{reason="DockerHung"} 1
111+
problem_counter{reason="TaskHung"} 1
112112
# HELP problem_gauge Whether a specific type of problem is affecting the node or not.
113113
# TYPE problem_gauge gauge
114-
problem_gauge{condition="KernelDeadlock",reason="DockerHung"} 1
114+
problem_gauge{condition="KernelDeadlock",reason="TaskHung"} 1
115115
```
116-

Diff for: pkg/systemlogmonitor/types/types.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import (
2323
)
2424

2525
// Log is the log item returned by translator. It's very easy to extend this
26-
// to support other log monitoring, such as docker log monitoring.
26+
// to support other log monitoring.
2727
type Log struct {
2828
Timestamp time.Time
2929
Message string

Diff for: pkg/util/metrics/helpers_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func TestPrometheusMetricsParsingAndMatching(t *testing.T) {
8181
},
8282
{
8383
Name: "problem_counter",
84-
Labels: map[string]string{"reason": "DockerHung"},
84+
Labels: map[string]string{"reason": "TaskHung"},
8585
},
8686
{
8787
Name: "problem_counter",

Diff for: pkg/util/metrics/testdata/sample_metrics.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ host_uptime{kernel_version="4.14.127+",os_version="cos 73-11647.217.0"} 81
2121
problem_counter{reason="ContainerdStart"} 1
2222
problem_counter{reason="CorruptDockerImage"} 0
2323
problem_counter{reason="CorruptDockerOverlay2"} 0
24-
problem_counter{reason="DockerHung"} 0
24+
problem_counter{reason="TaskHung"} 0
2525
problem_counter{reason="DockerStart"} 1
2626
problem_counter{reason="FilesystemIsReadOnly"} 0
2727
problem_counter{reason="FrequentContainerdRestart"} 0
@@ -35,7 +35,7 @@ problem_counter{reason="UnregisterNetDevice"} 0
3535
# HELP problem_gauge Whether a specific type of problem is affecting the node or not.
3636
# TYPE problem_gauge gauge
3737
problem_gauge{reason="CorruptDockerOverlay2",type="CorruptDockerOverlay2"} 0
38-
problem_gauge{reason="DockerHung",type="KernelDeadlock"} 0
38+
problem_gauge{reason="TaskHung",type="KernelDeadlock"} 0
3939
problem_gauge{reason="FilesystemIsReadOnly",type="ReadonlyFilesystem"} 0
4040
problem_gauge{reason="FrequentContainerdRestart",type="FrequentContainerdRestart"} 0
4141
problem_gauge{reason="FrequentDockerRestart",type="FrequentDockerRestart"} 0

Diff for: test/build.sh

+12-16
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ set -o errexit
2121
set -o nounset
2222
set -o pipefail
2323

24-
2524
NPD_STAGING_PATH=${NPD_STAGING_PATH:-"gs://k8s-staging-npd"}
2625
NPD_STAGING_REGISTRY=${NPD_STAGING_REGISTRY:-"gcr.io/node-problem-detector-staging"}
2726
PR_ENV_FILENAME=${PR_ENV_FILENAME:-"pr.env"}
@@ -30,7 +29,6 @@ CI_CUSTOM_FLAGS_ENV_FILENAME=${CI_CUSTOM_FLAGS_ENV_FILENAME:-"ci-custom-flags.en
3029
ROOT_PATH=$(git rev-parse --show-toplevel)
3130
GCS_URL_PREFIX="https://storage.googleapis.com/"
3231

33-
3432
function print-help() {
3533
echo "Usage: build.sh [flags] [command]"
3634
echo
@@ -57,7 +55,7 @@ function print-help() {
5755

5856
function get-version() {
5957
if [ -d .git ]; then
60-
echo `git describe --tags --dirty`
58+
echo $(git describe --tags --dirty)
6159
else
6260
echo "UNKNOWN"
6361
fi
@@ -75,7 +73,7 @@ function write-env-file() {
7573
exit 1
7674
fi
7775

78-
cat > ${ROOT_PATH}/${env_file} <<EOF
76+
cat >${ROOT_PATH}/${env_file} <<EOF
7977
export KUBE_ENABLE_NODE_PROBLEM_DETECTOR=standalone
8078
export NODE_PROBLEM_DETECTOR_RELEASE_PATH=${UPLOAD_PATH/gs:\/\//${GCS_URL_PREFIX}}
8179
export NODE_PROBLEM_DETECTOR_VERSION=${VERSION}
@@ -84,7 +82,7 @@ export EXTRA_ENVS=NODE_PROBLEM_DETECTOR_IMAGE=${REGISTRY}/node-problem-detector:
8482
EOF
8583

8684
if [[ -n "${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}" ]]; then
87-
cat >> ${ROOT_PATH}/${env_file} <<EOF
85+
cat >>${ROOT_PATH}/${env_file} <<EOF
8886
export NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS}"
8987
EOF
9088
fi
@@ -98,7 +96,6 @@ function build-npd-custom-flags() {
9896

9997
local -r km_config="${kube_home}/node-problem-detector/config/kernel-monitor.json"
10098
local -r rm_config="${kube_home}/node-problem-detector/config/readonly-monitor.json"
101-
local -r dm_config="${kube_home}/node-problem-detector/config/docker-monitor.json"
10299
local -r sm_config="${kube_home}/node-problem-detector/config/systemd-monitor.json"
103100

104101
local -r custom_km_config="${kube_home}/node-problem-detector/config/kernel-monitor-counter.json"
@@ -179,25 +176,24 @@ main() {
179176
fi
180177

181178
case ${1:-} in
182-
help) print-help;;
183-
pr) build-pr;;
184-
ci) build-ci;;
185-
get-ci-env) get-ci-env;;
186-
install-lib) install-lib;;
187-
*) print-help;;
179+
help) print-help ;;
180+
pr) build-pr ;;
181+
ci) build-ci ;;
182+
get-ci-env) get-ci-env ;;
183+
install-lib) install-lib ;;
184+
*) print-help ;;
188185
esac
189186
}
190187

191-
192188
USE_CUSTOM_FLAGS="false"
193189
PR_NUMBER=""
194190

195191
while getopts "fp:" opt; do
196192
case ${opt} in
197-
f) USE_CUSTOM_FLAGS="true";;
198-
p) PR_NUMBER="${OPTARG}";;
193+
f) USE_CUSTOM_FLAGS="true" ;;
194+
p) PR_NUMBER="${OPTARG}" ;;
199195
esac
200196
done
201-
shift "$((OPTIND-1))"
197+
shift "$((OPTIND - 1))"
202198

203199
main "$@"

Diff for: test/e2e/metriconly/metrics_test.go

+7-10
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
6767
})
6868

6969
ginkgo.Context("On a clean node", func() {
70-
7170
ginkgo.It("NPD should export cpu/disk/host/memory metric", func() {
7271
err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120)
7372
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
@@ -103,10 +102,10 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
103102
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
104103

105104
assertMetricValueInBound(instance,
106-
"problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"},
105+
"problem_gauge", map[string]string{"reason": "TaskHung", "type": "KernelDeadlock"},
107106
0.0, 0.0)
108107
assertMetricValueInBound(instance,
109-
"problem_counter", map[string]string{"reason": "DockerHung"},
108+
"problem_counter", map[string]string{"reason": "TaskHung"},
110109
0.0, 0.0)
111110
assertMetricValueInBound(instance,
112111
"problem_counter", map[string]string{"reason": "FilesystemIsReadOnly"},
@@ -121,7 +120,6 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
121120
})
122121

123122
ginkgo.Context("When ext4 filesystem error happens", func() {
124-
125123
ginkgo.BeforeEach(func() {
126124
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
127125
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
@@ -148,25 +146,24 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
148146
})
149147
})
150148

151-
ginkgo.Context("When OOM kills and docker hung happen", func() {
152-
149+
ginkgo.Context("When OOM kills and task hung happen", func() {
153150
ginkgo.BeforeEach(func() {
154151
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
155152
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
156153
instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem OOMKill")
157-
instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem DockerHung")
154+
instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem TaskHung")
158155
})
159156

160157
ginkgo.It("NPD should update problem_counter and problem_gauge", func() {
161158
time.Sleep(5 * time.Second)
162159
assertMetricValueInBound(instance,
163-
"problem_counter", map[string]string{"reason": "DockerHung"},
160+
"problem_counter", map[string]string{"reason": "TaskHung"},
164161
1.0, 1.0)
165162
assertMetricValueInBound(instance,
166163
"problem_counter", map[string]string{"reason": "TaskHung"},
167164
1.0, 1.0)
168165
assertMetricValueInBound(instance,
169-
"problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"},
166+
"problem_gauge", map[string]string{"reason": "TaskHung", "type": "KernelDeadlock"},
170167
1.0, 1.0)
171168
assertMetricValueInBound(instance,
172169
"problem_counter", map[string]string{"reason": "OOMKilling"},
@@ -186,7 +183,7 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
186183
testSubdirName := strings.Replace(testText, " ", "_", -1)
187184

188185
artifactSubDir = path.Join(*artifactsDir, testSubdirName)
189-
err := os.MkdirAll(artifactSubDir, os.ModeDir|0755)
186+
err := os.MkdirAll(artifactSubDir, os.ModeDir|0o755)
190187
if err != nil {
191188
fmt.Printf("Failed to create sub-directory to hold test artiface for test %s at %s\n",
192189
testText, artifactSubDir)

Diff for: test/e2e/problemmaker/README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Problem Maker
22

33
Problem maker is a program to generate/simulate various kinds of node problems. It is used in NPD e2e tests to verify NPD's behavior when node problems happen:
4+
45
1. NPD should report the problems correctly.
56
2. NPD should survive the problems as much as possible.
67

@@ -11,9 +12,10 @@ You shouldn't need to run it anyways. If you want to test NPD, it's best to run
1112
## Developing/Testing Problem Maker
1213

1314
If you want to enrich the problems that problem maker can generate, you may want to run it to test the behavior. Then the recommended way for running it is to run it in a VM:
15+
1416
```
1517
sudo problem-maker --help
16-
sudo problem-maker --problem DockerHung
18+
sudo problem-maker --problem TaskHung
1719
sudo problem-maker --problem Ext4FilesystemError
1820
```
1921

0 commit comments

Comments
 (0)