1
1
package static
2
2
3
3
import (
4
+ "context"
4
5
"errors"
6
+ "fmt"
7
+ "net"
5
8
"net/http"
6
9
"sync"
10
+ "time"
11
+
12
+ "sigs.k8s.io/controller-runtime/pkg/manager"
13
+
14
+ "github.com/nginx/nginx-gateway-fabric/internal/mode/static/config"
7
15
)
8
16
9
17
// newGraphBuiltHealthChecker creates a new graphBuiltHealthChecker.
@@ -13,37 +21,94 @@ func newGraphBuiltHealthChecker() *graphBuiltHealthChecker {
13
21
}
14
22
}
15
23
16
- // graphBuiltHealthChecker is used to check if the initial graph is built and the NGF Pod is ready.
24
+ // graphBuiltHealthChecker is used to check if the NGF Pod is ready. The NGF Pod is ready if the initial graph has
25
+ // been built and if it is leader.
17
26
type graphBuiltHealthChecker struct {
18
27
// readyCh is a channel that is initialized in newGraphBuiltHealthChecker and represents if the NGF Pod is ready.
19
- readyCh chan struct {}
20
- lock sync.RWMutex
21
- ready bool
28
+ readyCh chan struct {}
29
+ lock sync.RWMutex
30
+ graphBuilt bool
31
+ leader bool
32
+ }
33
+
34
+ // createHealthProbe creates a Server runnable to serve as our health and readiness checker.
35
+ func createHealthProbe (cfg config.Config , healthChecker * graphBuiltHealthChecker ) (manager.Server , error ) {
36
+ // we chose to create our own health probe server instead of using the controller-runtime one because
37
+ // of repetitive log which would flood our logs on non-ready non-leader NGF Pods. This health probe is
38
+ // similar to the controller-runtime's health probe.
39
+
40
+ mux := http .NewServeMux ()
41
+
42
+ // copy of controller-runtime sane defaults for new http.Server
43
+ s := & http.Server {
44
+ Handler : mux ,
45
+ MaxHeaderBytes : 1 << 20 ,
46
+ IdleTimeout : 90 * time .Second , // matches http.DefaultTransport keep-alive timeout
47
+ ReadHeaderTimeout : 32 * time .Second ,
48
+ }
49
+
50
+ mux .HandleFunc (readinessEndpointName , healthChecker .readyHandler )
51
+
52
+ ln , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , cfg .HealthConfig .Port ))
53
+ if err != nil {
54
+ return manager.Server {},
55
+ fmt .Errorf ("error listening on %s: %w" , fmt .Sprintf (":%d" , cfg .HealthConfig .Port ), err )
56
+ }
57
+
58
+ return manager.Server {
59
+ Name : "health probe" ,
60
+ Server : s ,
61
+ Listener : ln ,
62
+ }, nil
63
+ }
64
+
65
+ func (h * graphBuiltHealthChecker ) readyHandler (resp http.ResponseWriter , req * http.Request ) {
66
+ if err := h .readyCheck (req ); err != nil {
67
+ resp .WriteHeader (http .StatusServiceUnavailable )
68
+ } else {
69
+ resp .WriteHeader (http .StatusOK )
70
+ }
22
71
}
23
72
24
73
// readyCheck returns the ready-state of the Pod. It satisfies the controller-runtime Checker type.
25
- // We are considered ready after the first graph is built.
74
+ // We are considered ready after the first graph is built and if the NGF Pod is leader .
26
75
func (h * graphBuiltHealthChecker ) readyCheck (_ * http.Request ) error {
27
76
h .lock .RLock ()
28
77
defer h .lock .RUnlock ()
29
78
30
- if ! h .ready {
31
- return errors .New ("control plane is not yet ready" )
79
+ if ! h .leader {
80
+ return errors .New ("this Pod is not currently leader" )
81
+ }
82
+
83
+ if ! h .graphBuilt {
84
+ return errors .New ("control plane initial graph has not been built" )
32
85
}
33
86
34
87
return nil
35
88
}
36
89
37
- // setAsReady marks the health check as ready .
38
- func (h * graphBuiltHealthChecker ) setAsReady () {
90
+ // setGraphBuilt marks the health check as having the initial graph built .
91
+ func (h * graphBuiltHealthChecker ) setGraphBuilt () {
39
92
h .lock .Lock ()
40
93
defer h .lock .Unlock ()
41
94
42
- h .ready = true
43
- close (h .readyCh )
95
+ h .graphBuilt = true
44
96
}
45
97
46
98
// getReadyCh returns a read-only channel, which determines if the NGF Pod is ready.
47
99
func (h * graphBuiltHealthChecker ) getReadyCh () <- chan struct {} {
48
100
return h .readyCh
49
101
}
102
+
103
+ // setAsLeader marks the health check as leader.
104
+ func (h * graphBuiltHealthChecker ) setAsLeader (_ context.Context ) {
105
+ h .lock .Lock ()
106
+ defer h .lock .Unlock ()
107
+
108
+ h .leader = true
109
+
110
+ // setGraphBuilt should already have been called when processing the resources on startup because the leader
111
+ // election process takes longer than the initial call to HandleEventBatch. Thus, the NGF Pod should be marked as
112
+ // ready and have this channel be closed.
113
+ close (h .readyCh )
114
+ }
0 commit comments