1
1
package main
2
2
3
3
import (
4
- "context"
5
4
"flag"
6
5
"fmt"
7
6
"net"
8
- "os"
9
- "os/signal"
10
- "syscall"
11
7
"time"
12
8
13
9
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
14
10
"google.golang.org/grpc"
15
- "google.golang.org/grpc/codes"
16
11
healthPb "google.golang.org/grpc/health/grpc_health_v1"
17
- "google.golang.org/grpc/status"
18
12
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
19
13
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
20
14
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm"
@@ -28,10 +22,14 @@ import (
28
22
)
29
23
30
24
var (
31
- port = flag .Int (
32
- "port " ,
25
+ grpcPort = flag .Int (
26
+ "grpcPort " ,
33
27
9002 ,
34
- "gRPC port" )
28
+ "The gRPC port used for communicating with Envoy proxy" )
29
+ grpcHealthPort = flag .Int (
30
+ "grpcHealthPort" ,
31
+ 9003 ,
32
+ "The port used for gRPC liveness and readiness probes" )
35
33
targetPodHeader = flag .String (
36
34
"targetPodHeader" ,
37
35
"target-pod" ,
@@ -64,55 +62,39 @@ var (
64
62
scheme = runtime .NewScheme ()
65
63
)
66
64
67
- type healthServer struct {}
68
-
69
- func (s * healthServer ) Check (
70
- ctx context.Context ,
71
- in * healthPb.HealthCheckRequest ,
72
- ) (* healthPb.HealthCheckResponse , error ) {
73
- klog .Infof ("Handling grpc Check request + %s" , in .String ())
74
- return & healthPb.HealthCheckResponse {Status : healthPb .HealthCheckResponse_SERVING }, nil
75
- }
76
-
77
- func (s * healthServer ) Watch (in * healthPb.HealthCheckRequest , srv healthPb.Health_WatchServer ) error {
78
- return status .Error (codes .Unimplemented , "Watch is not implemented" )
79
- }
80
-
81
65
func init () {
82
66
utilruntime .Must (clientgoscheme .AddToScheme (scheme ))
83
67
utilruntime .Must (v1alpha1 .AddToScheme (scheme ))
84
68
}
85
69
86
70
func main () {
87
-
88
71
klog .InitFlags (nil )
89
72
flag .Parse ()
90
73
91
74
ctrl .SetLogger (klog .TODO ())
92
75
76
+ // Validate flags
77
+ if err := validateFlags (); err != nil {
78
+ klog .Fatalf ("flag validation failed: %v" , err )
79
+ }
80
+
93
81
// Print all flag values
94
82
flags := "Flags: "
95
83
flag .VisitAll (func (f * flag.Flag ) {
96
84
flags += fmt .Sprintf ("%s=%v; " , f .Name , f .Value )
97
85
})
98
86
klog .Info (flags )
99
87
100
- klog . Infof ( "Listening on %q" , fmt . Sprintf ( ":%d" , * port ))
101
- lis , err := net . Listen ( "tcp" , fmt . Sprintf ( ":%d" , * port ) )
88
+ // Create a new manager to manage controllers
89
+ mgr , err := ctrl . NewManager ( ctrl . GetConfigOrDie (), ctrl. Options { Scheme : scheme } )
102
90
if err != nil {
103
- klog .Fatalf ("failed to listen : %v" , err )
91
+ klog .Fatalf ("failed to start manager : %v" , err )
104
92
}
105
93
94
+ // Create the data store used to cache watched resources
106
95
datastore := backend .NewK8sDataStore ()
107
96
108
- mgr , err := ctrl .NewManager (ctrl .GetConfigOrDie (), ctrl.Options {
109
- Scheme : scheme ,
110
- })
111
- if err != nil {
112
- klog .Error (err , "unable to start manager" )
113
- os .Exit (1 )
114
- }
115
-
97
+ // Create the controllers and register them with the manager
116
98
if err := (& backend.InferencePoolReconciler {
117
99
Datastore : datastore ,
118
100
Scheme : mgr .GetScheme (),
@@ -121,7 +103,7 @@ func main() {
121
103
PoolNamespace : * poolNamespace ,
122
104
Record : mgr .GetEventRecorderFor ("InferencePool" ),
123
105
}).SetupWithManager (mgr ); err != nil {
124
- klog .Error ( err , "Error setting up InferencePoolReconciler" )
106
+ klog .Fatalf ( "Error setting up InferencePoolReconciler: %v" , err )
125
107
}
126
108
127
109
if err := (& backend.InferenceModelReconciler {
@@ -132,7 +114,7 @@ func main() {
132
114
PoolNamespace : * poolNamespace ,
133
115
Record : mgr .GetEventRecorderFor ("InferenceModel" ),
134
116
}).SetupWithManager (mgr ); err != nil {
135
- klog .Error ( err , "Error setting up InferenceModelReconciler" )
117
+ klog .Fatalf ( "Error setting up InferenceModelReconciler: %v" , err )
136
118
}
137
119
138
120
if err := (& backend.EndpointSliceReconciler {
@@ -143,53 +125,122 @@ func main() {
143
125
ServiceName : * serviceName ,
144
126
Zone : * zone ,
145
127
}).SetupWithManager (mgr ); err != nil {
146
- klog .Error (err , "Error setting up EndpointSliceReconciler" )
128
+ klog .Fatalf ("Error setting up EndpointSliceReconciler: %v" , err )
129
+ }
130
+
131
+ // Channel to handle error signals for goroutines
132
+ errChan := make (chan error , 1 )
133
+
134
+ // Start each component in its own goroutine
135
+ startControllerManager (mgr , errChan )
136
+ healthSvr := startHealthServer (mgr , errChan , * grpcHealthPort )
137
+ extProcSvr := startExternalProcessorServer (
138
+ errChan ,
139
+ datastore ,
140
+ * grpcPort ,
141
+ * refreshPodsInterval ,
142
+ * refreshMetricsInterval ,
143
+ * targetPodHeader ,
144
+ )
145
+
146
+ // Wait for first error from any goroutine
147
+ err = <- errChan
148
+ if err != nil {
149
+ klog .Errorf ("goroutine failed: %v" , err )
150
+ } else {
151
+ klog .Infof ("Manager exited gracefully" )
147
152
}
148
153
149
- errChan := make (chan error )
154
+ // Gracefully shutdown components
155
+ if healthSvr != nil {
156
+ klog .Info ("Health server shutting down..." )
157
+ healthSvr .GracefulStop ()
158
+ }
159
+ if extProcSvr != nil {
160
+ klog .Info ("Ext-proc server shutting down..." )
161
+ extProcSvr .GracefulStop ()
162
+ }
163
+
164
+ klog .Info ("All components stopped gracefully" )
165
+ }
166
+
167
+ // startControllerManager runs the controller manager in a goroutine.
168
+ func startControllerManager (mgr ctrl.Manager , errChan chan <- error ) {
150
169
go func () {
170
+ // Blocking and will return when shutdown is complete.
151
171
if err := mgr .Start (ctrl .SetupSignalHandler ()); err != nil {
152
- klog .Error (err , "Error running manager" )
153
- errChan <- err
172
+ errChan <- fmt .Errorf ("controller manager failed to start: %w" , err )
154
173
}
174
+ // Manager exited gracefully
175
+ klog .Info ("Controller manager shutting down..." )
176
+ errChan <- nil
155
177
}()
178
+ }
156
179
157
- s := grpc .NewServer ()
180
+ // startHealthServer starts the gRPC health probe server in a goroutine.
181
+ func startHealthServer (mgr ctrl.Manager , errChan chan <- error , port int ) * grpc.Server {
182
+ healthSvr := grpc .NewServer ()
183
+ healthPb .RegisterHealthServer (healthSvr , & healthServer {Client : mgr .GetClient ()})
158
184
159
- pp := backend .NewProvider (& vllm.PodMetricsClientImpl {}, datastore )
160
- if err := pp .Init (* refreshPodsInterval , * refreshMetricsInterval ); err != nil {
161
- klog .Fatalf ("failed to initialize: %v" , err )
162
- }
163
- extProcPb .RegisterExternalProcessorServer (
164
- s ,
165
- handlers .NewServer (
166
- pp ,
167
- scheduling .NewScheduler (pp ),
168
- * targetPodHeader ,
169
- datastore ))
170
- healthPb .RegisterHealthServer (s , & healthServer {})
171
-
172
- klog .Infof ("Starting gRPC server on port :%v" , * port )
173
-
174
- // shutdown
175
- var gracefulStop = make (chan os.Signal , 1 )
176
- signal .Notify (gracefulStop , syscall .SIGTERM )
177
- signal .Notify (gracefulStop , syscall .SIGINT )
178
185
go func () {
179
- select {
180
- case sig := <- gracefulStop :
181
- klog .Infof ("caught sig: %+v" , sig )
182
- os .Exit (0 )
183
- case err := <- errChan :
184
- klog .Infof ("caught error in controller: %+v" , err )
185
- os .Exit (0 )
186
+ healthLis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
187
+ if err != nil {
188
+ errChan <- fmt .Errorf ("health server failed to listen: %w" , err )
186
189
}
190
+ klog .Infof ("Health server listening on port: %d" , port )
187
191
192
+ // Blocking and will return when shutdown is complete.
193
+ if serveErr := healthSvr .Serve (healthLis ); serveErr != nil && serveErr != grpc .ErrServerStopped {
194
+ errChan <- fmt .Errorf ("health server failed: %w" , serveErr )
195
+ }
188
196
}()
197
+ return healthSvr
198
+ }
189
199
190
- err = s .Serve (lis )
191
- if err != nil {
192
- klog .Fatalf ("Ext-proc failed with the err: %v" , err )
200
+ // startExternalProcessorServer starts the Envoy external processor server in a goroutine.
201
+ func startExternalProcessorServer (
202
+ errChan chan <- error ,
203
+ datastore * backend.K8sDatastore ,
204
+ port int ,
205
+ refreshPodsInterval , refreshMetricsInterval time.Duration ,
206
+ targetPodHeader string ,
207
+ ) * grpc.Server {
208
+ extSvr := grpc .NewServer ()
209
+ go func () {
210
+ lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
211
+ if err != nil {
212
+ errChan <- fmt .Errorf ("ext-proc server failed to listen: %w" , err )
213
+ }
214
+ klog .Infof ("Ext-proc server listening on port: %d" , port )
215
+
216
+ // Initialize backend provider
217
+ pp := backend .NewProvider (& vllm.PodMetricsClientImpl {}, datastore )
218
+ if err := pp .Init (refreshPodsInterval , refreshMetricsInterval ); err != nil {
219
+ errChan <- fmt .Errorf ("failed to initialize backend provider: %w" , err )
220
+ }
221
+
222
+ // Register ext_proc handlers
223
+ extProcPb .RegisterExternalProcessorServer (
224
+ extSvr ,
225
+ handlers .NewServer (pp , scheduling .NewScheduler (pp ), targetPodHeader , datastore ),
226
+ )
227
+
228
+ // Blocking and will return when shutdown is complete.
229
+ if serveErr := extSvr .Serve (lis ); serveErr != nil && serveErr != grpc .ErrServerStopped {
230
+ errChan <- fmt .Errorf ("ext-proc server failed: %w" , serveErr )
231
+ }
232
+ }()
233
+ return extSvr
234
+ }
235
+
236
+ func validateFlags () error {
237
+ if * poolName == "" {
238
+ return fmt .Errorf ("required %q flag not set" , "poolName" )
239
+ }
240
+
241
+ if * serviceName == "" {
242
+ return fmt .Errorf ("required %q flag not set" , "serviceName" )
193
243
}
194
244
245
+ return nil
195
246
}
0 commit comments