1
1
package main
2
2
3
3
import (
4
- "context"
5
4
"flag"
6
5
"fmt"
7
6
"net"
8
- "os"
9
- "os/signal"
10
- "syscall"
11
7
"time"
12
8
13
9
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
14
10
"google.golang.org/grpc"
15
- "google.golang.org/grpc/codes"
16
11
healthPb "google.golang.org/grpc/health/grpc_health_v1"
17
- "google.golang.org/grpc/status"
18
12
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
19
13
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
20
14
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm"
@@ -29,10 +23,14 @@ import (
29
23
)
30
24
31
25
var (
32
- port = flag .Int (
33
- "port " ,
26
+ grpcPort = flag .Int (
27
+ "grpcPort " ,
34
28
9002 ,
35
- "gRPC port" )
29
+ "The gRPC port used for communicating with Envoy proxy" )
30
+ grpcHealthPort = flag .Int (
31
+ "grpcHealthPort" ,
32
+ 9003 ,
33
+ "The port used for gRPC liveness and readiness probes" )
36
34
targetPodHeader = flag .String (
37
35
"targetPodHeader" ,
38
36
"target-pod" ,
@@ -65,55 +63,39 @@ var (
65
63
scheme = runtime .NewScheme ()
66
64
)
67
65
68
- type healthServer struct {}
69
-
70
- func (s * healthServer ) Check (
71
- ctx context.Context ,
72
- in * healthPb.HealthCheckRequest ,
73
- ) (* healthPb.HealthCheckResponse , error ) {
74
- klog .Infof ("Handling grpc Check request + %s" , in .String ())
75
- return & healthPb.HealthCheckResponse {Status : healthPb .HealthCheckResponse_SERVING }, nil
76
- }
77
-
78
- func (s * healthServer ) Watch (in * healthPb.HealthCheckRequest , srv healthPb.Health_WatchServer ) error {
79
- return status .Error (codes .Unimplemented , "Watch is not implemented" )
80
- }
81
-
82
66
func init () {
83
67
utilruntime .Must (clientgoscheme .AddToScheme (scheme ))
84
68
utilruntime .Must (v1alpha1 .AddToScheme (scheme ))
85
69
}
86
70
87
71
func main () {
88
-
89
72
klog .InitFlags (nil )
90
73
flag .Parse ()
91
74
92
75
ctrl .SetLogger (klog .TODO ())
93
76
77
+ // Validate flags
78
+ if err := validateFlags (); err != nil {
79
+ klog .Fatalf ("flag validation failed: %v" , err )
80
+ }
81
+
94
82
// Print all flag values
95
83
flags := "Flags: "
96
84
flag .VisitAll (func (f * flag.Flag ) {
97
85
flags += fmt .Sprintf ("%s=%v; " , f .Name , f .Value )
98
86
})
99
87
klog .Info (flags )
100
88
101
- klog . Infof ( "Listening on %q" , fmt . Sprintf ( ":%d" , * port ))
102
- lis , err := net . Listen ( "tcp" , fmt . Sprintf ( ":%d" , * port ) )
89
+ // Create a new manager to manage controllers
90
+ mgr , err := ctrl . NewManager ( ctrl . GetConfigOrDie (), ctrl. Options { Scheme : scheme } )
103
91
if err != nil {
104
- klog .Fatalf ("failed to listen : %v" , err )
92
+ klog .Fatalf ("failed to start manager : %v" , err )
105
93
}
106
94
95
+ // Create the data store used to cache watched resources
107
96
datastore := backend .NewK8sDataStore ()
108
97
109
- mgr , err := ctrl .NewManager (ctrl .GetConfigOrDie (), ctrl.Options {
110
- Scheme : scheme ,
111
- })
112
- if err != nil {
113
- klog .Error (err , "unable to start manager" )
114
- os .Exit (1 )
115
- }
116
-
98
+ // Create the controllers and register them with the manager
117
99
if err := (& backend.InferencePoolReconciler {
118
100
Datastore : datastore ,
119
101
Scheme : mgr .GetScheme (),
@@ -124,7 +106,7 @@ func main() {
124
106
},
125
107
Record : mgr .GetEventRecorderFor ("InferencePool" ),
126
108
}).SetupWithManager (mgr ); err != nil {
127
- klog .Error ( err , "Error setting up InferencePoolReconciler" )
109
+ klog .Fatalf ( "Error setting up InferencePoolReconciler: %v" , err )
128
110
}
129
111
130
112
if err := (& backend.InferenceModelReconciler {
@@ -137,7 +119,7 @@ func main() {
137
119
},
138
120
Record : mgr .GetEventRecorderFor ("InferenceModel" ),
139
121
}).SetupWithManager (mgr ); err != nil {
140
- klog .Error ( err , "Error setting up InferenceModelReconciler" )
122
+ klog .Fatalf ( "Error setting up InferenceModelReconciler: %v" , err )
141
123
}
142
124
143
125
if err := (& backend.EndpointSliceReconciler {
@@ -148,53 +130,122 @@ func main() {
148
130
ServiceName : * serviceName ,
149
131
Zone : * zone ,
150
132
}).SetupWithManager (mgr ); err != nil {
151
- klog .Error (err , "Error setting up EndpointSliceReconciler" )
133
+ klog .Fatalf ("Error setting up EndpointSliceReconciler: %v" , err )
134
+ }
135
+
136
+ // Channel to handle error signals for goroutines
137
+ errChan := make (chan error , 1 )
138
+
139
+ // Start each component in its own goroutine
140
+ startControllerManager (mgr , errChan )
141
+ healthSvr := startHealthServer (mgr , errChan , * grpcHealthPort )
142
+ extProcSvr := startExternalProcessorServer (
143
+ errChan ,
144
+ datastore ,
145
+ * grpcPort ,
146
+ * refreshPodsInterval ,
147
+ * refreshMetricsInterval ,
148
+ * targetPodHeader ,
149
+ )
150
+
151
+ // Wait for first error from any goroutine
152
+ err = <- errChan
153
+ if err != nil {
154
+ klog .Errorf ("goroutine failed: %v" , err )
155
+ } else {
156
+ klog .Infof ("Manager exited gracefully" )
152
157
}
153
158
154
- errChan := make (chan error )
159
+ // Gracefully shutdown components
160
+ if healthSvr != nil {
161
+ klog .Info ("Health server shutting down..." )
162
+ healthSvr .GracefulStop ()
163
+ }
164
+ if extProcSvr != nil {
165
+ klog .Info ("Ext-proc server shutting down..." )
166
+ extProcSvr .GracefulStop ()
167
+ }
168
+
169
+ klog .Info ("All components stopped gracefully" )
170
+ }
171
+
172
+ // startControllerManager runs the controller manager in a goroutine.
173
+ func startControllerManager (mgr ctrl.Manager , errChan chan <- error ) {
155
174
go func () {
175
+ // Blocking and will return when shutdown is complete.
156
176
if err := mgr .Start (ctrl .SetupSignalHandler ()); err != nil {
157
- klog .Error (err , "Error running manager" )
158
- errChan <- err
177
+ errChan <- fmt .Errorf ("controller manager failed to start: %w" , err )
159
178
}
179
+ // Manager exited gracefully
180
+ klog .Info ("Controller manager shutting down..." )
181
+ errChan <- nil
160
182
}()
183
+ }
161
184
162
- s := grpc .NewServer ()
185
+ // startHealthServer starts the gRPC health probe server in a goroutine.
186
+ func startHealthServer (mgr ctrl.Manager , errChan chan <- error , port int ) * grpc.Server {
187
+ healthSvr := grpc .NewServer ()
188
+ healthPb .RegisterHealthServer (healthSvr , & healthServer {Client : mgr .GetClient ()})
163
189
164
- pp := backend .NewProvider (& vllm.PodMetricsClientImpl {}, datastore )
165
- if err := pp .Init (* refreshPodsInterval , * refreshMetricsInterval ); err != nil {
166
- klog .Fatalf ("failed to initialize: %v" , err )
167
- }
168
- extProcPb .RegisterExternalProcessorServer (
169
- s ,
170
- handlers .NewServer (
171
- pp ,
172
- scheduling .NewScheduler (pp ),
173
- * targetPodHeader ,
174
- datastore ))
175
- healthPb .RegisterHealthServer (s , & healthServer {})
176
-
177
- klog .Infof ("Starting gRPC server on port :%v" , * port )
178
-
179
- // shutdown
180
- var gracefulStop = make (chan os.Signal , 1 )
181
- signal .Notify (gracefulStop , syscall .SIGTERM )
182
- signal .Notify (gracefulStop , syscall .SIGINT )
183
190
go func () {
184
- select {
185
- case sig := <- gracefulStop :
186
- klog .Infof ("caught sig: %+v" , sig )
187
- os .Exit (0 )
188
- case err := <- errChan :
189
- klog .Infof ("caught error in controller: %+v" , err )
190
- os .Exit (0 )
191
+ healthLis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
192
+ if err != nil {
193
+ errChan <- fmt .Errorf ("health server failed to listen: %w" , err )
191
194
}
195
+ klog .Infof ("Health server listening on port: %d" , port )
192
196
197
+ // Blocking and will return when shutdown is complete.
198
+ if serveErr := healthSvr .Serve (healthLis ); serveErr != nil && serveErr != grpc .ErrServerStopped {
199
+ errChan <- fmt .Errorf ("health server failed: %w" , serveErr )
200
+ }
193
201
}()
202
+ return healthSvr
203
+ }
194
204
195
- err = s .Serve (lis )
196
- if err != nil {
197
- klog .Fatalf ("Ext-proc failed with the err: %v" , err )
205
+ // startExternalProcessorServer starts the Envoy external processor server in a goroutine.
206
+ func startExternalProcessorServer (
207
+ errChan chan <- error ,
208
+ datastore * backend.K8sDatastore ,
209
+ port int ,
210
+ refreshPodsInterval , refreshMetricsInterval time.Duration ,
211
+ targetPodHeader string ,
212
+ ) * grpc.Server {
213
+ extSvr := grpc .NewServer ()
214
+ go func () {
215
+ lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
216
+ if err != nil {
217
+ errChan <- fmt .Errorf ("ext-proc server failed to listen: %w" , err )
218
+ }
219
+ klog .Infof ("Ext-proc server listening on port: %d" , port )
220
+
221
+ // Initialize backend provider
222
+ pp := backend .NewProvider (& vllm.PodMetricsClientImpl {}, datastore )
223
+ if err := pp .Init (refreshPodsInterval , refreshMetricsInterval ); err != nil {
224
+ errChan <- fmt .Errorf ("failed to initialize backend provider: %w" , err )
225
+ }
226
+
227
+ // Register ext_proc handlers
228
+ extProcPb .RegisterExternalProcessorServer (
229
+ extSvr ,
230
+ handlers .NewServer (pp , scheduling .NewScheduler (pp ), targetPodHeader , datastore ),
231
+ )
232
+
233
+ // Blocking and will return when shutdown is complete.
234
+ if serveErr := extSvr .Serve (lis ); serveErr != nil && serveErr != grpc .ErrServerStopped {
235
+ errChan <- fmt .Errorf ("ext-proc server failed: %w" , serveErr )
236
+ }
237
+ }()
238
+ return extSvr
239
+ }
240
+
241
+ func validateFlags () error {
242
+ if * poolName == "" {
243
+ return fmt .Errorf ("required %q flag not set" , "poolName" )
244
+ }
245
+
246
+ if * serviceName == "" {
247
+ return fmt .Errorf ("required %q flag not set" , "serviceName" )
198
248
}
199
249
250
+ return nil
200
251
}
0 commit comments