@@ -24,15 +24,20 @@ import (
24
24
"errors"
25
25
"fmt"
26
26
"io"
27
+ "net"
28
+ "net/http"
27
29
"os"
28
30
"path/filepath"
31
+ "strconv"
32
+ "strings"
29
33
"testing"
30
34
"time"
31
35
32
36
configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
33
37
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
34
38
envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3"
35
39
"github.com/google/go-cmp/cmp"
40
+ "github.com/prometheus/client_golang/prometheus/promhttp"
36
41
"github.com/stretchr/testify/assert"
37
42
"google.golang.org/grpc"
38
43
"google.golang.org/grpc/credentials/insecure"
@@ -43,12 +48,16 @@ import (
43
48
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
44
49
k8syaml "k8s.io/apimachinery/pkg/util/yaml"
45
50
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
51
+ "k8s.io/component-base/metrics/legacyregistry"
52
+ metricsutils "k8s.io/component-base/metrics/testutil"
46
53
ctrl "sigs.k8s.io/controller-runtime"
47
54
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
48
55
"sigs.k8s.io/controller-runtime/pkg/envtest"
56
+ "sigs.k8s.io/controller-runtime/pkg/manager"
49
57
"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
50
58
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
51
59
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
60
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
52
61
runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
53
62
extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test"
54
63
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
@@ -57,7 +66,8 @@ import (
57
66
)
58
67
59
68
const (
60
- port = runserver .DefaultGrpcPort
69
+ port = runserver .DefaultGrpcPort
70
+ metricsPort = 8888
61
71
)
62
72
63
73
var (
@@ -76,6 +86,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
76
86
wantHeaders []* configPb.HeaderValueOption
77
87
wantMetadata * structpb.Struct
78
88
wantBody []byte
89
+ wantMetrics string
79
90
wantErr bool
80
91
immediateResponse * extProcPb.ImmediateResponse
81
92
}{
@@ -113,7 +124,12 @@ func TestKubeInferenceModelRequest(t *testing.T) {
113
124
},
114
125
wantMetadata : makeMetadata ("address-1:8000" ),
115
126
wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" my-model-12345\" ,\" prompt\" :\" test1\" ,\" temperature\" :0}" ),
116
- wantErr : false ,
127
+ wantMetrics : `
128
+ # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
129
+ # TYPE inference_model_request_total counter
130
+ inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
131
+ ` ,
132
+ wantErr : false ,
117
133
},
118
134
{
119
135
name : "select active lora, low queue" ,
@@ -161,7 +177,13 @@ func TestKubeInferenceModelRequest(t *testing.T) {
161
177
},
162
178
wantMetadata : makeMetadata ("address-1:8000" ),
163
179
wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" sql-lora-1fdg2\" ,\" prompt\" :\" test2\" ,\" temperature\" :0}" ),
164
- wantErr : false ,
180
+ wantMetrics : `
181
+ # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
182
+ # TYPE inference_model_request_total counter
183
+ inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
184
+ inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
185
+ ` ,
186
+ wantErr : false ,
165
187
},
166
188
{
167
189
name : "select no lora despite active model, avoid excessive queue size" ,
@@ -210,7 +232,13 @@ func TestKubeInferenceModelRequest(t *testing.T) {
210
232
},
211
233
wantMetadata : makeMetadata ("address-2:8000" ),
212
234
wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" sql-lora-1fdg2\" ,\" prompt\" :\" test3\" ,\" temperature\" :0}" ),
213
- wantErr : false ,
235
+ wantMetrics : `
236
+ # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
237
+ # TYPE inference_model_request_total counter
238
+ inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
239
+ inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 2
240
+ ` ,
241
+ wantErr : false ,
214
242
},
215
243
{
216
244
name : "noncritical and all models past threshold, shed request" ,
@@ -253,6 +281,12 @@ func TestKubeInferenceModelRequest(t *testing.T) {
253
281
Code : envoyTypePb .StatusCode_TooManyRequests ,
254
282
},
255
283
},
284
+ wantMetrics : `
285
+ # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
286
+ # TYPE inference_model_request_total counter
287
+ inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
288
+ inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 2
289
+ ` ,
256
290
},
257
291
{
258
292
name : "noncritical, but one server has capacity, do not shed" ,
@@ -301,7 +335,14 @@ func TestKubeInferenceModelRequest(t *testing.T) {
301
335
},
302
336
wantMetadata : makeMetadata ("address-0:8000" ),
303
337
wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" sql-lora-1fdg3\" ,\" prompt\" :\" test5\" ,\" temperature\" :0}" ),
304
- wantErr : false ,
338
+ wantMetrics : `
339
+ # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
340
+ # TYPE inference_model_request_total counter
341
+ inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
342
+ inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 2
343
+ inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
344
+ ` ,
345
+ wantErr : false ,
305
346
},
306
347
}
307
348
@@ -345,6 +386,12 @@ func TestKubeInferenceModelRequest(t *testing.T) {
345
386
if diff := cmp .Diff (want , res , protocmp .Transform ()); diff != "" {
346
387
t .Errorf ("Unexpected response, (-want +got): %v" , diff )
347
388
}
389
+
390
+ if test .wantMetrics != "" {
391
+ if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (test .wantMetrics ), "inference_model_request_total" ); err != nil {
392
+ t .Error (err )
393
+ }
394
+ }
348
395
})
349
396
}
350
397
}
@@ -423,6 +470,8 @@ func BeforeSuit(t *testing.T) func() {
423
470
logutil .Fatal (logger , err , "Failed to create controller manager" )
424
471
}
425
472
473
+ registerMetricsHandler (mgr , metricsPort )
474
+
426
475
serverRunner = runserver .NewDefaultExtProcServerRunner ()
427
476
// Adjust from defaults
428
477
serverRunner .PoolName = "vllm-llama2-7b-pool"
@@ -543,3 +592,31 @@ func makeMetadata(endpoint string) *structpb.Struct {
543
592
},
544
593
}
545
594
}
595
+
596
+ // registerMetricsHandler is a simplified version of metrics endpoint handler
597
+ // without Authentication for integration tests.
598
+ func registerMetricsHandler (mgr manager.Manager , port int ) error {
599
+ metrics .Register ()
600
+
601
+ // Init HTTP server.
602
+ h := promhttp .HandlerFor (
603
+ legacyregistry .DefaultGatherer ,
604
+ promhttp.HandlerOpts {},
605
+ )
606
+
607
+ mux := http .NewServeMux ()
608
+ mux .Handle ("/metrics" , h )
609
+
610
+ srv := & http.Server {
611
+ Addr : net .JoinHostPort ("" , strconv .Itoa (port )),
612
+ Handler : mux ,
613
+ }
614
+
615
+ if err := mgr .Add (& manager.Server {
616
+ Name : "metrics" ,
617
+ Server : srv ,
618
+ }); err != nil {
619
+ return err
620
+ }
621
+ return nil
622
+ }
0 commit comments