2
2
package vllm
3
3
4
4
import (
5
+ "context"
5
6
"ext-proc/backend"
6
7
"fmt"
7
8
"net/http"
@@ -15,8 +16,7 @@ import (
15
16
)
16
17
17
18
const (
18
- ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
19
- LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
19
+ ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
20
20
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
21
21
RunningQueueSizeMetricName = "vllm:num_requests_running"
22
22
WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -32,11 +32,15 @@ type PodMetricsClientImpl struct {
32
32
}
33
33
34
34
// FetchMetrics fetches metrics from a given pod.
35
- func (p * PodMetricsClientImpl ) FetchMetrics (pod backend.Pod , existing * backend.PodMetrics ) (* backend.PodMetrics , error ) {
35
+ func (p * PodMetricsClientImpl ) FetchMetrics (ctx context. Context , pod backend.Pod , existing * backend.PodMetrics ) (* backend.PodMetrics , error ) {
36
36
// Currently the metrics endpoint is hard-coded, which works with vLLM.
37
37
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16): Consume this from LLMServerPool config.
38
38
url := fmt .Sprintf ("http://%s/metrics" , pod .Address )
39
- resp , err := http .Get (url )
39
+ req , err := http .NewRequestWithContext (ctx , http .MethodGet , url , nil )
40
+ if err != nil {
41
+ return nil , fmt .Errorf ("failed to create request: %v" , err )
42
+ }
43
+ resp , err := http .DefaultClient .Do (req )
40
44
if err != nil {
41
45
klog .Errorf ("failed to fetch metrics from %s: %v" , pod , err )
42
46
return nil , fmt .Errorf ("failed to fetch metrics from %s: %w" , pod , err )
@@ -63,23 +67,23 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
63
67
var errs error
64
68
updated := existing .Clone ()
65
69
runningQueueSize , _ , err := getLatestMetric (metricFamilies , RunningQueueSizeMetricName )
66
- multierr .Append (errs , err )
70
+ errs = multierr .Append (errs , err )
67
71
if err == nil {
68
72
updated .RunningQueueSize = int (runningQueueSize .GetGauge ().GetValue ())
69
73
}
70
74
waitingQueueSize , _ , err := getLatestMetric (metricFamilies , WaitingQueueSizeMetricName )
71
- multierr .Append (errs , err )
75
+ errs = multierr .Append (errs , err )
72
76
if err == nil {
73
77
updated .WaitingQueueSize = int (waitingQueueSize .GetGauge ().GetValue ())
74
78
}
75
79
cachePercent , _ , err := getLatestMetric (metricFamilies , KVCacheUsagePercentMetricName )
76
- multierr .Append (errs , err )
80
+ errs = multierr .Append (errs , err )
77
81
if err == nil {
78
82
updated .KVCacheUsagePercent = cachePercent .GetGauge ().GetValue ()
79
83
}
80
84
/* TODO: uncomment once this is available in vllm.
81
85
kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
82
- multierr.Append(errs, err)
86
+ errs = multierr.Append(errs, err)
83
87
if err != nil {
84
88
updated.KvCacheMaxTokenCapacity = int(kvCap)
85
89
}
@@ -107,7 +111,7 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
107
111
}
108
112
} else {
109
113
klog .Warningf ("metric family %q not found" , ActiveLoRAAdaptersMetricName )
110
- multierr .Append (errs , fmt .Errorf ("metric family %q not found" , ActiveLoRAAdaptersMetricName ))
114
+ errs = multierr .Append (errs , fmt .Errorf ("metric family %q not found" , ActiveLoRAAdaptersMetricName ))
111
115
}
112
116
113
117
return updated , errs
0 commit comments