|
| 1 | +// Package vllm provides vllm specific pod metrics implementation. |
| 2 | +package vllm |
| 3 | + |
| 4 | +import ( |
| 5 | + "context" |
| 6 | + "fmt" |
| 7 | + "net/http" |
| 8 | + "sort" |
| 9 | + "strconv" |
| 10 | + "strings" |
| 11 | + |
| 12 | + dto "github.com/prometheus/client_model/go" |
| 13 | + "github.com/prometheus/common/expfmt" |
| 14 | + "go.uber.org/multierr" |
| 15 | + klog "k8s.io/klog/v2" |
| 16 | + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" |
| 17 | + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" |
| 18 | +) |
| 19 | + |
| 20 | +const ( |
| 21 | + LoraRequestInfoMetricName = "vllm:lora_requests_info" |
| 22 | + LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" |
| 23 | + LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" |
| 24 | + LoraRequestInfoMaxAdaptersMetricName = "max_lora" |
| 25 | + // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. |
| 26 | + RunningQueueSizeMetricName = "vllm:num_requests_running" |
| 27 | + WaitingQueueSizeMetricName = "vllm:num_requests_waiting" |
| 28 | + /* TODO: Uncomment this once the following are added to the fork. |
| 29 | + RunningQueueSizeMetricName = "vllm:num_tokens_running" |
| 30 | + WaitingQueueSizeMetricName = "vllm:num_tokens_waiting" |
| 31 | + */ |
| 32 | + KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" |
| 33 | + KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity" |
| 34 | +) |
| 35 | + |
| 36 | +type PodMetricsClientImpl struct{} |
| 37 | + |
| 38 | +// FetchMetrics fetches metrics from a given pod. |
| 39 | +func (p *PodMetricsClientImpl) FetchMetrics( |
| 40 | + ctx context.Context, |
| 41 | + pod backend.Pod, |
| 42 | + existing *backend.PodMetrics, |
| 43 | +) (*backend.PodMetrics, error) { |
| 44 | + // Currently the metrics endpoint is hard-coded, which works with vLLM. |
| 45 | + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. |
| 46 | + url := fmt.Sprintf("http://%s/metrics", pod.Address) |
| 47 | + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) |
| 48 | + if err != nil { |
| 49 | + klog.V(logutil.DEFAULT).ErrorS(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) |
| 50 | + return nil, fmt.Errorf("failed to create request: %v", err) |
| 51 | + } |
| 52 | + resp, err := http.DefaultClient.Do(req) |
| 53 | + if err != nil { |
| 54 | + klog.V(logutil.DEFAULT).ErrorS(err, "Failed to fetch metrics", "pod", pod) |
| 55 | + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err) |
| 56 | + } |
| 57 | + defer func() { |
| 58 | + _ = resp.Body.Close() |
| 59 | + }() |
| 60 | + |
| 61 | + if resp.StatusCode != http.StatusOK { |
| 62 | + klog.V(logutil.DEFAULT).ErrorS(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode) |
| 63 | + return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) |
| 64 | + } |
| 65 | + |
| 66 | + parser := expfmt.TextParser{} |
| 67 | + metricFamilies, err := parser.TextToMetricFamilies(resp.Body) |
| 68 | + if err != nil { |
| 69 | + return nil, err |
| 70 | + } |
| 71 | + return promToPodMetrics(metricFamilies, existing) |
| 72 | +} |
| 73 | + |
| 74 | +// promToPodMetrics updates internal pod metrics with scraped prometheus metrics. |
| 75 | +// A combined error is returned if errors occur in one or more metric processing. |
| 76 | +// It returns a new PodMetrics pointer which can be used to atomically update the pod metrics map. |
| 77 | +func promToPodMetrics( |
| 78 | + metricFamilies map[string]*dto.MetricFamily, |
| 79 | + existing *backend.PodMetrics, |
| 80 | +) (*backend.PodMetrics, error) { |
| 81 | + var errs error |
| 82 | + updated := existing.Clone() |
| 83 | + runningQueueSize, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName) |
| 84 | + errs = multierr.Append(errs, err) |
| 85 | + if err == nil { |
| 86 | + updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue()) |
| 87 | + } |
| 88 | + waitingQueueSize, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName) |
| 89 | + errs = multierr.Append(errs, err) |
| 90 | + if err == nil { |
| 91 | + updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue()) |
| 92 | + } |
| 93 | + cachePercent, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName) |
| 94 | + errs = multierr.Append(errs, err) |
| 95 | + if err == nil { |
| 96 | + updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() |
| 97 | + } |
| 98 | + |
| 99 | + // Get up to 5 of the latest Lora metrics. |
| 100 | + loraMetricsSlice, err := getLatestLoraMetrics(metricFamilies) |
| 101 | + errs = multierr.Append(errs, err) |
| 102 | + if err == nil && len(loraMetricsSlice) > 0 { |
| 103 | + var adapterList []string |
| 104 | + adapterSet := make(map[string]bool) |
| 105 | + // Iterate over metrics in descending order by creation timestamp. |
| 106 | + for _, m := range loraMetricsSlice { |
| 107 | + for _, label := range m.GetLabel() { |
| 108 | + // Optionally update max active models from the metric. |
| 109 | + if label.GetName() == LoraRequestInfoMaxAdaptersMetricName && label.GetValue() != "" { |
| 110 | + updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) |
| 111 | + if err != nil { |
| 112 | + errs = multierr.Append(errs, err) |
| 113 | + } |
| 114 | + break |
| 115 | + } |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + // Iterate over metrics in descending order by creation timestamp. |
| 120 | + for _, m := range loraMetricsSlice { |
| 121 | + // If we already have 5 unique adapters, stop processing. |
| 122 | + if len(adapterList) >= updated.MaxActiveModels { |
| 123 | + break |
| 124 | + } |
| 125 | + for _, label := range m.GetLabel() { |
| 126 | + // Process both running and waiting adapter labels. |
| 127 | + if label.GetName() == LoraRequestInfoRunningAdaptersMetricName || |
| 128 | + label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { |
| 129 | + if label.GetValue() != "" { |
| 130 | + adapters := strings.Split(label.GetValue(), ",") |
| 131 | + for _, adapter := range adapters { |
| 132 | + adapter = strings.TrimSpace(adapter) |
| 133 | + if adapter != "" && !adapterSet[adapter] { |
| 134 | + adapterSet[adapter] = true |
| 135 | + adapterList = append(adapterList, adapter) |
| 136 | + if len(adapterList) >= updated.MaxActiveModels { |
| 137 | + break |
| 138 | + } |
| 139 | + } |
| 140 | + } |
| 141 | + } |
| 142 | + } |
| 143 | + // Break early if we've collected 5 adapters. |
| 144 | + if len(adapterList) >= updated.MaxActiveModels { |
| 145 | + break |
| 146 | + } |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + updated.ActiveModels = make(map[string]int) |
| 151 | + for _, adapter := range adapterList { |
| 152 | + updated.ActiveModels[adapter] = 0 |
| 153 | + } |
| 154 | + } |
| 155 | + |
| 156 | + return updated, errs |
| 157 | +} |
| 158 | + |
| 159 | +// getLatestLoraMetrics gets up to 5 latest lora metric series from the gauge metric family `vllm:lora_requests_info`. |
| 160 | +// Each metric’s gauge value represents its creation timestamp. Only metrics with non‑empty running or waiting adapter labels are considered. |
| 161 | +func getLatestLoraMetrics(metricFamilies map[string]*dto.MetricFamily) ([]*dto.Metric, error) { |
| 162 | + loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] |
| 163 | + if !ok { |
| 164 | + klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", LoraRequestInfoMetricName) |
| 165 | + return nil, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) |
| 166 | + } |
| 167 | + |
| 168 | + var validMetrics []*dto.Metric |
| 169 | + // Iterate over all metrics in the family. |
| 170 | + for _, m := range loraRequests.GetMetric() { |
| 171 | + var running, waiting string |
| 172 | + // Read the label values for running and waiting adapters. |
| 173 | + for _, lp := range m.GetLabel() { |
| 174 | + switch lp.GetName() { |
| 175 | + case LoraRequestInfoRunningAdaptersMetricName: |
| 176 | + running = lp.GetValue() |
| 177 | + case LoraRequestInfoWaitingAdaptersMetricName: |
| 178 | + waiting = lp.GetValue() |
| 179 | + } |
| 180 | + } |
| 181 | + // Ignore metrics with both labels empty. |
| 182 | + if running == "" && waiting == "" { |
| 183 | + continue |
| 184 | + } |
| 185 | + validMetrics = append(validMetrics, m) |
| 186 | + } |
| 187 | + |
| 188 | + if len(validMetrics) == 0 { |
| 189 | + return nil, fmt.Errorf("no valid metric found") |
| 190 | + } |
| 191 | + |
| 192 | + // Sort validMetrics in descending order by their gauge value (interpreted as creation timestamp). |
| 193 | + sort.Slice(validMetrics, func(i, j int) bool { |
| 194 | + return validMetrics[i].GetGauge().GetValue() > validMetrics[j].GetGauge().GetValue() |
| 195 | + }) |
| 196 | + |
| 197 | + // We return all valid metrics so the caller can pick adapter names in order, |
| 198 | + // limiting to 5 unique adapter names across the metrics. |
| 199 | + return validMetrics, nil |
| 200 | +} |
| 201 | + |
| 202 | +// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. |
| 203 | +// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric. |
| 204 | +func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { |
| 205 | + mf, ok := metricFamilies[metricName] |
| 206 | + if !ok { |
| 207 | + klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", metricName) |
| 208 | + return nil, fmt.Errorf("metric family %q not found", metricName) |
| 209 | + } |
| 210 | + if len(mf.GetMetric()) == 0 { |
| 211 | + return nil, fmt.Errorf("no metrics available for %q", metricName) |
| 212 | + } |
| 213 | + var latestTs int64 |
| 214 | + var latest *dto.Metric |
| 215 | + for _, m := range mf.GetMetric() { |
| 216 | + if m.GetTimestampMs() >= latestTs { |
| 217 | + latestTs = m.GetTimestampMs() |
| 218 | + latest = m |
| 219 | + } |
| 220 | + } |
| 221 | + klog.V(logutil.TRACE).InfoS("Metric value selected", "value", latest, "metric", metricName) |
| 222 | + return latest, nil |
| 223 | +} |
0 commit comments