Skip to content

Commit c1cbfa1

Browse files
committed
changes for multilora
1 parent f128c07 commit c1cbfa1

File tree

11 files changed

+15615
-24
lines changed

11 files changed

+15615
-24
lines changed

config/manifests/ext_proc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ spec:
7171
spec:
7272
containers:
7373
- name: inference-gateway-ext-proc
74-
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
74+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/llm-ig-ext-proc-h100:latest
7575
imagePullPolicy: Always
7676
args:
7777
- -poolName

config/manifests/vllm/deployment.yaml

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ kind: Deployment
33
metadata:
44
name: vllm-llama2-7b-pool
55
spec:
6-
replicas: 3
6+
replicas: 2
77
selector:
88
matchLabels:
99
app: vllm-llama2-7b-pool
@@ -14,7 +14,7 @@ spec:
1414
spec:
1515
containers:
1616
- name: lora
17-
image: "vllm/vllm-openai:latest"
17+
image: "us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/vllm-openai-v1-lora"
1818
imagePullPolicy: Always
1919
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
2020
args:
@@ -24,15 +24,36 @@ spec:
2424
- "1"
2525
- "--port"
2626
- "8000"
27+
- "--compilation-config"
28+
- "3"
29+
- "--max-num-seqs"
30+
- "2048"
2731
- "--enable-lora"
2832
- "--max-loras"
2933
- "4"
3034
- "--max-cpu-loras"
31-
- "12"
35+
- "15"
36+
- "--max-lora-rank"
37+
- "16"
3238
- "--lora-modules"
3339
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
3440
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
41+
- '{"name": "tweet-summary-2", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
42+
- '{"name": "tweet-summary-3", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
43+
- '{"name": "tweet-summary-4", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
44+
- '{"name": "tweet-summary-5", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
45+
- '{"name": "tweet-summary-6", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
46+
- '{"name": "tweet-summary-7", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
47+
- '{"name": "tweet-summary-8", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
48+
- '{"name": "tweet-summary-9", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
49+
- '{"name": "tweet-summary-10", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
50+
- '{"name": "tweet-summary-11", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
51+
- '{"name": "tweet-summary-12", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
52+
- '{"name": "tweet-summary-13", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
53+
- '{"name": "tweet-summary-14", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
3554
env:
55+
- name: VLLM_USE_V1
56+
value: "1"
3657
- name: PORT
3758
value: "8000"
3859
- name: HUGGING_FACE_HUB_TOKEN
@@ -42,6 +63,8 @@ spec:
4263
key: token
4364
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
4465
value: "true"
66+
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
67+
value: "true"
4568
ports:
4669
- containerPort: 8000
4770
name: http

pkg/epp/backend/vllm/metric.go

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
// Package vllm provides vllm specific pod metrics implementation.
2+
package vllm
3+
4+
import (
5+
"context"
6+
"fmt"
7+
"net/http"
8+
"sort"
9+
"strconv"
10+
"strings"
11+
12+
dto "github.com/prometheus/client_model/go"
13+
"github.com/prometheus/common/expfmt"
14+
"go.uber.org/multierr"
15+
klog "k8s.io/klog/v2"
16+
"sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
17+
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging"
18+
)
19+
20+
const (
21+
LoraRequestInfoMetricName = "vllm:lora_requests_info"
22+
LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
23+
LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
24+
LoraRequestInfoMaxAdaptersMetricName = "max_lora"
25+
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
26+
RunningQueueSizeMetricName = "vllm:num_requests_running"
27+
WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
28+
/* TODO: Uncomment this once the following are added to the fork.
29+
RunningQueueSizeMetricName = "vllm:num_tokens_running"
30+
WaitingQueueSizeMetricName = "vllm:num_tokens_waiting"
31+
*/
32+
KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc"
33+
KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
34+
)
35+
36+
type PodMetricsClientImpl struct{}
37+
38+
// FetchMetrics fetches metrics from a given pod.
39+
func (p *PodMetricsClientImpl) FetchMetrics(
40+
ctx context.Context,
41+
pod backend.Pod,
42+
existing *backend.PodMetrics,
43+
) (*backend.PodMetrics, error) {
44+
// Currently the metrics endpoint is hard-coded, which works with vLLM.
45+
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
46+
url := fmt.Sprintf("http://%s/metrics", pod.Address)
47+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
48+
if err != nil {
49+
klog.V(logutil.DEFAULT).ErrorS(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
50+
return nil, fmt.Errorf("failed to create request: %v", err)
51+
}
52+
resp, err := http.DefaultClient.Do(req)
53+
if err != nil {
54+
klog.V(logutil.DEFAULT).ErrorS(err, "Failed to fetch metrics", "pod", pod)
55+
return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err)
56+
}
57+
defer func() {
58+
_ = resp.Body.Close()
59+
}()
60+
61+
if resp.StatusCode != http.StatusOK {
62+
klog.V(logutil.DEFAULT).ErrorS(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode)
63+
return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode)
64+
}
65+
66+
parser := expfmt.TextParser{}
67+
metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
68+
if err != nil {
69+
return nil, err
70+
}
71+
return promToPodMetrics(metricFamilies, existing)
72+
}
73+
74+
// promToPodMetrics updates internal pod metrics with scraped prometheus metrics.
75+
// A combined error is returned if errors occur in one or more metric processing.
76+
// It returns a new PodMetrics pointer which can be used to atomically update the pod metrics map.
77+
func promToPodMetrics(
78+
metricFamilies map[string]*dto.MetricFamily,
79+
existing *backend.PodMetrics,
80+
) (*backend.PodMetrics, error) {
81+
var errs error
82+
updated := existing.Clone()
83+
runningQueueSize, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName)
84+
errs = multierr.Append(errs, err)
85+
if err == nil {
86+
updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue())
87+
}
88+
waitingQueueSize, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName)
89+
errs = multierr.Append(errs, err)
90+
if err == nil {
91+
updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue())
92+
}
93+
cachePercent, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName)
94+
errs = multierr.Append(errs, err)
95+
if err == nil {
96+
updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
97+
}
98+
99+
// Get up to 5 of the latest Lora metrics.
100+
loraMetricsSlice, err := getLatestLoraMetrics(metricFamilies)
101+
errs = multierr.Append(errs, err)
102+
if err == nil && len(loraMetricsSlice) > 0 {
103+
var adapterList []string
104+
adapterSet := make(map[string]bool)
105+
// Iterate over metrics in descending order by creation timestamp.
106+
for _, m := range loraMetricsSlice {
107+
for _, label := range m.GetLabel() {
108+
// Optionally update max active models from the metric.
109+
if label.GetName() == LoraRequestInfoMaxAdaptersMetricName && label.GetValue() != "" {
110+
updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
111+
if err != nil {
112+
errs = multierr.Append(errs, err)
113+
}
114+
break
115+
}
116+
}
117+
}
118+
119+
// Iterate over metrics in descending order by creation timestamp.
120+
for _, m := range loraMetricsSlice {
121+
// If we already have 5 unique adapters, stop processing.
122+
if len(adapterList) >= updated.MaxActiveModels {
123+
break
124+
}
125+
for _, label := range m.GetLabel() {
126+
// Process both running and waiting adapter labels.
127+
if label.GetName() == LoraRequestInfoRunningAdaptersMetricName ||
128+
label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
129+
if label.GetValue() != "" {
130+
adapters := strings.Split(label.GetValue(), ",")
131+
for _, adapter := range adapters {
132+
adapter = strings.TrimSpace(adapter)
133+
if adapter != "" && !adapterSet[adapter] {
134+
adapterSet[adapter] = true
135+
adapterList = append(adapterList, adapter)
136+
if len(adapterList) >= updated.MaxActiveModels {
137+
break
138+
}
139+
}
140+
}
141+
}
142+
}
143+
// Break early if we've collected 5 adapters.
144+
if len(adapterList) >= updated.MaxActiveModels {
145+
break
146+
}
147+
}
148+
}
149+
150+
updated.ActiveModels = make(map[string]int)
151+
for _, adapter := range adapterList {
152+
updated.ActiveModels[adapter] = 0
153+
}
154+
}
155+
156+
return updated, errs
157+
}
158+
159+
// getLatestLoraMetrics gets up to 5 latest lora metric series from the gauge metric family `vllm:lora_requests_info`.
160+
// Each metric’s gauge value represents its creation timestamp. Only metrics with non‑empty running or waiting adapter labels are considered.
161+
func getLatestLoraMetrics(metricFamilies map[string]*dto.MetricFamily) ([]*dto.Metric, error) {
162+
loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
163+
if !ok {
164+
klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
165+
return nil, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
166+
}
167+
168+
var validMetrics []*dto.Metric
169+
// Iterate over all metrics in the family.
170+
for _, m := range loraRequests.GetMetric() {
171+
var running, waiting string
172+
// Read the label values for running and waiting adapters.
173+
for _, lp := range m.GetLabel() {
174+
switch lp.GetName() {
175+
case LoraRequestInfoRunningAdaptersMetricName:
176+
running = lp.GetValue()
177+
case LoraRequestInfoWaitingAdaptersMetricName:
178+
waiting = lp.GetValue()
179+
}
180+
}
181+
// Ignore metrics with both labels empty.
182+
if running == "" && waiting == "" {
183+
continue
184+
}
185+
validMetrics = append(validMetrics, m)
186+
}
187+
188+
if len(validMetrics) == 0 {
189+
return nil, fmt.Errorf("no valid metric found")
190+
}
191+
192+
// Sort validMetrics in descending order by their gauge value (interpreted as creation timestamp).
193+
sort.Slice(validMetrics, func(i, j int) bool {
194+
return validMetrics[i].GetGauge().GetValue() > validMetrics[j].GetGauge().GetValue()
195+
})
196+
197+
// We return all valid metrics so the caller can pick adapter names in order,
198+
// limiting to 5 unique adapter names across the metrics.
199+
return validMetrics, nil
200+
}
201+
202+
// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
203+
// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
204+
func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) {
205+
mf, ok := metricFamilies[metricName]
206+
if !ok {
207+
klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", metricName)
208+
return nil, fmt.Errorf("metric family %q not found", metricName)
209+
}
210+
if len(mf.GetMetric()) == 0 {
211+
return nil, fmt.Errorf("no metrics available for %q", metricName)
212+
}
213+
var latestTs int64
214+
var latest *dto.Metric
215+
for _, m := range mf.GetMetric() {
216+
if m.GetTimestampMs() >= latestTs {
217+
latestTs = m.GetTimestampMs()
218+
latest = m
219+
}
220+
}
221+
klog.V(logutil.TRACE).InfoS("Metric value selected", "value", latest, "metric", metricName)
222+
return latest, nil
223+
}

pkg/epp/backend/vllm/metrics.go

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
const (
3838
LoraRequestInfoMetricName = "vllm:lora_requests_info"
3939
LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
40+
LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
4041
LoraRequestInfoMaxAdaptersMetricName = "max_lora"
4142
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
4243
RunningQueueSizeMetricName = "vllm:num_requests_running"
@@ -136,6 +137,14 @@ func promToPodMetrics(
136137
}
137138
}
138139
}
140+
if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
141+
if label.GetValue() != "" {
142+
adapterList := strings.Split(label.GetValue(), ",")
143+
for _, adapter := range adapterList {
144+
updated.ActiveModels[adapter] = 0
145+
}
146+
}
147+
}
139148
if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
140149
if label.GetValue() != "" {
141150
updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
@@ -161,14 +170,40 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
161170
logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
162171
return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
163172
}
164-
var latestTs float64
173+
165174
var latest *dto.Metric
175+
var latestTs float64
176+
177+
// Iterate over all metrics in the family.
166178
for _, m := range loraRequests.GetMetric() {
179+
var running, waiting string
180+
// Read the label values for running and waiting adapters.
181+
for _, lp := range m.GetLabel() {
182+
switch lp.GetName() {
183+
case LoraRequestInfoRunningAdaptersMetricName:
184+
running = lp.GetValue()
185+
case LoraRequestInfoWaitingAdaptersMetricName:
186+
waiting = lp.GetValue()
187+
}
188+
}
189+
190+
// Ignore metrics with both labels empty.
191+
if running == "" && waiting == "" {
192+
// continue
193+
}
194+
195+
// Select the metric with the latest creation timestamp.
167196
if m.GetGauge().GetValue() > latestTs {
168197
latestTs = m.GetGauge().GetValue()
169198
latest = m
170199
}
171200
}
201+
202+
if latest == nil {
203+
return nil, time.Time{}, fmt.Errorf("no valid metric found")
204+
}
205+
206+
// Convert the gauge value (creation timestamp) to time.Time.
172207
return latest, time.Unix(0, int64(latestTs*1000)), nil
173208
}
174209

0 commit comments

Comments
 (0)