kaushikmitr
diff --git a/‎config/manifests/ext_proc.yaml
Lines changed: 1 addition & 1 deletion b/‎config/manifests/ext_proc.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/manifests/vllm/deployment.yaml
Lines changed: 26 additions & 3 deletions b/‎config/manifests/vllm/deployment.yaml
Lines changed: 26 additions & 3 deletions
diff --git a/‎pkg/epp/backend/vllm/metric.go
Lines changed: 223 additions & 0 deletions b/‎pkg/epp/backend/vllm/metric.go
Lines changed: 223 additions & 0 deletions
diff --git a/‎pkg/epp/backend/vllm/metrics.go
Lines changed: 36 additions & 1 deletion b/‎pkg/epp/backend/vllm/metrics.go
Lines changed: 36 additions & 1 deletion
@@ -71,7 +71,7 @@ spec:
     spec:
       containers:
       - name: inference-gateway-ext-proc
-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/llm-ig-ext-proc-h100:latest
         imagePullPolicy: Always
         args:
         - -poolName
 
@@ -3,7 +3,7 @@ kind: Deployment
 metadata:
   name: vllm-llama2-7b-pool
 spec:
-  replicas: 3
+  replicas: 2
   selector:
     matchLabels:
       app: vllm-llama2-7b-pool
@@ -14,7 +14,7 @@ spec:
     spec:
       containers:
         - name: lora
-          image: "vllm/vllm-openai:latest"
+          image: "us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/vllm-openai-v1-lora"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
@@ -24,15 +24,36 @@ spec:
           - "1"
           - "--port"
           - "8000"
+          - "--compilation-config"
+          - "3"
+          - "--max-num-seqs"
+          - "2048"
           - "--enable-lora"
           - "--max-loras"
           - "4"
           - "--max-cpu-loras"
-          - "12"
+          - "15"
+          - "--max-lora-rank"
+          - "16"
           - "--lora-modules"
           - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-2", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-3", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-4", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-5", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-6", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-7", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-8", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-9", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-10", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-11", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-12", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-13", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-14", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
+            - name: VLLM_USE_V1
+              value: "1"
             - name: PORT
               value: "8000"
             - name: HUGGING_FACE_HUB_TOKEN
@@ -42,6 +63,8 @@ spec:
                   key: token
             - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
               value: "true"
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
           ports:
             - containerPort: 8000
               name: http
 
@@ -0,0 +1,223 @@
+// Package vllm provides vllm specific pod metrics implementation.
+package vllm
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"sort"
+	"strconv"
+	"strings"
+
+	dto "github.com/prometheus/client_model/go"
+	"github.com/prometheus/common/expfmt"
+	"go.uber.org/multierr"
+	klog "k8s.io/klog/v2"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging"
+)
+
+const (
+	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
+	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
+	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
+	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
+	RunningQueueSizeMetricName = "vllm:num_requests_running"
+	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
+	/* TODO: Uncomment this once the following are added to the fork.
+	RunningQueueSizeMetricName        = "vllm:num_tokens_running"
+	WaitingQueueSizeMetricName        = "vllm:num_tokens_waiting"
+	*/
+	KVCacheUsagePercentMetricName     = "vllm:gpu_cache_usage_perc"
+	KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
+)
+
+type PodMetricsClientImpl struct{}
+
+// FetchMetrics fetches metrics from a given pod.
+func (p *PodMetricsClientImpl) FetchMetrics(
+	ctx context.Context,
+	pod backend.Pod,
+	existing *backend.PodMetrics,
+) (*backend.PodMetrics, error) {
+	// Currently the metrics endpoint is hard-coded, which works with vLLM.
+	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
+	url := fmt.Sprintf("http://%s/metrics", pod.Address)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		klog.V(logutil.DEFAULT).ErrorS(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
+		return nil, fmt.Errorf("failed to create request: %v", err)
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		klog.V(logutil.DEFAULT).ErrorS(err, "Failed to fetch metrics", "pod", pod)
+		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err)
+	}
+	defer func() {
+		_ = resp.Body.Close()
+	}()
+
+	if resp.StatusCode != http.StatusOK {
+		klog.V(logutil.DEFAULT).ErrorS(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode)
+		return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode)
+	}
+
+	parser := expfmt.TextParser{}
+	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	return promToPodMetrics(metricFamilies, existing)
+}
+
+// promToPodMetrics updates internal pod metrics with scraped prometheus metrics.
+// A combined error is returned if errors occur in one or more metric processing.
+// It returns a new PodMetrics pointer which can be used to atomically update the pod metrics map.
+func promToPodMetrics(
+	metricFamilies map[string]*dto.MetricFamily,
+	existing *backend.PodMetrics,
+) (*backend.PodMetrics, error) {
+	var errs error
+	updated := existing.Clone()
+	runningQueueSize, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue())
+	}
+	waitingQueueSize, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue())
+	}
+	cachePercent, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
+	}
+
+	// Get up to 5 of the latest Lora metrics.
+	loraMetricsSlice, err := getLatestLoraMetrics(metricFamilies)
+	errs = multierr.Append(errs, err)
+	if err == nil && len(loraMetricsSlice) > 0 {
+		var adapterList []string
+		adapterSet := make(map[string]bool)
+		// Iterate over metrics in descending order by creation timestamp.
+		for _, m := range loraMetricsSlice {
+			for _, label := range m.GetLabel() {
+				// Optionally update max active models from the metric.
+				if label.GetName() == LoraRequestInfoMaxAdaptersMetricName && label.GetValue() != "" {
+					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
+					if err != nil {
+						errs = multierr.Append(errs, err)
+					}
+					break
+				}
+			}
+		}
+
+		// Iterate over metrics in descending order by creation timestamp.
+		for _, m := range loraMetricsSlice {
+			// If we already have 5 unique adapters, stop processing.
+			if len(adapterList) >= updated.MaxActiveModels {
+				break
+			}
+			for _, label := range m.GetLabel() {
+				// Process both running and waiting adapter labels.
+				if label.GetName() == LoraRequestInfoRunningAdaptersMetricName ||
+					label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+					if label.GetValue() != "" {
+						adapters := strings.Split(label.GetValue(), ",")
+						for _, adapter := range adapters {
+							adapter = strings.TrimSpace(adapter)
+							if adapter != "" && !adapterSet[adapter] {
+								adapterSet[adapter] = true
+								adapterList = append(adapterList, adapter)
+								if len(adapterList) >= updated.MaxActiveModels {
+									break
+								}
+							}
+						}
+					}
+				}
+				// Break early if we've collected 5 adapters.
+				if len(adapterList) >= updated.MaxActiveModels {
+					break
+				}
+			}
+		}
+
+		updated.ActiveModels = make(map[string]int)
+		for _, adapter := range adapterList {
+			updated.ActiveModels[adapter] = 0
+		}
+	}
+
+	return updated, errs
+}
+
+// getLatestLoraMetrics gets up to 5 latest lora metric series from the gauge metric family `vllm:lora_requests_info`.
+// Each metric’s gauge value represents its creation timestamp. Only metrics with non‑empty running or waiting adapter labels are considered.
+func getLatestLoraMetrics(metricFamilies map[string]*dto.MetricFamily) ([]*dto.Metric, error) {
+	loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
+	if !ok {
+		klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
+		return nil, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
+	}
+
+	var validMetrics []*dto.Metric
+	// Iterate over all metrics in the family.
+	for _, m := range loraRequests.GetMetric() {
+		var running, waiting string
+		// Read the label values for running and waiting adapters.
+		for _, lp := range m.GetLabel() {
+			switch lp.GetName() {
+			case LoraRequestInfoRunningAdaptersMetricName:
+				running = lp.GetValue()
+			case LoraRequestInfoWaitingAdaptersMetricName:
+				waiting = lp.GetValue()
+			}
+		}
+		// Ignore metrics with both labels empty.
+		if running == "" && waiting == "" {
+			continue
+		}
+		validMetrics = append(validMetrics, m)
+	}
+
+	if len(validMetrics) == 0 {
+		return nil, fmt.Errorf("no valid metric found")
+	}
+
+	// Sort validMetrics in descending order by their gauge value (interpreted as creation timestamp).
+	sort.Slice(validMetrics, func(i, j int) bool {
+		return validMetrics[i].GetGauge().GetValue() > validMetrics[j].GetGauge().GetValue()
+	})
+
+	// We return all valid metrics so the caller can pick adapter names in order,
+	// limiting to 5 unique adapter names across the metrics.
+	return validMetrics, nil
+}
+
+// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
+// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
+func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) {
+	mf, ok := metricFamilies[metricName]
+	if !ok {
+		klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", metricName)
+		return nil, fmt.Errorf("metric family %q not found", metricName)
+	}
+	if len(mf.GetMetric()) == 0 {
+		return nil, fmt.Errorf("no metrics available for %q", metricName)
+	}
+	var latestTs int64
+	var latest *dto.Metric
+	for _, m := range mf.GetMetric() {
+		if m.GetTimestampMs() >= latestTs {
+			latestTs = m.GetTimestampMs()
+			latest = m
+		}
+	}
+	klog.V(logutil.TRACE).InfoS("Metric value selected", "value", latest, "metric", metricName)
+	return latest, nil
+}
@@ -37,6 +37,7 @@ import (
 const (
 	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
 	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
@@ -136,6 +137,14 @@ func promToPodMetrics(
 					}
 				}
 			}
+			if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.ActiveModels[adapter] = 0
+					}
+				}
+			}
 			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
 				if label.GetValue() != "" {
 					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
@@ -161,14 +170,40 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
 		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
 	}
-	var latestTs float64
+
 	var latest *dto.Metric
+	var latestTs float64
+
+	// Iterate over all metrics in the family.
 	for _, m := range loraRequests.GetMetric() {
+		var running, waiting string
+		// Read the label values for running and waiting adapters.
+		for _, lp := range m.GetLabel() {
+			switch lp.GetName() {
+			case LoraRequestInfoRunningAdaptersMetricName:
+				running = lp.GetValue()
+			case LoraRequestInfoWaitingAdaptersMetricName:
+				waiting = lp.GetValue()
+			}
+		}
+
+		// Ignore metrics with both labels empty.
+		if running == "" && waiting == "" {
+			//	continue
+		}
+
+		// Select the metric with the latest creation timestamp.
 		if m.GetGauge().GetValue() > latestTs {
 			latestTs = m.GetGauge().GetValue()
 			latest = m
 		}
 	}
+
+	if latest == nil {
+		return nil, time.Time{}, fmt.Errorf("no valid metric found")
+	}
+
+	// Convert the gauge value (creation timestamp) to time.Time.
 	return latest, time.Unix(0, int64(latestTs*1000)), nil
 }