From 214905d21726fd518ae69e42407662a8d7a9f3e2 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Tue, 4 Mar 2025 23:34:34 +0000
Subject: [PATCH 01/19] start adding metrics changes for trion support

---
 cmd/epp/main.go                             |   5 +
 config/manifests/triton/deployment.yaml     | 100 ++++++++
 config/manifests/triton/ext_proc.yaml       | 115 +++++++++
 config/manifests/triton/inferencemodel.yaml |   9 +
 config/manifests/triton/triton-set-up.yaml  | 111 ++++++++
 pkg/epp/backend/provider.go                 | 183 +++++++++++++
 pkg/epp/backend/triton/metrics.go           | 270 ++++++++++++++++++++
 pkg/epp/backend/triton/metrics_test.go      | 241 +++++++++++++++++
 8 files changed, 1034 insertions(+)
 create mode 100644 config/manifests/triton/deployment.yaml
 create mode 100644 config/manifests/triton/ext_proc.yaml
 create mode 100644 config/manifests/triton/inferencemodel.yaml
 create mode 100644 config/manifests/triton/triton-set-up.yaml
 create mode 100644 pkg/epp/backend/provider.go
 create mode 100644 pkg/epp/backend/triton/metrics.go
 create mode 100644 pkg/epp/backend/triton/metrics_test.go

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index e1cd50154..4eaa90c8f 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -37,7 +37,9 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/manager"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/triton"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -146,6 +148,9 @@ func run() error {
 	pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval)
 	// Setup runner.
 	datastore := datastore.NewDatastore(ctx, pmf)
+	// switch case across different model server metrics (triton, vllm)
+	provider := backend.NewProvider(&triton.PodMetricsClientImpl{}, datastore)
+	//
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
 		DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
diff --git a/config/manifests/triton/deployment.yaml b/config/manifests/triton/deployment.yaml
new file mode 100644
index 000000000..61626293b
--- /dev/null
+++ b/config/manifests/triton/deployment.yaml
@@ -0,0 +1,100 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-triton-deployment
+spec:
+  replicas: 1  # Start with 1 replica.  Adjust as needed.
+  selector:
+    matchLabels:
+      app: llama-triton  # This MUST match the labels in the template
+  template:
+    metadata:
+      labels:
+        app: llama-triton
+    spec:
+      containers:
+      - name: triton-server
+        image: nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3  # Use base Triton image
+        imagePullPolicy: IfNotPresent
+        command: ["/bin/bash", "-c"]
+        args:
+          - |
+            set -e
+            apt-get update && apt-get install -y python3.12-venv
+
+            # Create and activate a virtual environment
+            python3 -m venv /opt/venv
+            source /opt/venv/bin/activate
+            pip install SentencePiece
+            pip install packaging
+            pip install numpy
+            pip install torch
+            pip install requests
+            pip install transformers
+            pip install pillow
+            
+            # Use launch_triton_server.py
+            # python3 /models/tensorrtllm_backend/scripts/launch_triton_server.py --world_size 1 --model_repo /models/tensorrtllm_backend/llama_ifb
+            # tail -f /dev/null
+
+            # Launch OpenAI completetions endpoint
+            # Install python bindings for tritonserver and tritonfrontend
+            pip install /opt/tritonserver/python/triton*.whl
+            # Install application requirements
+            git clone https://github.com/triton-inference-server/server.git
+            cd server/python/openai/
+            pip install -r requirements.txt
+            pip install uvicorn
+            pip install -U huggingface_hub
+            huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential
+          
+            python3 openai_frontend/main.py --model-repository /models/tensorrtllm_backend/llama_ifb --tokenizer meta-llama/Llama-2-7b-chat-hf
+        ports:
+        - containerPort: 9000
+          name: http
+        - containerPort: 9001
+          name: grpc
+        - containerPort: 9002
+          name: metrics
+        volumeMounts:
+        - mountPath: /models
+          name: model-volume
+        - mountPath: /secrets/huggingface
+          name: huggingface-secret
+          readOnly: true
+        resources:
+          limits:
+            ephemeral-storage: 40Gi
+            nvidia.com/gpu: 1
+            memory: 40Gi
+          requests:
+            ephemeral-storage: 40Gi
+            memory: 40Gi
+            nvidia.com/gpu: 1
+      volumes:
+      - name: model-volume
+        persistentVolumeClaim:
+          claimName: llama-model-pvc
+      - name: huggingface-secret
+        secret:
+          secretName: hf-token
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-triton-service
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9000
+      targetPort: http
+      name: http-inference-server
+    - port: 9001
+      targetPort: grpc
+      name: grpc-inference-server
+    - port: 9002
+      targetPort: metrics
+      name: http-metrics
+  selector:
+    app: llama-triton
diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml
new file mode 100644
index 000000000..a794bdb2d
--- /dev/null
+++ b/config/manifests/triton/ext_proc.yaml
@@ -0,0 +1,115 @@
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencemodels"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "watch", "list"]
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+--- 
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read-binding
+subjects:
+- kind: ServiceAccount
+  name: default
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: pod-read
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  labels:
+  name: triton-llama2-7b-pool
+spec:
+  targetPortNumber: 9000
+  selector:
+    app: llama-triton
+  extensionRef:
+    name: inference-gateway-ext-proc
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: inference-gateway-ext-proc
+  namespace: default
+  labels:
+    app: inference-gateway-ext-proc
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: inference-gateway-ext-proc
+  template:
+    metadata:
+      labels:
+        app: inference-gateway-ext-proc
+    spec:
+      containers:
+      - name: inference-gateway-ext-proc
+        image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest
+        imagePullPolicy: Always
+        args:
+        - -poolName
+        - "triton-llama2-7b-pool"
+        - -v
+        - "3"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        ports:
+        - containerPort: 9002
+        - containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: inference-gateway-ext-proc
+  namespace: default
+spec:
+  selector:
+    app: inference-gateway-ext-proc
+  ports:
+    - protocol: TCP
+      port: 9002
+      targetPort: 9002
+  type: ClusterIP
diff --git a/config/manifests/triton/inferencemodel.yaml b/config/manifests/triton/inferencemodel.yaml
new file mode 100644
index 000000000..db643a85c
--- /dev/null
+++ b/config/manifests/triton/inferencemodel.yaml
@@ -0,0 +1,9 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: triton-llama2-7b-model
+spec:
+  modelName: ensemble
+  criticality: Standard
+  poolRef:
+    name: triton-llama2-7b-pool
diff --git a/config/manifests/triton/triton-set-up.yaml b/config/manifests/triton/triton-set-up.yaml
new file mode 100644
index 000000000..08fa0852c
--- /dev/null
+++ b/config/manifests/triton/triton-set-up.yaml
@@ -0,0 +1,111 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-model-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 200Gi
+
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: llama-build-job
+spec:
+  backoffLimit: 0
+  template:
+    metadata:
+      labels:
+        app: llama-triton
+    spec:
+      containers:
+      - name: llama-builder
+        image: nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3 # Use the base Triton image directly
+        command: ["/bin/bash", "-c"]
+        args:
+          - |
+            set -e  # Exit on error
+
+            apt-get update && apt-get install -y python3.12-venv
+
+            # Create and activate a virtual environment
+            python3 -m venv /opt/venv
+            source /opt/venv/bin/activate
+
+            # Install git (it might not be in the base image)
+            apt-get update && apt-get install -y --no-install-recommends git
+
+            # Clone the tensorrt_llm_backend repository and set up submodule
+            git clone -b triton-llm/v0.17.0 https://github.com/triton-inference-server/tensorrtllm_backend.git /models/tensorrtllm_backend
+            cd /models/tensorrtllm_backend
+            git lfs install
+            git submodule update --init --recursive
+
+            # --- Hugging Face Setup ---
+            # 1. Install the Hugging Face CLI
+            pip install -U huggingface_hub
+            pip install transformers
+            pip install --extra-index-url https://pypi.nvidia.com/ tensorrt-llm
+            pip install tensorrt_llm
+
+            # 2. Log in using the token from the secret
+            #    The secret is mounted as a file.
+            huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential
+            huggingface-cli download meta-llama/Llama-2-7b-hf --local-dir /models/hf_models/
+
+            # Download and convert the Hugging Face model.  Modify parameters as needed.
+            export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json', local_dir='/models/hf_models/')).parent)"`
+            echo PATH TO LLAMA MODEL: $HF_LLAMA_MODEL
+            export UNIFIED_CKPT_PATH=/models/tmp/ckpt/llama/7b/
+            export ENGINE_PATH=/models/tmp/engines/llama/7b/
+            export TRTLLM_MODEL_REPO=/models/tensorrtllm_backend/llama_ifb
+            python3 /models/tensorrtllm_backend/tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
+                     --output_dir ${UNIFIED_CKPT_PATH} \
+                     --dtype float16
+
+            # Build the TensorRT-LLM engine.  Adjust parameters (e.g., world_size) as needed.
+            trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+                          --output_dir ${ENGINE_PATH} \
+                          --gemm_plugin float16 \
+                          --kv_cache_type paged \
+                          --context_fmha enable \
+                          --gpt_attention_plugin float16 \
+                          --remove_input_padding enable \
+                          --max_batch_size 64
+
+            cp /models/tensorrtllm_backend/all_models/inflight_batcher_llm/ ${TRTLLM_MODEL_REPO} -r
+
+            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
+            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
+
+
+            echo "Build complete!"
+        volumeMounts:
+        - mountPath: /models
+          name: model-volume
+        - mountPath: /secrets/huggingface
+          name: huggingface-secret
+          readOnly: true
+        resources:
+          limits:
+            ephemeral-storage: 80Gi
+            nvidia.com/gpu: 1
+            memory: 40Gi
+          requests:
+            ephemeral-storage: 80Gi
+            nvidia.com/gpu: 1
+            memory: 40Gi
+      restartPolicy: Never
+      volumes:
+      - name: model-volume
+        persistentVolumeClaim:
+          claimName: llama-model-pvc
+      - name: huggingface-secret
+        secret:
+          secretName: hf-token
diff --git a/pkg/epp/backend/provider.go b/pkg/epp/backend/provider.go
new file mode 100644
index 000000000..959f3e0c9
--- /dev/null
+++ b/pkg/epp/backend/provider.go
@@ -0,0 +1,183 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backend
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/go-logr/logr"
+	"go.uber.org/multierr"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+	fetchMetricsTimeout = 5 * time.Second
+)
+
+func NewProvider(pmc PodMetricsClient, datastore datastore.Datastore) *Provider {
+	p := &Provider{
+		pmc:       pmc,
+		datastore: datastore,
+	}
+	return p
+}
+
+// Provider provides backend pods and information such as metrics.
+type Provider struct {
+	pmc       PodMetricsClient
+	datastore datastore.Datastore
+}
+
+type PodMetricsClient interface {
+	FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error)
+}
+
+func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error {
+	// periodically refresh metrics
+	logger := log.FromContext(ctx)
+	go func() {
+		for {
+			select {
+			case <-ctx.Done():
+				logger.V(logutil.DEFAULT).Info("Shutting down metrics prober")
+				return
+			default:
+				time.Sleep(refreshMetricsInterval)
+				if err := p.refreshMetricsOnce(logger); err != nil {
+					logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics")
+				}
+			}
+		}
+	}()
+
+	// Periodically flush prometheus metrics for inference pool
+	go func() {
+		for {
+			select {
+			case <-ctx.Done():
+				logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
+				return
+			default:
+				time.Sleep(refreshPrometheusMetricsInterval)
+				p.flushPrometheusMetricsOnce(logger)
+			}
+		}
+	}()
+
+	// Periodically print out the pods and metrics for DEBUGGING.
+	if logger := logger.V(logutil.DEBUG); logger.Enabled() {
+		go func() {
+			for {
+				select {
+				case <-ctx.Done():
+					logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread")
+					return
+				default:
+					time.Sleep(5 * time.Second)
+					logger.Info("Current Pods and metrics gathered", "metrics", p.datastore.PodGetAll())
+				}
+			}
+		}()
+	}
+
+	return nil
+}
+
+func (p *Provider) refreshMetricsOnce(logger logr.Logger) error {
+	loggerTrace := logger.V(logutil.TRACE)
+	pool, _ := p.datastore.PoolGet()
+	if pool == nil {
+		loggerTrace.Info("No inference pool or not initialized")
+		return nil
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout)
+	defer cancel()
+	start := time.Now()
+	defer func() {
+		d := time.Since(start)
+		// TODO: add a metric instead of logging
+		loggerTrace.Info("Metrics refreshed", "duration", d)
+	}()
+
+	var wg sync.WaitGroup
+	errCh := make(chan error)
+	processOnePod := func(key, value any) bool {
+		loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value)
+		existing := value.(*datastore.PodMetrics)
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			updated, err := p.pmc.FetchMetrics(ctx, existing, pool.Spec.TargetPortNumber)
+			if err != nil {
+				errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err)
+				return
+			}
+			p.datastore.PodUpdateMetricsIfExist(updated.NamespacedName, &updated.Metrics)
+			loggerTrace.Info("Updated metrics for pod", "pod", updated.NamespacedName, "metrics", updated.Metrics)
+		}()
+		return true
+	}
+	p.datastore.PodRange(processOnePod)
+
+	// Wait for metric collection for all pods to complete and close the error channel in a
+	// goroutine so this is unblocking, allowing the code to proceed to the error collection code
+	// below.
+	// Note we couldn't use a buffered error channel with a size because the size of the podMetrics
+	// sync.Map is unknown beforehand.
+	go func() {
+		wg.Wait()
+		close(errCh)
+	}()
+
+	var errs error
+	for err := range errCh {
+		errs = multierr.Append(errs, err)
+	}
+	return errs
+}
+
+func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) {
+	pool, _ := p.datastore.PoolGet()
+	if pool == nil {
+		// No inference pool or not initialize.
+		return
+	}
+
+	var kvCacheTotal float64
+	var queueTotal int
+
+	podMetrics := p.datastore.PodGetAll()
+	logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics))
+	if len(podMetrics) == 0 {
+		return
+	}
+
+	for _, pod := range podMetrics {
+		kvCacheTotal += pod.KVCacheUsagePercent
+		queueTotal += pod.WaitingQueueSize
+	}
+
+	podTotalCount := len(podMetrics)
+	metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount))
+	metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount))
+}
diff --git a/pkg/epp/backend/triton/metrics.go b/pkg/epp/backend/triton/metrics.go
new file mode 100644
index 000000000..2f8d24bd9
--- /dev/null
+++ b/pkg/epp/backend/triton/metrics.go
@@ -0,0 +1,270 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package triton
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/go-logr/logr"
+	dto "github.com/prometheus/client_model/go"
+	"github.com/prometheus/common/expfmt"
+	"go.uber.org/multierr"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+	// Triton metrics, see https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/metrics.html
+
+	TRTLLMRequestMetricsName  = "nv_trt_llm_request_metrics"
+	TRTLLMKvCacheMetricsName  = "nv_trt_llm_kv_cache_block_metrics"
+	TRTLLMKvCacheMetricsLabel = "kv_cache_block_type"
+	TRTLLMRequestMetricsLabel = "request_type"
+
+	// THESE ARE UNUSED, EXAMPLES FOR MORE METRICS
+	inferenceCountMetricName           = "nv_inference_count"
+	inferenceSuccessMetricName         = "nv_inference_request_success"
+	inferenceExecCountMetricName       = "nv_inference_exec_count"
+	inferenceRequestDurationMetricName = "nv_inference_request_duration_us"
+	waitingQueueSizeMetricName         = "nv_inference_pending_request_count"
+	queueDurationMetricName            = "nv_inference_queue_duration_us"
+	computeInputDurationMetricName     = "nv_inference_compute_input_duration_us"
+	computeInferDurationMetricName     = "nv_inference_compute_infer_duration_us"
+	computeOutputDurationMetricName    = "nv_inference_compute_output_duration_us"
+	gpuUtilizationMetricName           = "nv_gpu_utilization"
+	gpuMemoryTotalMetricName           = "nv_gpu_memory_total_bytes"
+	gpuMemoryUsedMetricName            = "nv_gpu_memory_used_bytes"
+	gpuPowerUsageMetricName            = "nv_gpu_power_usage"
+	gpuPowerLimitMetricName            = "nv_gpu_power_limit"
+	gpuMemoryTotalBytesMetricName      = "nv_gpu_memory_total_bytes"
+	gpuMemoryUsedBytesMetricName       = "nv_gpu_memory_used_bytes"
+)
+
+type PodMetricsClientImpl struct{}
+
+// FetchMetrics fetches metrics from a given pod.
+func (p *PodMetricsClientImpl) FetchMetrics(
+	ctx context.Context,
+	existing *datastore.PodMetrics,
+) (*datastore.PodMetrics, error) {
+	logger := log.FromContext(ctx)
+	loggerDefault := logger.V(logutil.DEFAULT)
+
+	// existing.ScrapePort = 8002 // triton has a different port for metrics than the target port for inference
+	url := existing.BuildScrapeEndpoint()
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	// TODO print response and err
+
+	if err != nil {
+		loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
+		return nil, fmt.Errorf("failed to create request: %v", err)
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName)
+		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err)
+	}
+	defer func() {
+		_ = resp.Body.Close()
+	}()
+
+	if resp.StatusCode != http.StatusOK {
+		loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode)
+		return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode)
+	}
+
+	parser := expfmt.TextParser{}
+	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	return promToPodMetrics(logger, metricFamilies, existing)
+}
+
+// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
+func promToPodMetrics(
+	logger logr.Logger,
+	metricFamilies map[string]*dto.MetricFamily,
+	existing *datastore.PodMetrics,
+) (*datastore.PodMetrics, error) {
+	var errs error
+	updated := existing.Clone()
+
+	// Get the "nv_trt_llm_request_metrics" metric family
+	requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil {
+			fmt.Printf("###### DEBUG max: %+v", active)
+			updated.Metrics.RunningQueueSize = int(active)
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+		if scheduled, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "scheduled"); err == nil {
+			fmt.Printf("###### DEBUG max: %+v", scheduled)
+			updated.Metrics.WaitingQueueSize = int(scheduled)
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	fmt.Print("###### DEBUG getting kvblock metrics... ######")
+	// Get the "nv_trt_llm_kv_cache_block_metrics" metric family
+	kvCacheBlocks, err := getLatestMetric(logger, metricFamilies, TRTLLMKvCacheMetricsName)
+	errs = multierr.Append(errs, err)
+	// fmt.Printf("###### DEBUG (should be nil) getLatestMetric errs: %+v", errs)
+	if err == nil {
+		// Calculate the kv-cache usage from the max and used metrics
+		if max, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "max"); err == nil {
+			fmt.Printf("###### DEBUG max: %+v", max)
+			if used, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "used"); err == nil {
+				fmt.Printf("###### DEBUG tokens_per: %+v", used)
+				usage := 0.0
+				if max > 0 {
+					usage = used / max
+				}
+				updated.Metrics.KVCacheUsagePercent = usage
+			} else {
+				errs = multierr.Append(errs, err)
+			}
+			if tokens_per, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "tokens_per"); err == nil {
+				fmt.Printf("###### DEBUG tokens_per: %+v", tokens_per)
+				updated.Metrics.KvCacheMaxTokenCapacity = int(tokens_per * max)
+			} else {
+				errs = multierr.Append(errs, err)
+			}
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	fmt.Printf("###### DEBUG UPDATED: %+v", updated)
+	fmt.Printf("###### DEBUG ERRORS: %+v", errs)
+
+	return updated, errs
+}
+
+// getLatestMetric gets the latest metric of a family.
+func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.MetricFamily, error) {
+	mf, ok := metricFamilies[metricName]
+	if !ok {
+		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName)
+		return nil, fmt.Errorf("metric family %q not found", metricName)
+	}
+	if len(mf.GetMetric()) == 0 {
+		return nil, fmt.Errorf("no metrics available for %q", metricName)
+	}
+
+	var latestTs int64
+	var latestMf *dto.MetricFamily
+	for _, m := range mf.GetMetric() {
+		if m.GetTimestampMs() >= latestTs {
+			latestTs = m.GetTimestampMs()
+			latestMf = &dto.MetricFamily{
+				Name:   mf.Name,
+				Help:   mf.Help,
+				Type:   mf.Type,
+				Metric: []*dto.Metric{m},
+			}
+		}
+	}
+
+	logger.V(logutil.TRACE).Info("Metric value selected", "metric Family", latestMf, "metric", metricName)
+	return latestMf, nil
+}
+
+// getGaugeMetricForPod gets gauge metric value for a given pod.
+func getGaugeMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podIdentifier string) (float64, error) {
+	for _, m := range mf.GetMetric() {
+		for _, label := range m.GetLabel() {
+			if (label.GetName() == "pod" || label.GetName() == "gpu_uuid") && strings.Contains(label.GetValue(), podIdentifier) {
+				logger.V(logutil.TRACE).Info("Pod metric found", "value", m.GetGauge().GetValue(), "labelName", label.GetName(), "labelValue", label.GetValue())
+
+				return m.GetGauge().GetValue(), nil // Return the value with nil error
+			}
+		}
+	}
+	logger.V(logutil.TRACE).Info("Metric Value not found for pod", "pod", podIdentifier, "metric family", mf.GetName())
+	return -1, fmt.Errorf("metric value not found for pod %s in metric family %s", podIdentifier, mf.GetName()) // Return an error
+}
+
+// getCounterMetricForPod gets counter metric value for a given pod.
+func getCounterMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podName string) (int, error) {
+	for _, m := range mf.GetMetric() {
+		for _, label := range m.GetLabel() {
+			if label.GetName() == "pod" && label.GetValue() == podName {
+				val := m.GetCounter().GetValue()
+				intVal, err := strconv.Atoi(fmt.Sprintf("%v", val)) // Convert float64 to int
+				if err != nil {
+					return -1, fmt.Errorf("failed to convert counter metric to int: %w", err)
+				}
+				logger.V(logutil.TRACE).Info("Pod metric found", "value", intVal)
+
+				return intVal, nil
+			}
+		}
+	}
+	return -1, nil
+}
+
+// TRTLLM metrics
+
+// getTrtLlmMetric gets a TRT LLM metric with the specified type, key, and value.
+func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.MetricType, key, value string) (float64, error) {
+	for _, m := range mf.GetMetric() {
+		foundKey := false
+		foundValue := false
+		for _, label := range m.GetLabel() {
+			if label.GetName() == key && label.GetValue() == value {
+				foundKey = true
+			}
+			if mf.GetType() == metricType {
+				foundValue = true
+			}
+		}
+		if foundKey && foundValue {
+			if metricType == dto.MetricType_GAUGE {
+				logger.V(logutil.TRACE).Info("TRT LLM gauge metric found", "value", m.GetGauge().GetValue(), "key", key, "value", value)
+				return m.GetGauge().GetValue(), nil
+			} else if metricType == dto.MetricType_COUNTER {
+				val := m.GetCounter().GetValue()
+				intVal, err := strconv.Atoi(fmt.Sprintf("%v", val))
+				if err != nil {
+					return -1, fmt.Errorf("failed to convert counter metric to int: %w", err)
+				}
+				logger.V(logutil.TRACE).Info("TRT LLM counter metric found", "value", intVal, "key", key, "value", value)
+				return float64(intVal), nil
+			}
+		}
+	}
+	return -1, fmt.Errorf("TRT LLM metric not found: %s{ %s=\"%s\" }", mf.GetName(), key, value)
+}
+
+// getTrtLlmGaugeMetric gets a gauge TRT LLM metric.
+func getTrtLlmGaugeMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) {
+	return getTrtLlmMetric(logger, mf, dto.MetricType_GAUGE, key, value)
+}
+
+// getTrtLlmCounterMetric gets a counter TRT LLM metric.
+func getTrtLlmCounterMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) {
+	return getTrtLlmMetric(logger, mf, dto.MetricType_COUNTER, key, value)
+}
diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go
new file mode 100644
index 000000000..f9b960a52
--- /dev/null
+++ b/pkg/epp/backend/triton/metrics_test.go
@@ -0,0 +1,241 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package triton
+
+import (
+	"testing"
+
+	dto "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/assert"
+	"google.golang.org/protobuf/proto"
+	"k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+func TestPromToPodMetrics(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	podName := "test-pod"
+	podAddress := "10.0.0.1"
+
+	testCases := []struct {
+		name              string
+		metricFamilies    map[string]*dto.MetricFamily
+		expectedMetrics   *datastore.PodMetrics
+		expectedErr       bool
+		initialPodMetrics *datastore.PodMetrics
+	}{
+		{
+			name:           "all metrics available",
+			metricFamilies: allMetricsAvailable(podName),
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{
+					RunningQueueSize:        1,
+					WaitingQueueSize:        2,
+					KVCacheUsagePercent:     0.5,  // used / max = 50 / 100
+					KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50
+				},
+			},
+			initialPodMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{},
+			},
+			expectedErr: false,
+		},
+		{
+			name:           "missing metrics",
+			metricFamilies: map[string]*dto.MetricFamily{}, // No metrics provided
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{
+					RunningQueueSize:        0, // Default int value
+					WaitingQueueSize:        0, // Default int value
+					KVCacheUsagePercent:     0, // Default float64 value
+					KvCacheMaxTokenCapacity: 0, // Default int value
+				},
+			},
+			initialPodMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{},
+			},
+			expectedErr: false,
+		},
+		{
+			name:           "multiple timestamps",
+			metricFamilies: multipleMetricsWithDifferentTimestamps(podName),
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{
+					RunningQueueSize:        1,    // from latest
+					WaitingQueueSize:        2,    // from latest
+					KVCacheUsagePercent:     0.5,  // used / max = 50 / 100  (from latest)
+					KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50 (from latest)
+				},
+			},
+			initialPodMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{},
+			},
+			expectedErr: false,
+		},
+		{
+			name: "empty metric family",
+			metricFamilies: map[string]*dto.MetricFamily{
+				TRTLLMRequestMetricsName: {
+					Name:   proto.String(TRTLLMRequestMetricsName),
+					Type:   dto.MetricType_GAUGE.Enum(),
+					Metric: []*dto.Metric{}, // Empty
+				},
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{},
+			},
+			initialPodMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					NamespacedName: types.NamespacedName{Name: podName},
+					Address:        podAddress,
+					ScrapePort:     9000,
+					ScrapePath:     "/metrics",
+				},
+				Metrics: datastore.Metrics{},
+			},
+			expectedErr: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialPodMetrics)
+			if tc.expectedErr {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+				assert.Equal(t, tc.expectedMetrics, updated)
+			}
+		})
+	}
+}
+
+// --- Helper Functions ---
+
+func allMetricsAvailable(podName string) map[string]*dto.MetricFamily {
+	return map[string]*dto.MetricFamily{
+		TRTLLMRequestMetricsName: {
+			Name: proto.String(TRTLLMRequestMetricsName),
+			Type: dto.MetricType_GAUGE.Enum(),
+			Metric: []*dto.Metric{
+				trtLlmRequestMetric("active", 1, 200),
+				trtLlmRequestMetric("scheduled", 2, 200),
+			},
+		},
+		TRTLLMKvCacheMetricsName: {
+			Name: proto.String(TRTLLMKvCacheMetricsName),
+			Type: dto.MetricType_GAUGE.Enum(),
+			Metric: []*dto.Metric{
+				trtLlmKvCacheMetric("max", 100, 200),
+				trtLlmKvCacheMetric("used", 50, 200),
+				trtLlmKvCacheMetric("tokens_per", 50, 200),
+			},
+		},
+	}
+}
+
+func multipleMetricsWithDifferentTimestamps(podName string) map[string]*dto.MetricFamily {
+	return map[string]*dto.MetricFamily{
+		TRTLLMRequestMetricsName: {
+			Name: proto.String(TRTLLMRequestMetricsName),
+			Type: dto.MetricType_GAUGE.Enum(),
+			Metric: []*dto.Metric{
+				trtLlmRequestMetric("active", 0, 100),    // Older
+				trtLlmRequestMetric("scheduled", 3, 100), // Older
+				trtLlmRequestMetric("active", 1, 200),    // Newer
+				trtLlmRequestMetric("scheduled", 2, 200), // Newer
+
+			},
+		},
+		TRTLLMKvCacheMetricsName: {
+			Name: proto.String(TRTLLMKvCacheMetricsName),
+			Type: dto.MetricType_GAUGE.Enum(),
+			Metric: []*dto.Metric{
+				trtLlmKvCacheMetric("max", 110, 100),       //Older
+				trtLlmKvCacheMetric("used", 60, 100),       //Older
+				trtLlmKvCacheMetric("tokens_per", 40, 100), //Older
+				trtLlmKvCacheMetric("max", 100, 200),       // Newer
+				trtLlmKvCacheMetric("used", 50, 200),       // Newer
+				trtLlmKvCacheMetric("tokens_per", 50, 200), // Newer
+			},
+		},
+	}
+}
+
+func trtLlmRequestMetric(requestType string, value float64, timestampMs int64) *dto.Metric {
+	return &dto.Metric{
+		Label: []*dto.LabelPair{
+			{Name: proto.String(TRTLLMRequestMetricsLabel), Value: proto.String(requestType)},
+		},
+		Gauge:       &dto.Gauge{Value: &value},
+		TimestampMs: &timestampMs,
+	}
+}
+
+func trtLlmKvCacheMetric(blockType string, value float64, timestampMs int64) *dto.Metric {
+	return &dto.Metric{
+		Label: []*dto.LabelPair{
+			{Name: proto.String(TRTLLMKvCacheMetricsLabel), Value: proto.String(blockType)},
+		},
+		Gauge:       &dto.Gauge{Value: &value},
+		TimestampMs: &timestampMs,
+	}
+}

From 612505425565589d7cd30d3e5860e0164e319925 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 6 Mar 2025 03:33:07 +0000
Subject: [PATCH 02/19] Refactor metrics to work with any prometheus metric
 naming convention based on EPP runtime flags.

---
 cmd/epp/main.go                        |  25 +-
 config/manifests/ext_proc.yaml         |   8 +
 config/manifests/triton/ext_proc.yaml  |   8 +
 pkg/epp/backend/metrics.go             | 321 +++++++++++
 pkg/epp/backend/metrics_spec.go        | 164 ++++++
 pkg/epp/backend/metrics_spec_test.go   | 281 ++++++++++
 pkg/epp/backend/metrics_test.go        | 741 +++++++++++++++++++++++++
 pkg/epp/backend/triton/metrics.go      |  83 ++-
 pkg/epp/backend/triton/metrics_test.go |  37 +-
 pkg/epp/datastore/types.go             |  71 +++
 10 files changed, 1665 insertions(+), 74 deletions(-)
 create mode 100644 pkg/epp/backend/metrics.go
 create mode 100644 pkg/epp/backend/metrics_spec.go
 create mode 100644 pkg/epp/backend/metrics_spec_test.go
 create mode 100644 pkg/epp/backend/metrics_test.go
 create mode 100644 pkg/epp/datastore/types.go

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index 4eaa90c8f..c5264b823 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -39,7 +39,6 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/triton"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -94,6 +93,15 @@ var (
 		"certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+
 			"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
 			"then a self-signed certificate is used.")
+	// metric flags
+	allRequestsMetric       = flag.String("allRequestsMetric", "", "Prometheus metric for the total number of processing requests, both queued and running.")
+	waitingRequestsMetric   = flag.String("waitingRequestsMetric", "", "Prometheus metric for the number of queued requests.")
+	runningRequestsMetric   = flag.String("runningRequestsMetric", "", "Prometheus metric for the number of running requests.")
+	usedKVCacheBlocksMetric = flag.String("usedKVCacheBlocksMetric", "", "Prometheus metric for the number of utilized KV-cache blocks.")
+	maxKVCacheBlocksMetric  = flag.String("maxKVCacheBlocksMetric", "", "Prometheus metric for the total number of available KV-cache blocks.")
+	kVCacheUsageMetric      = flag.String("kVCacheUsageMetric", "", "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
+	// LoRA metrics
+	loraRequestInfoMetric = flag.String("loraRequestInfoMetric", "", "Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
 
 	setupLog = ctrl.Log.WithName("setup")
 )
@@ -149,7 +157,20 @@ func run() error {
 	// Setup runner.
 	datastore := datastore.NewDatastore(ctx, pmf)
 	// switch case across different model server metrics (triton, vllm)
-	provider := backend.NewProvider(&triton.PodMetricsClientImpl{}, datastore)
+	mapping, err := backend.NewMetricMapping(
+		*allRequestsMetric,
+		*waitingRequestsMetric,
+		*runningRequestsMetric,
+		*usedKVCacheBlocksMetric,
+		*maxKVCacheBlocksMetric,
+		*kVCacheUsageMetric,
+		*loraRequestInfoMetric,
+	)
+	if err != nil {
+		setupLog.Error(err, "Failed to create metric mapping from flags")
+		return err
+	}
+	provider := backend.NewProvider(&backend.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
 	//
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml
index d70467ee0..33c47d400 100644
--- a/config/manifests/ext_proc.yaml
+++ b/config/manifests/ext_proc.yaml
@@ -82,6 +82,14 @@ spec:
         - "9002"
         - -grpcHealthPort
         - "9003"
+        - -waitingRequestsMetric
+        - "vllm:num_requests_waiting"
+        - -runningRequestsMetric
+        - "vllm:num_requests_running"
+        - -kVCacheUsageMetric
+        - "vllm:gpu_cache_usage_perc"
+        - -loraRequestInfoMetric
+        - "vllm:lora_requests_info"
         env:
         - name: USE_STREAMING
           value: "false"
diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml
index a794bdb2d..16c802838 100644
--- a/config/manifests/triton/ext_proc.yaml
+++ b/config/manifests/triton/ext_proc.yaml
@@ -82,6 +82,14 @@ spec:
         - "9002"
         - -grpcHealthPort
         - "9003"
+        - -allRequestsMetric
+        - "nv_trt_llm_request_metrics{request_type=active}"
+        - -runningRequestsMetric
+        - "nv_trt_llm_request_metrics{request_type=scheduled}"
+        - -usedKVCacheBlocksMetric
+        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=used}"
+        - -maxKVCacheBlocksMetric
+        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=max}"
         ports:
         - containerPort: 9002
         - containerPort: 9003
diff --git a/pkg/epp/backend/metrics.go b/pkg/epp/backend/metrics.go
new file mode 100644
index 000000000..2f2082652
--- /dev/null
+++ b/pkg/epp/backend/metrics.go
@@ -0,0 +1,321 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backend
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/go-logr/logr"
+	dto "github.com/prometheus/client_model/go"
+	"github.com/prometheus/common/expfmt"
+	"go.uber.org/multierr"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+	// Hardcoded vLLM specific LoRA metrics
+	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
+	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
+)
+
+type PodMetricsClientImpl struct {
+	MetricMapping *MetricMapping
+}
+
+// FetchMetrics fetches metrics from a given pod.
+func (p *PodMetricsClientImpl) FetchMetrics(
+	ctx context.Context,
+	existing *datastore.PodMetrics,
+	port int32,
+) (*datastore.PodMetrics, error) {
+	logger := log.FromContext(ctx)
+	loggerDefault := logger.V(logutil.DEFAULT)
+
+	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
+		return nil, fmt.Errorf("failed to create request: %v", err)
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName)
+		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err)
+	}
+	defer func() {
+		_ = resp.Body.Close()
+	}()
+
+	if resp.StatusCode != http.StatusOK {
+		loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode)
+		return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode)
+	}
+
+	parser := expfmt.TextParser{}
+	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	return p.promToPodMetrics(logger, metricFamilies, existing)
+}
+
+// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
+func (p *PodMetricsClientImpl) promToPodMetrics(
+	logger logr.Logger,
+	metricFamilies map[string]*dto.MetricFamily,
+	existing *datastore.PodMetrics,
+) (*datastore.PodMetrics, error) {
+	var errs error
+	updated := existing.Clone()
+
+	if p.MetricMapping.RunningRequests != nil {
+		running, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.RunningRequests)
+		if err == nil {
+			updated.RunningQueueSize = int(running.GetGauge().GetValue())
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	if p.MetricMapping.AllRequests != nil {
+		all, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.AllRequests)
+		if err == nil {
+			updated.WaitingQueueSize = int(all.GetGauge().GetValue()) - updated.RunningQueueSize
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	if p.MetricMapping.WaitingRequests != nil {
+		waiting, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.WaitingRequests)
+		if err == nil {
+			updated.WaitingQueueSize = int(waiting.GetGauge().GetValue())
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	if p.MetricMapping.KVCacheUsage != nil {
+		usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUsage)
+		if err == nil {
+			updated.KVCacheUsagePercent = usage.GetGauge().GetValue()
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	} else if p.MetricMapping.UsedKVCacheBlocks != nil && p.MetricMapping.MaxKVCacheBlocks != nil {
+		used, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.UsedKVCacheBlocks)
+		if err != nil {
+			errs = multierr.Append(errs, err)
+		}
+		max, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.MaxKVCacheBlocks)
+		if err != nil {
+			errs = multierr.Append(errs, err)
+		}
+		if err == nil {
+			usage := 0.0
+			if max.GetGauge().GetValue() > 0 {
+				usage = used.GetGauge().GetValue() / max.GetGauge().GetValue()
+			}
+			updated.KVCacheUsagePercent = usage
+		}
+	}
+
+	// Handle LoRA metrics (only if all LoRA MetricSpecs are present)
+	if p.MetricMapping.LoraRequestInfo != nil {
+		loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies)
+		errs = multierr.Append(errs, err)
+
+		if loraMetrics != nil {
+			updated.ActiveModels = make(map[string]int)
+			for _, label := range loraMetrics.GetLabel() {
+				if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
+					if label.GetValue() != "" {
+						adapterList := strings.Split(label.GetValue(), ",")
+						for _, adapter := range adapterList {
+							updated.ActiveModels[adapter] = 0
+						}
+					}
+				}
+				if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+					if label.GetValue() != "" {
+						adapterList := strings.Split(label.GetValue(), ",")
+						for _, adapter := range adapterList {
+							updated.ActiveModels[adapter] = 0
+						}
+					}
+				}
+				if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
+					if label.GetValue() != "" {
+						updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
+						if err != nil {
+							errs = multierr.Append(errs, err)
+						}
+					}
+				}
+			}
+		}
+	}
+
+	return updated, errs
+}
+
+// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
+// reason its specially fetched is because each label key value pair permutation generates new series
+// and only most recent is useful. The value of each series is the creation timestamp so we can
+// retrieve the latest by sorting the value.
+func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	if p.MetricMapping.LoraRequestInfo == nil {
+		return nil, time.Time{}, nil // No LoRA metrics configured
+	}
+
+	loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName]
+	if !ok {
+		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
+		return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
+	}
+
+	var latest *dto.Metric
+	var latestTs float64 // Use float64, as Gauge.Value is float64
+
+	// Iterate over all metrics in the family.
+	for _, m := range loraRequests.GetMetric() {
+		running := ""
+		waiting := ""
+		// Check if the metric has the expected LoRA labels.  This is important!
+		hasRequiredLabels := false
+		for _, lp := range m.GetLabel() {
+			switch lp.GetName() {
+			case LoraRequestInfoRunningAdaptersMetricName:
+				running = lp.GetValue()
+				hasRequiredLabels = true
+			case LoraRequestInfoWaitingAdaptersMetricName:
+				waiting = lp.GetValue()
+				hasRequiredLabels = true
+			}
+		}
+		//Skip if it does not have the lora labels
+		if !hasRequiredLabels {
+			continue
+		}
+		// Ignore metrics with both labels empty.
+		if running == "" && waiting == "" {
+			continue
+		}
+
+		// Select the metric with the *largest Gauge Value* (which represents the timestamp).
+		if m.GetGauge().GetValue() > latestTs {
+			latestTs = m.GetGauge().GetValue()
+			latest = m
+		}
+	}
+	if latest == nil {
+		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName)
+		return nil, time.Time{}, nil
+	}
+
+	// Convert the gauge value (creation timestamp) to time.Time.
+	return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time
+}
+
+// getMetric retrieves a specific metric based on MetricSpec.
+func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
+	mf, ok := metricFamilies[spec.MetricName]
+	if !ok {
+		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName)
+		return nil, fmt.Errorf("metric family %q not found", spec.MetricName)
+	}
+
+	if len(mf.GetMetric()) == 0 {
+		return nil, fmt.Errorf("no metrics available for %q", spec.MetricName)
+	}
+	// if there is a specified label, return only that metric in the family
+	if spec.Labels != nil {
+		return getLabeledMetric(logger, mf, spec)
+	}
+	return getLatestMetric(logger, mf)
+}
+
+// getLatestMetric gets the latest metric of a family (for metrics without labels).
+func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) {
+	var latestTs int64
+	var latest *dto.Metric
+	for _, m := range mf.GetMetric() {
+		if m.GetTimestampMs() >= latestTs {
+			latestTs = m.GetTimestampMs()
+			latest = m
+		}
+	}
+
+	if latest == nil {
+		return nil, fmt.Errorf("no metrics found for %q", mf.GetName())
+	}
+
+	logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName())
+	return latest, nil
+}
+
+// getLabeledMetric gets the latest metric with matching labels.
+func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
+	var latestMetric *dto.Metric
+	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
+
+	for _, m := range mf.GetMetric() {
+		if labelsMatch(m.GetLabel(), spec.Labels) {
+			if m.GetTimestampMs() > latestTimestamp {
+				latestTimestamp = m.GetTimestampMs()
+				latestMetric = m
+			}
+		}
+	}
+
+	if latestMetric != nil {
+		logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName)
+		return latestMetric, nil
+	}
+
+	return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels)
+}
+
+// labelsMatch checks if a metric's labels contain all the labels in the spec.
+func labelsMatch(metricLabels []*dto.LabelPair, specLabels map[string]string) bool {
+	if len(specLabels) == 0 {
+		return true // No specific labels required
+	}
+
+	for specName, specValue := range specLabels {
+		found := false
+		for _, label := range metricLabels {
+			if label.GetName() == specName && label.GetValue() == specValue {
+				found = true
+				break
+			}
+		}
+		if !found {
+			return false // A required label is missing
+		}
+	}
+	return true // All required labels are present
+}
diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/metrics_spec.go
new file mode 100644
index 000000000..aabcf9835
--- /dev/null
+++ b/pkg/epp/backend/metrics_spec.go
@@ -0,0 +1,164 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backend
+
+import (
+	"fmt"
+	"strings"
+)
+
+// MetricSpec represents a single metric's specification.
+type MetricSpec struct {
+	MetricName string
+	Labels     map[string]string // Label name -> Label value
+}
+
+// MetricMapping holds named MetricSpecs.
+type MetricMapping struct {
+	AllRequests       *MetricSpec // Option 1
+	WaitingRequests   *MetricSpec // Option 2
+	RunningRequests   *MetricSpec // Required
+	UsedKVCacheBlocks *MetricSpec // Optional (part of a group)
+	MaxKVCacheBlocks  *MetricSpec // Optional (part of a group)
+	KVCacheUsage      *MetricSpec // Optional (alternative to the group above)
+	// LoRA Metrics (vLLM Specific, optional)
+	LoraRequestInfo *MetricSpec
+}
+
+// stringToMetricSpec converts a string to a MetricSpec.
+// Example inputs:
+//
+//	"metric_name"
+//	"metric_name{label1=value1}"
+//	"metric_name{label1=value1,label2=value2}"
+func stringToMetricSpec(specStr string) (*MetricSpec, error) {
+	if specStr == "" {
+		return nil, nil // Allow empty strings to represent nil MetricSpecs
+	}
+
+	specStr = strings.TrimSpace(specStr)
+	metricName := specStr
+	labels := make(map[string]string)
+
+	// Check for labels enclosed in curly braces
+	start := strings.Index(specStr, "{")
+	end := strings.Index(specStr, "}")
+
+	if start != -1 || end != -1 { // If *either* brace is present...
+		if start == -1 || end == -1 || end <= start+1 { // ...check that *both* are present and correctly placed.
+			return nil, fmt.Errorf("invalid metric spec string: %q, missing or malformed label block", specStr)
+		}
+
+		metricName = strings.TrimSpace(specStr[:start])
+		labelStr := specStr[start+1 : end]
+
+		// Split into individual label pairs
+		labelPairs := strings.Split(labelStr, ",")
+		for _, pair := range labelPairs {
+			pair = strings.TrimSpace(pair)
+			parts := strings.Split(pair, "=")
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid label pair: %q in metric spec: %q", pair, specStr)
+			}
+			labelName := strings.TrimSpace(parts[0])
+			labelValue := strings.TrimSpace(parts[1])
+			if labelName == "" || labelValue == "" {
+				return nil, fmt.Errorf("empty label name or value in pair: %q in metric spec: %q", pair, specStr)
+			}
+			labels[labelName] = labelValue
+		}
+		// Check for extra characters after labels
+		if end != len(specStr)-1 {
+			return nil, fmt.Errorf("invalid characters after label section in: %q", specStr)
+		}
+
+	}
+
+	if metricName == "" { //Metric name cannot be empty
+		return nil, fmt.Errorf("empty metric name in spec: %q", specStr)
+	}
+
+	return &MetricSpec{
+		MetricName: metricName,
+		Labels:     labels,
+	}, nil
+}
+
+// NewMetricMapping creates a MetricMapping from string values.
+func NewMetricMapping(allStr, waitingStr, runningStr, usedBlocksStr, maxBlocksStr, usageStr, loraReqInfoStr string) (*MetricMapping, error) {
+	allSpec, err := stringToMetricSpec(allStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing AllRequests: %w", err)
+	}
+	waitingSpec, err := stringToMetricSpec(waitingStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing WaitingRequests: %w", err)
+	}
+	runningSpec, err := stringToMetricSpec(runningStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing RunningRequests: %w", err)
+	}
+	usedBlocksSpec, err := stringToMetricSpec(usedBlocksStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing UsedKVCacheBlocks: %w", err)
+	}
+	maxBlocksSpec, err := stringToMetricSpec(maxBlocksStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing MaxKVCacheBlocks: %w", err)
+	}
+	usageSpec, err := stringToMetricSpec(usageStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing KVCacheUsage: %w", err)
+	}
+	loraReqInfoSpec, err := stringToMetricSpec(loraReqInfoStr)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing loraReqInfoStr: %w", err)
+	}
+	mapping := &MetricMapping{
+		AllRequests:       allSpec,
+		WaitingRequests:   waitingSpec,
+		RunningRequests:   runningSpec,
+		UsedKVCacheBlocks: usedBlocksSpec,
+		MaxKVCacheBlocks:  maxBlocksSpec,
+		KVCacheUsage:      usageSpec,
+		LoraRequestInfo:   loraReqInfoSpec,
+	}
+
+	if err := mapping.Validate(); err != nil {
+		return nil, err // Return validation error
+	}
+
+	return mapping, nil
+}
+
+// Validate checks if the MetricMapping is valid.
+func (m *MetricMapping) Validate() error {
+	// 1. WaitingRequests OR AllRequests (but not both can be nil)
+	if m.WaitingRequests == nil && m.AllRequests == nil {
+		return fmt.Errorf("either WaitingRequests or AllRequests must be specified")
+	}
+
+	if m.RunningRequests == nil {
+		return fmt.Errorf("RunningRequests is required")
+	}
+
+	// 2. KVCacheUsage OR (UsedKVCacheBlocks AND MaxKVCacheBlocks)
+	if m.KVCacheUsage == nil && (m.UsedKVCacheBlocks == nil || m.MaxKVCacheBlocks == nil) {
+		return fmt.Errorf("either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified")
+	}
+	return nil
+}
diff --git a/pkg/epp/backend/metrics_spec_test.go b/pkg/epp/backend/metrics_spec_test.go
new file mode 100644
index 000000000..084ae5b5a
--- /dev/null
+++ b/pkg/epp/backend/metrics_spec_test.go
@@ -0,0 +1,281 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backend
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestStringToMetricSpec(t *testing.T) {
+	tests := []struct {
+		name    string
+		input   string
+		want    *MetricSpec
+		wantErr bool
+	}{
+		{
+			name:    "empty string",
+			input:   "",
+			want:    nil,
+			wantErr: false,
+		},
+		{
+			name:  "no labels",
+			input: "my_metric",
+			want: &MetricSpec{
+				MetricName: "my_metric",
+				Labels:     map[string]string{},
+			},
+			wantErr: false,
+		},
+		{
+			name:  "one label",
+			input: "my_metric{label1=value1}",
+			want: &MetricSpec{
+				MetricName: "my_metric",
+				Labels: map[string]string{
+					"label1": "value1",
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:  "multiple labels",
+			input: "my_metric{label1=value1,label2=value2}",
+			want: &MetricSpec{
+				MetricName: "my_metric",
+				Labels: map[string]string{
+					"label1": "value1",
+					"label2": "value2",
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:  "extra whitespace",
+			input: "  my_metric  {  label1  =  value1  ,  label2  =  value2  }  ",
+			want: &MetricSpec{
+				MetricName: "my_metric",
+				Labels: map[string]string{
+					"label1": "value1",
+					"label2": "value2",
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:    "missing closing brace",
+			input:   "my_metric{label1=value1",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:    "missing opening brace",
+			input:   "my_metriclabel1=value1}",
+			want:    nil, // Corrected expected value
+			wantErr: true,
+		},
+		{
+			name:    "invalid label pair",
+			input:   "my_metric{label1}",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:    "empty label name",
+			input:   "my_metric{=value1}",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:    "empty label value",
+			input:   "my_metric{label1=}",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:    "empty label name and value with spaces",
+			input:   "my_metric{  =  }",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:    "characters after closing brace",
+			input:   "my_metric{label=val}extra",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:    "empty metric name",
+			input:   "{label=val}",
+			want:    nil,
+			wantErr: true,
+		},
+		{
+			name:  "no labels and just metric name with space",
+			input: "my_metric ",
+			want: &MetricSpec{
+				MetricName: "my_metric",
+				Labels:     map[string]string{},
+			},
+			wantErr: false,
+		},
+		{
+			name:  "no labels and just metric name with space before and after",
+			input: "  my_metric  ",
+			want: &MetricSpec{
+				MetricName: "my_metric",
+				Labels:     map[string]string{},
+			},
+			wantErr: false,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := stringToMetricSpec(tt.input)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("stringToMetricSpec() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if tt.want != nil && got != nil { // compare maps directly
+				if tt.want.Labels == nil {
+					tt.want.Labels = make(map[string]string)
+				}
+				if !reflect.DeepEqual(got.MetricName, tt.want.MetricName) {
+					t.Errorf("stringToMetricSpec() got MetricName = %v, want %v", got.MetricName, tt.want.MetricName)
+				}
+				if !reflect.DeepEqual(got.Labels, tt.want.Labels) {
+					t.Errorf("stringToMetricSpec() got Labels = %v, want %v", got.Labels, tt.want.Labels)
+				}
+			} else if tt.want != got { // handles if one is nil and the other isn't
+				t.Errorf("stringToMetricSpec() = %v, want %v", got, tt.want)
+
+			}
+
+		})
+	}
+}
+
+func TestNewMetricMappingAndValidate(t *testing.T) {
+	tests := []struct {
+		name           string
+		allStr         string
+		waitingStr     string
+		runningStr     string
+		usedStr        string
+		maxStr         string
+		usageStr       string
+		loraReqInfoStr string
+		wantErr        bool
+		expectedErr    string // Added to check for specific error messages
+	}{
+		{
+			name:           "valid vllm mapping",
+			runningStr:     "running_metric",
+			waitingStr:     "waiting_metric",
+			usageStr:       "usage_metric",
+			loraReqInfoStr: "lora_requests_info",
+			wantErr:        false,
+			expectedErr:    "",
+		},
+		{
+			name:       "valid triton mapping",
+			runningStr: "running_metric{label1=value1}",
+			allStr:     "all_metric{label2=value2}",
+			usedStr:    "used_blocks{label3=value3}",
+			maxStr:     "max_blocks{label4=value4}",
+			wantErr:    false,
+		},
+		{
+			name:       "multiple labels mapping",
+			runningStr: "running_metric{label1=value1,label5=value5}",
+			allStr:     "all_metric{label2=value2,label6=value6}",
+			usedStr:    "used_blocks{label3=value3}",
+			maxStr:     "max_blocks{label4=value4}",
+			wantErr:    false,
+		},
+		{
+			name:        "missing running",
+			waitingStr:  "waiting_metric",
+			usageStr:    "usage_metric",
+			wantErr:     true,
+			expectedErr: "RunningRequests is required",
+		},
+		{
+			name:        "missing both waiting and all",
+			runningStr:  "running_metric",
+			usageStr:    "usage_metric",
+			wantErr:     true,
+			expectedErr: "either WaitingRequests or AllRequests must be specified",
+		},
+		{
+			name:        "missing usage and both block metrics",
+			runningStr:  "running_metric",
+			waitingStr:  "waiting_metric",
+			wantErr:     true,
+			expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified",
+		},
+		{
+			name:        "missing max block metric",
+			runningStr:  "running_metric",
+			waitingStr:  "waiting_metric",
+			usedStr:     "used_blocks",
+			wantErr:     true,
+			expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified",
+		},
+		{
+			name:        "missing used block metric",
+			runningStr:  "running_metric",
+			waitingStr:  "waiting_metric",
+			maxStr:      "max_blocks",
+			wantErr:     true,
+			expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified",
+		},
+		{
+			name:        "invalid running metric format",
+			runningStr:  "running_metric{invalid",
+			waitingStr:  "waiting_metric",
+			usageStr:    "usage_metric",
+			wantErr:     true,
+			expectedErr: "error parsing RunningRequests", // Check for part of the expected error
+		},
+		{
+			name:           "lora metrics present",
+			runningStr:     "running_metric",
+			waitingStr:     "waiting_metric",
+			usageStr:       "usage_metric",
+			loraReqInfoStr: "lora_requests_info",
+
+			wantErr:     false,
+			expectedErr: "", // Check for part of the expected error
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := NewMetricMapping(tt.allStr, tt.waitingStr, tt.runningStr, tt.usedStr, tt.maxStr, tt.usageStr, tt.loraReqInfoStr)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("NewMetricMapping() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if tt.wantErr && !strings.Contains(err.Error(), tt.expectedErr) {
+				t.Errorf("NewMetricMapping() error = %v, expected to contain = %v", err, tt.expectedErr)
+			}
+		})
+	}
+}
diff --git a/pkg/epp/backend/metrics_test.go b/pkg/epp/backend/metrics_test.go
new file mode 100644
index 000000000..0bfafcee5
--- /dev/null
+++ b/pkg/epp/backend/metrics_test.go
@@ -0,0 +1,741 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backend
+
+import (
+	"context"
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
+	"testing"
+
+	dto "github.com/prometheus/client_model/go"
+	"go.uber.org/multierr"
+	"google.golang.org/protobuf/proto"
+	"k8s.io/apimachinery/pkg/types"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+// --- Test Helpers ---
+
+func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric {
+	labelPairs := []*dto.LabelPair{}
+	for k, v := range labels {
+		labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)})
+	}
+	return &dto.Metric{
+		Label:       labelPairs,
+		Gauge:       &dto.Gauge{Value: &value},
+		TimestampMs: &timestampMs,
+	}
+}
+
+func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily {
+	return &dto.MetricFamily{
+		Name:   &name,
+		Type:   dto.MetricType_GAUGE.Enum(),
+		Metric: metrics,
+	}
+}
+
+// --- Tests ---
+
+func TestGetMetric(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	metricFamilies := map[string]*dto.MetricFamily{
+		"metric1": makeMetricFamily("metric1",
+			makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000),
+			makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000),
+		),
+		"metric2": makeMetricFamily("metric2",
+			makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500),
+			makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500),
+		),
+		"metric3": makeMetricFamily("metric3",
+			makeMetric("metric3", map[string]string{}, 5.0, 3000),
+			makeMetric("metric3", map[string]string{}, 6.0, 1000),
+		),
+	}
+
+	tests := []struct {
+		name        string
+		spec        MetricSpec
+		wantValue   float64
+		wantError   bool
+		shouldPanic bool // Add this
+	}{
+		{
+			name: "get labeled metric, exists",
+			spec: MetricSpec{
+				MetricName: "metric1",
+				Labels:     map[string]string{"label1": "value1"},
+			},
+			wantValue: 1.0,
+			wantError: false,
+		},
+		{
+			name: "get labeled metric, wrong value",
+			spec: MetricSpec{
+				MetricName: "metric1",
+				Labels:     map[string]string{"label1": "value3"},
+			},
+			wantValue: -1, // Expect an error, not a specific value
+			wantError: true,
+		},
+		{
+			name: "get labeled metric, missing label",
+			spec: MetricSpec{
+				MetricName: "metric1",
+				Labels:     map[string]string{"label2": "value2"},
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get labeled metric, extra label present",
+			spec: MetricSpec{
+				MetricName: "metric2",
+				Labels:     map[string]string{"labelA": "A1"},
+			},
+			wantValue: 3.0,
+			wantError: false,
+		},
+		{
+			name: "get unlabeled metric, exists",
+			spec: MetricSpec{
+				MetricName: "metric3",
+				Labels:     nil, // Explicitly nil
+			},
+			wantValue: 5.0, // latest metric, which occurs first in our test data
+			wantError: false,
+		},
+		{
+			name: "get unlabeled metric, metric family not found",
+			spec: MetricSpec{
+				MetricName: "metric4",
+				Labels:     nil,
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get labeled metric, metric family not found",
+			spec: MetricSpec{
+				MetricName: "metric4",
+				Labels:     map[string]string{"label1": "value1"},
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get metric, no metrics available",
+			spec: MetricSpec{
+				MetricName: "empty_metric",
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get latest metric",
+			spec: MetricSpec{
+				MetricName: "metric3",
+				Labels:     map[string]string{}, // Empty map, not nil
+			},
+			wantValue: 5.0,
+			wantError: false,
+		},
+	}
+
+	p := &PodMetricsClientImpl{} // No need for MetricMapping here
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if tt.shouldPanic {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("The code did not panic")
+					}
+				}()
+			}
+
+			gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec)
+
+			if tt.wantError {
+				if err == nil {
+					t.Errorf("getMetric() expected error, got nil")
+				}
+			} else {
+				if err != nil {
+					t.Errorf("getMetric() unexpected error: %v", err)
+				}
+				if gotMetric.GetGauge().GetValue() != tt.wantValue {
+					t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue)
+				}
+			}
+		})
+	}
+}
+
+func TestLabelsMatch(t *testing.T) {
+	tests := []struct {
+		name         string
+		metricLabels []*dto.LabelPair
+		specLabels   map[string]string
+		want         bool
+	}{
+		{
+			name:         "empty spec labels, should match",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{},
+			want:         true,
+		},
+		{
+			name:         "nil spec labels, should match",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   nil,
+			want:         true,
+		},
+		{
+			name:         "exact match",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{"a": "b"},
+			want:         true,
+		},
+		{
+			name:         "extra labels in metric",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}, {Name: proto.String("c"), Value: proto.String("d")}},
+			specLabels:   map[string]string{"a": "b"},
+			want:         true,
+		},
+		{
+			name:         "missing label in metric",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{"a": "b", "c": "d"},
+			want:         false,
+		},
+		{
+			name:         "value mismatch",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{"a": "c"},
+			want:         false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := labelsMatch(tt.metricLabels, tt.specLabels); got != tt.want {
+				t.Errorf("labelsMatch() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestGetLatestLoraMetric(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	testCases := []struct {
+		name             string
+		metricFamilies   map[string]*dto.MetricFamily
+		expectedAdapters map[string]int
+		expectedMax      int
+		expectedErr      error
+		mapping          *MetricMapping
+	}{
+		{
+			name: "no lora metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"some_other_metric": makeMetricFamily("some_other_metric",
+					makeMetric("some_other_metric", nil, 1.0, 1000),
+				),
+			},
+			expectedAdapters: nil,
+			expectedMax:      0,
+			expectedErr:      fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+		},
+		{
+			name: "basic lora metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000),       // Newer
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older
+
+				),
+			},
+			expectedAdapters: map[string]int{"lora1": 0},
+			expectedMax:      2,
+			expectedErr:      nil,
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+		},
+		{
+			name: "no matching lora metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000),
+				),
+			},
+			expectedAdapters: nil,
+			expectedMax:      0,
+			expectedErr:      nil, // Expect *no* error; just no adapters found
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+		},
+		{
+			name: "no lora metrics if not in MetricMapping",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000),
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000),
+				),
+			},
+			expectedAdapters: nil,
+			expectedMax:      0,
+			expectedErr:      nil,
+			mapping:          &MetricMapping{ // No LoRA metrics defined
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
+			loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies)
+
+			if tc.expectedErr != nil {
+				if err == nil || err.Error() != tc.expectedErr.Error() {
+					t.Errorf("getLatestLoraMetric() error = %v, wantErr %v", err, tc.expectedErr)
+				}
+				return // Stop here if an error was expected
+			} else if err != nil {
+				t.Fatalf("getLatestLoraMetric() unexpected error: %v", err)
+			}
+
+			if tc.mapping.LoraRequestInfo == nil {
+				if loraMetric != nil {
+					t.Errorf("getLatestLoraMetric() expected nil metric, got %v", loraMetric)
+				}
+				return // Stop if no Lora metrics are expected.
+			}
+
+			if tc.expectedAdapters == nil && loraMetric == nil {
+				return // Both nil, as expected
+			}
+
+			if tc.expectedAdapters != nil && loraMetric != nil { // proceed with checks
+
+				adaptersFound := make(map[string]int)
+				maxLora := 0
+				for _, label := range loraMetric.GetLabel() {
+					if label.GetName() == "running_lora_adapters" && label.GetValue() != "" {
+						for _, adapter := range strings.Split(label.GetValue(), ",") {
+							adaptersFound[adapter] = 0
+						}
+					}
+					if label.GetName() == "waiting_lora_adapters" && label.GetValue() != "" {
+						for _, adapter := range strings.Split(label.GetValue(), ",") {
+							adaptersFound[adapter] = 0 // Overwrite if already present
+						}
+					}
+					if label.GetName() == "max_lora" {
+						var converr error // define err in this scope.
+						maxLora, converr = strconv.Atoi(label.GetValue())
+						if converr != nil && tc.expectedErr == nil { // only report if we don't expect any other errors
+							t.Errorf("getLatestLoraMetric() could not parse max_lora: %v", converr)
+						}
+					}
+				}
+
+				if !reflect.DeepEqual(adaptersFound, tc.expectedAdapters) {
+					t.Errorf("getLatestLoraMetric() adapters = %v, want %v", adaptersFound, tc.expectedAdapters)
+				}
+				if maxLora != tc.expectedMax {
+					t.Errorf("getLatestLoraMetric() maxLora = %v, want %v", maxLora, tc.expectedMax)
+				}
+			} else { // one is nil and the other is not
+				t.Errorf("getLatestLoraMetric(): one of expectedAdapters/loraMetric is nil and the other is not, expected %v, got %v", tc.expectedAdapters, loraMetric)
+			}
+		})
+	}
+}
+
+func TestPromToPodMetrics(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	tests := []struct {
+		name             string
+		metricFamilies   map[string]*dto.MetricFamily
+		mapping          *MetricMapping
+		existingMetrics  *datastore.PodMetrics
+		expectedMetrics  *datastore.PodMetrics
+		expectedErrCount int // Count of expected errors
+	}{
+		{
+			name: "vllm metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm_running": makeMetricFamily("vllm_running",
+					makeMetric("vllm_running", nil, 10.0, 2000),
+					makeMetric("vllm_running", nil, 12.0, 1000), //Older
+				),
+				"vllm_waiting": makeMetricFamily("vllm_waiting",
+					makeMetric("vllm_waiting", nil, 5.0, 1000),
+					makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer
+				),
+				"vllm_usage": makeMetricFamily("vllm_usage",
+					makeMetric("vllm_usage", nil, 0.8, 2000),
+					makeMetric("vllm_usage", nil, 0.7, 500),
+				),
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
+				),
+			},
+			mapping: &MetricMapping{
+				RunningRequests: &MetricSpec{MetricName: "vllm_running"},
+				WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"},
+				KVCacheUsage:    &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					RunningQueueSize:    10,
+					WaitingQueueSize:    7,
+					KVCacheUsagePercent: 0.8,
+					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
+					MaxActiveModels:     3,
+				},
+			},
+			expectedErrCount: 0,
+		},
+		{
+			name: "triton metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"triton_running": makeMetricFamily("triton_running",
+					makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000),
+					makeMetric("triton_running", map[string]string{"queue": "slow"}, 12.0, 1000), //Older, but different label
+				),
+				"triton_all": makeMetricFamily("triton_all",
+					makeMetric("triton_all", map[string]string{"queue": "fast"}, 15.0, 1000),
+					makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000), // Newer
+				),
+				"triton_used": makeMetricFamily("triton_used",
+					makeMetric("triton_used", map[string]string{"type": "gpu"}, 80.0, 1000),
+				),
+				"triton_max": makeMetricFamily("triton_max",
+					makeMetric("triton_max", map[string]string{"type": "gpu"}, 100.0, 1000),
+				),
+			},
+			mapping: &MetricMapping{
+				RunningRequests:   &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}},
+				AllRequests:       &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}},
+				UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}},
+				MaxKVCacheBlocks:  &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels:        map[string]int{},
+					RunningQueueSize:    10,
+					WaitingQueueSize:    7,   // 17 (all) - 10 (running)
+					KVCacheUsagePercent: 0.8, // 80 / 100
+				},
+			},
+			expectedErrCount: 0,
+		},
+		{
+			name: "triton metrics, missing label",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"triton_running": makeMetricFamily("triton_running",
+					makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000),
+				),
+				"triton_all": makeMetricFamily("triton_all",
+					makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000),
+				),
+				// triton_used and _max have no metrics with type=gpu label.
+			},
+			mapping: &MetricMapping{
+				RunningRequests:   &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}},
+				AllRequests:       &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}},
+				UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}},
+				MaxKVCacheBlocks:  &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels:        map[string]int{},
+					RunningQueueSize:    10,
+					WaitingQueueSize:    7,
+					KVCacheUsagePercent: 0.0, // expect this to still be present, but with default 0 value
+				},
+			},
+
+			expectedErrCount: 2, // Two errors:  Used and Max
+		},
+		{
+			name:           "missing metrics",
+			metricFamilies: map[string]*dto.MetricFamily{}, // No metrics
+			mapping: &MetricMapping{
+				RunningRequests: &MetricSpec{MetricName: "vllm_running"},
+				WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"},
+				KVCacheUsage:    &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
+			expectedMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
+			expectedErrCount: 4, // Errors for all 4 main metrics
+		},
+		{
+			name: "partial metrics available + LoRA",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm_usage": makeMetricFamily("vllm_usage",
+					makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present
+				),
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
+				),
+			},
+			mapping: &MetricMapping{
+				RunningRequests: &MetricSpec{MetricName: "vllm_running"}, // Not present
+				WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present
+				KVCacheUsage:    &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					RunningQueueSize:    0,
+					WaitingQueueSize:    0,
+					KVCacheUsagePercent: 0.8,
+					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
+					MaxActiveModels:     3,
+				},
+			},
+			expectedErrCount: 2, // Errors for the two missing metrics
+		},
+		{
+			name: "use all requests for waiting queue",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm_running": makeMetricFamily("vllm_running",
+					makeMetric("vllm_running", nil, 10.0, 2000),
+				),
+				"vllm_all": makeMetricFamily("vllm_all",
+					makeMetric("vllm_all", nil, 15.0, 1000),
+				),
+			},
+			mapping: &MetricMapping{
+				RunningRequests: &MetricSpec{MetricName: "vllm_running"},
+				AllRequests:     &MetricSpec{MetricName: "vllm_all"},
+				// No WaitingRequests
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels:     map[string]int{},
+					RunningQueueSize: 10,
+					WaitingQueueSize: 5, // 15 - 10
+				},
+			},
+			expectedErrCount: 0,
+		},
+		{
+			name: "invalid max lora",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000),
+				),
+			},
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{},
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels:    map[string]int{"lora1": 0},
+					MaxActiveModels: 0, // Should still default to 0.
+
+				},
+			},
+			expectedErrCount: 1, // Expect *one* error
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
+			updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics)
+
+			if tc.expectedErrCount == 0 {
+				if err != nil {
+					t.Errorf("promToPodMetrics() unexpected error: %v", err)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("promToPodMetrics() expected errors, got nil")
+				} else {
+					// Check the *number* of errors.  multierr.Errors() gives us a slice
+					if len(multierr.Errors(err)) != tc.expectedErrCount {
+						t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d.  Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err)
+					}
+
+				}
+			}
+			// Use podMetricsEqual for comparison with tolerance.
+			if !reflect.DeepEqual(updated, tc.expectedMetrics) {
+				t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics)
+			}
+		})
+	}
+}
+
+// TestFetchMetrics is a basic integration test.  A more complete test would mock
+// the HTTP client.
+func TestFetchMetrics(t *testing.T) {
+	// This test is very basic as it doesn't mock the HTTP client.  It assumes
+	// there's no server running on the specified port.  A real-world test
+	// suite should use a mock server.
+	ctx := logutil.NewTestLoggerIntoContext(context.Background())
+	existing := &datastore.PodMetrics{
+		Pod: datastore.Pod{
+			Address: "127.0.0.1",
+			NamespacedName: types.NamespacedName{
+				Namespace: "test",
+				Name:      "pod",
+			},
+		},
+	}
+	p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test
+
+	_, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use.
+	if err == nil {
+		t.Errorf("FetchMetrics() expected error, got nil")
+	}
+	// Check for a specific error message (fragile, but OK for this example)
+	expectedSubstr := "connection refused"
+	if err != nil && !strings.Contains(err.Error(), expectedSubstr) {
+		t.Errorf("FetchMetrics() error = %v, want error containing %q", err, expectedSubstr)
+	}
+}
diff --git a/pkg/epp/backend/triton/metrics.go b/pkg/epp/backend/triton/metrics.go
index 2f8d24bd9..f28b1c88b 100644
--- a/pkg/epp/backend/triton/metrics.go
+++ b/pkg/epp/backend/triton/metrics.go
@@ -39,24 +39,6 @@ const (
 	TRTLLMKvCacheMetricsName  = "nv_trt_llm_kv_cache_block_metrics"
 	TRTLLMKvCacheMetricsLabel = "kv_cache_block_type"
 	TRTLLMRequestMetricsLabel = "request_type"
-
-	// THESE ARE UNUSED, EXAMPLES FOR MORE METRICS
-	inferenceCountMetricName           = "nv_inference_count"
-	inferenceSuccessMetricName         = "nv_inference_request_success"
-	inferenceExecCountMetricName       = "nv_inference_exec_count"
-	inferenceRequestDurationMetricName = "nv_inference_request_duration_us"
-	waitingQueueSizeMetricName         = "nv_inference_pending_request_count"
-	queueDurationMetricName            = "nv_inference_queue_duration_us"
-	computeInputDurationMetricName     = "nv_inference_compute_input_duration_us"
-	computeInferDurationMetricName     = "nv_inference_compute_infer_duration_us"
-	computeOutputDurationMetricName    = "nv_inference_compute_output_duration_us"
-	gpuUtilizationMetricName           = "nv_gpu_utilization"
-	gpuMemoryTotalMetricName           = "nv_gpu_memory_total_bytes"
-	gpuMemoryUsedMetricName            = "nv_gpu_memory_used_bytes"
-	gpuPowerUsageMetricName            = "nv_gpu_power_usage"
-	gpuPowerLimitMetricName            = "nv_gpu_power_limit"
-	gpuMemoryTotalBytesMetricName      = "nv_gpu_memory_total_bytes"
-	gpuMemoryUsedBytesMetricName       = "nv_gpu_memory_used_bytes"
 )
 
 type PodMetricsClientImpl struct{}
@@ -65,12 +47,13 @@ type PodMetricsClientImpl struct{}
 func (p *PodMetricsClientImpl) FetchMetrics(
 	ctx context.Context,
 	existing *datastore.PodMetrics,
+	port int32,
 ) (*datastore.PodMetrics, error) {
 	logger := log.FromContext(ctx)
 	loggerDefault := logger.V(logutil.DEFAULT)
 
 	// existing.ScrapePort = 8002 // triton has a different port for metrics than the target port for inference
-	url := existing.BuildScrapeEndpoint()
+	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	// TODO print response and err
 
@@ -109,35 +92,46 @@ func promToPodMetrics(
 	var errs error
 	updated := existing.Clone()
 
+	//fmt.Print("\n\nDEBUG START\n###### DEBUG getting REQUEST metrics... ######")
 	// Get the "nv_trt_llm_request_metrics" metric family
-	requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName)
-	errs = multierr.Append(errs, err)
-	if err == nil {
-		if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil {
-			fmt.Printf("###### DEBUG max: %+v", active)
-			updated.Metrics.RunningQueueSize = int(active)
-		} else {
-			errs = multierr.Append(errs, err)
-		}
+	//requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName)
+	requestMetrics, ok := metricFamilies[TRTLLMRequestMetricsName]
+	//errs = multierr.Append(errs, err)
+	if ok {
 		if scheduled, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "scheduled"); err == nil {
-			fmt.Printf("###### DEBUG max: %+v", scheduled)
-			updated.Metrics.WaitingQueueSize = int(scheduled)
+			//fmt.Printf("\n###### DEBUG generation_requests: %+v", generation_requests)
+			updated.Metrics.RunningQueueSize = int(scheduled)
+			if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil {
+				//fmt.Printf("\n###### DEBUG scheduled: %+v", scheduled)
+				updated.Metrics.WaitingQueueSize = int(active - scheduled)
+				// pendingMetrics, ok := metricFamilies["nv_inference_pending_request_count"]
+				// if ok {
+				// 	if queued, err := getTrtLlmGaugeMetric(logger, pendingMetrics, "model", "ensemble"); err == nil {
+				// 		fmt.Printf("\n###### DEBUG queued requests: %+v", int(queued))
+				// 	}
+				// }
+				//fmt.Printf("\n###### DEBUG active (total) requests: %+v", int(active))
+				//fmt.Printf("\n###### DEBUG waiting requests: %+v", int(active-scheduled))
+				//fmt.Printf("\n###### DEBUG running requests: %+v", int(scheduled))
+			} else {
+				errs = multierr.Append(errs, err)
+			}
 		} else {
 			errs = multierr.Append(errs, err)
 		}
 	}
 
-	fmt.Print("###### DEBUG getting kvblock metrics... ######")
+	//fmt.Print("\n\n###### DEBUG getting KVBLOCK metrics... ######")
 	// Get the "nv_trt_llm_kv_cache_block_metrics" metric family
-	kvCacheBlocks, err := getLatestMetric(logger, metricFamilies, TRTLLMKvCacheMetricsName)
-	errs = multierr.Append(errs, err)
+	kvCacheBlocks, ok := metricFamilies[TRTLLMKvCacheMetricsName]
+	// errs = multierr.Append(errs, err)
 	// fmt.Printf("###### DEBUG (should be nil) getLatestMetric errs: %+v", errs)
-	if err == nil {
+	if ok {
 		// Calculate the kv-cache usage from the max and used metrics
 		if max, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "max"); err == nil {
-			fmt.Printf("###### DEBUG max: %+v", max)
+			//fmt.Printf("\n###### DEBUG max: %+v", max)
 			if used, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "used"); err == nil {
-				fmt.Printf("###### DEBUG tokens_per: %+v", used)
+				//fmt.Printf("\n###### DEBUG used: %+v", used)
 				usage := 0.0
 				if max > 0 {
 					usage = used / max
@@ -146,19 +140,13 @@ func promToPodMetrics(
 			} else {
 				errs = multierr.Append(errs, err)
 			}
-			if tokens_per, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "tokens_per"); err == nil {
-				fmt.Printf("###### DEBUG tokens_per: %+v", tokens_per)
-				updated.Metrics.KvCacheMaxTokenCapacity = int(tokens_per * max)
-			} else {
-				errs = multierr.Append(errs, err)
-			}
 		} else {
 			errs = multierr.Append(errs, err)
 		}
 	}
 
-	fmt.Printf("###### DEBUG UPDATED: %+v", updated)
-	fmt.Printf("###### DEBUG ERRORS: %+v", errs)
+	//fmt.Printf("\n### DEBUG: %+v", updated)
+	//fmt.Printf("\n###### DEBUG ERRORS: %+v", errs)
 
 	return updated, errs
 }
@@ -230,10 +218,16 @@ func getCounterMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podName st
 
 // getTrtLlmMetric gets a TRT LLM metric with the specified type, key, and value.
 func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.MetricType, key, value string) (float64, error) {
+	//fmt.Printf("###### DEBUG START GETTRTMERTIC: %+v", mf.GetMetric())
+	//fmt.Printf("###### DEBUG METRICS: %+v", len(mf.GetMetric()))
 	for _, m := range mf.GetMetric() {
+		//fmt.Printf("###### DEBUG ANALYZING METRIC: %+v", m)
+		//fmt.Printf("###### DEBUG TIMESTAMP: %+v", m.GetTimestampMs())
 		foundKey := false
 		foundValue := false
+		//fmt.Printf("###### DEBUG LABELS: %+v", m.GetLabel())
 		for _, label := range m.GetLabel() {
+			//fmt.Printf("###### DEBUG COMPARING label NAME %+v == %+v and label VALUE %+v == %+v", label.GetName(), key, label.GetValue(), value)
 			if label.GetName() == key && label.GetValue() == value {
 				foundKey = true
 			}
@@ -242,6 +236,7 @@ func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.Me
 			}
 		}
 		if foundKey && foundValue {
+			//fmt.Printf("###### DEBUG METRIC FOUND: %+v", m)
 			if metricType == dto.MetricType_GAUGE {
 				logger.V(logutil.TRACE).Info("TRT LLM gauge metric found", "value", m.GetGauge().GetValue(), "key", key, "value", value)
 				return m.GetGauge().GetValue(), nil
diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go
index f9b960a52..4a1e2b578 100644
--- a/pkg/epp/backend/triton/metrics_test.go
+++ b/pkg/epp/backend/triton/metrics_test.go
@@ -47,22 +47,17 @@ func TestPromToPodMetrics(t *testing.T) {
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{
-					RunningQueueSize:        1,
-					WaitingQueueSize:        2,
-					KVCacheUsagePercent:     0.5,  // used / max = 50 / 100
-					KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50
+					RunningQueueSize:    1,
+					WaitingQueueSize:    2,
+					KVCacheUsagePercent: 0.5, // used / max = 50 / 100
 				},
 			},
 			initialPodMetrics: &datastore.PodMetrics{
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{},
 			},
@@ -75,22 +70,17 @@ func TestPromToPodMetrics(t *testing.T) {
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{
-					RunningQueueSize:        0, // Default int value
-					WaitingQueueSize:        0, // Default int value
-					KVCacheUsagePercent:     0, // Default float64 value
-					KvCacheMaxTokenCapacity: 0, // Default int value
+					RunningQueueSize:    0, // Default int value
+					WaitingQueueSize:    0, // Default int value
+					KVCacheUsagePercent: 0, // Default float64 value
 				},
 			},
 			initialPodMetrics: &datastore.PodMetrics{
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{},
 			},
@@ -103,22 +93,17 @@ func TestPromToPodMetrics(t *testing.T) {
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{
-					RunningQueueSize:        1,    // from latest
-					WaitingQueueSize:        2,    // from latest
-					KVCacheUsagePercent:     0.5,  // used / max = 50 / 100  (from latest)
-					KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50 (from latest)
+					RunningQueueSize:    1,   // from latest
+					WaitingQueueSize:    2,   // from latest
+					KVCacheUsagePercent: 0.5, // used / max = 50 / 100  (from latest)
 				},
 			},
 			initialPodMetrics: &datastore.PodMetrics{
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{},
 			},
@@ -137,8 +122,6 @@ func TestPromToPodMetrics(t *testing.T) {
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{},
 			},
@@ -146,8 +129,6 @@ func TestPromToPodMetrics(t *testing.T) {
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
-					ScrapePort:     9000,
-					ScrapePath:     "/metrics",
 				},
 				Metrics: datastore.Metrics{},
 			},
diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go
new file mode 100644
index 000000000..8cfcf1d1f
--- /dev/null
+++ b/pkg/epp/datastore/types.go
@@ -0,0 +1,71 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package datastore is a library to interact with backend model servers such as probing metrics.
+package datastore
+
+import (
+	"fmt"
+
+	"k8s.io/apimachinery/pkg/types"
+)
+
+type Pod struct {
+	NamespacedName types.NamespacedName
+	Address        string
+}
+
+type Metrics struct {
+	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
+	ActiveModels map[string]int
+	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
+	MaxActiveModels         int
+	RunningQueueSize        int
+	WaitingQueueSize        int
+	KVCacheUsagePercent     float64
+	KvCacheMaxTokenCapacity int
+}
+
+type PodMetrics struct {
+	Pod
+	Metrics
+}
+
+func (pm *PodMetrics) String() string {
+	return fmt.Sprintf("Pod: %+v; Address: %+v; Metrics: %+v", pm.NamespacedName, pm.Address, pm.Metrics)
+}
+
+func (pm *PodMetrics) Clone() *PodMetrics {
+	cm := make(map[string]int, len(pm.ActiveModels))
+	for k, v := range pm.ActiveModels {
+		cm[k] = v
+	}
+	clone := &PodMetrics{
+		Pod: Pod{
+			NamespacedName: pm.NamespacedName,
+			Address:        pm.Address,
+		},
+		Metrics: Metrics{
+			ActiveModels:            cm,
+			MaxActiveModels:         pm.MaxActiveModels,
+			RunningQueueSize:        pm.RunningQueueSize,
+			WaitingQueueSize:        pm.WaitingQueueSize,
+			KVCacheUsagePercent:     pm.KVCacheUsagePercent,
+			KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity,
+		},
+	}
+	return clone
+}

From 71e00adb67b5bbd7b288688f46e8a7cf76208561 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 6 Mar 2025 20:34:41 +0000
Subject: [PATCH 03/19] Finalize metric refactor and testing.

---
 config/manifests/{ => vllm}/ext_proc.yaml     |  2 +-
 .../manifests/{ => vllm}/inferencemodel.yaml  |  0
 pkg/epp/backend/triton/metrics_test.go        | 48 ++++++++++---------
 3 files changed, 27 insertions(+), 23 deletions(-)
 rename config/manifests/{ => vllm}/ext_proc.yaml (96%)
 rename config/manifests/{ => vllm}/inferencemodel.yaml (100%)

diff --git a/config/manifests/ext_proc.yaml b/config/manifests/vllm/ext_proc.yaml
similarity index 96%
rename from config/manifests/ext_proc.yaml
rename to config/manifests/vllm/ext_proc.yaml
index 33c47d400..bbd11b5c8 100644
--- a/config/manifests/ext_proc.yaml
+++ b/config/manifests/vllm/ext_proc.yaml
@@ -71,7 +71,7 @@ spec:
     spec:
       containers:
       - name: inference-gateway-ext-proc
-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest
         imagePullPolicy: Always
         args:
         - -poolName
diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/vllm/inferencemodel.yaml
similarity index 100%
rename from config/manifests/inferencemodel.yaml
rename to config/manifests/vllm/inferencemodel.yaml
diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go
index 4a1e2b578..931a6346a 100644
--- a/pkg/epp/backend/triton/metrics_test.go
+++ b/pkg/epp/backend/triton/metrics_test.go
@@ -49,6 +49,7 @@ func TestPromToPodMetrics(t *testing.T) {
 					Address:        podAddress,
 				},
 				Metrics: datastore.Metrics{
+					ActiveModels:        map[string]int{},
 					RunningQueueSize:    1,
 					WaitingQueueSize:    2,
 					KVCacheUsagePercent: 0.5, // used / max = 50 / 100
@@ -59,7 +60,9 @@ func TestPromToPodMetrics(t *testing.T) {
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
 				},
-				Metrics: datastore.Metrics{},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				},
 			},
 			expectedErr: false,
 		},
@@ -72,6 +75,7 @@ func TestPromToPodMetrics(t *testing.T) {
 					Address:        podAddress,
 				},
 				Metrics: datastore.Metrics{
+					ActiveModels:        map[string]int{},
 					RunningQueueSize:    0, // Default int value
 					WaitingQueueSize:    0, // Default int value
 					KVCacheUsagePercent: 0, // Default float64 value
@@ -82,7 +86,9 @@ func TestPromToPodMetrics(t *testing.T) {
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
 				},
-				Metrics: datastore.Metrics{},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				},
 			},
 			expectedErr: false,
 		},
@@ -95,6 +101,7 @@ func TestPromToPodMetrics(t *testing.T) {
 					Address:        podAddress,
 				},
 				Metrics: datastore.Metrics{
+					ActiveModels:        map[string]int{},
 					RunningQueueSize:    1,   // from latest
 					WaitingQueueSize:    2,   // from latest
 					KVCacheUsagePercent: 0.5, // used / max = 50 / 100  (from latest)
@@ -105,7 +112,9 @@ func TestPromToPodMetrics(t *testing.T) {
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
 				},
-				Metrics: datastore.Metrics{},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				},
 			},
 			expectedErr: false,
 		},
@@ -118,21 +127,17 @@ func TestPromToPodMetrics(t *testing.T) {
 					Metric: []*dto.Metric{}, // Empty
 				},
 			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{},
-			},
+			expectedMetrics: nil,
 			initialPodMetrics: &datastore.PodMetrics{
 				Pod: datastore.Pod{
 					NamespacedName: types.NamespacedName{Name: podName},
 					Address:        podAddress,
 				},
-				Metrics: datastore.Metrics{},
+				Metrics: datastore.Metrics{
+					ActiveModels: map[string]int{},
+				},
 			},
-			expectedErr: false,
+			expectedErr: true,
 		},
 	}
 
@@ -157,8 +162,8 @@ func allMetricsAvailable(podName string) map[string]*dto.MetricFamily {
 			Name: proto.String(TRTLLMRequestMetricsName),
 			Type: dto.MetricType_GAUGE.Enum(),
 			Metric: []*dto.Metric{
-				trtLlmRequestMetric("active", 1, 200),
-				trtLlmRequestMetric("scheduled", 2, 200),
+				trtLlmRequestMetric("active", 3, 200),
+				trtLlmRequestMetric("scheduled", 1, 200),
 			},
 		},
 		TRTLLMKvCacheMetricsName: {
@@ -179,23 +184,22 @@ func multipleMetricsWithDifferentTimestamps(podName string) map[string]*dto.Metr
 			Name: proto.String(TRTLLMRequestMetricsName),
 			Type: dto.MetricType_GAUGE.Enum(),
 			Metric: []*dto.Metric{
-				trtLlmRequestMetric("active", 0, 100),    // Older
-				trtLlmRequestMetric("scheduled", 3, 100), // Older
-				trtLlmRequestMetric("active", 1, 200),    // Newer
-				trtLlmRequestMetric("scheduled", 2, 200), // Newer
-
+				trtLlmRequestMetric("active", 3, 200),    // Newer
+				trtLlmRequestMetric("scheduled", 1, 200), // Newer
+				trtLlmRequestMetric("active", 3, 100),    // Older
+				trtLlmRequestMetric("scheduled", 0, 100), // Older
 			},
 		},
 		TRTLLMKvCacheMetricsName: {
 			Name: proto.String(TRTLLMKvCacheMetricsName),
 			Type: dto.MetricType_GAUGE.Enum(),
 			Metric: []*dto.Metric{
-				trtLlmKvCacheMetric("max", 110, 100),       //Older
-				trtLlmKvCacheMetric("used", 60, 100),       //Older
-				trtLlmKvCacheMetric("tokens_per", 40, 100), //Older
 				trtLlmKvCacheMetric("max", 100, 200),       // Newer
 				trtLlmKvCacheMetric("used", 50, 200),       // Newer
 				trtLlmKvCacheMetric("tokens_per", 50, 200), // Newer
+				trtLlmKvCacheMetric("max", 110, 100),       //Older
+				trtLlmKvCacheMetric("used", 60, 100),       //Older
+				trtLlmKvCacheMetric("tokens_per", 40, 100), //Older
 			},
 		},
 	}

From dd2825f2cf7c27bd643070ccbaefc942a087e6b7 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 6 Mar 2025 20:54:03 +0000
Subject: [PATCH 04/19] Set streaming env var to false in triton ext_proc.yaml

---
 config/manifests/triton/ext_proc.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml
index 16c802838..6797b7c78 100644
--- a/config/manifests/triton/ext_proc.yaml
+++ b/config/manifests/triton/ext_proc.yaml
@@ -90,6 +90,9 @@ spec:
         - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=used}"
         - -maxKVCacheBlocksMetric
         - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=max}"
+        env:
+        - name: USE_STREAMING
+          value: "false"
         ports:
         - containerPort: 9002
         - containerPort: 9003

From aa2ee06fe18c591aa284724689c4e6adcb9a3555 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 6 Mar 2025 21:40:24 +0000
Subject: [PATCH 05/19] Update titon server deployment to pull frozen repo
 branch instead of main for consistency.

---
 config/manifests/triton/deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/manifests/triton/deployment.yaml b/config/manifests/triton/deployment.yaml
index 61626293b..189ad90f2 100644
--- a/config/manifests/triton/deployment.yaml
+++ b/config/manifests/triton/deployment.yaml
@@ -41,7 +41,7 @@ spec:
             # Install python bindings for tritonserver and tritonfrontend
             pip install /opt/tritonserver/python/triton*.whl
             # Install application requirements
-            git clone https://github.com/triton-inference-server/server.git
+            git clone --depth 1 --branch v2.55.0 https://github.com/triton-inference-server/server.git
             cd server/python/openai/
             pip install -r requirements.txt
             pip install uvicorn

From d4c083e33398c1483b6ef1c5f3ee88f1186b8c42 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 6 Mar 2025 22:01:04 +0000
Subject: [PATCH 06/19] Remove model server specific metric files and tests and
 point EPP image to main AR instead of testing registry.

---
 config/manifests/triton/ext_proc.yaml  |   2 +-
 config/manifests/vllm/ext_proc.yaml    |   2 +-
 pkg/epp/backend/triton/metrics.go      | 265 -------------------------
 pkg/epp/backend/triton/metrics_test.go | 226 ---------------------
 4 files changed, 2 insertions(+), 493 deletions(-)
 delete mode 100644 pkg/epp/backend/triton/metrics.go
 delete mode 100644 pkg/epp/backend/triton/metrics_test.go

diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml
index 6797b7c78..f61a7ec18 100644
--- a/config/manifests/triton/ext_proc.yaml
+++ b/config/manifests/triton/ext_proc.yaml
@@ -71,7 +71,7 @@ spec:
     spec:
       containers:
       - name: inference-gateway-ext-proc
-        image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest
+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
         imagePullPolicy: Always
         args:
         - -poolName
diff --git a/config/manifests/vllm/ext_proc.yaml b/config/manifests/vllm/ext_proc.yaml
index bbd11b5c8..33c47d400 100644
--- a/config/manifests/vllm/ext_proc.yaml
+++ b/config/manifests/vllm/ext_proc.yaml
@@ -71,7 +71,7 @@ spec:
     spec:
       containers:
       - name: inference-gateway-ext-proc
-        image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest
+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
         imagePullPolicy: Always
         args:
         - -poolName
diff --git a/pkg/epp/backend/triton/metrics.go b/pkg/epp/backend/triton/metrics.go
deleted file mode 100644
index f28b1c88b..000000000
--- a/pkg/epp/backend/triton/metrics.go
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
-Copyright 2025 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package triton
-
-import (
-	"context"
-	"fmt"
-	"net/http"
-	"strconv"
-	"strings"
-
-	"github.com/go-logr/logr"
-	dto "github.com/prometheus/client_model/go"
-	"github.com/prometheus/common/expfmt"
-	"go.uber.org/multierr"
-	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-)
-
-const (
-	// Triton metrics, see https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/metrics.html
-
-	TRTLLMRequestMetricsName  = "nv_trt_llm_request_metrics"
-	TRTLLMKvCacheMetricsName  = "nv_trt_llm_kv_cache_block_metrics"
-	TRTLLMKvCacheMetricsLabel = "kv_cache_block_type"
-	TRTLLMRequestMetricsLabel = "request_type"
-)
-
-type PodMetricsClientImpl struct{}
-
-// FetchMetrics fetches metrics from a given pod.
-func (p *PodMetricsClientImpl) FetchMetrics(
-	ctx context.Context,
-	existing *datastore.PodMetrics,
-	port int32,
-) (*datastore.PodMetrics, error) {
-	logger := log.FromContext(ctx)
-	loggerDefault := logger.V(logutil.DEFAULT)
-
-	// existing.ScrapePort = 8002 // triton has a different port for metrics than the target port for inference
-	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
-	// TODO print response and err
-
-	if err != nil {
-		loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
-		return nil, fmt.Errorf("failed to create request: %v", err)
-	}
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName)
-		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err)
-	}
-	defer func() {
-		_ = resp.Body.Close()
-	}()
-
-	if resp.StatusCode != http.StatusOK {
-		loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode)
-		return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode)
-	}
-
-	parser := expfmt.TextParser{}
-	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
-	if err != nil {
-		return nil, err
-	}
-	return promToPodMetrics(logger, metricFamilies, existing)
-}
-
-// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
-func promToPodMetrics(
-	logger logr.Logger,
-	metricFamilies map[string]*dto.MetricFamily,
-	existing *datastore.PodMetrics,
-) (*datastore.PodMetrics, error) {
-	var errs error
-	updated := existing.Clone()
-
-	//fmt.Print("\n\nDEBUG START\n###### DEBUG getting REQUEST metrics... ######")
-	// Get the "nv_trt_llm_request_metrics" metric family
-	//requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName)
-	requestMetrics, ok := metricFamilies[TRTLLMRequestMetricsName]
-	//errs = multierr.Append(errs, err)
-	if ok {
-		if scheduled, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "scheduled"); err == nil {
-			//fmt.Printf("\n###### DEBUG generation_requests: %+v", generation_requests)
-			updated.Metrics.RunningQueueSize = int(scheduled)
-			if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil {
-				//fmt.Printf("\n###### DEBUG scheduled: %+v", scheduled)
-				updated.Metrics.WaitingQueueSize = int(active - scheduled)
-				// pendingMetrics, ok := metricFamilies["nv_inference_pending_request_count"]
-				// if ok {
-				// 	if queued, err := getTrtLlmGaugeMetric(logger, pendingMetrics, "model", "ensemble"); err == nil {
-				// 		fmt.Printf("\n###### DEBUG queued requests: %+v", int(queued))
-				// 	}
-				// }
-				//fmt.Printf("\n###### DEBUG active (total) requests: %+v", int(active))
-				//fmt.Printf("\n###### DEBUG waiting requests: %+v", int(active-scheduled))
-				//fmt.Printf("\n###### DEBUG running requests: %+v", int(scheduled))
-			} else {
-				errs = multierr.Append(errs, err)
-			}
-		} else {
-			errs = multierr.Append(errs, err)
-		}
-	}
-
-	//fmt.Print("\n\n###### DEBUG getting KVBLOCK metrics... ######")
-	// Get the "nv_trt_llm_kv_cache_block_metrics" metric family
-	kvCacheBlocks, ok := metricFamilies[TRTLLMKvCacheMetricsName]
-	// errs = multierr.Append(errs, err)
-	// fmt.Printf("###### DEBUG (should be nil) getLatestMetric errs: %+v", errs)
-	if ok {
-		// Calculate the kv-cache usage from the max and used metrics
-		if max, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "max"); err == nil {
-			//fmt.Printf("\n###### DEBUG max: %+v", max)
-			if used, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "used"); err == nil {
-				//fmt.Printf("\n###### DEBUG used: %+v", used)
-				usage := 0.0
-				if max > 0 {
-					usage = used / max
-				}
-				updated.Metrics.KVCacheUsagePercent = usage
-			} else {
-				errs = multierr.Append(errs, err)
-			}
-		} else {
-			errs = multierr.Append(errs, err)
-		}
-	}
-
-	//fmt.Printf("\n### DEBUG: %+v", updated)
-	//fmt.Printf("\n###### DEBUG ERRORS: %+v", errs)
-
-	return updated, errs
-}
-
-// getLatestMetric gets the latest metric of a family.
-func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.MetricFamily, error) {
-	mf, ok := metricFamilies[metricName]
-	if !ok {
-		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName)
-		return nil, fmt.Errorf("metric family %q not found", metricName)
-	}
-	if len(mf.GetMetric()) == 0 {
-		return nil, fmt.Errorf("no metrics available for %q", metricName)
-	}
-
-	var latestTs int64
-	var latestMf *dto.MetricFamily
-	for _, m := range mf.GetMetric() {
-		if m.GetTimestampMs() >= latestTs {
-			latestTs = m.GetTimestampMs()
-			latestMf = &dto.MetricFamily{
-				Name:   mf.Name,
-				Help:   mf.Help,
-				Type:   mf.Type,
-				Metric: []*dto.Metric{m},
-			}
-		}
-	}
-
-	logger.V(logutil.TRACE).Info("Metric value selected", "metric Family", latestMf, "metric", metricName)
-	return latestMf, nil
-}
-
-// getGaugeMetricForPod gets gauge metric value for a given pod.
-func getGaugeMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podIdentifier string) (float64, error) {
-	for _, m := range mf.GetMetric() {
-		for _, label := range m.GetLabel() {
-			if (label.GetName() == "pod" || label.GetName() == "gpu_uuid") && strings.Contains(label.GetValue(), podIdentifier) {
-				logger.V(logutil.TRACE).Info("Pod metric found", "value", m.GetGauge().GetValue(), "labelName", label.GetName(), "labelValue", label.GetValue())
-
-				return m.GetGauge().GetValue(), nil // Return the value with nil error
-			}
-		}
-	}
-	logger.V(logutil.TRACE).Info("Metric Value not found for pod", "pod", podIdentifier, "metric family", mf.GetName())
-	return -1, fmt.Errorf("metric value not found for pod %s in metric family %s", podIdentifier, mf.GetName()) // Return an error
-}
-
-// getCounterMetricForPod gets counter metric value for a given pod.
-func getCounterMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podName string) (int, error) {
-	for _, m := range mf.GetMetric() {
-		for _, label := range m.GetLabel() {
-			if label.GetName() == "pod" && label.GetValue() == podName {
-				val := m.GetCounter().GetValue()
-				intVal, err := strconv.Atoi(fmt.Sprintf("%v", val)) // Convert float64 to int
-				if err != nil {
-					return -1, fmt.Errorf("failed to convert counter metric to int: %w", err)
-				}
-				logger.V(logutil.TRACE).Info("Pod metric found", "value", intVal)
-
-				return intVal, nil
-			}
-		}
-	}
-	return -1, nil
-}
-
-// TRTLLM metrics
-
-// getTrtLlmMetric gets a TRT LLM metric with the specified type, key, and value.
-func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.MetricType, key, value string) (float64, error) {
-	//fmt.Printf("###### DEBUG START GETTRTMERTIC: %+v", mf.GetMetric())
-	//fmt.Printf("###### DEBUG METRICS: %+v", len(mf.GetMetric()))
-	for _, m := range mf.GetMetric() {
-		//fmt.Printf("###### DEBUG ANALYZING METRIC: %+v", m)
-		//fmt.Printf("###### DEBUG TIMESTAMP: %+v", m.GetTimestampMs())
-		foundKey := false
-		foundValue := false
-		//fmt.Printf("###### DEBUG LABELS: %+v", m.GetLabel())
-		for _, label := range m.GetLabel() {
-			//fmt.Printf("###### DEBUG COMPARING label NAME %+v == %+v and label VALUE %+v == %+v", label.GetName(), key, label.GetValue(), value)
-			if label.GetName() == key && label.GetValue() == value {
-				foundKey = true
-			}
-			if mf.GetType() == metricType {
-				foundValue = true
-			}
-		}
-		if foundKey && foundValue {
-			//fmt.Printf("###### DEBUG METRIC FOUND: %+v", m)
-			if metricType == dto.MetricType_GAUGE {
-				logger.V(logutil.TRACE).Info("TRT LLM gauge metric found", "value", m.GetGauge().GetValue(), "key", key, "value", value)
-				return m.GetGauge().GetValue(), nil
-			} else if metricType == dto.MetricType_COUNTER {
-				val := m.GetCounter().GetValue()
-				intVal, err := strconv.Atoi(fmt.Sprintf("%v", val))
-				if err != nil {
-					return -1, fmt.Errorf("failed to convert counter metric to int: %w", err)
-				}
-				logger.V(logutil.TRACE).Info("TRT LLM counter metric found", "value", intVal, "key", key, "value", value)
-				return float64(intVal), nil
-			}
-		}
-	}
-	return -1, fmt.Errorf("TRT LLM metric not found: %s{ %s=\"%s\" }", mf.GetName(), key, value)
-}
-
-// getTrtLlmGaugeMetric gets a gauge TRT LLM metric.
-func getTrtLlmGaugeMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) {
-	return getTrtLlmMetric(logger, mf, dto.MetricType_GAUGE, key, value)
-}
-
-// getTrtLlmCounterMetric gets a counter TRT LLM metric.
-func getTrtLlmCounterMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) {
-	return getTrtLlmMetric(logger, mf, dto.MetricType_COUNTER, key, value)
-}
diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go
deleted file mode 100644
index 931a6346a..000000000
--- a/pkg/epp/backend/triton/metrics_test.go
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
-Copyright 2025 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package triton
-
-import (
-	"testing"
-
-	dto "github.com/prometheus/client_model/go"
-	"github.com/stretchr/testify/assert"
-	"google.golang.org/protobuf/proto"
-	"k8s.io/apimachinery/pkg/types"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-)
-
-func TestPromToPodMetrics(t *testing.T) {
-	logger := logutil.NewTestLogger()
-
-	podName := "test-pod"
-	podAddress := "10.0.0.1"
-
-	testCases := []struct {
-		name              string
-		metricFamilies    map[string]*dto.MetricFamily
-		expectedMetrics   *datastore.PodMetrics
-		expectedErr       bool
-		initialPodMetrics *datastore.PodMetrics
-	}{
-		{
-			name:           "all metrics available",
-			metricFamilies: allMetricsAvailable(podName),
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:        map[string]int{},
-					RunningQueueSize:    1,
-					WaitingQueueSize:    2,
-					KVCacheUsagePercent: 0.5, // used / max = 50 / 100
-				},
-			},
-			initialPodMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				},
-			},
-			expectedErr: false,
-		},
-		{
-			name:           "missing metrics",
-			metricFamilies: map[string]*dto.MetricFamily{}, // No metrics provided
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:        map[string]int{},
-					RunningQueueSize:    0, // Default int value
-					WaitingQueueSize:    0, // Default int value
-					KVCacheUsagePercent: 0, // Default float64 value
-				},
-			},
-			initialPodMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				},
-			},
-			expectedErr: false,
-		},
-		{
-			name:           "multiple timestamps",
-			metricFamilies: multipleMetricsWithDifferentTimestamps(podName),
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:        map[string]int{},
-					RunningQueueSize:    1,   // from latest
-					WaitingQueueSize:    2,   // from latest
-					KVCacheUsagePercent: 0.5, // used / max = 50 / 100  (from latest)
-				},
-			},
-			initialPodMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				},
-			},
-			expectedErr: false,
-		},
-		{
-			name: "empty metric family",
-			metricFamilies: map[string]*dto.MetricFamily{
-				TRTLLMRequestMetricsName: {
-					Name:   proto.String(TRTLLMRequestMetricsName),
-					Type:   dto.MetricType_GAUGE.Enum(),
-					Metric: []*dto.Metric{}, // Empty
-				},
-			},
-			expectedMetrics: nil,
-			initialPodMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					NamespacedName: types.NamespacedName{Name: podName},
-					Address:        podAddress,
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				},
-			},
-			expectedErr: true,
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialPodMetrics)
-			if tc.expectedErr {
-				assert.Error(t, err)
-			} else {
-				assert.NoError(t, err)
-				assert.Equal(t, tc.expectedMetrics, updated)
-			}
-		})
-	}
-}
-
-// --- Helper Functions ---
-
-func allMetricsAvailable(podName string) map[string]*dto.MetricFamily {
-	return map[string]*dto.MetricFamily{
-		TRTLLMRequestMetricsName: {
-			Name: proto.String(TRTLLMRequestMetricsName),
-			Type: dto.MetricType_GAUGE.Enum(),
-			Metric: []*dto.Metric{
-				trtLlmRequestMetric("active", 3, 200),
-				trtLlmRequestMetric("scheduled", 1, 200),
-			},
-		},
-		TRTLLMKvCacheMetricsName: {
-			Name: proto.String(TRTLLMKvCacheMetricsName),
-			Type: dto.MetricType_GAUGE.Enum(),
-			Metric: []*dto.Metric{
-				trtLlmKvCacheMetric("max", 100, 200),
-				trtLlmKvCacheMetric("used", 50, 200),
-				trtLlmKvCacheMetric("tokens_per", 50, 200),
-			},
-		},
-	}
-}
-
-func multipleMetricsWithDifferentTimestamps(podName string) map[string]*dto.MetricFamily {
-	return map[string]*dto.MetricFamily{
-		TRTLLMRequestMetricsName: {
-			Name: proto.String(TRTLLMRequestMetricsName),
-			Type: dto.MetricType_GAUGE.Enum(),
-			Metric: []*dto.Metric{
-				trtLlmRequestMetric("active", 3, 200),    // Newer
-				trtLlmRequestMetric("scheduled", 1, 200), // Newer
-				trtLlmRequestMetric("active", 3, 100),    // Older
-				trtLlmRequestMetric("scheduled", 0, 100), // Older
-			},
-		},
-		TRTLLMKvCacheMetricsName: {
-			Name: proto.String(TRTLLMKvCacheMetricsName),
-			Type: dto.MetricType_GAUGE.Enum(),
-			Metric: []*dto.Metric{
-				trtLlmKvCacheMetric("max", 100, 200),       // Newer
-				trtLlmKvCacheMetric("used", 50, 200),       // Newer
-				trtLlmKvCacheMetric("tokens_per", 50, 200), // Newer
-				trtLlmKvCacheMetric("max", 110, 100),       //Older
-				trtLlmKvCacheMetric("used", 60, 100),       //Older
-				trtLlmKvCacheMetric("tokens_per", 40, 100), //Older
-			},
-		},
-	}
-}
-
-func trtLlmRequestMetric(requestType string, value float64, timestampMs int64) *dto.Metric {
-	return &dto.Metric{
-		Label: []*dto.LabelPair{
-			{Name: proto.String(TRTLLMRequestMetricsLabel), Value: proto.String(requestType)},
-		},
-		Gauge:       &dto.Gauge{Value: &value},
-		TimestampMs: &timestampMs,
-	}
-}
-
-func trtLlmKvCacheMetric(blockType string, value float64, timestampMs int64) *dto.Metric {
-	return &dto.Metric{
-		Label: []*dto.LabelPair{
-			{Name: proto.String(TRTLLMKvCacheMetricsLabel), Value: proto.String(blockType)},
-		},
-		Gauge:       &dto.Gauge{Value: &value},
-		TimestampMs: &timestampMs,
-	}
-}

From df3f3e3ac7f0ac0bb0702b10a291e362c14806d7 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 7 Mar 2025 00:23:03 +0000
Subject: [PATCH 07/19] Remove commented prints and old comments.

---
 cmd/epp/main.go                 | 5 +++--
 pkg/epp/backend/metrics_spec.go | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index c5264b823..40f80b39a 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -156,7 +156,8 @@ func run() error {
 	pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval)
 	// Setup runner.
 	datastore := datastore.NewDatastore(ctx, pmf)
-	// switch case across different model server metrics (triton, vllm)
+
+	// Set up mapper for metric scraping.
 	mapping, err := backend.NewMetricMapping(
 		*allRequestsMetric,
 		*waitingRequestsMetric,
@@ -167,7 +168,7 @@ func run() error {
 		*loraRequestInfoMetric,
 	)
 	if err != nil {
-		setupLog.Error(err, "Failed to create metric mapping from flags")
+		setupLog.Error(err, "Failed to create metric mapping from flags.")
 		return err
 	}
 	provider := backend.NewProvider(&backend.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/metrics_spec.go
index aabcf9835..9cd194db1 100644
--- a/pkg/epp/backend/metrics_spec.go
+++ b/pkg/epp/backend/metrics_spec.go
@@ -32,9 +32,9 @@ type MetricMapping struct {
 	AllRequests       *MetricSpec // Option 1
 	WaitingRequests   *MetricSpec // Option 2
 	RunningRequests   *MetricSpec // Required
-	UsedKVCacheBlocks *MetricSpec // Optional (part of a group)
-	MaxKVCacheBlocks  *MetricSpec // Optional (part of a group)
-	KVCacheUsage      *MetricSpec // Optional (alternative to the group above)
+	UsedKVCacheBlocks *MetricSpec // Option 1 (part of a group)
+	MaxKVCacheBlocks  *MetricSpec // Option 1 (part of a group)
+	KVCacheUsage      *MetricSpec // Option 2 (alternative to the group above)
 	// LoRA Metrics (vLLM Specific, optional)
 	LoraRequestInfo *MetricSpec
 }

From 558132e021ad7d6f6cd1dfaa82232c4b1717efcc Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 7 Mar 2025 19:48:56 +0000
Subject: [PATCH 08/19] Remove triton support for now, make metrics mapping
 1-to-1 with load balancing metrics.

---
 cmd/epp/main.go                             |  22 ++-
 config/manifests/triton/deployment.yaml     | 100 -----------
 config/manifests/triton/ext_proc.yaml       | 126 --------------
 config/manifests/triton/inferencemodel.yaml |   9 -
 config/manifests/triton/triton-set-up.yaml  | 111 -------------
 config/manifests/vllm/ext_proc.yaml         |   8 -
 pkg/epp/backend/metrics.go                  |  44 +----
 pkg/epp/backend/metrics_spec.go             |  65 +-------
 pkg/epp/backend/metrics_spec_test.go        | 109 ------------
 pkg/epp/backend/metrics_test.go             | 174 ++------------------
 pkg/epp/datastore/types.go                  |  18 +-
 11 files changed, 42 insertions(+), 744 deletions(-)
 delete mode 100644 config/manifests/triton/deployment.yaml
 delete mode 100644 config/manifests/triton/ext_proc.yaml
 delete mode 100644 config/manifests/triton/inferencemodel.yaml
 delete mode 100644 config/manifests/triton/triton-set-up.yaml

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index 40f80b39a..f3e0b6571 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -94,14 +94,16 @@ var (
 			"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
 			"then a self-signed certificate is used.")
 	// metric flags
-	allRequestsMetric       = flag.String("allRequestsMetric", "", "Prometheus metric for the total number of processing requests, both queued and running.")
-	waitingRequestsMetric   = flag.String("waitingRequestsMetric", "", "Prometheus metric for the number of queued requests.")
-	runningRequestsMetric   = flag.String("runningRequestsMetric", "", "Prometheus metric for the number of running requests.")
-	usedKVCacheBlocksMetric = flag.String("usedKVCacheBlocksMetric", "", "Prometheus metric for the number of utilized KV-cache blocks.")
-	maxKVCacheBlocksMetric  = flag.String("maxKVCacheBlocksMetric", "", "Prometheus metric for the total number of available KV-cache blocks.")
-	kVCacheUsageMetric      = flag.String("kVCacheUsageMetric", "", "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
+	totalQueuedRequestMetric = flag.String("totalQueuedRequestMetric",
+		"vllm:num_requests_waiting",
+		"Prometheus metric for the number of queued requests.")
+	kVCacheUsageMetric = flag.String("kVCacheUsageMetric",
+		"vllm:gpu_cache_usage_perc",
+		"Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
 	// LoRA metrics
-	loraRequestInfoMetric = flag.String("loraRequestInfoMetric", "", "Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
+	loraRequestInfoMetric = flag.String("loraRequestInfoMetric",
+		"vllm:lora_requests_info",
+		"Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
 
 	setupLog = ctrl.Log.WithName("setup")
 )
@@ -159,11 +161,7 @@ func run() error {
 
 	// Set up mapper for metric scraping.
 	mapping, err := backend.NewMetricMapping(
-		*allRequestsMetric,
-		*waitingRequestsMetric,
-		*runningRequestsMetric,
-		*usedKVCacheBlocksMetric,
-		*maxKVCacheBlocksMetric,
+		*totalQueuedRequestMetric,
 		*kVCacheUsageMetric,
 		*loraRequestInfoMetric,
 	)
diff --git a/config/manifests/triton/deployment.yaml b/config/manifests/triton/deployment.yaml
deleted file mode 100644
index 189ad90f2..000000000
--- a/config/manifests/triton/deployment.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama-triton-deployment
-spec:
-  replicas: 1  # Start with 1 replica.  Adjust as needed.
-  selector:
-    matchLabels:
-      app: llama-triton  # This MUST match the labels in the template
-  template:
-    metadata:
-      labels:
-        app: llama-triton
-    spec:
-      containers:
-      - name: triton-server
-        image: nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3  # Use base Triton image
-        imagePullPolicy: IfNotPresent
-        command: ["/bin/bash", "-c"]
-        args:
-          - |
-            set -e
-            apt-get update && apt-get install -y python3.12-venv
-
-            # Create and activate a virtual environment
-            python3 -m venv /opt/venv
-            source /opt/venv/bin/activate
-            pip install SentencePiece
-            pip install packaging
-            pip install numpy
-            pip install torch
-            pip install requests
-            pip install transformers
-            pip install pillow
-            
-            # Use launch_triton_server.py
-            # python3 /models/tensorrtllm_backend/scripts/launch_triton_server.py --world_size 1 --model_repo /models/tensorrtllm_backend/llama_ifb
-            # tail -f /dev/null
-
-            # Launch OpenAI completetions endpoint
-            # Install python bindings for tritonserver and tritonfrontend
-            pip install /opt/tritonserver/python/triton*.whl
-            # Install application requirements
-            git clone --depth 1 --branch v2.55.0 https://github.com/triton-inference-server/server.git
-            cd server/python/openai/
-            pip install -r requirements.txt
-            pip install uvicorn
-            pip install -U huggingface_hub
-            huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential
-          
-            python3 openai_frontend/main.py --model-repository /models/tensorrtllm_backend/llama_ifb --tokenizer meta-llama/Llama-2-7b-chat-hf
-        ports:
-        - containerPort: 9000
-          name: http
-        - containerPort: 9001
-          name: grpc
-        - containerPort: 9002
-          name: metrics
-        volumeMounts:
-        - mountPath: /models
-          name: model-volume
-        - mountPath: /secrets/huggingface
-          name: huggingface-secret
-          readOnly: true
-        resources:
-          limits:
-            ephemeral-storage: 40Gi
-            nvidia.com/gpu: 1
-            memory: 40Gi
-          requests:
-            ephemeral-storage: 40Gi
-            memory: 40Gi
-            nvidia.com/gpu: 1
-      volumes:
-      - name: model-volume
-        persistentVolumeClaim:
-          claimName: llama-model-pvc
-      - name: huggingface-secret
-        secret:
-          secretName: hf-token
-
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-triton-service
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9000
-      targetPort: http
-      name: http-inference-server
-    - port: 9001
-      targetPort: grpc
-      name: grpc-inference-server
-    - port: 9002
-      targetPort: metrics
-      name: http-metrics
-  selector:
-    app: llama-triton
diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml
deleted file mode 100644
index f61a7ec18..000000000
--- a/config/manifests/triton/ext_proc.yaml
+++ /dev/null
@@ -1,126 +0,0 @@
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: pod-read
-rules:
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: [""]
-  resources: ["pods"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["discovery.k8s.io"]
-  resources: ["endpointslices"]
-  verbs: ["get", "watch", "list"]
-- apiGroups:
-  - authentication.k8s.io
-  resources:
-  - tokenreviews
-  verbs:
-  - create
-- apiGroups:
-  - authorization.k8s.io
-  resources:
-  - subjectaccessreviews
-  verbs:
-  - create
---- 
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: pod-read-binding
-subjects:
-- kind: ServiceAccount
-  name: default
-  namespace: default
-roleRef:
-  kind: ClusterRole
-  name: pod-read
----
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferencePool
-metadata:
-  labels:
-  name: triton-llama2-7b-pool
-spec:
-  targetPortNumber: 9000
-  selector:
-    app: llama-triton
-  extensionRef:
-    name: inference-gateway-ext-proc
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: inference-gateway-ext-proc
-  namespace: default
-  labels:
-    app: inference-gateway-ext-proc
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: inference-gateway-ext-proc
-  template:
-    metadata:
-      labels:
-        app: inference-gateway-ext-proc
-    spec:
-      containers:
-      - name: inference-gateway-ext-proc
-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
-        imagePullPolicy: Always
-        args:
-        - -poolName
-        - "triton-llama2-7b-pool"
-        - -v
-        - "3"
-        - -grpcPort
-        - "9002"
-        - -grpcHealthPort
-        - "9003"
-        - -allRequestsMetric
-        - "nv_trt_llm_request_metrics{request_type=active}"
-        - -runningRequestsMetric
-        - "nv_trt_llm_request_metrics{request_type=scheduled}"
-        - -usedKVCacheBlocksMetric
-        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=used}"
-        - -maxKVCacheBlocksMetric
-        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=max}"
-        env:
-        - name: USE_STREAMING
-          value: "false"
-        ports:
-        - containerPort: 9002
-        - containerPort: 9003
-        - name: metrics
-          containerPort: 9090
-        livenessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
-        readinessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: inference-gateway-ext-proc
-  namespace: default
-spec:
-  selector:
-    app: inference-gateway-ext-proc
-  ports:
-    - protocol: TCP
-      port: 9002
-      targetPort: 9002
-  type: ClusterIP
diff --git a/config/manifests/triton/inferencemodel.yaml b/config/manifests/triton/inferencemodel.yaml
deleted file mode 100644
index db643a85c..000000000
--- a/config/manifests/triton/inferencemodel.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  name: triton-llama2-7b-model
-spec:
-  modelName: ensemble
-  criticality: Standard
-  poolRef:
-    name: triton-llama2-7b-pool
diff --git a/config/manifests/triton/triton-set-up.yaml b/config/manifests/triton/triton-set-up.yaml
deleted file mode 100644
index 08fa0852c..000000000
--- a/config/manifests/triton/triton-set-up.yaml
+++ /dev/null
@@ -1,111 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: llama-model-pvc
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 200Gi
-
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: llama-build-job
-spec:
-  backoffLimit: 0
-  template:
-    metadata:
-      labels:
-        app: llama-triton
-    spec:
-      containers:
-      - name: llama-builder
-        image: nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3 # Use the base Triton image directly
-        command: ["/bin/bash", "-c"]
-        args:
-          - |
-            set -e  # Exit on error
-
-            apt-get update && apt-get install -y python3.12-venv
-
-            # Create and activate a virtual environment
-            python3 -m venv /opt/venv
-            source /opt/venv/bin/activate
-
-            # Install git (it might not be in the base image)
-            apt-get update && apt-get install -y --no-install-recommends git
-
-            # Clone the tensorrt_llm_backend repository and set up submodule
-            git clone -b triton-llm/v0.17.0 https://github.com/triton-inference-server/tensorrtllm_backend.git /models/tensorrtllm_backend
-            cd /models/tensorrtllm_backend
-            git lfs install
-            git submodule update --init --recursive
-
-            # --- Hugging Face Setup ---
-            # 1. Install the Hugging Face CLI
-            pip install -U huggingface_hub
-            pip install transformers
-            pip install --extra-index-url https://pypi.nvidia.com/ tensorrt-llm
-            pip install tensorrt_llm
-
-            # 2. Log in using the token from the secret
-            #    The secret is mounted as a file.
-            huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential
-            huggingface-cli download meta-llama/Llama-2-7b-hf --local-dir /models/hf_models/
-
-            # Download and convert the Hugging Face model.  Modify parameters as needed.
-            export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json', local_dir='/models/hf_models/')).parent)"`
-            echo PATH TO LLAMA MODEL: $HF_LLAMA_MODEL
-            export UNIFIED_CKPT_PATH=/models/tmp/ckpt/llama/7b/
-            export ENGINE_PATH=/models/tmp/engines/llama/7b/
-            export TRTLLM_MODEL_REPO=/models/tensorrtllm_backend/llama_ifb
-            python3 /models/tensorrtllm_backend/tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
-                     --output_dir ${UNIFIED_CKPT_PATH} \
-                     --dtype float16
-
-            # Build the TensorRT-LLM engine.  Adjust parameters (e.g., world_size) as needed.
-            trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
-                          --output_dir ${ENGINE_PATH} \
-                          --gemm_plugin float16 \
-                          --kv_cache_type paged \
-                          --context_fmha enable \
-                          --gpt_attention_plugin float16 \
-                          --remove_input_padding enable \
-                          --max_batch_size 64
-
-            cp /models/tensorrtllm_backend/all_models/inflight_batcher_llm/ ${TRTLLM_MODEL_REPO} -r
-
-            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
-            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
-            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
-            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
-            python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
-
-
-            echo "Build complete!"
-        volumeMounts:
-        - mountPath: /models
-          name: model-volume
-        - mountPath: /secrets/huggingface
-          name: huggingface-secret
-          readOnly: true
-        resources:
-          limits:
-            ephemeral-storage: 80Gi
-            nvidia.com/gpu: 1
-            memory: 40Gi
-          requests:
-            ephemeral-storage: 80Gi
-            nvidia.com/gpu: 1
-            memory: 40Gi
-      restartPolicy: Never
-      volumes:
-      - name: model-volume
-        persistentVolumeClaim:
-          claimName: llama-model-pvc
-      - name: huggingface-secret
-        secret:
-          secretName: hf-token
diff --git a/config/manifests/vllm/ext_proc.yaml b/config/manifests/vllm/ext_proc.yaml
index 33c47d400..d70467ee0 100644
--- a/config/manifests/vllm/ext_proc.yaml
+++ b/config/manifests/vllm/ext_proc.yaml
@@ -82,14 +82,6 @@ spec:
         - "9002"
         - -grpcHealthPort
         - "9003"
-        - -waitingRequestsMetric
-        - "vllm:num_requests_waiting"
-        - -runningRequestsMetric
-        - "vllm:num_requests_running"
-        - -kVCacheUsageMetric
-        - "vllm:gpu_cache_usage_perc"
-        - -loraRequestInfoMetric
-        - "vllm:lora_requests_info"
         env:
         - name: USE_STREAMING
           value: "false"
diff --git a/pkg/epp/backend/metrics.go b/pkg/epp/backend/metrics.go
index 2f2082652..edc4b6e80 100644
--- a/pkg/epp/backend/metrics.go
+++ b/pkg/epp/backend/metrics.go
@@ -91,56 +91,22 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 	var errs error
 	updated := existing.Clone()
 
-	if p.MetricMapping.RunningRequests != nil {
-		running, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.RunningRequests)
+	if p.MetricMapping.TotalQueuedRequests != nil {
+		queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests)
 		if err == nil {
-			updated.RunningQueueSize = int(running.GetGauge().GetValue())
+			updated.WaitingQueueSize = int(queued.GetGauge().GetValue())
 		} else {
 			errs = multierr.Append(errs, err)
 		}
 	}
 
-	if p.MetricMapping.AllRequests != nil {
-		all, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.AllRequests)
-		if err == nil {
-			updated.WaitingQueueSize = int(all.GetGauge().GetValue()) - updated.RunningQueueSize
-		} else {
-			errs = multierr.Append(errs, err)
-		}
-	}
-
-	if p.MetricMapping.WaitingRequests != nil {
-		waiting, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.WaitingRequests)
-		if err == nil {
-			updated.WaitingQueueSize = int(waiting.GetGauge().GetValue())
-		} else {
-			errs = multierr.Append(errs, err)
-		}
-	}
-
-	if p.MetricMapping.KVCacheUsage != nil {
-		usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUsage)
+	if p.MetricMapping.KVCacheUtilization != nil {
+		usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization)
 		if err == nil {
 			updated.KVCacheUsagePercent = usage.GetGauge().GetValue()
 		} else {
 			errs = multierr.Append(errs, err)
 		}
-	} else if p.MetricMapping.UsedKVCacheBlocks != nil && p.MetricMapping.MaxKVCacheBlocks != nil {
-		used, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.UsedKVCacheBlocks)
-		if err != nil {
-			errs = multierr.Append(errs, err)
-		}
-		max, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.MaxKVCacheBlocks)
-		if err != nil {
-			errs = multierr.Append(errs, err)
-		}
-		if err == nil {
-			usage := 0.0
-			if max.GetGauge().GetValue() > 0 {
-				usage = used.GetGauge().GetValue() / max.GetGauge().GetValue()
-			}
-			updated.KVCacheUsagePercent = usage
-		}
 	}
 
 	// Handle LoRA metrics (only if all LoRA MetricSpecs are present)
diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/metrics_spec.go
index 9cd194db1..7ce2f5d60 100644
--- a/pkg/epp/backend/metrics_spec.go
+++ b/pkg/epp/backend/metrics_spec.go
@@ -29,14 +29,9 @@ type MetricSpec struct {
 
 // MetricMapping holds named MetricSpecs.
 type MetricMapping struct {
-	AllRequests       *MetricSpec // Option 1
-	WaitingRequests   *MetricSpec // Option 2
-	RunningRequests   *MetricSpec // Required
-	UsedKVCacheBlocks *MetricSpec // Option 1 (part of a group)
-	MaxKVCacheBlocks  *MetricSpec // Option 1 (part of a group)
-	KVCacheUsage      *MetricSpec // Option 2 (alternative to the group above)
-	// LoRA Metrics (vLLM Specific, optional)
-	LoraRequestInfo *MetricSpec
+	TotalQueuedRequests *MetricSpec
+	KVCacheUtilization  *MetricSpec
+	LoraRequestInfo     *MetricSpec
 }
 
 // stringToMetricSpec converts a string to a MetricSpec.
@@ -99,28 +94,12 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) {
 }
 
 // NewMetricMapping creates a MetricMapping from string values.
-func NewMetricMapping(allStr, waitingStr, runningStr, usedBlocksStr, maxBlocksStr, usageStr, loraReqInfoStr string) (*MetricMapping, error) {
-	allSpec, err := stringToMetricSpec(allStr)
-	if err != nil {
-		return nil, fmt.Errorf("error parsing AllRequests: %w", err)
-	}
-	waitingSpec, err := stringToMetricSpec(waitingStr)
+func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr string) (*MetricMapping, error) {
+	queuedSpec, err := stringToMetricSpec(queuedStr)
 	if err != nil {
 		return nil, fmt.Errorf("error parsing WaitingRequests: %w", err)
 	}
-	runningSpec, err := stringToMetricSpec(runningStr)
-	if err != nil {
-		return nil, fmt.Errorf("error parsing RunningRequests: %w", err)
-	}
-	usedBlocksSpec, err := stringToMetricSpec(usedBlocksStr)
-	if err != nil {
-		return nil, fmt.Errorf("error parsing UsedKVCacheBlocks: %w", err)
-	}
-	maxBlocksSpec, err := stringToMetricSpec(maxBlocksStr)
-	if err != nil {
-		return nil, fmt.Errorf("error parsing MaxKVCacheBlocks: %w", err)
-	}
-	usageSpec, err := stringToMetricSpec(usageStr)
+	kvUsageSpec, err := stringToMetricSpec(kvUsageStr)
 	if err != nil {
 		return nil, fmt.Errorf("error parsing KVCacheUsage: %w", err)
 	}
@@ -129,36 +108,10 @@ func NewMetricMapping(allStr, waitingStr, runningStr, usedBlocksStr, maxBlocksSt
 		return nil, fmt.Errorf("error parsing loraReqInfoStr: %w", err)
 	}
 	mapping := &MetricMapping{
-		AllRequests:       allSpec,
-		WaitingRequests:   waitingSpec,
-		RunningRequests:   runningSpec,
-		UsedKVCacheBlocks: usedBlocksSpec,
-		MaxKVCacheBlocks:  maxBlocksSpec,
-		KVCacheUsage:      usageSpec,
-		LoraRequestInfo:   loraReqInfoSpec,
-	}
-
-	if err := mapping.Validate(); err != nil {
-		return nil, err // Return validation error
+		TotalQueuedRequests: queuedSpec,
+		KVCacheUtilization:  kvUsageSpec,
+		LoraRequestInfo:     loraReqInfoSpec,
 	}
 
 	return mapping, nil
 }
-
-// Validate checks if the MetricMapping is valid.
-func (m *MetricMapping) Validate() error {
-	// 1. WaitingRequests OR AllRequests (but not both can be nil)
-	if m.WaitingRequests == nil && m.AllRequests == nil {
-		return fmt.Errorf("either WaitingRequests or AllRequests must be specified")
-	}
-
-	if m.RunningRequests == nil {
-		return fmt.Errorf("RunningRequests is required")
-	}
-
-	// 2. KVCacheUsage OR (UsedKVCacheBlocks AND MaxKVCacheBlocks)
-	if m.KVCacheUsage == nil && (m.UsedKVCacheBlocks == nil || m.MaxKVCacheBlocks == nil) {
-		return fmt.Errorf("either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified")
-	}
-	return nil
-}
diff --git a/pkg/epp/backend/metrics_spec_test.go b/pkg/epp/backend/metrics_spec_test.go
index 084ae5b5a..141b97386 100644
--- a/pkg/epp/backend/metrics_spec_test.go
+++ b/pkg/epp/backend/metrics_spec_test.go
@@ -18,7 +18,6 @@ package backend
 
 import (
 	"reflect"
-	"strings"
 	"testing"
 )
 
@@ -171,111 +170,3 @@ func TestStringToMetricSpec(t *testing.T) {
 		})
 	}
 }
-
-func TestNewMetricMappingAndValidate(t *testing.T) {
-	tests := []struct {
-		name           string
-		allStr         string
-		waitingStr     string
-		runningStr     string
-		usedStr        string
-		maxStr         string
-		usageStr       string
-		loraReqInfoStr string
-		wantErr        bool
-		expectedErr    string // Added to check for specific error messages
-	}{
-		{
-			name:           "valid vllm mapping",
-			runningStr:     "running_metric",
-			waitingStr:     "waiting_metric",
-			usageStr:       "usage_metric",
-			loraReqInfoStr: "lora_requests_info",
-			wantErr:        false,
-			expectedErr:    "",
-		},
-		{
-			name:       "valid triton mapping",
-			runningStr: "running_metric{label1=value1}",
-			allStr:     "all_metric{label2=value2}",
-			usedStr:    "used_blocks{label3=value3}",
-			maxStr:     "max_blocks{label4=value4}",
-			wantErr:    false,
-		},
-		{
-			name:       "multiple labels mapping",
-			runningStr: "running_metric{label1=value1,label5=value5}",
-			allStr:     "all_metric{label2=value2,label6=value6}",
-			usedStr:    "used_blocks{label3=value3}",
-			maxStr:     "max_blocks{label4=value4}",
-			wantErr:    false,
-		},
-		{
-			name:        "missing running",
-			waitingStr:  "waiting_metric",
-			usageStr:    "usage_metric",
-			wantErr:     true,
-			expectedErr: "RunningRequests is required",
-		},
-		{
-			name:        "missing both waiting and all",
-			runningStr:  "running_metric",
-			usageStr:    "usage_metric",
-			wantErr:     true,
-			expectedErr: "either WaitingRequests or AllRequests must be specified",
-		},
-		{
-			name:        "missing usage and both block metrics",
-			runningStr:  "running_metric",
-			waitingStr:  "waiting_metric",
-			wantErr:     true,
-			expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified",
-		},
-		{
-			name:        "missing max block metric",
-			runningStr:  "running_metric",
-			waitingStr:  "waiting_metric",
-			usedStr:     "used_blocks",
-			wantErr:     true,
-			expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified",
-		},
-		{
-			name:        "missing used block metric",
-			runningStr:  "running_metric",
-			waitingStr:  "waiting_metric",
-			maxStr:      "max_blocks",
-			wantErr:     true,
-			expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified",
-		},
-		{
-			name:        "invalid running metric format",
-			runningStr:  "running_metric{invalid",
-			waitingStr:  "waiting_metric",
-			usageStr:    "usage_metric",
-			wantErr:     true,
-			expectedErr: "error parsing RunningRequests", // Check for part of the expected error
-		},
-		{
-			name:           "lora metrics present",
-			runningStr:     "running_metric",
-			waitingStr:     "waiting_metric",
-			usageStr:       "usage_metric",
-			loraReqInfoStr: "lora_requests_info",
-
-			wantErr:     false,
-			expectedErr: "", // Check for part of the expected error
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			_, err := NewMetricMapping(tt.allStr, tt.waitingStr, tt.runningStr, tt.usedStr, tt.maxStr, tt.usageStr, tt.loraReqInfoStr)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("NewMetricMapping() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			if tt.wantErr && !strings.Contains(err.Error(), tt.expectedErr) {
-				t.Errorf("NewMetricMapping() error = %v, expected to contain = %v", err, tt.expectedErr)
-			}
-		})
-	}
-}
diff --git a/pkg/epp/backend/metrics_test.go b/pkg/epp/backend/metrics_test.go
index 0bfafcee5..1b0ad05d9 100644
--- a/pkg/epp/backend/metrics_test.go
+++ b/pkg/epp/backend/metrics_test.go
@@ -395,10 +395,6 @@ func TestPromToPodMetrics(t *testing.T) {
 		{
 			name: "vllm metrics",
 			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm_running": makeMetricFamily("vllm_running",
-					makeMetric("vllm_running", nil, 10.0, 2000),
-					makeMetric("vllm_running", nil, 12.0, 1000), //Older
-				),
 				"vllm_waiting": makeMetricFamily("vllm_waiting",
 					makeMetric("vllm_waiting", nil, 5.0, 1000),
 					makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer
@@ -412,10 +408,9 @@ func TestPromToPodMetrics(t *testing.T) {
 				),
 			},
 			mapping: &MetricMapping{
-				RunningRequests: &MetricSpec{MetricName: "vllm_running"},
-				WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"},
-				KVCacheUsage:    &MetricSpec{MetricName: "vllm_usage"},
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"},
+				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
 			existingMetrics: &datastore.PodMetrics{
 				Pod: datastore.Pod{
@@ -436,7 +431,6 @@ func TestPromToPodMetrics(t *testing.T) {
 					},
 				},
 				Metrics: datastore.Metrics{
-					RunningQueueSize:    10,
 					WaitingQueueSize:    7,
 					KVCacheUsagePercent: 0.8,
 					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
@@ -445,118 +439,17 @@ func TestPromToPodMetrics(t *testing.T) {
 			},
 			expectedErrCount: 0,
 		},
-		{
-			name: "triton metrics",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"triton_running": makeMetricFamily("triton_running",
-					makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000),
-					makeMetric("triton_running", map[string]string{"queue": "slow"}, 12.0, 1000), //Older, but different label
-				),
-				"triton_all": makeMetricFamily("triton_all",
-					makeMetric("triton_all", map[string]string{"queue": "fast"}, 15.0, 1000),
-					makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000), // Newer
-				),
-				"triton_used": makeMetricFamily("triton_used",
-					makeMetric("triton_used", map[string]string{"type": "gpu"}, 80.0, 1000),
-				),
-				"triton_max": makeMetricFamily("triton_max",
-					makeMetric("triton_max", map[string]string{"type": "gpu"}, 100.0, 1000),
-				),
-			},
-			mapping: &MetricMapping{
-				RunningRequests:   &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}},
-				AllRequests:       &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}},
-				UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}},
-				MaxKVCacheBlocks:  &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}},
-			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:        map[string]int{},
-					RunningQueueSize:    10,
-					WaitingQueueSize:    7,   // 17 (all) - 10 (running)
-					KVCacheUsagePercent: 0.8, // 80 / 100
-				},
-			},
-			expectedErrCount: 0,
-		},
-		{
-			name: "triton metrics, missing label",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"triton_running": makeMetricFamily("triton_running",
-					makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000),
-				),
-				"triton_all": makeMetricFamily("triton_all",
-					makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000),
-				),
-				// triton_used and _max have no metrics with type=gpu label.
-			},
-			mapping: &MetricMapping{
-				RunningRequests:   &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}},
-				AllRequests:       &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}},
-				UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}},
-				MaxKVCacheBlocks:  &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}},
-			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:        map[string]int{},
-					RunningQueueSize:    10,
-					WaitingQueueSize:    7,
-					KVCacheUsagePercent: 0.0, // expect this to still be present, but with default 0 value
-				},
-			},
-
-			expectedErrCount: 2, // Two errors:  Used and Max
-		},
 		{
 			name:           "missing metrics",
 			metricFamilies: map[string]*dto.MetricFamily{}, // No metrics
 			mapping: &MetricMapping{
-				RunningRequests: &MetricSpec{MetricName: "vllm_running"},
-				WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"},
-				KVCacheUsage:    &MetricSpec{MetricName: "vllm_usage"},
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"},
+				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
 			existingMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
 			expectedMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedErrCount: 4, // Errors for all 4 main metrics
+			expectedErrCount: 3, // Errors for all 4 main metrics
 		},
 		{
 			name: "partial metrics available + LoRA",
@@ -569,10 +462,9 @@ func TestPromToPodMetrics(t *testing.T) {
 				),
 			},
 			mapping: &MetricMapping{
-				RunningRequests: &MetricSpec{MetricName: "vllm_running"}, // Not present
-				WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present
-				KVCacheUsage:    &MetricSpec{MetricName: "vllm_usage"},
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present
+				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
 			existingMetrics: &datastore.PodMetrics{
 				Pod: datastore.Pod{
@@ -593,57 +485,13 @@ func TestPromToPodMetrics(t *testing.T) {
 					},
 				},
 				Metrics: datastore.Metrics{
-					RunningQueueSize:    0,
 					WaitingQueueSize:    0,
 					KVCacheUsagePercent: 0.8,
 					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
 					MaxActiveModels:     3,
 				},
 			},
-			expectedErrCount: 2, // Errors for the two missing metrics
-		},
-		{
-			name: "use all requests for waiting queue",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm_running": makeMetricFamily("vllm_running",
-					makeMetric("vllm_running", nil, 10.0, 2000),
-				),
-				"vllm_all": makeMetricFamily("vllm_all",
-					makeMetric("vllm_all", nil, 15.0, 1000),
-				),
-			},
-			mapping: &MetricMapping{
-				RunningRequests: &MetricSpec{MetricName: "vllm_running"},
-				AllRequests:     &MetricSpec{MetricName: "vllm_all"},
-				// No WaitingRequests
-			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels: map[string]int{},
-				}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:     map[string]int{},
-					RunningQueueSize: 10,
-					WaitingQueueSize: 5, // 15 - 10
-				},
-			},
-			expectedErrCount: 0,
+			expectedErrCount: 1, // Errors for the two missing metrics
 		},
 		{
 			name: "invalid max lora",
diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go
index 8cfcf1d1f..b87b1c0ae 100644
--- a/pkg/epp/datastore/types.go
+++ b/pkg/epp/datastore/types.go
@@ -32,11 +32,9 @@ type Metrics struct {
 	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
 	ActiveModels map[string]int
 	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
-	MaxActiveModels         int
-	RunningQueueSize        int
-	WaitingQueueSize        int
-	KVCacheUsagePercent     float64
-	KvCacheMaxTokenCapacity int
+	MaxActiveModels     int
+	WaitingQueueSize    int
+	KVCacheUsagePercent float64
 }
 
 type PodMetrics struct {
@@ -59,12 +57,10 @@ func (pm *PodMetrics) Clone() *PodMetrics {
 			Address:        pm.Address,
 		},
 		Metrics: Metrics{
-			ActiveModels:            cm,
-			MaxActiveModels:         pm.MaxActiveModels,
-			RunningQueueSize:        pm.RunningQueueSize,
-			WaitingQueueSize:        pm.WaitingQueueSize,
-			KVCacheUsagePercent:     pm.KVCacheUsagePercent,
-			KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity,
+			ActiveModels:        cm,
+			MaxActiveModels:     pm.MaxActiveModels,
+			WaitingQueueSize:    pm.WaitingQueueSize,
+			KVCacheUsagePercent: pm.KVCacheUsagePercent,
 		},
 	}
 	return clone

From 5838459945ce106852626a306997da7d87173736 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 7 Mar 2025 22:19:40 +0000
Subject: [PATCH 09/19] moved files for cleaner diff

---
 cmd/epp/main.go                               |   4 +-
 pkg/epp/backend/metrics.go                    | 287 -------
 pkg/epp/backend/metrics_test.go               | 589 --------------
 pkg/epp/backend/vllm/metrics.go               | 256 ++++---
 pkg/epp/backend/{ => vllm}/metrics_spec.go    |   2 +-
 .../backend/{ => vllm}/metrics_spec_test.go   |   2 +-
 pkg/epp/backend/vllm/metrics_test.go          | 719 +++++++++++++-----
 7 files changed, 686 insertions(+), 1173 deletions(-)
 delete mode 100644 pkg/epp/backend/metrics.go
 delete mode 100644 pkg/epp/backend/metrics_test.go
 rename pkg/epp/backend/{ => vllm}/metrics_spec.go (99%)
 rename pkg/epp/backend/{ => vllm}/metrics_spec_test.go (99%)

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index f3e0b6571..d3c1ab09b 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -160,7 +160,7 @@ func run() error {
 	datastore := datastore.NewDatastore(ctx, pmf)
 
 	// Set up mapper for metric scraping.
-	mapping, err := backend.NewMetricMapping(
+	mapping, err := vllm.NewMetricMapping(
 		*totalQueuedRequestMetric,
 		*kVCacheUsageMetric,
 		*loraRequestInfoMetric,
@@ -169,7 +169,7 @@ func run() error {
 		setupLog.Error(err, "Failed to create metric mapping from flags.")
 		return err
 	}
-	provider := backend.NewProvider(&backend.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
+	provider := backend.NewProvider(&vllm.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
 	//
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
diff --git a/pkg/epp/backend/metrics.go b/pkg/epp/backend/metrics.go
deleted file mode 100644
index edc4b6e80..000000000
--- a/pkg/epp/backend/metrics.go
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
-Copyright 2025 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package backend
-
-import (
-	"context"
-	"fmt"
-	"net/http"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/go-logr/logr"
-	dto "github.com/prometheus/client_model/go"
-	"github.com/prometheus/common/expfmt"
-	"go.uber.org/multierr"
-	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-)
-
-const (
-	// Hardcoded vLLM specific LoRA metrics
-	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
-	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
-	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
-)
-
-type PodMetricsClientImpl struct {
-	MetricMapping *MetricMapping
-}
-
-// FetchMetrics fetches metrics from a given pod.
-func (p *PodMetricsClientImpl) FetchMetrics(
-	ctx context.Context,
-	existing *datastore.PodMetrics,
-	port int32,
-) (*datastore.PodMetrics, error) {
-	logger := log.FromContext(ctx)
-	loggerDefault := logger.V(logutil.DEFAULT)
-
-	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
-	if err != nil {
-		loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
-		return nil, fmt.Errorf("failed to create request: %v", err)
-	}
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName)
-		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err)
-	}
-	defer func() {
-		_ = resp.Body.Close()
-	}()
-
-	if resp.StatusCode != http.StatusOK {
-		loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode)
-		return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode)
-	}
-
-	parser := expfmt.TextParser{}
-	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
-	if err != nil {
-		return nil, err
-	}
-	return p.promToPodMetrics(logger, metricFamilies, existing)
-}
-
-// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
-func (p *PodMetricsClientImpl) promToPodMetrics(
-	logger logr.Logger,
-	metricFamilies map[string]*dto.MetricFamily,
-	existing *datastore.PodMetrics,
-) (*datastore.PodMetrics, error) {
-	var errs error
-	updated := existing.Clone()
-
-	if p.MetricMapping.TotalQueuedRequests != nil {
-		queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests)
-		if err == nil {
-			updated.WaitingQueueSize = int(queued.GetGauge().GetValue())
-		} else {
-			errs = multierr.Append(errs, err)
-		}
-	}
-
-	if p.MetricMapping.KVCacheUtilization != nil {
-		usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization)
-		if err == nil {
-			updated.KVCacheUsagePercent = usage.GetGauge().GetValue()
-		} else {
-			errs = multierr.Append(errs, err)
-		}
-	}
-
-	// Handle LoRA metrics (only if all LoRA MetricSpecs are present)
-	if p.MetricMapping.LoraRequestInfo != nil {
-		loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies)
-		errs = multierr.Append(errs, err)
-
-		if loraMetrics != nil {
-			updated.ActiveModels = make(map[string]int)
-			for _, label := range loraMetrics.GetLabel() {
-				if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
-					if label.GetValue() != "" {
-						adapterList := strings.Split(label.GetValue(), ",")
-						for _, adapter := range adapterList {
-							updated.ActiveModels[adapter] = 0
-						}
-					}
-				}
-				if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
-					if label.GetValue() != "" {
-						adapterList := strings.Split(label.GetValue(), ",")
-						for _, adapter := range adapterList {
-							updated.ActiveModels[adapter] = 0
-						}
-					}
-				}
-				if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
-					if label.GetValue() != "" {
-						updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
-						if err != nil {
-							errs = multierr.Append(errs, err)
-						}
-					}
-				}
-			}
-		}
-	}
-
-	return updated, errs
-}
-
-// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
-// reason its specially fetched is because each label key value pair permutation generates new series
-// and only most recent is useful. The value of each series is the creation timestamp so we can
-// retrieve the latest by sorting the value.
-func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
-	if p.MetricMapping.LoraRequestInfo == nil {
-		return nil, time.Time{}, nil // No LoRA metrics configured
-	}
-
-	loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName]
-	if !ok {
-		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
-		return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
-	}
-
-	var latest *dto.Metric
-	var latestTs float64 // Use float64, as Gauge.Value is float64
-
-	// Iterate over all metrics in the family.
-	for _, m := range loraRequests.GetMetric() {
-		running := ""
-		waiting := ""
-		// Check if the metric has the expected LoRA labels.  This is important!
-		hasRequiredLabels := false
-		for _, lp := range m.GetLabel() {
-			switch lp.GetName() {
-			case LoraRequestInfoRunningAdaptersMetricName:
-				running = lp.GetValue()
-				hasRequiredLabels = true
-			case LoraRequestInfoWaitingAdaptersMetricName:
-				waiting = lp.GetValue()
-				hasRequiredLabels = true
-			}
-		}
-		//Skip if it does not have the lora labels
-		if !hasRequiredLabels {
-			continue
-		}
-		// Ignore metrics with both labels empty.
-		if running == "" && waiting == "" {
-			continue
-		}
-
-		// Select the metric with the *largest Gauge Value* (which represents the timestamp).
-		if m.GetGauge().GetValue() > latestTs {
-			latestTs = m.GetGauge().GetValue()
-			latest = m
-		}
-	}
-	if latest == nil {
-		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName)
-		return nil, time.Time{}, nil
-	}
-
-	// Convert the gauge value (creation timestamp) to time.Time.
-	return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time
-}
-
-// getMetric retrieves a specific metric based on MetricSpec.
-func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
-	mf, ok := metricFamilies[spec.MetricName]
-	if !ok {
-		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName)
-		return nil, fmt.Errorf("metric family %q not found", spec.MetricName)
-	}
-
-	if len(mf.GetMetric()) == 0 {
-		return nil, fmt.Errorf("no metrics available for %q", spec.MetricName)
-	}
-	// if there is a specified label, return only that metric in the family
-	if spec.Labels != nil {
-		return getLabeledMetric(logger, mf, spec)
-	}
-	return getLatestMetric(logger, mf)
-}
-
-// getLatestMetric gets the latest metric of a family (for metrics without labels).
-func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) {
-	var latestTs int64
-	var latest *dto.Metric
-	for _, m := range mf.GetMetric() {
-		if m.GetTimestampMs() >= latestTs {
-			latestTs = m.GetTimestampMs()
-			latest = m
-		}
-	}
-
-	if latest == nil {
-		return nil, fmt.Errorf("no metrics found for %q", mf.GetName())
-	}
-
-	logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName())
-	return latest, nil
-}
-
-// getLabeledMetric gets the latest metric with matching labels.
-func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
-	var latestMetric *dto.Metric
-	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
-
-	for _, m := range mf.GetMetric() {
-		if labelsMatch(m.GetLabel(), spec.Labels) {
-			if m.GetTimestampMs() > latestTimestamp {
-				latestTimestamp = m.GetTimestampMs()
-				latestMetric = m
-			}
-		}
-	}
-
-	if latestMetric != nil {
-		logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName)
-		return latestMetric, nil
-	}
-
-	return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels)
-}
-
-// labelsMatch checks if a metric's labels contain all the labels in the spec.
-func labelsMatch(metricLabels []*dto.LabelPair, specLabels map[string]string) bool {
-	if len(specLabels) == 0 {
-		return true // No specific labels required
-	}
-
-	for specName, specValue := range specLabels {
-		found := false
-		for _, label := range metricLabels {
-			if label.GetName() == specName && label.GetValue() == specValue {
-				found = true
-				break
-			}
-		}
-		if !found {
-			return false // A required label is missing
-		}
-	}
-	return true // All required labels are present
-}
diff --git a/pkg/epp/backend/metrics_test.go b/pkg/epp/backend/metrics_test.go
deleted file mode 100644
index 1b0ad05d9..000000000
--- a/pkg/epp/backend/metrics_test.go
+++ /dev/null
@@ -1,589 +0,0 @@
-/*
-Copyright 2025 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package backend
-
-import (
-	"context"
-	"fmt"
-	"reflect"
-	"strconv"
-	"strings"
-	"testing"
-
-	dto "github.com/prometheus/client_model/go"
-	"go.uber.org/multierr"
-	"google.golang.org/protobuf/proto"
-	"k8s.io/apimachinery/pkg/types"
-
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-)
-
-// --- Test Helpers ---
-
-func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric {
-	labelPairs := []*dto.LabelPair{}
-	for k, v := range labels {
-		labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)})
-	}
-	return &dto.Metric{
-		Label:       labelPairs,
-		Gauge:       &dto.Gauge{Value: &value},
-		TimestampMs: &timestampMs,
-	}
-}
-
-func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily {
-	return &dto.MetricFamily{
-		Name:   &name,
-		Type:   dto.MetricType_GAUGE.Enum(),
-		Metric: metrics,
-	}
-}
-
-// --- Tests ---
-
-func TestGetMetric(t *testing.T) {
-	logger := logutil.NewTestLogger()
-
-	metricFamilies := map[string]*dto.MetricFamily{
-		"metric1": makeMetricFamily("metric1",
-			makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000),
-			makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000),
-		),
-		"metric2": makeMetricFamily("metric2",
-			makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500),
-			makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500),
-		),
-		"metric3": makeMetricFamily("metric3",
-			makeMetric("metric3", map[string]string{}, 5.0, 3000),
-			makeMetric("metric3", map[string]string{}, 6.0, 1000),
-		),
-	}
-
-	tests := []struct {
-		name        string
-		spec        MetricSpec
-		wantValue   float64
-		wantError   bool
-		shouldPanic bool // Add this
-	}{
-		{
-			name: "get labeled metric, exists",
-			spec: MetricSpec{
-				MetricName: "metric1",
-				Labels:     map[string]string{"label1": "value1"},
-			},
-			wantValue: 1.0,
-			wantError: false,
-		},
-		{
-			name: "get labeled metric, wrong value",
-			spec: MetricSpec{
-				MetricName: "metric1",
-				Labels:     map[string]string{"label1": "value3"},
-			},
-			wantValue: -1, // Expect an error, not a specific value
-			wantError: true,
-		},
-		{
-			name: "get labeled metric, missing label",
-			spec: MetricSpec{
-				MetricName: "metric1",
-				Labels:     map[string]string{"label2": "value2"},
-			},
-			wantValue: -1,
-			wantError: true,
-		},
-		{
-			name: "get labeled metric, extra label present",
-			spec: MetricSpec{
-				MetricName: "metric2",
-				Labels:     map[string]string{"labelA": "A1"},
-			},
-			wantValue: 3.0,
-			wantError: false,
-		},
-		{
-			name: "get unlabeled metric, exists",
-			spec: MetricSpec{
-				MetricName: "metric3",
-				Labels:     nil, // Explicitly nil
-			},
-			wantValue: 5.0, // latest metric, which occurs first in our test data
-			wantError: false,
-		},
-		{
-			name: "get unlabeled metric, metric family not found",
-			spec: MetricSpec{
-				MetricName: "metric4",
-				Labels:     nil,
-			},
-			wantValue: -1,
-			wantError: true,
-		},
-		{
-			name: "get labeled metric, metric family not found",
-			spec: MetricSpec{
-				MetricName: "metric4",
-				Labels:     map[string]string{"label1": "value1"},
-			},
-			wantValue: -1,
-			wantError: true,
-		},
-		{
-			name: "get metric, no metrics available",
-			spec: MetricSpec{
-				MetricName: "empty_metric",
-			},
-			wantValue: -1,
-			wantError: true,
-		},
-		{
-			name: "get latest metric",
-			spec: MetricSpec{
-				MetricName: "metric3",
-				Labels:     map[string]string{}, // Empty map, not nil
-			},
-			wantValue: 5.0,
-			wantError: false,
-		},
-	}
-
-	p := &PodMetricsClientImpl{} // No need for MetricMapping here
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if tt.shouldPanic {
-				defer func() {
-					if r := recover(); r == nil {
-						t.Errorf("The code did not panic")
-					}
-				}()
-			}
-
-			gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec)
-
-			if tt.wantError {
-				if err == nil {
-					t.Errorf("getMetric() expected error, got nil")
-				}
-			} else {
-				if err != nil {
-					t.Errorf("getMetric() unexpected error: %v", err)
-				}
-				if gotMetric.GetGauge().GetValue() != tt.wantValue {
-					t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue)
-				}
-			}
-		})
-	}
-}
-
-func TestLabelsMatch(t *testing.T) {
-	tests := []struct {
-		name         string
-		metricLabels []*dto.LabelPair
-		specLabels   map[string]string
-		want         bool
-	}{
-		{
-			name:         "empty spec labels, should match",
-			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
-			specLabels:   map[string]string{},
-			want:         true,
-		},
-		{
-			name:         "nil spec labels, should match",
-			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
-			specLabels:   nil,
-			want:         true,
-		},
-		{
-			name:         "exact match",
-			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
-			specLabels:   map[string]string{"a": "b"},
-			want:         true,
-		},
-		{
-			name:         "extra labels in metric",
-			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}, {Name: proto.String("c"), Value: proto.String("d")}},
-			specLabels:   map[string]string{"a": "b"},
-			want:         true,
-		},
-		{
-			name:         "missing label in metric",
-			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
-			specLabels:   map[string]string{"a": "b", "c": "d"},
-			want:         false,
-		},
-		{
-			name:         "value mismatch",
-			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
-			specLabels:   map[string]string{"a": "c"},
-			want:         false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if got := labelsMatch(tt.metricLabels, tt.specLabels); got != tt.want {
-				t.Errorf("labelsMatch() = %v, want %v", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestGetLatestLoraMetric(t *testing.T) {
-	logger := logutil.NewTestLogger()
-
-	testCases := []struct {
-		name             string
-		metricFamilies   map[string]*dto.MetricFamily
-		expectedAdapters map[string]int
-		expectedMax      int
-		expectedErr      error
-		mapping          *MetricMapping
-	}{
-		{
-			name: "no lora metrics",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"some_other_metric": makeMetricFamily("some_other_metric",
-					makeMetric("some_other_metric", nil, 1.0, 1000),
-				),
-			},
-			expectedAdapters: nil,
-			expectedMax:      0,
-			expectedErr:      fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing
-			mapping: &MetricMapping{
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-		},
-		{
-			name: "basic lora metrics",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000),       // Newer
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older
-
-				),
-			},
-			expectedAdapters: map[string]int{"lora1": 0},
-			expectedMax:      2,
-			expectedErr:      nil,
-			mapping: &MetricMapping{
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-		},
-		{
-			name: "no matching lora metrics",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000),
-				),
-			},
-			expectedAdapters: nil,
-			expectedMax:      0,
-			expectedErr:      nil, // Expect *no* error; just no adapters found
-			mapping: &MetricMapping{
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-		},
-		{
-			name: "no lora metrics if not in MetricMapping",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000),
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000),
-				),
-			},
-			expectedAdapters: nil,
-			expectedMax:      0,
-			expectedErr:      nil,
-			mapping:          &MetricMapping{ // No LoRA metrics defined
-			},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
-			loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies)
-
-			if tc.expectedErr != nil {
-				if err == nil || err.Error() != tc.expectedErr.Error() {
-					t.Errorf("getLatestLoraMetric() error = %v, wantErr %v", err, tc.expectedErr)
-				}
-				return // Stop here if an error was expected
-			} else if err != nil {
-				t.Fatalf("getLatestLoraMetric() unexpected error: %v", err)
-			}
-
-			if tc.mapping.LoraRequestInfo == nil {
-				if loraMetric != nil {
-					t.Errorf("getLatestLoraMetric() expected nil metric, got %v", loraMetric)
-				}
-				return // Stop if no Lora metrics are expected.
-			}
-
-			if tc.expectedAdapters == nil && loraMetric == nil {
-				return // Both nil, as expected
-			}
-
-			if tc.expectedAdapters != nil && loraMetric != nil { // proceed with checks
-
-				adaptersFound := make(map[string]int)
-				maxLora := 0
-				for _, label := range loraMetric.GetLabel() {
-					if label.GetName() == "running_lora_adapters" && label.GetValue() != "" {
-						for _, adapter := range strings.Split(label.GetValue(), ",") {
-							adaptersFound[adapter] = 0
-						}
-					}
-					if label.GetName() == "waiting_lora_adapters" && label.GetValue() != "" {
-						for _, adapter := range strings.Split(label.GetValue(), ",") {
-							adaptersFound[adapter] = 0 // Overwrite if already present
-						}
-					}
-					if label.GetName() == "max_lora" {
-						var converr error // define err in this scope.
-						maxLora, converr = strconv.Atoi(label.GetValue())
-						if converr != nil && tc.expectedErr == nil { // only report if we don't expect any other errors
-							t.Errorf("getLatestLoraMetric() could not parse max_lora: %v", converr)
-						}
-					}
-				}
-
-				if !reflect.DeepEqual(adaptersFound, tc.expectedAdapters) {
-					t.Errorf("getLatestLoraMetric() adapters = %v, want %v", adaptersFound, tc.expectedAdapters)
-				}
-				if maxLora != tc.expectedMax {
-					t.Errorf("getLatestLoraMetric() maxLora = %v, want %v", maxLora, tc.expectedMax)
-				}
-			} else { // one is nil and the other is not
-				t.Errorf("getLatestLoraMetric(): one of expectedAdapters/loraMetric is nil and the other is not, expected %v, got %v", tc.expectedAdapters, loraMetric)
-			}
-		})
-	}
-}
-
-func TestPromToPodMetrics(t *testing.T) {
-	logger := logutil.NewTestLogger()
-
-	tests := []struct {
-		name             string
-		metricFamilies   map[string]*dto.MetricFamily
-		mapping          *MetricMapping
-		existingMetrics  *datastore.PodMetrics
-		expectedMetrics  *datastore.PodMetrics
-		expectedErrCount int // Count of expected errors
-	}{
-		{
-			name: "vllm metrics",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm_waiting": makeMetricFamily("vllm_waiting",
-					makeMetric("vllm_waiting", nil, 5.0, 1000),
-					makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer
-				),
-				"vllm_usage": makeMetricFamily("vllm_usage",
-					makeMetric("vllm_usage", nil, 0.8, 2000),
-					makeMetric("vllm_usage", nil, 0.7, 500),
-				),
-				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
-				),
-			},
-			mapping: &MetricMapping{
-				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"},
-				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
-				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					WaitingQueueSize:    7,
-					KVCacheUsagePercent: 0.8,
-					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
-					MaxActiveModels:     3,
-				},
-			},
-			expectedErrCount: 0,
-		},
-		{
-			name:           "missing metrics",
-			metricFamilies: map[string]*dto.MetricFamily{}, // No metrics
-			mapping: &MetricMapping{
-				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"},
-				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
-				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-			existingMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedErrCount: 3, // Errors for all 4 main metrics
-		},
-		{
-			name: "partial metrics available + LoRA",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm_usage": makeMetricFamily("vllm_usage",
-					makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present
-				),
-				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
-				),
-			},
-			mapping: &MetricMapping{
-				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present
-				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
-				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					WaitingQueueSize:    0,
-					KVCacheUsagePercent: 0.8,
-					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
-					MaxActiveModels:     3,
-				},
-			},
-			expectedErrCount: 1, // Errors for the two missing metrics
-		},
-		{
-			name: "invalid max lora",
-			metricFamilies: map[string]*dto.MetricFamily{
-				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000),
-				),
-			},
-			mapping: &MetricMapping{
-				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
-			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{},
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:    map[string]int{"lora1": 0},
-					MaxActiveModels: 0, // Should still default to 0.
-
-				},
-			},
-			expectedErrCount: 1, // Expect *one* error
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
-			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
-			updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics)
-
-			if tc.expectedErrCount == 0 {
-				if err != nil {
-					t.Errorf("promToPodMetrics() unexpected error: %v", err)
-				}
-			} else {
-				if err == nil {
-					t.Errorf("promToPodMetrics() expected errors, got nil")
-				} else {
-					// Check the *number* of errors.  multierr.Errors() gives us a slice
-					if len(multierr.Errors(err)) != tc.expectedErrCount {
-						t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d.  Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err)
-					}
-
-				}
-			}
-			// Use podMetricsEqual for comparison with tolerance.
-			if !reflect.DeepEqual(updated, tc.expectedMetrics) {
-				t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics)
-			}
-		})
-	}
-}
-
-// TestFetchMetrics is a basic integration test.  A more complete test would mock
-// the HTTP client.
-func TestFetchMetrics(t *testing.T) {
-	// This test is very basic as it doesn't mock the HTTP client.  It assumes
-	// there's no server running on the specified port.  A real-world test
-	// suite should use a mock server.
-	ctx := logutil.NewTestLoggerIntoContext(context.Background())
-	existing := &datastore.PodMetrics{
-		Pod: datastore.Pod{
-			Address: "127.0.0.1",
-			NamespacedName: types.NamespacedName{
-				Namespace: "test",
-				Name:      "pod",
-			},
-		},
-	}
-	p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test
-
-	_, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use.
-	if err == nil {
-		t.Errorf("FetchMetrics() expected error, got nil")
-	}
-	// Check for a specific error message (fragile, but OK for this example)
-	expectedSubstr := "connection refused"
-	if err != nil && !strings.Contains(err.Error(), expectedSubstr) {
-		t.Errorf("FetchMetrics() error = %v, want error containing %q", err, expectedSubstr)
-	}
-}
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
index 8d2dd7154..4c1532080 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/vllm/metrics.go
@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-// Package vllm provides vllm specific pod metrics implementation.
 package vllm
 
 import (
@@ -30,60 +29,49 @@ import (
 	"github.com/prometheus/common/expfmt"
 	"go.uber.org/multierr"
 	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-// Metric names used in the vLLM metrics implementation.
-// Refer to the protocol doc for more details:
-// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol
 const (
-	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
+	// Hardcoded vLLM specific LoRA metrics
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
 	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
 	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
-	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
-	RunningQueueSizeMetricName = "vllm:num_requests_running"
-	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
-	/* TODO: Uncomment this once the following are added to the fork.
-	RunningQueueSizeMetricName        = "vllm:num_tokens_running"
-	WaitingQueueSizeMetricName        = "vllm:num_tokens_waiting"
-	*/
-	KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc"
 )
 
-type PodMetricsClientImpl struct{}
+type PodMetricsClientImpl struct {
+	MetricMapping *MetricMapping
+}
 
 // FetchMetrics fetches metrics from a given pod.
 func (p *PodMetricsClientImpl) FetchMetrics(
 	ctx context.Context,
-	pod *metrics.Pod,
-	existing *metrics.Metrics,
+	existing *datastore.PodMetrics,
 	port int32,
-) (*metrics.Metrics, error) {
-	logger := log.FromContext(ctx).V(logutil.TRACE)
+) (*datastore.PodMetrics, error) {
+	logger := log.FromContext(ctx)
+	loggerDefault := logger.V(logutil.DEFAULT)
 
-	// Currently the metrics endpoint is hard-coded, which works with vLLM.
-	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
-	url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
+	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
 
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	if err != nil {
-		logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
+		loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
 		return nil, fmt.Errorf("failed to create request: %v", err)
 	}
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
-		logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
-		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
+		loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName)
+		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err)
 	}
 	defer func() {
 		_ = resp.Body.Close()
 	}()
 
 	if resp.StatusCode != http.StatusOK {
-		logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
-		return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode)
+		loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode)
+		return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode)
 	}
 
 	parser := expfmt.TextParser{}
@@ -91,74 +79,70 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	if err != nil {
 		return nil, err
 	}
-	return promToPodMetrics(logger, metricFamilies, existing)
+	return p.promToPodMetrics(logger, metricFamilies, existing)
 }
 
-// promToPodMetrics updates internal pod metrics with scraped prometheus metrics.
-// A combined error is returned if errors occur in one or more metric processing.
-// it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map.
-func promToPodMetrics(
+// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
+func (p *PodMetricsClientImpl) promToPodMetrics(
 	logger logr.Logger,
 	metricFamilies map[string]*dto.MetricFamily,
-	existing *metrics.Metrics,
-) (*metrics.Metrics, error) {
+	existing *datastore.PodMetrics,
+) (*datastore.PodMetrics, error) {
 	var errs error
 	updated := existing.Clone()
-	runningQueueSize, err := getLatestMetric(logger, metricFamilies, RunningQueueSizeMetricName)
-	errs = multierr.Append(errs, err)
-	if err == nil {
-		updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue())
-	}
-	waitingQueueSize, err := getLatestMetric(logger, metricFamilies, WaitingQueueSizeMetricName)
-	errs = multierr.Append(errs, err)
-	if err == nil {
-		updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue())
-	}
-	cachePercent, err := getLatestMetric(logger, metricFamilies, KVCacheUsagePercentMetricName)
-	errs = multierr.Append(errs, err)
-	if err == nil {
-		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
-	}
-
-	loraMetrics, _, err := getLatestLoraMetric(logger, metricFamilies)
-	errs = multierr.Append(errs, err)
-	/* TODO: uncomment once this is available in vllm.
-	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
-	errs = multierr.Append(errs, err)
-	if err != nil {
-		updated.KvCacheMaxTokenCapacity = int(kvCap)
-	}
-	*/
-
-	if loraMetrics != nil {
-		updated.ActiveModels = make(map[string]int)
-		for _, label := range loraMetrics.GetLabel() {
-			if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
-				if label.GetValue() != "" {
-					adapterList := strings.Split(label.GetValue(), ",")
-					for _, adapter := range adapterList {
-						updated.ActiveModels[adapter] = 0
+
+	if p.MetricMapping.TotalQueuedRequests != nil {
+		queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests)
+		if err == nil {
+			updated.WaitingQueueSize = int(queued.GetGauge().GetValue())
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	if p.MetricMapping.KVCacheUtilization != nil {
+		usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization)
+		if err == nil {
+			updated.KVCacheUsagePercent = usage.GetGauge().GetValue()
+		} else {
+			errs = multierr.Append(errs, err)
+		}
+	}
+
+	// Handle LoRA metrics (only if all LoRA MetricSpecs are present)
+	if p.MetricMapping.LoraRequestInfo != nil {
+		loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies)
+		errs = multierr.Append(errs, err)
+
+		if loraMetrics != nil {
+			updated.ActiveModels = make(map[string]int)
+			for _, label := range loraMetrics.GetLabel() {
+				if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
+					if label.GetValue() != "" {
+						adapterList := strings.Split(label.GetValue(), ",")
+						for _, adapter := range adapterList {
+							updated.ActiveModels[adapter] = 0
+						}
 					}
 				}
-			}
-			if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
-				if label.GetValue() != "" {
-					adapterList := strings.Split(label.GetValue(), ",")
-					for _, adapter := range adapterList {
-						updated.ActiveModels[adapter] = 0
+				if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+					if label.GetValue() != "" {
+						adapterList := strings.Split(label.GetValue(), ",")
+						for _, adapter := range adapterList {
+							updated.ActiveModels[adapter] = 0
+						}
 					}
 				}
-			}
-			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
-				if label.GetValue() != "" {
-					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
-					if err != nil {
-						errs = multierr.Append(errs, err)
+				if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
+					if label.GetValue() != "" {
+						updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
+						if err != nil {
+							errs = multierr.Append(errs, err)
+						}
 					}
 				}
 			}
 		}
-
 	}
 
 	return updated, errs
@@ -168,62 +152,80 @@ func promToPodMetrics(
 // reason its specially fetched is because each label key value pair permutation generates new series
 // and only most recent is useful. The value of each series is the creation timestamp so we can
 // retrieve the latest by sorting the value.
-func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
-	loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
+func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	if p.MetricMapping.LoraRequestInfo == nil {
+		return nil, time.Time{}, nil // No LoRA metrics configured
+	}
+
+	loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName]
 	if !ok {
-		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
-		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
+		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
+		return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
 	}
 
 	var latest *dto.Metric
-	var latestTs float64
+	var latestTs float64 // Use float64, as Gauge.Value is float64
 
 	// Iterate over all metrics in the family.
 	for _, m := range loraRequests.GetMetric() {
-		var running, waiting string
-		// Read the label values for running and waiting adapters.
+		running := ""
+		waiting := ""
+		// Check if the metric has the expected LoRA labels.  This is important!
+		hasRequiredLabels := false
 		for _, lp := range m.GetLabel() {
 			switch lp.GetName() {
 			case LoraRequestInfoRunningAdaptersMetricName:
 				running = lp.GetValue()
+				hasRequiredLabels = true
 			case LoraRequestInfoWaitingAdaptersMetricName:
 				waiting = lp.GetValue()
+				hasRequiredLabels = true
 			}
 		}
-
-		// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on
-		// the server, in this case it is best to use the last set of active adapters.
+		//Skip if it does not have the lora labels
+		if !hasRequiredLabels {
+			continue
+		}
+		// Ignore metrics with both labels empty.
 		if running == "" && waiting == "" {
 			continue
 		}
 
-		// Select the metric with the latest creation timestamp.
+		// Select the metric with the *largest Gauge Value* (which represents the timestamp).
 		if m.GetGauge().GetValue() > latestTs {
 			latestTs = m.GetGauge().GetValue()
 			latest = m
 		}
 	}
-
 	if latest == nil {
-		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName)
+		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName)
 		return nil, time.Time{}, nil
 	}
 
 	// Convert the gauge value (creation timestamp) to time.Time.
-	return latest, time.Unix(0, int64(latestTs*1000)), nil
+	return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time
 }
 
-// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
-// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
-func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) {
-	mf, ok := metricFamilies[metricName]
+// getMetric retrieves a specific metric based on MetricSpec.
+func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
+	mf, ok := metricFamilies[spec.MetricName]
 	if !ok {
-		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName)
-		return nil, fmt.Errorf("metric family %q not found", metricName)
+		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName)
+		return nil, fmt.Errorf("metric family %q not found", spec.MetricName)
 	}
+
 	if len(mf.GetMetric()) == 0 {
-		return nil, fmt.Errorf("no metrics available for %q", metricName)
+		return nil, fmt.Errorf("no metrics available for %q", spec.MetricName)
+	}
+	// if there is a specified label, return only that metric in the family
+	if spec.Labels != nil {
+		return getLabeledMetric(logger, mf, spec)
 	}
+	return getLatestMetric(logger, mf)
+}
+
+// getLatestMetric gets the latest metric of a family (for metrics without labels).
+func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) {
 	var latestTs int64
 	var latest *dto.Metric
 	for _, m := range mf.GetMetric() {
@@ -232,6 +234,54 @@ func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFa
 			latest = m
 		}
 	}
-	logger.V(logutil.TRACE).Info("Metric value selected", "value", latest, "metric", metricName)
+
+	if latest == nil {
+		return nil, fmt.Errorf("no metrics found for %q", mf.GetName())
+	}
+
+	logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName())
 	return latest, nil
 }
+
+// getLabeledMetric gets the latest metric with matching labels.
+func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
+	var latestMetric *dto.Metric
+	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
+
+	for _, m := range mf.GetMetric() {
+		if labelsMatch(m.GetLabel(), spec.Labels) {
+			if m.GetTimestampMs() > latestTimestamp {
+				latestTimestamp = m.GetTimestampMs()
+				latestMetric = m
+			}
+		}
+	}
+
+	if latestMetric != nil {
+		logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName)
+		return latestMetric, nil
+	}
+
+	return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels)
+}
+
+// labelsMatch checks if a metric's labels contain all the labels in the spec.
+func labelsMatch(metricLabels []*dto.LabelPair, specLabels map[string]string) bool {
+	if len(specLabels) == 0 {
+		return true // No specific labels required
+	}
+
+	for specName, specValue := range specLabels {
+		found := false
+		for _, label := range metricLabels {
+			if label.GetName() == specName && label.GetValue() == specValue {
+				found = true
+				break
+			}
+		}
+		if !found {
+			return false // A required label is missing
+		}
+	}
+	return true // All required labels are present
+}
diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/vllm/metrics_spec.go
similarity index 99%
rename from pkg/epp/backend/metrics_spec.go
rename to pkg/epp/backend/vllm/metrics_spec.go
index 7ce2f5d60..bdd1e6671 100644
--- a/pkg/epp/backend/metrics_spec.go
+++ b/pkg/epp/backend/vllm/metrics_spec.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package backend
+package vllm
 
 import (
 	"fmt"
diff --git a/pkg/epp/backend/metrics_spec_test.go b/pkg/epp/backend/vllm/metrics_spec_test.go
similarity index 99%
rename from pkg/epp/backend/metrics_spec_test.go
rename to pkg/epp/backend/vllm/metrics_spec_test.go
index 141b97386..d73ce21dd 100644
--- a/pkg/epp/backend/metrics_spec_test.go
+++ b/pkg/epp/backend/vllm/metrics_spec_test.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package backend
+package vllm
 
 import (
 	"reflect"
diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go
index 5555bd260..0f05185d1 100644
--- a/pkg/epp/backend/vllm/metrics_test.go
+++ b/pkg/epp/backend/vllm/metrics_test.go
@@ -17,234 +17,573 @@ limitations under the License.
 package vllm
 
 import (
-	"errors"
+	"context"
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
 	"testing"
 
 	dto "github.com/prometheus/client_model/go"
-	"github.com/stretchr/testify/assert"
+	"go.uber.org/multierr"
 	"google.golang.org/protobuf/proto"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"k8s.io/apimachinery/pkg/types"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-func TestPromToPodMetrics(t *testing.T) {
+// --- Test Helpers ---
+
+func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric {
+	labelPairs := []*dto.LabelPair{}
+	for k, v := range labels {
+		labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)})
+	}
+	return &dto.Metric{
+		Label:       labelPairs,
+		Gauge:       &dto.Gauge{Value: &value},
+		TimestampMs: &timestampMs,
+	}
+}
+
+func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily {
+	return &dto.MetricFamily{
+		Name:   &name,
+		Type:   dto.MetricType_GAUGE.Enum(),
+		Metric: metrics,
+	}
+}
+
+// --- Tests ---
+
+func TestGetMetric(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	metricFamilies := map[string]*dto.MetricFamily{
+		"metric1": makeMetricFamily("metric1",
+			makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000),
+			makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000),
+		),
+		"metric2": makeMetricFamily("metric2",
+			makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500),
+			makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500),
+		),
+		"metric3": makeMetricFamily("metric3",
+			makeMetric("metric3", map[string]string{}, 5.0, 3000),
+			makeMetric("metric3", map[string]string{}, 6.0, 1000),
+		),
+	}
+
+	tests := []struct {
+		name        string
+		spec        MetricSpec
+		wantValue   float64
+		wantError   bool
+		shouldPanic bool // Add this
+	}{
+		{
+			name: "get labeled metric, exists",
+			spec: MetricSpec{
+				MetricName: "metric1",
+				Labels:     map[string]string{"label1": "value1"},
+			},
+			wantValue: 1.0,
+			wantError: false,
+		},
+		{
+			name: "get labeled metric, wrong value",
+			spec: MetricSpec{
+				MetricName: "metric1",
+				Labels:     map[string]string{"label1": "value3"},
+			},
+			wantValue: -1, // Expect an error, not a specific value
+			wantError: true,
+		},
+		{
+			name: "get labeled metric, missing label",
+			spec: MetricSpec{
+				MetricName: "metric1",
+				Labels:     map[string]string{"label2": "value2"},
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get labeled metric, extra label present",
+			spec: MetricSpec{
+				MetricName: "metric2",
+				Labels:     map[string]string{"labelA": "A1"},
+			},
+			wantValue: 3.0,
+			wantError: false,
+		},
+		{
+			name: "get unlabeled metric, exists",
+			spec: MetricSpec{
+				MetricName: "metric3",
+				Labels:     nil, // Explicitly nil
+			},
+			wantValue: 5.0, // latest metric, which occurs first in our test data
+			wantError: false,
+		},
+		{
+			name: "get unlabeled metric, metric family not found",
+			spec: MetricSpec{
+				MetricName: "metric4",
+				Labels:     nil,
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get labeled metric, metric family not found",
+			spec: MetricSpec{
+				MetricName: "metric4",
+				Labels:     map[string]string{"label1": "value1"},
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get metric, no metrics available",
+			spec: MetricSpec{
+				MetricName: "empty_metric",
+			},
+			wantValue: -1,
+			wantError: true,
+		},
+		{
+			name: "get latest metric",
+			spec: MetricSpec{
+				MetricName: "metric3",
+				Labels:     map[string]string{}, // Empty map, not nil
+			},
+			wantValue: 5.0,
+			wantError: false,
+		},
+	}
+
+	p := &PodMetricsClientImpl{} // No need for MetricMapping here
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if tt.shouldPanic {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("The code did not panic")
+					}
+				}()
+			}
+
+			gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec)
+
+			if tt.wantError {
+				if err == nil {
+					t.Errorf("getMetric() expected error, got nil")
+				}
+			} else {
+				if err != nil {
+					t.Errorf("getMetric() unexpected error: %v", err)
+				}
+				if gotMetric.GetGauge().GetValue() != tt.wantValue {
+					t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue)
+				}
+			}
+		})
+	}
+}
+
+func TestLabelsMatch(t *testing.T) {
+	tests := []struct {
+		name         string
+		metricLabels []*dto.LabelPair
+		specLabels   map[string]string
+		want         bool
+	}{
+		{
+			name:         "empty spec labels, should match",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{},
+			want:         true,
+		},
+		{
+			name:         "nil spec labels, should match",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   nil,
+			want:         true,
+		},
+		{
+			name:         "exact match",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{"a": "b"},
+			want:         true,
+		},
+		{
+			name:         "extra labels in metric",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}, {Name: proto.String("c"), Value: proto.String("d")}},
+			specLabels:   map[string]string{"a": "b"},
+			want:         true,
+		},
+		{
+			name:         "missing label in metric",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{"a": "b", "c": "d"},
+			want:         false,
+		},
+		{
+			name:         "value mismatch",
+			metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}},
+			specLabels:   map[string]string{"a": "c"},
+			want:         false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := labelsMatch(tt.metricLabels, tt.specLabels); got != tt.want {
+				t.Errorf("labelsMatch() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestGetLatestLoraMetric(t *testing.T) {
 	logger := logutil.NewTestLogger()
 
 	testCases := []struct {
-		name            string
-		metricFamilies  map[string]*dto.MetricFamily
-		initialMetrics  *metrics.Metrics
-		expectedMetrics *metrics.Metrics
-		expectedErr     error
+		name             string
+		metricFamilies   map[string]*dto.MetricFamily
+		expectedAdapters map[string]int
+		expectedMax      int
+		expectedErr      error
+		mapping          *MetricMapping
 	}{
 		{
-			name: "all metrics available",
+			name: "no lora metrics",
 			metricFamilies: map[string]*dto.MetricFamily{
-				RunningQueueSizeMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(10),
-							},
-							TimestampMs: proto.Int64(100),
-						},
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(15),
-							},
-							TimestampMs: proto.Int64(200), // This is the latest
-						},
-					},
-				},
-				WaitingQueueSizeMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(20),
-							},
-							TimestampMs: proto.Int64(100),
-						},
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(25),
-							},
-							TimestampMs: proto.Int64(200), // This is the latest
-						},
+				"some_other_metric": makeMetricFamily("some_other_metric",
+					makeMetric("some_other_metric", nil, 1.0, 1000),
+				),
+			},
+			expectedAdapters: nil,
+			expectedMax:      0,
+			expectedErr:      fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+		},
+		{
+			name: "basic lora metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000),       // Newer
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older
+
+				),
+			},
+			expectedAdapters: map[string]int{"lora1": 0},
+			expectedMax:      2,
+			expectedErr:      nil,
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+		},
+		{
+			name: "no matching lora metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000),
+				),
+			},
+			expectedAdapters: nil,
+			expectedMax:      0,
+			expectedErr:      nil, // Expect *no* error; just no adapters found
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+		},
+		{
+			name: "no lora metrics if not in MetricMapping",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000),
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000),
+				),
+			},
+			expectedAdapters: nil,
+			expectedMax:      0,
+			expectedErr:      nil,
+			mapping:          &MetricMapping{ // No LoRA metrics defined
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
+			loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies)
+
+			if tc.expectedErr != nil {
+				if err == nil || err.Error() != tc.expectedErr.Error() {
+					t.Errorf("getLatestLoraMetric() error = %v, wantErr %v", err, tc.expectedErr)
+				}
+				return // Stop here if an error was expected
+			} else if err != nil {
+				t.Fatalf("getLatestLoraMetric() unexpected error: %v", err)
+			}
+
+			if tc.mapping.LoraRequestInfo == nil {
+				if loraMetric != nil {
+					t.Errorf("getLatestLoraMetric() expected nil metric, got %v", loraMetric)
+				}
+				return // Stop if no Lora metrics are expected.
+			}
+
+			if tc.expectedAdapters == nil && loraMetric == nil {
+				return // Both nil, as expected
+			}
+
+			if tc.expectedAdapters != nil && loraMetric != nil { // proceed with checks
+
+				adaptersFound := make(map[string]int)
+				maxLora := 0
+				for _, label := range loraMetric.GetLabel() {
+					if label.GetName() == "running_lora_adapters" && label.GetValue() != "" {
+						for _, adapter := range strings.Split(label.GetValue(), ",") {
+							adaptersFound[adapter] = 0
+						}
+					}
+					if label.GetName() == "waiting_lora_adapters" && label.GetValue() != "" {
+						for _, adapter := range strings.Split(label.GetValue(), ",") {
+							adaptersFound[adapter] = 0 // Overwrite if already present
+						}
+					}
+					if label.GetName() == "max_lora" {
+						var converr error // define err in this scope.
+						maxLora, converr = strconv.Atoi(label.GetValue())
+						if converr != nil && tc.expectedErr == nil { // only report if we don't expect any other errors
+							t.Errorf("getLatestLoraMetric() could not parse max_lora: %v", converr)
+						}
+					}
+				}
+
+				if !reflect.DeepEqual(adaptersFound, tc.expectedAdapters) {
+					t.Errorf("getLatestLoraMetric() adapters = %v, want %v", adaptersFound, tc.expectedAdapters)
+				}
+				if maxLora != tc.expectedMax {
+					t.Errorf("getLatestLoraMetric() maxLora = %v, want %v", maxLora, tc.expectedMax)
+				}
+			} else { // one is nil and the other is not
+				t.Errorf("getLatestLoraMetric(): one of expectedAdapters/loraMetric is nil and the other is not, expected %v, got %v", tc.expectedAdapters, loraMetric)
+			}
+		})
+	}
+}
+
+func TestPromToPodMetrics(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	tests := []struct {
+		name             string
+		metricFamilies   map[string]*dto.MetricFamily
+		mapping          *MetricMapping
+		existingMetrics  *datastore.PodMetrics
+		expectedMetrics  *datastore.PodMetrics
+		expectedErrCount int // Count of expected errors
+	}{
+		{
+			name: "vllm metrics",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm_waiting": makeMetricFamily("vllm_waiting",
+					makeMetric("vllm_waiting", nil, 5.0, 1000),
+					makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer
+				),
+				"vllm_usage": makeMetricFamily("vllm_usage",
+					makeMetric("vllm_usage", nil, 0.8, 2000),
+					makeMetric("vllm_usage", nil, 0.7, 500),
+				),
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
+				),
+			},
+			mapping: &MetricMapping{
+				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"},
+				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
 					},
 				},
-				KVCacheUsagePercentMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(0.8),
-							},
-							TimestampMs: proto.Int64(100),
-						},
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(0.9),
-							},
-							TimestampMs: proto.Int64(200), // This is the latest
-						},
+				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
 					},
 				},
-				LoraRequestInfoMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Label: []*dto.LabelPair{
-								{
-									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
-									Value: proto.String("lora3,lora4"),
-								},
-								{
-									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
-									Value: proto.String("2"),
-								},
-							},
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(100),
-							},
-						},
-						{
-							Label: []*dto.LabelPair{
-								{
-									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
-									Value: proto.String("lora2"),
-								},
-								{
-									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
-									Value: proto.String("2"),
-								},
-							},
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(90),
-							},
-						},
-					},
+				Metrics: datastore.Metrics{
+					WaitingQueueSize:    7,
+					KVCacheUsagePercent: 0.8,
+					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
+					MaxActiveModels:     3,
 				},
 			},
-			expectedMetrics: &metrics.Metrics{
-				RunningQueueSize:    15,
-				WaitingQueueSize:    25,
-				KVCacheUsagePercent: 0.9,
-				ActiveModels: map[string]int{
-					"lora3": 0,
-					"lora4": 0,
-				},
-				MaxActiveModels: 2,
+			expectedErrCount: 0,
+		},
+		{
+			name:           "missing metrics",
+			metricFamilies: map[string]*dto.MetricFamily{}, // No metrics
+			mapping: &MetricMapping{
+				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"},
+				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
-			initialMetrics: &metrics.Metrics{},
-			expectedErr:    nil,
+			existingMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
+			expectedMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
+			expectedErrCount: 3, // Errors for all 4 main metrics
 		},
 		{
-			name: "invalid max lora",
+			name: "partial metrics available + LoRA",
 			metricFamilies: map[string]*dto.MetricFamily{
-				RunningQueueSizeMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(10),
-							},
-							TimestampMs: proto.Int64(100),
-						},
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(15),
-							},
-							TimestampMs: proto.Int64(200), // This is the latest
-						},
+				"vllm_usage": makeMetricFamily("vllm_usage",
+					makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present
+				),
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
+				),
+			},
+			mapping: &MetricMapping{
+				TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present
+				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
+				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
 					},
 				},
-				WaitingQueueSizeMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(20),
-							},
-							TimestampMs: proto.Int64(100),
-						},
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(25),
-							},
-							TimestampMs: proto.Int64(200), // This is the latest
-						},
+				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
+			},
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
 					},
 				},
-				KVCacheUsagePercentMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(0.8),
-							},
-							TimestampMs: proto.Int64(100),
-						},
-						{
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(0.9),
-							},
-							TimestampMs: proto.Int64(200), // This is the latest
-						},
-					},
+				Metrics: datastore.Metrics{
+					WaitingQueueSize:    0,
+					KVCacheUsagePercent: 0.8,
+					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
+					MaxActiveModels:     3,
 				},
-				LoraRequestInfoMetricName: {
-					Metric: []*dto.Metric{
-						{
-							Label: []*dto.LabelPair{
-								{
-									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
-									Value: proto.String("lora3,lora4"),
-								},
-								{
-									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
-									Value: proto.String("2a"),
-								},
-							},
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(100),
-							},
-						},
-						{
-							Label: []*dto.LabelPair{
-								{
-									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
-									Value: proto.String("lora2"),
-								},
-								{
-									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
-									Value: proto.String("2"),
-								},
-							},
-							Gauge: &dto.Gauge{
-								Value: proto.Float64(90),
-							},
-						},
+			},
+			expectedErrCount: 1, // Errors for the two missing metrics
+		},
+		{
+			name: "invalid max lora",
+			metricFamilies: map[string]*dto.MetricFamily{
+				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000),
+				),
+			},
+			mapping: &MetricMapping{
+				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
+			},
+			existingMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
 					},
 				},
+				Metrics: datastore.Metrics{},
 			},
-			expectedMetrics: &metrics.Metrics{
-				RunningQueueSize:    15,
-				WaitingQueueSize:    25,
-				KVCacheUsagePercent: 0.9,
-				ActiveModels: map[string]int{
-					"lora3": 0,
-					"lora4": 0,
+			expectedMetrics: &datastore.PodMetrics{
+				Pod: datastore.Pod{
+					Address: "127.0.0.1",
+					NamespacedName: types.NamespacedName{
+						Namespace: "test",
+						Name:      "pod",
+					},
+				},
+				Metrics: datastore.Metrics{
+					ActiveModels:    map[string]int{"lora1": 0},
+					MaxActiveModels: 0, // Should still default to 0.
+
 				},
-				MaxActiveModels: 0,
 			},
-			initialMetrics: &metrics.Metrics{},
-			expectedErr:    errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
+			expectedErrCount: 1, // Expect *one* error
 		},
 	}
-	for _, tc := range testCases {
+
+	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialMetrics)
-			if tc.expectedErr != nil {
-				assert.Error(t, err)
+			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
+			updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics)
+
+			if tc.expectedErrCount == 0 {
+				if err != nil {
+					t.Errorf("promToPodMetrics() unexpected error: %v", err)
+				}
 			} else {
-				assert.NoError(t, err)
-				assert.Equal(t, tc.expectedMetrics, updated)
+				if err == nil {
+					t.Errorf("promToPodMetrics() expected errors, got nil")
+				} else {
+					// Check the *number* of errors.  multierr.Errors() gives us a slice
+					if len(multierr.Errors(err)) != tc.expectedErrCount {
+						t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d.  Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err)
+					}
+
+				}
+			}
+			// Use podMetricsEqual for comparison with tolerance.
+			if !reflect.DeepEqual(updated, tc.expectedMetrics) {
+				t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics)
 			}
 		})
 	}
 }
+
+// TestFetchMetrics is a basic integration test.  A more complete test would mock
+// the HTTP client.
+func TestFetchMetrics(t *testing.T) {
+	// This test is very basic as it doesn't mock the HTTP client.  It assumes
+	// there's no server running on the specified port.  A real-world test
+	// suite should use a mock server.
+	ctx := logutil.NewTestLoggerIntoContext(context.Background())
+	existing := &datastore.PodMetrics{
+		Pod: datastore.Pod{
+			Address: "127.0.0.1",
+			NamespacedName: types.NamespacedName{
+				Namespace: "test",
+				Name:      "pod",
+			},
+		},
+	}
+	p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test
+
+	_, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use.
+	if err == nil {
+		t.Errorf("FetchMetrics() expected error, got nil")
+	}
+	// Check for a specific error message (fragile, but OK for this example)
+	expectedSubstr := "connection refused"
+	if err != nil && !strings.Contains(err.Error(), expectedSubstr) {
+		t.Errorf("FetchMetrics() error = %v, want error containing %q", err, expectedSubstr)
+	}
+}

From 1c367a6ecbf4bbd5f8f83703cbd45f1cd8bd3e3f Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 7 Mar 2025 22:55:41 +0000
Subject: [PATCH 10/19] re-add todos and rename kv flag to reflect percentage
 usage.

---
 cmd/epp/main.go                 | 4 ++--
 pkg/epp/backend/vllm/metrics.go | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index d3c1ab09b..a0441d4a6 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -97,7 +97,7 @@ var (
 	totalQueuedRequestMetric = flag.String("totalQueuedRequestMetric",
 		"vllm:num_requests_waiting",
 		"Prometheus metric for the number of queued requests.")
-	kVCacheUsageMetric = flag.String("kVCacheUsageMetric",
+	kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric",
 		"vllm:gpu_cache_usage_perc",
 		"Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
 	// LoRA metrics
@@ -162,7 +162,7 @@ func run() error {
 	// Set up mapper for metric scraping.
 	mapping, err := vllm.NewMetricMapping(
 		*totalQueuedRequestMetric,
-		*kVCacheUsageMetric,
+		*kvCacheUsagePercentageMetric,
 		*loraRequestInfoMetric,
 	)
 	if err != nil {
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
index 4c1532080..6d181b612 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/vllm/metrics.go
@@ -34,7 +34,7 @@ import (
 )
 
 const (
-	// Hardcoded vLLM specific LoRA metrics
+	// LoRA metrics based on protocol
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
 	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
 	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
@@ -53,6 +53,8 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	logger := log.FromContext(ctx)
 	loggerDefault := logger.V(logutil.DEFAULT)
 
+	// Currently the metrics endpoint is hard-coded, which works with vLLM.
+	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
 	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
 
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)

From 3356bd30eff9ab5678d524bcfd564dcd78a7ffd1 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 13 Mar 2025 20:46:47 +0000
Subject: [PATCH 11/19] Fix nits, move logging channel for backend/metrics.go
 from default to trace, fix comments.

---
 cmd/epp/main.go                           |   5 +-
 pkg/epp/backend/vllm/metrics.go           |  14 +--
 pkg/epp/backend/vllm/metrics_spec.go      |   6 +-
 pkg/epp/backend/vllm/metrics_spec_test.go |  19 ++--
 pkg/epp/backend/vllm/metrics_test.go      | 109 +++++++++-------------
 5 files changed, 65 insertions(+), 88 deletions(-)

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index a0441d4a6..277cff37f 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -40,6 +40,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm"
+	servermetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
@@ -160,7 +161,7 @@ func run() error {
 	datastore := datastore.NewDatastore(ctx, pmf)
 
 	// Set up mapper for metric scraping.
-	mapping, err := vllm.NewMetricMapping(
+	mapping, err := servermetrics.NewMetricMapping(
 		*totalQueuedRequestMetric,
 		*kvCacheUsagePercentageMetric,
 		*loraRequestInfoMetric,
@@ -169,7 +170,7 @@ func run() error {
 		setupLog.Error(err, "Failed to create metric mapping from flags.")
 		return err
 	}
-	provider := backend.NewProvider(&vllm.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
+	provider := backend.NewProvider(&servermetrics.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
 	//
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
index 6d181b612..1328a7672 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/vllm/metrics.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package vllm
+package metrics
 
 import (
 	"context"
@@ -161,7 +161,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam
 
 	loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName]
 	if !ok {
-		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
+		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
 		return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
 	}
 
@@ -212,7 +212,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam
 func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
 	mf, ok := metricFamilies[spec.MetricName]
 	if !ok {
-		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName)
+		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", spec.MetricName)
 		return nil, fmt.Errorf("metric family %q not found", spec.MetricName)
 	}
 
@@ -221,14 +221,14 @@ func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[
 	}
 	// if there is a specified label, return only that metric in the family
 	if spec.Labels != nil {
-		return getLabeledMetric(logger, mf, spec)
+		return getLabeledMetric(logger, mf, &spec)
 	}
 	return getLatestMetric(logger, mf)
 }
 
 // getLatestMetric gets the latest metric of a family (for metrics without labels).
 func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) {
-	var latestTs int64
+	var latestTs int64 = -1
 	var latest *dto.Metric
 	for _, m := range mf.GetMetric() {
 		if m.GetTimestampMs() >= latestTs {
@@ -246,12 +246,12 @@ func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, err
 }
 
 // getLabeledMetric gets the latest metric with matching labels.
-func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
+func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) {
 	var latestMetric *dto.Metric
 	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
 
 	for _, m := range mf.GetMetric() {
-		if labelsMatch(m.GetLabel(), spec.Labels) {
+		if spec == nil || labelsMatch(m.GetLabel(), spec.Labels) {
 			if m.GetTimestampMs() > latestTimestamp {
 				latestTimestamp = m.GetTimestampMs()
 				latestMetric = m
diff --git a/pkg/epp/backend/vllm/metrics_spec.go b/pkg/epp/backend/vllm/metrics_spec.go
index bdd1e6671..bd8f39ccf 100644
--- a/pkg/epp/backend/vllm/metrics_spec.go
+++ b/pkg/epp/backend/vllm/metrics_spec.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package vllm
+package metrics
 
 import (
 	"fmt"
@@ -41,10 +41,6 @@ type MetricMapping struct {
 //	"metric_name{label1=value1}"
 //	"metric_name{label1=value1,label2=value2}"
 func stringToMetricSpec(specStr string) (*MetricSpec, error) {
-	if specStr == "" {
-		return nil, nil // Allow empty strings to represent nil MetricSpecs
-	}
-
 	specStr = strings.TrimSpace(specStr)
 	metricName := specStr
 	labels := make(map[string]string)
diff --git a/pkg/epp/backend/vllm/metrics_spec_test.go b/pkg/epp/backend/vllm/metrics_spec_test.go
index d73ce21dd..8de6dac29 100644
--- a/pkg/epp/backend/vllm/metrics_spec_test.go
+++ b/pkg/epp/backend/vllm/metrics_spec_test.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package vllm
+package metrics
 
 import (
 	"reflect"
@@ -32,7 +32,7 @@ func TestStringToMetricSpec(t *testing.T) {
 			name:    "empty string",
 			input:   "",
 			want:    nil,
-			wantErr: false,
+			wantErr: true,
 		},
 		{
 			name:  "no labels",
@@ -152,9 +152,14 @@ func TestStringToMetricSpec(t *testing.T) {
 				t.Errorf("stringToMetricSpec() error = %v, wantErr %v", err, tt.wantErr)
 				return
 			}
-			if tt.want != nil && got != nil { // compare maps directly
-				if tt.want.Labels == nil {
-					tt.want.Labels = make(map[string]string)
+			if tt.wantErr {
+				if got != nil { // handles if we got a nil spec and didn't expect an error
+					t.Errorf("stringToMetricSpec() = %v, want %v", got, tt.want)
+					return
+				}
+			} else {
+				if got == nil {
+					t.Errorf("stringToMetricSpec() = got nil but wanted %v", tt.want)
 				}
 				if !reflect.DeepEqual(got.MetricName, tt.want.MetricName) {
 					t.Errorf("stringToMetricSpec() got MetricName = %v, want %v", got.MetricName, tt.want.MetricName)
@@ -162,11 +167,7 @@ func TestStringToMetricSpec(t *testing.T) {
 				if !reflect.DeepEqual(got.Labels, tt.want.Labels) {
 					t.Errorf("stringToMetricSpec() got Labels = %v, want %v", got.Labels, tt.want.Labels)
 				}
-			} else if tt.want != got { // handles if one is nil and the other isn't
-				t.Errorf("stringToMetricSpec() = %v, want %v", got, tt.want)
-
 			}
-
 		})
 	}
 }
diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go
index 0f05185d1..3bc4fc703 100644
--- a/pkg/epp/backend/vllm/metrics_test.go
+++ b/pkg/epp/backend/vllm/metrics_test.go
@@ -14,10 +14,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package vllm
+package metrics
 
 import (
 	"context"
+
+	"errors"
 	"fmt"
 	"reflect"
 	"strconv"
@@ -25,7 +27,7 @@ import (
 	"testing"
 
 	dto "github.com/prometheus/client_model/go"
-	"go.uber.org/multierr"
+	"github.com/stretchr/testify/assert"
 	"google.golang.org/protobuf/proto"
 	"k8s.io/apimachinery/pkg/types"
 
@@ -76,11 +78,10 @@ func TestGetMetric(t *testing.T) {
 	}
 
 	tests := []struct {
-		name        string
-		spec        MetricSpec
-		wantValue   float64
-		wantError   bool
-		shouldPanic bool // Add this
+		name           string
+		spec           MetricSpec
+		wantGaugeValue float64
+		wantError      bool
 	}{
 		{
 			name: "get labeled metric, exists",
@@ -88,8 +89,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric1",
 				Labels:     map[string]string{"label1": "value1"},
 			},
-			wantValue: 1.0,
-			wantError: false,
+			wantGaugeValue: 1.0,
+			wantError:      false,
 		},
 		{
 			name: "get labeled metric, wrong value",
@@ -97,8 +98,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric1",
 				Labels:     map[string]string{"label1": "value3"},
 			},
-			wantValue: -1, // Expect an error, not a specific value
-			wantError: true,
+			wantGaugeValue: -1, // Expect an error, not a specific value
+			wantError:      true,
 		},
 		{
 			name: "get labeled metric, missing label",
@@ -106,8 +107,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric1",
 				Labels:     map[string]string{"label2": "value2"},
 			},
-			wantValue: -1,
-			wantError: true,
+			wantGaugeValue: -1,
+			wantError:      true,
 		},
 		{
 			name: "get labeled metric, extra label present",
@@ -115,8 +116,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric2",
 				Labels:     map[string]string{"labelA": "A1"},
 			},
-			wantValue: 3.0,
-			wantError: false,
+			wantGaugeValue: 3.0,
+			wantError:      false,
 		},
 		{
 			name: "get unlabeled metric, exists",
@@ -124,8 +125,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric3",
 				Labels:     nil, // Explicitly nil
 			},
-			wantValue: 5.0, // latest metric, which occurs first in our test data
-			wantError: false,
+			wantGaugeValue: 5.0, // latest metric, which occurs first in our test data
+			wantError:      false,
 		},
 		{
 			name: "get unlabeled metric, metric family not found",
@@ -133,8 +134,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric4",
 				Labels:     nil,
 			},
-			wantValue: -1,
-			wantError: true,
+			wantGaugeValue: -1,
+			wantError:      true,
 		},
 		{
 			name: "get labeled metric, metric family not found",
@@ -142,16 +143,16 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric4",
 				Labels:     map[string]string{"label1": "value1"},
 			},
-			wantValue: -1,
-			wantError: true,
+			wantGaugeValue: -1,
+			wantError:      true,
 		},
 		{
 			name: "get metric, no metrics available",
 			spec: MetricSpec{
 				MetricName: "empty_metric",
 			},
-			wantValue: -1,
-			wantError: true,
+			wantGaugeValue: -1,
+			wantError:      true,
 		},
 		{
 			name: "get latest metric",
@@ -159,8 +160,8 @@ func TestGetMetric(t *testing.T) {
 				MetricName: "metric3",
 				Labels:     map[string]string{}, // Empty map, not nil
 			},
-			wantValue: 5.0,
-			wantError: false,
+			wantGaugeValue: 5.0,
+			wantError:      false,
 		},
 	}
 
@@ -168,13 +169,6 @@ func TestGetMetric(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if tt.shouldPanic {
-				defer func() {
-					if r := recover(); r == nil {
-						t.Errorf("The code did not panic")
-					}
-				}()
-			}
 
 			gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec)
 
@@ -184,10 +178,10 @@ func TestGetMetric(t *testing.T) {
 				}
 			} else {
 				if err != nil {
-					t.Errorf("getMetric() unexpected error: %v", err)
+					t.Fatalf("getMetric() unexpected error: %v", err)
 				}
-				if gotMetric.GetGauge().GetValue() != tt.wantValue {
-					t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue)
+				if gotMetric.GetGauge().GetValue() != tt.wantGaugeValue {
+					t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantGaugeValue)
 				}
 			}
 		})
@@ -385,12 +379,12 @@ func TestPromToPodMetrics(t *testing.T) {
 	logger := logutil.NewTestLogger()
 
 	tests := []struct {
-		name             string
-		metricFamilies   map[string]*dto.MetricFamily
-		mapping          *MetricMapping
-		existingMetrics  *datastore.PodMetrics
-		expectedMetrics  *datastore.PodMetrics
-		expectedErrCount int // Count of expected errors
+		name            string
+		metricFamilies  map[string]*dto.MetricFamily
+		mapping         *MetricMapping
+		existingMetrics *datastore.PodMetrics
+		expectedMetrics *datastore.PodMetrics
+		expectedErr     error // Count of expected errors
 	}{
 		{
 			name: "vllm metrics",
@@ -437,7 +431,6 @@ func TestPromToPodMetrics(t *testing.T) {
 					MaxActiveModels:     3,
 				},
 			},
-			expectedErrCount: 0,
 		},
 		{
 			name:           "missing metrics",
@@ -447,9 +440,9 @@ func TestPromToPodMetrics(t *testing.T) {
 				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
 				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
-			existingMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedMetrics:  &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedErrCount: 3, // Errors for all 4 main metrics
+			existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
+			expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
+			expectedErr:     errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
 		},
 		{
 			name: "partial metrics available + LoRA",
@@ -491,7 +484,7 @@ func TestPromToPodMetrics(t *testing.T) {
 					MaxActiveModels:     3,
 				},
 			},
-			expectedErrCount: 1, // Errors for the two missing metrics
+			expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
 		},
 		{
 			name: "invalid max lora",
@@ -527,7 +520,7 @@ func TestPromToPodMetrics(t *testing.T) {
 
 				},
 			},
-			expectedErrCount: 1, // Expect *one* error
+			expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
 		},
 	}
 
@@ -535,25 +528,11 @@ func TestPromToPodMetrics(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
 			updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics)
-
-			if tc.expectedErrCount == 0 {
-				if err != nil {
-					t.Errorf("promToPodMetrics() unexpected error: %v", err)
-				}
+			if tc.expectedErr != nil {
+				assert.Error(t, err)
 			} else {
-				if err == nil {
-					t.Errorf("promToPodMetrics() expected errors, got nil")
-				} else {
-					// Check the *number* of errors.  multierr.Errors() gives us a slice
-					if len(multierr.Errors(err)) != tc.expectedErrCount {
-						t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d.  Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err)
-					}
-
-				}
-			}
-			// Use podMetricsEqual for comparison with tolerance.
-			if !reflect.DeepEqual(updated, tc.expectedMetrics) {
-				t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics)
+				assert.NoError(t, err)
+				assert.Equal(t, tc.expectedMetrics, updated)
 			}
 		})
 	}

From 371fd582393dee21a162d693130fbf92e0a5c8ac Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Thu, 13 Mar 2025 23:37:27 +0000
Subject: [PATCH 12/19] Rebase into metric agnostic redesign.

---
 cmd/epp/main.go                               |  16 +-
 pkg/epp/backend/{vllm => metrics}/metrics.go  |  20 +-
 .../backend/{vllm => metrics}/metrics_spec.go |   0
 .../{vllm => metrics}/metrics_spec_test.go    |   0
 .../backend/{vllm => metrics}/metrics_test.go | 133 ++++---------
 pkg/epp/backend/provider.go                   | 183 ------------------
 6 files changed, 53 insertions(+), 299 deletions(-)
 rename pkg/epp/backend/{vllm => metrics}/metrics.go (94%)
 rename pkg/epp/backend/{vllm => metrics}/metrics_spec.go (100%)
 rename pkg/epp/backend/{vllm => metrics}/metrics_spec_test.go (100%)
 rename pkg/epp/backend/{vllm => metrics}/metrics_test.go (82%)
 delete mode 100644 pkg/epp/backend/provider.go

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index 277cff37f..634cda4a2 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -37,10 +37,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/manager"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm"
-	servermetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
@@ -156,12 +153,8 @@ func run() error {
 
 	ctx := ctrl.SetupSignalHandler()
 
-	pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval)
-	// Setup runner.
-	datastore := datastore.NewDatastore(ctx, pmf)
-
 	// Set up mapper for metric scraping.
-	mapping, err := servermetrics.NewMetricMapping(
+	mapping, err := backendmetrics.NewMetricMapping(
 		*totalQueuedRequestMetric,
 		*kvCacheUsagePercentageMetric,
 		*loraRequestInfoMetric,
@@ -170,8 +163,11 @@ func run() error {
 		setupLog.Error(err, "Failed to create metric mapping from flags.")
 		return err
 	}
-	provider := backend.NewProvider(&servermetrics.PodMetricsClientImpl{MetricMapping: mapping}, datastore)
-	//
+
+	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval)
+	// Setup runner.
+	datastore := datastore.NewDatastore(ctx, pmf)
+
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
 		DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/metrics/metrics.go
similarity index 94%
rename from pkg/epp/backend/vllm/metrics.go
rename to pkg/epp/backend/metrics/metrics.go
index 1328a7672..cc988758f 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/metrics/metrics.go
@@ -29,7 +29,6 @@ import (
 	"github.com/prometheus/common/expfmt"
 	"go.uber.org/multierr"
 	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
@@ -47,15 +46,16 @@ type PodMetricsClientImpl struct {
 // FetchMetrics fetches metrics from a given pod.
 func (p *PodMetricsClientImpl) FetchMetrics(
 	ctx context.Context,
-	existing *datastore.PodMetrics,
+	pod *Pod,
+	existing *Metrics,
 	port int32,
-) (*datastore.PodMetrics, error) {
+) (*Metrics, error) {
 	logger := log.FromContext(ctx)
 	loggerDefault := logger.V(logutil.DEFAULT)
 
 	// Currently the metrics endpoint is hard-coded, which works with vLLM.
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
-	url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
+	url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
 
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	if err != nil {
@@ -64,16 +64,16 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	}
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
-		loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName)
-		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err)
+		loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
+		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
 	}
 	defer func() {
 		_ = resp.Body.Close()
 	}()
 
 	if resp.StatusCode != http.StatusOK {
-		loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode)
-		return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode)
+		loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
+		return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode)
 	}
 
 	parser := expfmt.TextParser{}
@@ -88,8 +88,8 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 func (p *PodMetricsClientImpl) promToPodMetrics(
 	logger logr.Logger,
 	metricFamilies map[string]*dto.MetricFamily,
-	existing *datastore.PodMetrics,
-) (*datastore.PodMetrics, error) {
+	existing *Metrics,
+) (*Metrics, error) {
 	var errs error
 	updated := existing.Clone()
 
diff --git a/pkg/epp/backend/vllm/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go
similarity index 100%
rename from pkg/epp/backend/vllm/metrics_spec.go
rename to pkg/epp/backend/metrics/metrics_spec.go
diff --git a/pkg/epp/backend/vllm/metrics_spec_test.go b/pkg/epp/backend/metrics/metrics_spec_test.go
similarity index 100%
rename from pkg/epp/backend/vllm/metrics_spec_test.go
rename to pkg/epp/backend/metrics/metrics_spec_test.go
diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
similarity index 82%
rename from pkg/epp/backend/vllm/metrics_test.go
rename to pkg/epp/backend/metrics/metrics_test.go
index 3bc4fc703..41a3eb9ae 100644
--- a/pkg/epp/backend/vllm/metrics_test.go
+++ b/pkg/epp/backend/metrics/metrics_test.go
@@ -18,7 +18,6 @@ package metrics
 
 import (
 	"context"
-
 	"errors"
 	"fmt"
 	"reflect"
@@ -28,10 +27,10 @@ import (
 
 	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
+	"go.uber.org/multierr"
 	"google.golang.org/protobuf/proto"
 	"k8s.io/apimachinery/pkg/types"
 
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
@@ -377,13 +376,12 @@ func TestGetLatestLoraMetric(t *testing.T) {
 
 func TestPromToPodMetrics(t *testing.T) {
 	logger := logutil.NewTestLogger()
-
 	tests := []struct {
 		name            string
 		metricFamilies  map[string]*dto.MetricFamily
 		mapping         *MetricMapping
-		existingMetrics *datastore.PodMetrics
-		expectedMetrics *datastore.PodMetrics
+		existingMetrics *Metrics
+		expectedMetrics *Metrics
 		expectedErr     error // Count of expected errors
 	}{
 		{
@@ -398,7 +396,7 @@ func TestPromToPodMetrics(t *testing.T) {
 					makeMetric("vllm_usage", nil, 0.7, 500),
 				),
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000),
 				),
 			},
 			mapping: &MetricMapping{
@@ -406,30 +404,12 @@ func TestPromToPodMetrics(t *testing.T) {
 				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
 				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					WaitingQueueSize:    7,
-					KVCacheUsagePercent: 0.8,
-					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
-					MaxActiveModels:     3,
-				},
+			existingMetrics: &Metrics{},
+			expectedMetrics: &Metrics{
+				WaitingQueueSize:    7,
+				KVCacheUsagePercent: 0.8,
+				ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
+				MaxActiveModels:     3,
 			},
 		},
 		{
@@ -440,9 +420,9 @@ func TestPromToPodMetrics(t *testing.T) {
 				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
 				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
-			existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}},
-			expectedErr:     errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
+			existingMetrics: &Metrics{ActiveModels: map[string]int{}},
+			expectedMetrics: &Metrics{ActiveModels: map[string]int{}},
+			expectedErr:     multierr.Combine(fmt.Errorf("metric family \"vllm_waiting\" not found"), fmt.Errorf("metric family \"vllm_usage\" not found"), fmt.Errorf("metric family \"vllm:lora_requests_info\" not found")),
 		},
 		{
 			name: "partial metrics available + LoRA",
@@ -451,7 +431,7 @@ func TestPromToPodMetrics(t *testing.T) {
 					makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present
 				),
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000),
+					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000),
 				),
 			},
 			mapping: &MetricMapping{
@@ -459,32 +439,14 @@ func TestPromToPodMetrics(t *testing.T) {
 				KVCacheUtilization:  &MetricSpec{MetricName: "vllm_usage"},
 				LoraRequestInfo:     &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{}, // Initialize with empty Metrics
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					WaitingQueueSize:    0,
-					KVCacheUsagePercent: 0.8,
-					ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
-					MaxActiveModels:     3,
-				},
+			existingMetrics: &Metrics{},
+			expectedMetrics: &Metrics{
+				WaitingQueueSize:    0,
+				KVCacheUsagePercent: 0.8,
+				ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
+				MaxActiveModels:     3,
 			},
-			expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
+			expectedErr: fmt.Errorf("metric family \"vllm_waiting\" not found"),
 		},
 		{
 			name: "invalid max lora",
@@ -496,31 +458,13 @@ func TestPromToPodMetrics(t *testing.T) {
 			mapping: &MetricMapping{
 				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
-			existingMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{},
-			},
-			expectedMetrics: &datastore.PodMetrics{
-				Pod: datastore.Pod{
-					Address: "127.0.0.1",
-					NamespacedName: types.NamespacedName{
-						Namespace: "test",
-						Name:      "pod",
-					},
-				},
-				Metrics: datastore.Metrics{
-					ActiveModels:    map[string]int{"lora1": 0},
-					MaxActiveModels: 0, // Should still default to 0.
-
-				},
+			existingMetrics: &Metrics{},
+			expectedMetrics: &Metrics{
+				ActiveModels:    map[string]int{"lora1": 0},
+				MaxActiveModels: 0, // Should still default to 0.
+
 			},
-			expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"),
+			expectedErr: errors.New("strconv.Atoi: parsing \"invalid\": invalid syntax"),
 		},
 	}
 
@@ -530,6 +474,7 @@ func TestPromToPodMetrics(t *testing.T) {
 			updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics)
 			if tc.expectedErr != nil {
 				assert.Error(t, err)
+				assert.EqualError(t, err, tc.expectedErr.Error())
 			} else {
 				assert.NoError(t, err)
 				assert.Equal(t, tc.expectedMetrics, updated)
@@ -538,25 +483,21 @@ func TestPromToPodMetrics(t *testing.T) {
 	}
 }
 
-// TestFetchMetrics is a basic integration test.  A more complete test would mock
-// the HTTP client.
+// TestFetchMetrics is a basic integration test. It assumes
+// there's no server running on the specified port.
 func TestFetchMetrics(t *testing.T) {
-	// This test is very basic as it doesn't mock the HTTP client.  It assumes
-	// there's no server running on the specified port.  A real-world test
-	// suite should use a mock server.
 	ctx := logutil.NewTestLoggerIntoContext(context.Background())
-	existing := &datastore.PodMetrics{
-		Pod: datastore.Pod{
-			Address: "127.0.0.1",
-			NamespacedName: types.NamespacedName{
-				Namespace: "test",
-				Name:      "pod",
-			},
+	pod := &Pod{
+		Address: "127.0.0.1",
+		NamespacedName: types.NamespacedName{
+			Namespace: "test",
+			Name:      "pod",
 		},
 	}
+	existing := &Metrics{}
 	p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test
 
-	_, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use.
+	_, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use.
 	if err == nil {
 		t.Errorf("FetchMetrics() expected error, got nil")
 	}
diff --git a/pkg/epp/backend/provider.go b/pkg/epp/backend/provider.go
deleted file mode 100644
index 959f3e0c9..000000000
--- a/pkg/epp/backend/provider.go
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-Copyright 2025 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package backend
-
-import (
-	"context"
-	"fmt"
-	"sync"
-	"time"
-
-	"github.com/go-logr/logr"
-	"go.uber.org/multierr"
-	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-)
-
-const (
-	fetchMetricsTimeout = 5 * time.Second
-)
-
-func NewProvider(pmc PodMetricsClient, datastore datastore.Datastore) *Provider {
-	p := &Provider{
-		pmc:       pmc,
-		datastore: datastore,
-	}
-	return p
-}
-
-// Provider provides backend pods and information such as metrics.
-type Provider struct {
-	pmc       PodMetricsClient
-	datastore datastore.Datastore
-}
-
-type PodMetricsClient interface {
-	FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error)
-}
-
-func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error {
-	// periodically refresh metrics
-	logger := log.FromContext(ctx)
-	go func() {
-		for {
-			select {
-			case <-ctx.Done():
-				logger.V(logutil.DEFAULT).Info("Shutting down metrics prober")
-				return
-			default:
-				time.Sleep(refreshMetricsInterval)
-				if err := p.refreshMetricsOnce(logger); err != nil {
-					logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics")
-				}
-			}
-		}
-	}()
-
-	// Periodically flush prometheus metrics for inference pool
-	go func() {
-		for {
-			select {
-			case <-ctx.Done():
-				logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
-				return
-			default:
-				time.Sleep(refreshPrometheusMetricsInterval)
-				p.flushPrometheusMetricsOnce(logger)
-			}
-		}
-	}()
-
-	// Periodically print out the pods and metrics for DEBUGGING.
-	if logger := logger.V(logutil.DEBUG); logger.Enabled() {
-		go func() {
-			for {
-				select {
-				case <-ctx.Done():
-					logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread")
-					return
-				default:
-					time.Sleep(5 * time.Second)
-					logger.Info("Current Pods and metrics gathered", "metrics", p.datastore.PodGetAll())
-				}
-			}
-		}()
-	}
-
-	return nil
-}
-
-func (p *Provider) refreshMetricsOnce(logger logr.Logger) error {
-	loggerTrace := logger.V(logutil.TRACE)
-	pool, _ := p.datastore.PoolGet()
-	if pool == nil {
-		loggerTrace.Info("No inference pool or not initialized")
-		return nil
-	}
-	ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout)
-	defer cancel()
-	start := time.Now()
-	defer func() {
-		d := time.Since(start)
-		// TODO: add a metric instead of logging
-		loggerTrace.Info("Metrics refreshed", "duration", d)
-	}()
-
-	var wg sync.WaitGroup
-	errCh := make(chan error)
-	processOnePod := func(key, value any) bool {
-		loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value)
-		existing := value.(*datastore.PodMetrics)
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			updated, err := p.pmc.FetchMetrics(ctx, existing, pool.Spec.TargetPortNumber)
-			if err != nil {
-				errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err)
-				return
-			}
-			p.datastore.PodUpdateMetricsIfExist(updated.NamespacedName, &updated.Metrics)
-			loggerTrace.Info("Updated metrics for pod", "pod", updated.NamespacedName, "metrics", updated.Metrics)
-		}()
-		return true
-	}
-	p.datastore.PodRange(processOnePod)
-
-	// Wait for metric collection for all pods to complete and close the error channel in a
-	// goroutine so this is unblocking, allowing the code to proceed to the error collection code
-	// below.
-	// Note we couldn't use a buffered error channel with a size because the size of the podMetrics
-	// sync.Map is unknown beforehand.
-	go func() {
-		wg.Wait()
-		close(errCh)
-	}()
-
-	var errs error
-	for err := range errCh {
-		errs = multierr.Append(errs, err)
-	}
-	return errs
-}
-
-func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) {
-	pool, _ := p.datastore.PoolGet()
-	if pool == nil {
-		// No inference pool or not initialize.
-		return
-	}
-
-	var kvCacheTotal float64
-	var queueTotal int
-
-	podMetrics := p.datastore.PodGetAll()
-	logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics))
-	if len(podMetrics) == 0 {
-		return
-	}
-
-	for _, pod := range podMetrics {
-		kvCacheTotal += pod.KVCacheUsagePercent
-		queueTotal += pod.WaitingQueueSize
-	}
-
-	podTotalCount := len(podMetrics)
-	metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount))
-	metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount))
-}

From 97fd0defbd2d2f2a92eaa1ca9b406e47a311ac23 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 02:27:01 +0000
Subject: [PATCH 13/19] Merge getLatestMetric and getLabeledMetric.

---
 pkg/epp/backend/metrics/metrics.go | 37 ++++++++----------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
index cc988758f..67baf853e 100644
--- a/pkg/epp/backend/metrics/metrics.go
+++ b/pkg/epp/backend/metrics/metrics.go
@@ -219,39 +219,22 @@ func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[
 	if len(mf.GetMetric()) == 0 {
 		return nil, fmt.Errorf("no metrics available for %q", spec.MetricName)
 	}
-	// if there is a specified label, return only that metric in the family
-	if spec.Labels != nil {
-		return getLabeledMetric(logger, mf, &spec)
-	}
-	return getLatestMetric(logger, mf)
-}
 
-// getLatestMetric gets the latest metric of a family (for metrics without labels).
-func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) {
-	var latestTs int64 = -1
-	var latest *dto.Metric
-	for _, m := range mf.GetMetric() {
-		if m.GetTimestampMs() >= latestTs {
-			latestTs = m.GetTimestampMs()
-			latest = m
-		}
-	}
-
-	if latest == nil {
-		return nil, fmt.Errorf("no metrics found for %q", mf.GetName())
-	}
-
-	logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName())
-	return latest, nil
+	return getLatestMetric(logger, mf, &spec)
 }
 
 // getLabeledMetric gets the latest metric with matching labels.
-func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) {
+func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) {
 	var latestMetric *dto.Metric
 	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
 
+	var labels map[string]string = nil
+	if spec.Labels != nil {
+		labels = spec.Labels
+	}
+
 	for _, m := range mf.GetMetric() {
-		if spec == nil || labelsMatch(m.GetLabel(), spec.Labels) {
+		if labels == nil || labelsMatch(m.GetLabel(), spec.Labels) {
 			if m.GetTimestampMs() > latestTimestamp {
 				latestTimestamp = m.GetTimestampMs()
 				latestMetric = m
@@ -260,11 +243,11 @@ func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec
 	}
 
 	if latestMetric != nil {
-		logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName)
+		logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "name", spec.MetricName)
 		return latestMetric, nil
 	}
 
-	return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels)
+	return nil, fmt.Errorf("no matching metric found for %q with labels %+v", spec.MetricName, labels)
 }
 
 // labelsMatch checks if a metric's labels contain all the labels in the spec.

From 27b34e9410ec8a7e535376a24ce786fc0d5f7d54 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 02:34:48 +0000
Subject: [PATCH 14/19] Remove unused datastore types.

---
 pkg/epp/datastore/types.go | 67 --------------------------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 pkg/epp/datastore/types.go

diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go
deleted file mode 100644
index b87b1c0ae..000000000
--- a/pkg/epp/datastore/types.go
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-Copyright 2025 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-// Package datastore is a library to interact with backend model servers such as probing metrics.
-package datastore
-
-import (
-	"fmt"
-
-	"k8s.io/apimachinery/pkg/types"
-)
-
-type Pod struct {
-	NamespacedName types.NamespacedName
-	Address        string
-}
-
-type Metrics struct {
-	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
-	ActiveModels map[string]int
-	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
-	MaxActiveModels     int
-	WaitingQueueSize    int
-	KVCacheUsagePercent float64
-}
-
-type PodMetrics struct {
-	Pod
-	Metrics
-}
-
-func (pm *PodMetrics) String() string {
-	return fmt.Sprintf("Pod: %+v; Address: %+v; Metrics: %+v", pm.NamespacedName, pm.Address, pm.Metrics)
-}
-
-func (pm *PodMetrics) Clone() *PodMetrics {
-	cm := make(map[string]int, len(pm.ActiveModels))
-	for k, v := range pm.ActiveModels {
-		cm[k] = v
-	}
-	clone := &PodMetrics{
-		Pod: Pod{
-			NamespacedName: pm.NamespacedName,
-			Address:        pm.Address,
-		},
-		Metrics: Metrics{
-			ActiveModels:        cm,
-			MaxActiveModels:     pm.MaxActiveModels,
-			WaitingQueueSize:    pm.WaitingQueueSize,
-			KVCacheUsagePercent: pm.KVCacheUsagePercent,
-		},
-	}
-	return clone
-}

From 4b84744e75dc6a18a80ffe5d7a46e333854ed903 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 02:50:41 +0000
Subject: [PATCH 15/19] Fix lint.

---
 pkg/epp/backend/metrics/metrics.go      |  2 +-
 pkg/epp/backend/metrics/metrics_spec.go |  2 +-
 pkg/epp/backend/metrics/metrics_test.go | 49 ++++++++++++-------------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
index 67baf853e..b3cfcea77 100644
--- a/pkg/epp/backend/metrics/metrics.go
+++ b/pkg/epp/backend/metrics/metrics.go
@@ -184,7 +184,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam
 				hasRequiredLabels = true
 			}
 		}
-		//Skip if it does not have the lora labels
+		// Skip if it does not have the lora labels
 		if !hasRequiredLabels {
 			continue
 		}
diff --git a/pkg/epp/backend/metrics/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go
index bd8f39ccf..ce0c075dd 100644
--- a/pkg/epp/backend/metrics/metrics_spec.go
+++ b/pkg/epp/backend/metrics/metrics_spec.go
@@ -79,7 +79,7 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) {
 
 	}
 
-	if metricName == "" { //Metric name cannot be empty
+	if metricName == "" { // Metric name cannot be empty
 		return nil, fmt.Errorf("empty metric name in spec: %q", specStr)
 	}
 
diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
index 41a3eb9ae..455758d99 100644
--- a/pkg/epp/backend/metrics/metrics_test.go
+++ b/pkg/epp/backend/metrics/metrics_test.go
@@ -19,7 +19,6 @@ package metrics
 import (
 	"context"
 	"errors"
-	"fmt"
 	"reflect"
 	"strconv"
 	"strings"
@@ -36,7 +35,7 @@ import (
 
 // --- Test Helpers ---
 
-func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric {
+func makeMetric(labels map[string]string, value float64, timestampMs int64) *dto.Metric {
 	labelPairs := []*dto.LabelPair{}
 	for k, v := range labels {
 		labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)})
@@ -63,16 +62,16 @@ func TestGetMetric(t *testing.T) {
 
 	metricFamilies := map[string]*dto.MetricFamily{
 		"metric1": makeMetricFamily("metric1",
-			makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000),
-			makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000),
+			makeMetric(map[string]string{"label1": "value1"}, 1.0, 1000),
+			makeMetric(map[string]string{"label1": "value2"}, 2.0, 2000),
 		),
 		"metric2": makeMetricFamily("metric2",
-			makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500),
-			makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500),
+			makeMetric(map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500),
+			makeMetric(map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500),
 		),
 		"metric3": makeMetricFamily("metric3",
-			makeMetric("metric3", map[string]string{}, 5.0, 3000),
-			makeMetric("metric3", map[string]string{}, 6.0, 1000),
+			makeMetric(map[string]string{}, 5.0, 3000),
+			makeMetric(map[string]string{}, 6.0, 1000),
 		),
 	}
 
@@ -256,12 +255,12 @@ func TestGetLatestLoraMetric(t *testing.T) {
 			name: "no lora metrics",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"some_other_metric": makeMetricFamily("some_other_metric",
-					makeMetric("some_other_metric", nil, 1.0, 1000),
+					makeMetric(nil, 1.0, 1000),
 				),
 			},
 			expectedAdapters: nil,
 			expectedMax:      0,
-			expectedErr:      fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing
+			expectedErr:      errors.New("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing
 			mapping: &MetricMapping{
 				LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"},
 			},
@@ -270,8 +269,8 @@ func TestGetLatestLoraMetric(t *testing.T) {
 			name: "basic lora metrics",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000),       // Newer
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older
+					makeMetric(map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000),       // Newer
+					makeMetric(map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older
 
 				),
 			},
@@ -286,7 +285,7 @@ func TestGetLatestLoraMetric(t *testing.T) {
 			name: "no matching lora metrics",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000),
+					makeMetric(map[string]string{"other_label": "value"}, 5.0, 3000),
 				),
 			},
 			expectedAdapters: nil,
@@ -300,8 +299,8 @@ func TestGetLatestLoraMetric(t *testing.T) {
 			name: "no lora metrics if not in MetricMapping",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000),
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000),
+					makeMetric(map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000),
+					makeMetric(map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000),
 				),
 			},
 			expectedAdapters: nil,
@@ -388,15 +387,15 @@ func TestPromToPodMetrics(t *testing.T) {
 			name: "vllm metrics",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"vllm_waiting": makeMetricFamily("vllm_waiting",
-					makeMetric("vllm_waiting", nil, 5.0, 1000),
-					makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer
+					makeMetric(nil, 5.0, 1000),
+					makeMetric(nil, 7.0, 2000), // Newer
 				),
 				"vllm_usage": makeMetricFamily("vllm_usage",
-					makeMetric("vllm_usage", nil, 0.8, 2000),
-					makeMetric("vllm_usage", nil, 0.7, 500),
+					makeMetric(nil, 0.8, 2000),
+					makeMetric(nil, 0.7, 500),
 				),
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000),
+					makeMetric(map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000),
 				),
 			},
 			mapping: &MetricMapping{
@@ -422,16 +421,16 @@ func TestPromToPodMetrics(t *testing.T) {
 			},
 			existingMetrics: &Metrics{ActiveModels: map[string]int{}},
 			expectedMetrics: &Metrics{ActiveModels: map[string]int{}},
-			expectedErr:     multierr.Combine(fmt.Errorf("metric family \"vllm_waiting\" not found"), fmt.Errorf("metric family \"vllm_usage\" not found"), fmt.Errorf("metric family \"vllm:lora_requests_info\" not found")),
+			expectedErr:     multierr.Combine(errors.New("metric family \"vllm_waiting\" not found"), errors.New("metric family \"vllm_usage\" not found"), errors.New("metric family \"vllm:lora_requests_info\" not found")),
 		},
 		{
 			name: "partial metrics available + LoRA",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"vllm_usage": makeMetricFamily("vllm_usage",
-					makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present
+					makeMetric(nil, 0.8, 2000), // Only usage is present
 				),
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000),
+					makeMetric(map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000),
 				),
 			},
 			mapping: &MetricMapping{
@@ -446,13 +445,13 @@ func TestPromToPodMetrics(t *testing.T) {
 				ActiveModels:        map[string]int{"lora1": 0, "lora2": 0, "lora3": 0},
 				MaxActiveModels:     3,
 			},
-			expectedErr: fmt.Errorf("metric family \"vllm_waiting\" not found"),
+			expectedErr: errors.New("metric family \"vllm_waiting\" not found"),
 		},
 		{
 			name: "invalid max lora",
 			metricFamilies: map[string]*dto.MetricFamily{
 				"vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info",
-					makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000),
+					makeMetric(map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000),
 				),
 			},
 			mapping: &MetricMapping{

From 66e0376cb3046d487f190fc702cd28646b697bc1 Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 17:06:43 +0000
Subject: [PATCH 16/19] Remove log and fix nits.

---
 pkg/epp/backend/metrics/metrics.go      | 26 ++++++++-----------------
 pkg/epp/backend/metrics/metrics_test.go |  2 +-
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
index b3cfcea77..714a44f11 100644
--- a/pkg/epp/backend/metrics/metrics.go
+++ b/pkg/epp/backend/metrics/metrics.go
@@ -22,7 +22,6 @@ import (
 	"net/http"
 	"strconv"
 	"strings"
-	"time"
 
 	"github.com/go-logr/logr"
 	dto "github.com/prometheus/client_model/go"
@@ -113,7 +112,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 
 	// Handle LoRA metrics (only if all LoRA MetricSpecs are present)
 	if p.MetricMapping.LoraRequestInfo != nil {
-		loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies)
+		loraMetrics, err := p.getLatestLoraMetric(logger, metricFamilies)
 		errs = multierr.Append(errs, err)
 
 		if loraMetrics != nil {
@@ -154,15 +153,15 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 // reason its specially fetched is because each label key value pair permutation generates new series
 // and only most recent is useful. The value of each series is the creation timestamp so we can
 // retrieve the latest by sorting the value.
-func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
+func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, error) {
 	if p.MetricMapping.LoraRequestInfo == nil {
-		return nil, time.Time{}, nil // No LoRA metrics configured
+		return nil, nil // No LoRA metrics configured
 	}
 
 	loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName]
 	if !ok {
 		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
-		return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
+		return nil, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
 	}
 
 	var latest *dto.Metric
@@ -200,19 +199,16 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam
 		}
 	}
 	if latest == nil {
-		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName)
-		return nil, time.Time{}, nil
+		return nil, nil
 	}
 
-	// Convert the gauge value (creation timestamp) to time.Time.
-	return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time
+	return latest, nil // Convert nanoseconds to time.Time
 }
 
 // getMetric retrieves a specific metric based on MetricSpec.
 func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
 	mf, ok := metricFamilies[spec.MetricName]
 	if !ok {
-		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", spec.MetricName)
 		return nil, fmt.Errorf("metric family %q not found", spec.MetricName)
 	}
 
@@ -228,13 +224,8 @@ func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec)
 	var latestMetric *dto.Metric
 	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
 
-	var labels map[string]string = nil
-	if spec.Labels != nil {
-		labels = spec.Labels
-	}
-
 	for _, m := range mf.GetMetric() {
-		if labels == nil || labelsMatch(m.GetLabel(), spec.Labels) {
+		if spec.Labels == nil || labelsMatch(m.GetLabel(), spec.Labels) {
 			if m.GetTimestampMs() > latestTimestamp {
 				latestTimestamp = m.GetTimestampMs()
 				latestMetric = m
@@ -243,11 +234,10 @@ func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec)
 	}
 
 	if latestMetric != nil {
-		logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "name", spec.MetricName)
 		return latestMetric, nil
 	}
 
-	return nil, fmt.Errorf("no matching metric found for %q with labels %+v", spec.MetricName, labels)
+	return nil, fmt.Errorf("no matching metric found for %q with labels %+v", spec.MetricName, spec.Labels)
 }
 
 // labelsMatch checks if a metric's labels contain all the labels in the spec.
diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
index 455758d99..d2e637fc7 100644
--- a/pkg/epp/backend/metrics/metrics_test.go
+++ b/pkg/epp/backend/metrics/metrics_test.go
@@ -314,7 +314,7 @@ func TestGetLatestLoraMetric(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
-			loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies)
+			loraMetric, err := p.getLatestLoraMetric(logger, tc.metricFamilies)
 
 			if tc.expectedErr != nil {
 				if err == nil || err.Error() != tc.expectedErr.Error() {

From 9f4859b2f4cd49d720b8b6f1a60d71ab5b5ee42e Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 17:45:35 +0000
Subject: [PATCH 17/19] Move ext_proc and inferencemodel yaml files back, fix
 nits and remove all logging from metrics.go.

---
 cmd/epp/main.go                               |  8 ++--
 config/manifests/{vllm => }/ext_proc.yaml     |  0
 .../manifests/{vllm => }/inferencemodel.yaml  |  0
 pkg/epp/backend/metrics/metrics.go            | 45 +++++++------------
 pkg/epp/backend/metrics/metrics_test.go       |  6 +--
 5 files changed, 22 insertions(+), 37 deletions(-)
 rename config/manifests/{vllm => }/ext_proc.yaml (100%)
 rename config/manifests/{vllm => }/inferencemodel.yaml (100%)

diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index 634cda4a2..fa63f0bce 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -92,14 +92,14 @@ var (
 			"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
 			"then a self-signed certificate is used.")
 	// metric flags
-	totalQueuedRequestMetric = flag.String("totalQueuedRequestMetric",
+	totalQueuedRequestsMetric = flag.String("totalQueuedRequestsMetric",
 		"vllm:num_requests_waiting",
 		"Prometheus metric for the number of queued requests.")
 	kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric",
 		"vllm:gpu_cache_usage_perc",
 		"Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
 	// LoRA metrics
-	loraRequestInfoMetric = flag.String("loraRequestInfoMetric",
+	loraInfoMetric = flag.String("loraInfoMetric",
 		"vllm:lora_requests_info",
 		"Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
 
@@ -155,9 +155,9 @@ func run() error {
 
 	// Set up mapper for metric scraping.
 	mapping, err := backendmetrics.NewMetricMapping(
-		*totalQueuedRequestMetric,
+		*totalQueuedRequestsMetric,
 		*kvCacheUsagePercentageMetric,
-		*loraRequestInfoMetric,
+		*loraInfoMetric,
 	)
 	if err != nil {
 		setupLog.Error(err, "Failed to create metric mapping from flags.")
diff --git a/config/manifests/vllm/ext_proc.yaml b/config/manifests/ext_proc.yaml
similarity index 100%
rename from config/manifests/vllm/ext_proc.yaml
rename to config/manifests/ext_proc.yaml
diff --git a/config/manifests/vllm/inferencemodel.yaml b/config/manifests/inferencemodel.yaml
similarity index 100%
rename from config/manifests/vllm/inferencemodel.yaml
rename to config/manifests/inferencemodel.yaml
diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
index 714a44f11..7de3d9031 100644
--- a/pkg/epp/backend/metrics/metrics.go
+++ b/pkg/epp/backend/metrics/metrics.go
@@ -28,14 +28,13 @@ import (
 	"github.com/prometheus/common/expfmt"
 	"go.uber.org/multierr"
 	"sigs.k8s.io/controller-runtime/pkg/log"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
 const (
 	// LoRA metrics based on protocol
-	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
-	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
-	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
+	LoraInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
+	LoraInfoMaxAdaptersMetricName     = "max_lora"
 )
 
 type PodMetricsClientImpl struct {
@@ -50,7 +49,6 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	port int32,
 ) (*Metrics, error) {
 	logger := log.FromContext(ctx)
-	loggerDefault := logger.V(logutil.DEFAULT)
 
 	// Currently the metrics endpoint is hard-coded, which works with vLLM.
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
@@ -58,12 +56,10 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	if err != nil {
-		loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
 		return nil, fmt.Errorf("failed to create request: %v", err)
 	}
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
-		loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
 		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
 	}
 	defer func() {
@@ -71,7 +67,6 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	}()
 
 	if resp.StatusCode != http.StatusOK {
-		loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
 		return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode)
 	}
 
@@ -93,7 +88,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 	updated := existing.Clone()
 
 	if p.MetricMapping.TotalQueuedRequests != nil {
-		queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests)
+		queued, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalQueuedRequests)
 		if err == nil {
 			updated.WaitingQueueSize = int(queued.GetGauge().GetValue())
 		} else {
@@ -102,7 +97,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 	}
 
 	if p.MetricMapping.KVCacheUtilization != nil {
-		usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization)
+		usage, err := p.getMetric(metricFamilies, *p.MetricMapping.KVCacheUtilization)
 		if err == nil {
 			updated.KVCacheUsagePercent = usage.GetGauge().GetValue()
 		} else {
@@ -112,13 +107,13 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 
 	// Handle LoRA metrics (only if all LoRA MetricSpecs are present)
 	if p.MetricMapping.LoraRequestInfo != nil {
-		loraMetrics, err := p.getLatestLoraMetric(logger, metricFamilies)
+		loraMetrics, err := p.getLatestLoraMetric(metricFamilies)
 		errs = multierr.Append(errs, err)
 
 		if loraMetrics != nil {
 			updated.ActiveModels = make(map[string]int)
 			for _, label := range loraMetrics.GetLabel() {
-				if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
+				if label.GetName() == LoraInfoRunningAdaptersMetricName {
 					if label.GetValue() != "" {
 						adapterList := strings.Split(label.GetValue(), ",")
 						for _, adapter := range adapterList {
@@ -126,7 +121,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 						}
 					}
 				}
-				if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+				if label.GetName() == LoraInfoWaitingAdaptersMetricName {
 					if label.GetValue() != "" {
 						adapterList := strings.Split(label.GetValue(), ",")
 						for _, adapter := range adapterList {
@@ -134,7 +129,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 						}
 					}
 				}
-				if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
+				if label.GetName() == LoraInfoMaxAdaptersMetricName {
 					if label.GetValue() != "" {
 						updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
 						if err != nil {
@@ -153,14 +148,13 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 // reason its specially fetched is because each label key value pair permutation generates new series
 // and only most recent is useful. The value of each series is the creation timestamp so we can
 // retrieve the latest by sorting the value.
-func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, error) {
+func (p *PodMetricsClientImpl) getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, error) {
 	if p.MetricMapping.LoraRequestInfo == nil {
 		return nil, nil // No LoRA metrics configured
 	}
 
 	loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName]
 	if !ok {
-		logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName)
 		return nil, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName)
 	}
 
@@ -171,22 +165,15 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam
 	for _, m := range loraRequests.GetMetric() {
 		running := ""
 		waiting := ""
-		// Check if the metric has the expected LoRA labels.  This is important!
-		hasRequiredLabels := false
+		// Check if the metric has the expected LoRA labels.
 		for _, lp := range m.GetLabel() {
 			switch lp.GetName() {
-			case LoraRequestInfoRunningAdaptersMetricName:
+			case LoraInfoRunningAdaptersMetricName:
 				running = lp.GetValue()
-				hasRequiredLabels = true
-			case LoraRequestInfoWaitingAdaptersMetricName:
+			case LoraInfoWaitingAdaptersMetricName:
 				waiting = lp.GetValue()
-				hasRequiredLabels = true
 			}
 		}
-		// Skip if it does not have the lora labels
-		if !hasRequiredLabels {
-			continue
-		}
 		// Ignore metrics with both labels empty.
 		if running == "" && waiting == "" {
 			continue
@@ -206,7 +193,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam
 }
 
 // getMetric retrieves a specific metric based on MetricSpec.
-func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
+func (p *PodMetricsClientImpl) getMetric(metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) {
 	mf, ok := metricFamilies[spec.MetricName]
 	if !ok {
 		return nil, fmt.Errorf("metric family %q not found", spec.MetricName)
@@ -216,11 +203,11 @@ func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[
 		return nil, fmt.Errorf("no metrics available for %q", spec.MetricName)
 	}
 
-	return getLatestMetric(logger, mf, &spec)
+	return getLatestMetric(mf, &spec)
 }
 
 // getLabeledMetric gets the latest metric with matching labels.
-func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) {
+func getLatestMetric(mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) {
 	var latestMetric *dto.Metric
 	var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater
 
diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
index d2e637fc7..0a1e2cd79 100644
--- a/pkg/epp/backend/metrics/metrics_test.go
+++ b/pkg/epp/backend/metrics/metrics_test.go
@@ -58,7 +58,6 @@ func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily {
 // --- Tests ---
 
 func TestGetMetric(t *testing.T) {
-	logger := logutil.NewTestLogger()
 
 	metricFamilies := map[string]*dto.MetricFamily{
 		"metric1": makeMetricFamily("metric1",
@@ -168,7 +167,7 @@ func TestGetMetric(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 
-			gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec)
+			gotMetric, err := p.getMetric(metricFamilies, tt.spec)
 
 			if tt.wantError {
 				if err == nil {
@@ -241,7 +240,6 @@ func TestLabelsMatch(t *testing.T) {
 }
 
 func TestGetLatestLoraMetric(t *testing.T) {
-	logger := logutil.NewTestLogger()
 
 	testCases := []struct {
 		name             string
@@ -314,7 +312,7 @@ func TestGetLatestLoraMetric(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
-			loraMetric, err := p.getLatestLoraMetric(logger, tc.metricFamilies)
+			loraMetric, err := p.getLatestLoraMetric(tc.metricFamilies)
 
 			if tc.expectedErr != nil {
 				if err == nil || err.Error() != tc.expectedErr.Error() {

From c082e869436b647d0e35af15ed97a83767624f7a Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 18:00:51 +0000
Subject: [PATCH 18/19] Remove the rest of logging from metrics.go and tests.

---
 pkg/epp/backend/metrics/metrics.go      | 6 +-----
 pkg/epp/backend/metrics/metrics_test.go | 3 +--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
index 7de3d9031..be732e78e 100644
--- a/pkg/epp/backend/metrics/metrics.go
+++ b/pkg/epp/backend/metrics/metrics.go
@@ -23,11 +23,9 @@ import (
 	"strconv"
 	"strings"
 
-	"github.com/go-logr/logr"
 	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/expfmt"
 	"go.uber.org/multierr"
-	"sigs.k8s.io/controller-runtime/pkg/log"
 )
 
 const (
@@ -48,7 +46,6 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	existing *Metrics,
 	port int32,
 ) (*Metrics, error) {
-	logger := log.FromContext(ctx)
 
 	// Currently the metrics endpoint is hard-coded, which works with vLLM.
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
@@ -75,12 +72,11 @@ func (p *PodMetricsClientImpl) FetchMetrics(
 	if err != nil {
 		return nil, err
 	}
-	return p.promToPodMetrics(logger, metricFamilies, existing)
+	return p.promToPodMetrics(metricFamilies, existing)
 }
 
 // promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
 func (p *PodMetricsClientImpl) promToPodMetrics(
-	logger logr.Logger,
 	metricFamilies map[string]*dto.MetricFamily,
 	existing *Metrics,
 ) (*Metrics, error) {
diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
index 0a1e2cd79..d0396bf74 100644
--- a/pkg/epp/backend/metrics/metrics_test.go
+++ b/pkg/epp/backend/metrics/metrics_test.go
@@ -372,7 +372,6 @@ func TestGetLatestLoraMetric(t *testing.T) {
 }
 
 func TestPromToPodMetrics(t *testing.T) {
-	logger := logutil.NewTestLogger()
 	tests := []struct {
 		name            string
 		metricFamilies  map[string]*dto.MetricFamily
@@ -468,7 +467,7 @@ func TestPromToPodMetrics(t *testing.T) {
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			p := &PodMetricsClientImpl{MetricMapping: tc.mapping}
-			updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics)
+			updated, err := p.promToPodMetrics(tc.metricFamilies, tc.existingMetrics)
 			if tc.expectedErr != nil {
 				assert.Error(t, err)
 				assert.EqualError(t, err, tc.expectedErr.Error())

From 81ee1e6b66ff371daa7f080a3d9d2aef5785784a Mon Sep 17 00:00:00 2001
From: BenjaminBraunDev <benjaminbraun@google.com>
Date: Fri, 14 Mar 2025 18:27:55 +0000
Subject: [PATCH 19/19] Add trace log to podmetrics and small warning fix to
 metrics_spec_test.

---
 pkg/epp/backend/metrics/metrics_spec_test.go | 2 +-
 pkg/epp/backend/metrics/pod_metrics.go       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/epp/backend/metrics/metrics_spec_test.go b/pkg/epp/backend/metrics/metrics_spec_test.go
index 8de6dac29..828042065 100644
--- a/pkg/epp/backend/metrics/metrics_spec_test.go
+++ b/pkg/epp/backend/metrics/metrics_spec_test.go
@@ -159,7 +159,7 @@ func TestStringToMetricSpec(t *testing.T) {
 				}
 			} else {
 				if got == nil {
-					t.Errorf("stringToMetricSpec() = got nil but wanted %v", tt.want)
+					t.Fatalf("stringToMetricSpec() = got nil but wanted %v", tt.want)
 				}
 				if !reflect.DeepEqual(got.MetricName, tt.want.MetricName) {
 					t.Errorf("stringToMetricSpec() got MetricName = %v, want %v", got.MetricName, tt.want.MetricName)
diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go
index b954a98ce..01db14bec 100644
--- a/pkg/epp/backend/metrics/pod_metrics.go
+++ b/pkg/epp/backend/metrics/pod_metrics.go
@@ -115,6 +115,7 @@ func (pm *podMetrics) refreshMetrics() error {
 	defer cancel()
 	updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics(), pool.Spec.TargetPortNumber)
 	if err != nil {
+		pm.logger.V(logutil.TRACE).Info("Failed to refreshed metrics:", "err", err)
 		// As refresher is running in the background, it's possible that the pod is deleted but
 		// the refresh goroutine doesn't read the done channel yet. In this case, we just return nil.
 		// The refresher will be stopped after this interval.