From 214905d21726fd518ae69e42407662a8d7a9f3e2 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Tue, 4 Mar 2025 23:34:34 +0000 Subject: [PATCH 01/19] start adding metrics changes for trion support --- cmd/epp/main.go | 5 + config/manifests/triton/deployment.yaml | 100 ++++++++ config/manifests/triton/ext_proc.yaml | 115 +++++++++ config/manifests/triton/inferencemodel.yaml | 9 + config/manifests/triton/triton-set-up.yaml | 111 ++++++++ pkg/epp/backend/provider.go | 183 +++++++++++++ pkg/epp/backend/triton/metrics.go | 270 ++++++++++++++++++++ pkg/epp/backend/triton/metrics_test.go | 241 +++++++++++++++++ 8 files changed, 1034 insertions(+) create mode 100644 config/manifests/triton/deployment.yaml create mode 100644 config/manifests/triton/ext_proc.yaml create mode 100644 config/manifests/triton/inferencemodel.yaml create mode 100644 config/manifests/triton/triton-set-up.yaml create mode 100644 pkg/epp/backend/provider.go create mode 100644 pkg/epp/backend/triton/metrics.go create mode 100644 pkg/epp/backend/triton/metrics_test.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index e1cd50154..4eaa90c8f 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -37,7 +37,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/triton" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" @@ -146,6 +148,9 @@ func run() error { pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval) // Setup runner. datastore := datastore.NewDatastore(ctx, pmf) + // switch case across different model server metrics (triton, vllm) + provider := backend.NewProvider(&triton.PodMetricsClientImpl{}, datastore) + // serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, diff --git a/config/manifests/triton/deployment.yaml b/config/manifests/triton/deployment.yaml new file mode 100644 index 000000000..61626293b --- /dev/null +++ b/config/manifests/triton/deployment.yaml @@ -0,0 +1,100 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llama-triton-deployment +spec: + replicas: 1 # Start with 1 replica. Adjust as needed. + selector: + matchLabels: + app: llama-triton # This MUST match the labels in the template + template: + metadata: + labels: + app: llama-triton + spec: + containers: + - name: triton-server + image: nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3 # Use base Triton image + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-c"] + args: + - | + set -e + apt-get update && apt-get install -y python3.12-venv + + # Create and activate a virtual environment + python3 -m venv /opt/venv + source /opt/venv/bin/activate + pip install SentencePiece + pip install packaging + pip install numpy + pip install torch + pip install requests + pip install transformers + pip install pillow + + # Use launch_triton_server.py + # python3 /models/tensorrtllm_backend/scripts/launch_triton_server.py --world_size 1 --model_repo /models/tensorrtllm_backend/llama_ifb + # tail -f /dev/null + + # Launch OpenAI completetions endpoint + # Install python bindings for tritonserver and tritonfrontend + pip install /opt/tritonserver/python/triton*.whl + # Install application requirements + git clone https://github.com/triton-inference-server/server.git + cd server/python/openai/ + pip install -r requirements.txt + pip install uvicorn + pip install -U huggingface_hub + huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential + + python3 openai_frontend/main.py --model-repository /models/tensorrtllm_backend/llama_ifb --tokenizer meta-llama/Llama-2-7b-chat-hf + ports: + - containerPort: 9000 + name: http + - containerPort: 9001 + name: grpc + - containerPort: 9002 + name: metrics + volumeMounts: + - mountPath: /models + name: model-volume + - mountPath: /secrets/huggingface + name: huggingface-secret + readOnly: true + resources: + limits: + ephemeral-storage: 40Gi + nvidia.com/gpu: 1 + memory: 40Gi + requests: + ephemeral-storage: 40Gi + memory: 40Gi + nvidia.com/gpu: 1 + volumes: + - name: model-volume + persistentVolumeClaim: + claimName: llama-model-pvc + - name: huggingface-secret + secret: + secretName: hf-token + +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-triton-service +spec: + type: ClusterIP + ports: + - port: 9000 + targetPort: http + name: http-inference-server + - port: 9001 + targetPort: grpc + name: grpc-inference-server + - port: 9002 + targetPort: metrics + name: http-metrics + selector: + app: llama-triton diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml new file mode 100644 index 000000000..a794bdb2d --- /dev/null +++ b/config/manifests/triton/ext_proc.yaml @@ -0,0 +1,115 @@ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default + namespace: default +roleRef: + kind: ClusterRole + name: pod-read +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + labels: + name: triton-llama2-7b-pool +spec: + targetPortNumber: 9000 + selector: + app: llama-triton + extensionRef: + name: inference-gateway-ext-proc +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference-gateway-ext-proc + namespace: default + labels: + app: inference-gateway-ext-proc +spec: + replicas: 1 + selector: + matchLabels: + app: inference-gateway-ext-proc + template: + metadata: + labels: + app: inference-gateway-ext-proc + spec: + containers: + - name: inference-gateway-ext-proc + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest + imagePullPolicy: Always + args: + - -poolName + - "triton-llama2-7b-pool" + - -v + - "3" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: inference-gateway-ext-proc + namespace: default +spec: + selector: + app: inference-gateway-ext-proc + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + type: ClusterIP diff --git a/config/manifests/triton/inferencemodel.yaml b/config/manifests/triton/inferencemodel.yaml new file mode 100644 index 000000000..db643a85c --- /dev/null +++ b/config/manifests/triton/inferencemodel.yaml @@ -0,0 +1,9 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: triton-llama2-7b-model +spec: + modelName: ensemble + criticality: Standard + poolRef: + name: triton-llama2-7b-pool diff --git a/config/manifests/triton/triton-set-up.yaml b/config/manifests/triton/triton-set-up.yaml new file mode 100644 index 000000000..08fa0852c --- /dev/null +++ b/config/manifests/triton/triton-set-up.yaml @@ -0,0 +1,111 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llama-model-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 200Gi + +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: llama-build-job +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: llama-triton + spec: + containers: + - name: llama-builder + image: nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3 # Use the base Triton image directly + command: ["/bin/bash", "-c"] + args: + - | + set -e # Exit on error + + apt-get update && apt-get install -y python3.12-venv + + # Create and activate a virtual environment + python3 -m venv /opt/venv + source /opt/venv/bin/activate + + # Install git (it might not be in the base image) + apt-get update && apt-get install -y --no-install-recommends git + + # Clone the tensorrt_llm_backend repository and set up submodule + git clone -b triton-llm/v0.17.0 https://github.com/triton-inference-server/tensorrtllm_backend.git /models/tensorrtllm_backend + cd /models/tensorrtllm_backend + git lfs install + git submodule update --init --recursive + + # --- Hugging Face Setup --- + # 1. Install the Hugging Face CLI + pip install -U huggingface_hub + pip install transformers + pip install --extra-index-url https://pypi.nvidia.com/ tensorrt-llm + pip install tensorrt_llm + + # 2. Log in using the token from the secret + # The secret is mounted as a file. + huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential + huggingface-cli download meta-llama/Llama-2-7b-hf --local-dir /models/hf_models/ + + # Download and convert the Hugging Face model. Modify parameters as needed. + export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json', local_dir='/models/hf_models/')).parent)"` + echo PATH TO LLAMA MODEL: $HF_LLAMA_MODEL + export UNIFIED_CKPT_PATH=/models/tmp/ckpt/llama/7b/ + export ENGINE_PATH=/models/tmp/engines/llama/7b/ + export TRTLLM_MODEL_REPO=/models/tensorrtllm_backend/llama_ifb + python3 /models/tensorrtllm_backend/tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + # Build the TensorRT-LLM engine. Adjust parameters (e.g., world_size) as needed. + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --kv_cache_type paged \ + --context_fmha enable \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --max_batch_size 64 + + cp /models/tensorrtllm_backend/all_models/inflight_batcher_llm/ ${TRTLLM_MODEL_REPO} -r + + python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 + python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 + python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 + python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 + python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + + echo "Build complete!" + volumeMounts: + - mountPath: /models + name: model-volume + - mountPath: /secrets/huggingface + name: huggingface-secret + readOnly: true + resources: + limits: + ephemeral-storage: 80Gi + nvidia.com/gpu: 1 + memory: 40Gi + requests: + ephemeral-storage: 80Gi + nvidia.com/gpu: 1 + memory: 40Gi + restartPolicy: Never + volumes: + - name: model-volume + persistentVolumeClaim: + claimName: llama-model-pvc + - name: huggingface-secret + secret: + secretName: hf-token diff --git a/pkg/epp/backend/provider.go b/pkg/epp/backend/provider.go new file mode 100644 index 000000000..959f3e0c9 --- /dev/null +++ b/pkg/epp/backend/provider.go @@ -0,0 +1,183 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/go-logr/logr" + "go.uber.org/multierr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + fetchMetricsTimeout = 5 * time.Second +) + +func NewProvider(pmc PodMetricsClient, datastore datastore.Datastore) *Provider { + p := &Provider{ + pmc: pmc, + datastore: datastore, + } + return p +} + +// Provider provides backend pods and information such as metrics. +type Provider struct { + pmc PodMetricsClient + datastore datastore.Datastore +} + +type PodMetricsClient interface { + FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error) +} + +func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { + // periodically refresh metrics + logger := log.FromContext(ctx) + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down metrics prober") + return + default: + time.Sleep(refreshMetricsInterval) + if err := p.refreshMetricsOnce(logger); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics") + } + } + } + }() + + // Periodically flush prometheus metrics for inference pool + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") + return + default: + time.Sleep(refreshPrometheusMetricsInterval) + p.flushPrometheusMetricsOnce(logger) + } + } + }() + + // Periodically print out the pods and metrics for DEBUGGING. + if logger := logger.V(logutil.DEBUG); logger.Enabled() { + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") + return + default: + time.Sleep(5 * time.Second) + logger.Info("Current Pods and metrics gathered", "metrics", p.datastore.PodGetAll()) + } + } + }() + } + + return nil +} + +func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { + loggerTrace := logger.V(logutil.TRACE) + pool, _ := p.datastore.PoolGet() + if pool == nil { + loggerTrace.Info("No inference pool or not initialized") + return nil + } + ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) + defer cancel() + start := time.Now() + defer func() { + d := time.Since(start) + // TODO: add a metric instead of logging + loggerTrace.Info("Metrics refreshed", "duration", d) + }() + + var wg sync.WaitGroup + errCh := make(chan error) + processOnePod := func(key, value any) bool { + loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value) + existing := value.(*datastore.PodMetrics) + wg.Add(1) + go func() { + defer wg.Done() + updated, err := p.pmc.FetchMetrics(ctx, existing, pool.Spec.TargetPortNumber) + if err != nil { + errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err) + return + } + p.datastore.PodUpdateMetricsIfExist(updated.NamespacedName, &updated.Metrics) + loggerTrace.Info("Updated metrics for pod", "pod", updated.NamespacedName, "metrics", updated.Metrics) + }() + return true + } + p.datastore.PodRange(processOnePod) + + // Wait for metric collection for all pods to complete and close the error channel in a + // goroutine so this is unblocking, allowing the code to proceed to the error collection code + // below. + // Note we couldn't use a buffered error channel with a size because the size of the podMetrics + // sync.Map is unknown beforehand. + go func() { + wg.Wait() + close(errCh) + }() + + var errs error + for err := range errCh { + errs = multierr.Append(errs, err) + } + return errs +} + +func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { + pool, _ := p.datastore.PoolGet() + if pool == nil { + // No inference pool or not initialize. + return + } + + var kvCacheTotal float64 + var queueTotal int + + podMetrics := p.datastore.PodGetAll() + logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) + if len(podMetrics) == 0 { + return + } + + for _, pod := range podMetrics { + kvCacheTotal += pod.KVCacheUsagePercent + queueTotal += pod.WaitingQueueSize + } + + podTotalCount := len(podMetrics) + metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) + metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) +} diff --git a/pkg/epp/backend/triton/metrics.go b/pkg/epp/backend/triton/metrics.go new file mode 100644 index 000000000..2f8d24bd9 --- /dev/null +++ b/pkg/epp/backend/triton/metrics.go @@ -0,0 +1,270 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package triton + +import ( + "context" + "fmt" + "net/http" + "strconv" + "strings" + + "github.com/go-logr/logr" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "go.uber.org/multierr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + // Triton metrics, see https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/metrics.html + + TRTLLMRequestMetricsName = "nv_trt_llm_request_metrics" + TRTLLMKvCacheMetricsName = "nv_trt_llm_kv_cache_block_metrics" + TRTLLMKvCacheMetricsLabel = "kv_cache_block_type" + TRTLLMRequestMetricsLabel = "request_type" + + // THESE ARE UNUSED, EXAMPLES FOR MORE METRICS + inferenceCountMetricName = "nv_inference_count" + inferenceSuccessMetricName = "nv_inference_request_success" + inferenceExecCountMetricName = "nv_inference_exec_count" + inferenceRequestDurationMetricName = "nv_inference_request_duration_us" + waitingQueueSizeMetricName = "nv_inference_pending_request_count" + queueDurationMetricName = "nv_inference_queue_duration_us" + computeInputDurationMetricName = "nv_inference_compute_input_duration_us" + computeInferDurationMetricName = "nv_inference_compute_infer_duration_us" + computeOutputDurationMetricName = "nv_inference_compute_output_duration_us" + gpuUtilizationMetricName = "nv_gpu_utilization" + gpuMemoryTotalMetricName = "nv_gpu_memory_total_bytes" + gpuMemoryUsedMetricName = "nv_gpu_memory_used_bytes" + gpuPowerUsageMetricName = "nv_gpu_power_usage" + gpuPowerLimitMetricName = "nv_gpu_power_limit" + gpuMemoryTotalBytesMetricName = "nv_gpu_memory_total_bytes" + gpuMemoryUsedBytesMetricName = "nv_gpu_memory_used_bytes" +) + +type PodMetricsClientImpl struct{} + +// FetchMetrics fetches metrics from a given pod. +func (p *PodMetricsClientImpl) FetchMetrics( + ctx context.Context, + existing *datastore.PodMetrics, +) (*datastore.PodMetrics, error) { + logger := log.FromContext(ctx) + loggerDefault := logger.V(logutil.DEFAULT) + + // existing.ScrapePort = 8002 // triton has a different port for metrics than the target port for inference + url := existing.BuildScrapeEndpoint() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + // TODO print response and err + + if err != nil { + loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) + return nil, fmt.Errorf("failed to create request: %v", err) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) + } + defer func() { + _ = resp.Body.Close() + }() + + if resp.StatusCode != http.StatusOK { + loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) + } + + parser := expfmt.TextParser{} + metricFamilies, err := parser.TextToMetricFamilies(resp.Body) + if err != nil { + return nil, err + } + return promToPodMetrics(logger, metricFamilies, existing) +} + +// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. +func promToPodMetrics( + logger logr.Logger, + metricFamilies map[string]*dto.MetricFamily, + existing *datastore.PodMetrics, +) (*datastore.PodMetrics, error) { + var errs error + updated := existing.Clone() + + // Get the "nv_trt_llm_request_metrics" metric family + requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName) + errs = multierr.Append(errs, err) + if err == nil { + if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil { + fmt.Printf("###### DEBUG max: %+v", active) + updated.Metrics.RunningQueueSize = int(active) + } else { + errs = multierr.Append(errs, err) + } + if scheduled, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "scheduled"); err == nil { + fmt.Printf("###### DEBUG max: %+v", scheduled) + updated.Metrics.WaitingQueueSize = int(scheduled) + } else { + errs = multierr.Append(errs, err) + } + } + + fmt.Print("###### DEBUG getting kvblock metrics... ######") + // Get the "nv_trt_llm_kv_cache_block_metrics" metric family + kvCacheBlocks, err := getLatestMetric(logger, metricFamilies, TRTLLMKvCacheMetricsName) + errs = multierr.Append(errs, err) + // fmt.Printf("###### DEBUG (should be nil) getLatestMetric errs: %+v", errs) + if err == nil { + // Calculate the kv-cache usage from the max and used metrics + if max, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "max"); err == nil { + fmt.Printf("###### DEBUG max: %+v", max) + if used, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "used"); err == nil { + fmt.Printf("###### DEBUG tokens_per: %+v", used) + usage := 0.0 + if max > 0 { + usage = used / max + } + updated.Metrics.KVCacheUsagePercent = usage + } else { + errs = multierr.Append(errs, err) + } + if tokens_per, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "tokens_per"); err == nil { + fmt.Printf("###### DEBUG tokens_per: %+v", tokens_per) + updated.Metrics.KvCacheMaxTokenCapacity = int(tokens_per * max) + } else { + errs = multierr.Append(errs, err) + } + } else { + errs = multierr.Append(errs, err) + } + } + + fmt.Printf("###### DEBUG UPDATED: %+v", updated) + fmt.Printf("###### DEBUG ERRORS: %+v", errs) + + return updated, errs +} + +// getLatestMetric gets the latest metric of a family. +func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.MetricFamily, error) { + mf, ok := metricFamilies[metricName] + if !ok { + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName) + return nil, fmt.Errorf("metric family %q not found", metricName) + } + if len(mf.GetMetric()) == 0 { + return nil, fmt.Errorf("no metrics available for %q", metricName) + } + + var latestTs int64 + var latestMf *dto.MetricFamily + for _, m := range mf.GetMetric() { + if m.GetTimestampMs() >= latestTs { + latestTs = m.GetTimestampMs() + latestMf = &dto.MetricFamily{ + Name: mf.Name, + Help: mf.Help, + Type: mf.Type, + Metric: []*dto.Metric{m}, + } + } + } + + logger.V(logutil.TRACE).Info("Metric value selected", "metric Family", latestMf, "metric", metricName) + return latestMf, nil +} + +// getGaugeMetricForPod gets gauge metric value for a given pod. +func getGaugeMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podIdentifier string) (float64, error) { + for _, m := range mf.GetMetric() { + for _, label := range m.GetLabel() { + if (label.GetName() == "pod" || label.GetName() == "gpu_uuid") && strings.Contains(label.GetValue(), podIdentifier) { + logger.V(logutil.TRACE).Info("Pod metric found", "value", m.GetGauge().GetValue(), "labelName", label.GetName(), "labelValue", label.GetValue()) + + return m.GetGauge().GetValue(), nil // Return the value with nil error + } + } + } + logger.V(logutil.TRACE).Info("Metric Value not found for pod", "pod", podIdentifier, "metric family", mf.GetName()) + return -1, fmt.Errorf("metric value not found for pod %s in metric family %s", podIdentifier, mf.GetName()) // Return an error +} + +// getCounterMetricForPod gets counter metric value for a given pod. +func getCounterMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podName string) (int, error) { + for _, m := range mf.GetMetric() { + for _, label := range m.GetLabel() { + if label.GetName() == "pod" && label.GetValue() == podName { + val := m.GetCounter().GetValue() + intVal, err := strconv.Atoi(fmt.Sprintf("%v", val)) // Convert float64 to int + if err != nil { + return -1, fmt.Errorf("failed to convert counter metric to int: %w", err) + } + logger.V(logutil.TRACE).Info("Pod metric found", "value", intVal) + + return intVal, nil + } + } + } + return -1, nil +} + +// TRTLLM metrics + +// getTrtLlmMetric gets a TRT LLM metric with the specified type, key, and value. +func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.MetricType, key, value string) (float64, error) { + for _, m := range mf.GetMetric() { + foundKey := false + foundValue := false + for _, label := range m.GetLabel() { + if label.GetName() == key && label.GetValue() == value { + foundKey = true + } + if mf.GetType() == metricType { + foundValue = true + } + } + if foundKey && foundValue { + if metricType == dto.MetricType_GAUGE { + logger.V(logutil.TRACE).Info("TRT LLM gauge metric found", "value", m.GetGauge().GetValue(), "key", key, "value", value) + return m.GetGauge().GetValue(), nil + } else if metricType == dto.MetricType_COUNTER { + val := m.GetCounter().GetValue() + intVal, err := strconv.Atoi(fmt.Sprintf("%v", val)) + if err != nil { + return -1, fmt.Errorf("failed to convert counter metric to int: %w", err) + } + logger.V(logutil.TRACE).Info("TRT LLM counter metric found", "value", intVal, "key", key, "value", value) + return float64(intVal), nil + } + } + } + return -1, fmt.Errorf("TRT LLM metric not found: %s{ %s=\"%s\" }", mf.GetName(), key, value) +} + +// getTrtLlmGaugeMetric gets a gauge TRT LLM metric. +func getTrtLlmGaugeMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) { + return getTrtLlmMetric(logger, mf, dto.MetricType_GAUGE, key, value) +} + +// getTrtLlmCounterMetric gets a counter TRT LLM metric. +func getTrtLlmCounterMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) { + return getTrtLlmMetric(logger, mf, dto.MetricType_COUNTER, key, value) +} diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go new file mode 100644 index 000000000..f9b960a52 --- /dev/null +++ b/pkg/epp/backend/triton/metrics_test.go @@ -0,0 +1,241 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package triton + +import ( + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "google.golang.org/protobuf/proto" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func TestPromToPodMetrics(t *testing.T) { + logger := logutil.NewTestLogger() + + podName := "test-pod" + podAddress := "10.0.0.1" + + testCases := []struct { + name string + metricFamilies map[string]*dto.MetricFamily + expectedMetrics *datastore.PodMetrics + expectedErr bool + initialPodMetrics *datastore.PodMetrics + }{ + { + name: "all metrics available", + metricFamilies: allMetricsAvailable(podName), + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{ + RunningQueueSize: 1, + WaitingQueueSize: 2, + KVCacheUsagePercent: 0.5, // used / max = 50 / 100 + KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50 + }, + }, + initialPodMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{}, + }, + expectedErr: false, + }, + { + name: "missing metrics", + metricFamilies: map[string]*dto.MetricFamily{}, // No metrics provided + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{ + RunningQueueSize: 0, // Default int value + WaitingQueueSize: 0, // Default int value + KVCacheUsagePercent: 0, // Default float64 value + KvCacheMaxTokenCapacity: 0, // Default int value + }, + }, + initialPodMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{}, + }, + expectedErr: false, + }, + { + name: "multiple timestamps", + metricFamilies: multipleMetricsWithDifferentTimestamps(podName), + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{ + RunningQueueSize: 1, // from latest + WaitingQueueSize: 2, // from latest + KVCacheUsagePercent: 0.5, // used / max = 50 / 100 (from latest) + KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50 (from latest) + }, + }, + initialPodMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{}, + }, + expectedErr: false, + }, + { + name: "empty metric family", + metricFamilies: map[string]*dto.MetricFamily{ + TRTLLMRequestMetricsName: { + Name: proto.String(TRTLLMRequestMetricsName), + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{}, // Empty + }, + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{}, + }, + initialPodMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + NamespacedName: types.NamespacedName{Name: podName}, + Address: podAddress, + ScrapePort: 9000, + ScrapePath: "/metrics", + }, + Metrics: datastore.Metrics{}, + }, + expectedErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialPodMetrics) + if tc.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tc.expectedMetrics, updated) + } + }) + } +} + +// --- Helper Functions --- + +func allMetricsAvailable(podName string) map[string]*dto.MetricFamily { + return map[string]*dto.MetricFamily{ + TRTLLMRequestMetricsName: { + Name: proto.String(TRTLLMRequestMetricsName), + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{ + trtLlmRequestMetric("active", 1, 200), + trtLlmRequestMetric("scheduled", 2, 200), + }, + }, + TRTLLMKvCacheMetricsName: { + Name: proto.String(TRTLLMKvCacheMetricsName), + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{ + trtLlmKvCacheMetric("max", 100, 200), + trtLlmKvCacheMetric("used", 50, 200), + trtLlmKvCacheMetric("tokens_per", 50, 200), + }, + }, + } +} + +func multipleMetricsWithDifferentTimestamps(podName string) map[string]*dto.MetricFamily { + return map[string]*dto.MetricFamily{ + TRTLLMRequestMetricsName: { + Name: proto.String(TRTLLMRequestMetricsName), + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{ + trtLlmRequestMetric("active", 0, 100), // Older + trtLlmRequestMetric("scheduled", 3, 100), // Older + trtLlmRequestMetric("active", 1, 200), // Newer + trtLlmRequestMetric("scheduled", 2, 200), // Newer + + }, + }, + TRTLLMKvCacheMetricsName: { + Name: proto.String(TRTLLMKvCacheMetricsName), + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{ + trtLlmKvCacheMetric("max", 110, 100), //Older + trtLlmKvCacheMetric("used", 60, 100), //Older + trtLlmKvCacheMetric("tokens_per", 40, 100), //Older + trtLlmKvCacheMetric("max", 100, 200), // Newer + trtLlmKvCacheMetric("used", 50, 200), // Newer + trtLlmKvCacheMetric("tokens_per", 50, 200), // Newer + }, + }, + } +} + +func trtLlmRequestMetric(requestType string, value float64, timestampMs int64) *dto.Metric { + return &dto.Metric{ + Label: []*dto.LabelPair{ + {Name: proto.String(TRTLLMRequestMetricsLabel), Value: proto.String(requestType)}, + }, + Gauge: &dto.Gauge{Value: &value}, + TimestampMs: ×tampMs, + } +} + +func trtLlmKvCacheMetric(blockType string, value float64, timestampMs int64) *dto.Metric { + return &dto.Metric{ + Label: []*dto.LabelPair{ + {Name: proto.String(TRTLLMKvCacheMetricsLabel), Value: proto.String(blockType)}, + }, + Gauge: &dto.Gauge{Value: &value}, + TimestampMs: ×tampMs, + } +} From 612505425565589d7cd30d3e5860e0164e319925 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Mar 2025 03:33:07 +0000 Subject: [PATCH 02/19] Refactor metrics to work with any prometheus metric naming convention based on EPP runtime flags. --- cmd/epp/main.go | 25 +- config/manifests/ext_proc.yaml | 8 + config/manifests/triton/ext_proc.yaml | 8 + pkg/epp/backend/metrics.go | 321 +++++++++++ pkg/epp/backend/metrics_spec.go | 164 ++++++ pkg/epp/backend/metrics_spec_test.go | 281 ++++++++++ pkg/epp/backend/metrics_test.go | 741 +++++++++++++++++++++++++ pkg/epp/backend/triton/metrics.go | 83 ++- pkg/epp/backend/triton/metrics_test.go | 37 +- pkg/epp/datastore/types.go | 71 +++ 10 files changed, 1665 insertions(+), 74 deletions(-) create mode 100644 pkg/epp/backend/metrics.go create mode 100644 pkg/epp/backend/metrics_spec.go create mode 100644 pkg/epp/backend/metrics_spec_test.go create mode 100644 pkg/epp/backend/metrics_test.go create mode 100644 pkg/epp/datastore/types.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 4eaa90c8f..c5264b823 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -39,7 +39,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/triton" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" @@ -94,6 +93,15 @@ var ( "certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+ "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ "then a self-signed certificate is used.") + // metric flags + allRequestsMetric = flag.String("allRequestsMetric", "", "Prometheus metric for the total number of processing requests, both queued and running.") + waitingRequestsMetric = flag.String("waitingRequestsMetric", "", "Prometheus metric for the number of queued requests.") + runningRequestsMetric = flag.String("runningRequestsMetric", "", "Prometheus metric for the number of running requests.") + usedKVCacheBlocksMetric = flag.String("usedKVCacheBlocksMetric", "", "Prometheus metric for the number of utilized KV-cache blocks.") + maxKVCacheBlocksMetric = flag.String("maxKVCacheBlocksMetric", "", "Prometheus metric for the total number of available KV-cache blocks.") + kVCacheUsageMetric = flag.String("kVCacheUsageMetric", "", "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") + // LoRA metrics + loraRequestInfoMetric = flag.String("loraRequestInfoMetric", "", "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") setupLog = ctrl.Log.WithName("setup") ) @@ -149,7 +157,20 @@ func run() error { // Setup runner. datastore := datastore.NewDatastore(ctx, pmf) // switch case across different model server metrics (triton, vllm) - provider := backend.NewProvider(&triton.PodMetricsClientImpl{}, datastore) + mapping, err := backend.NewMetricMapping( + *allRequestsMetric, + *waitingRequestsMetric, + *runningRequestsMetric, + *usedKVCacheBlocksMetric, + *maxKVCacheBlocksMetric, + *kVCacheUsageMetric, + *loraRequestInfoMetric, + ) + if err != nil { + setupLog.Error(err, "Failed to create metric mapping from flags") + return err + } + provider := backend.NewProvider(&backend.PodMetricsClientImpl{MetricMapping: mapping}, datastore) // serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index d70467ee0..33c47d400 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -82,6 +82,14 @@ spec: - "9002" - -grpcHealthPort - "9003" + - -waitingRequestsMetric + - "vllm:num_requests_waiting" + - -runningRequestsMetric + - "vllm:num_requests_running" + - -kVCacheUsageMetric + - "vllm:gpu_cache_usage_perc" + - -loraRequestInfoMetric + - "vllm:lora_requests_info" env: - name: USE_STREAMING value: "false" diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml index a794bdb2d..16c802838 100644 --- a/config/manifests/triton/ext_proc.yaml +++ b/config/manifests/triton/ext_proc.yaml @@ -82,6 +82,14 @@ spec: - "9002" - -grpcHealthPort - "9003" + - -allRequestsMetric + - "nv_trt_llm_request_metrics{request_type=active}" + - -runningRequestsMetric + - "nv_trt_llm_request_metrics{request_type=scheduled}" + - -usedKVCacheBlocksMetric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=used}" + - -maxKVCacheBlocksMetric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=max}" ports: - containerPort: 9002 - containerPort: 9003 diff --git a/pkg/epp/backend/metrics.go b/pkg/epp/backend/metrics.go new file mode 100644 index 000000000..2f2082652 --- /dev/null +++ b/pkg/epp/backend/metrics.go @@ -0,0 +1,321 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "context" + "fmt" + "net/http" + "strconv" + "strings" + "time" + + "github.com/go-logr/logr" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "go.uber.org/multierr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + // Hardcoded vLLM specific LoRA metrics + LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" + LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" + LoraRequestInfoMaxAdaptersMetricName = "max_lora" +) + +type PodMetricsClientImpl struct { + MetricMapping *MetricMapping +} + +// FetchMetrics fetches metrics from a given pod. +func (p *PodMetricsClientImpl) FetchMetrics( + ctx context.Context, + existing *datastore.PodMetrics, + port int32, +) (*datastore.PodMetrics, error) { + logger := log.FromContext(ctx) + loggerDefault := logger.V(logutil.DEFAULT) + + url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) + return nil, fmt.Errorf("failed to create request: %v", err) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) + } + defer func() { + _ = resp.Body.Close() + }() + + if resp.StatusCode != http.StatusOK { + loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) + } + + parser := expfmt.TextParser{} + metricFamilies, err := parser.TextToMetricFamilies(resp.Body) + if err != nil { + return nil, err + } + return p.promToPodMetrics(logger, metricFamilies, existing) +} + +// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. +func (p *PodMetricsClientImpl) promToPodMetrics( + logger logr.Logger, + metricFamilies map[string]*dto.MetricFamily, + existing *datastore.PodMetrics, +) (*datastore.PodMetrics, error) { + var errs error + updated := existing.Clone() + + if p.MetricMapping.RunningRequests != nil { + running, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.RunningRequests) + if err == nil { + updated.RunningQueueSize = int(running.GetGauge().GetValue()) + } else { + errs = multierr.Append(errs, err) + } + } + + if p.MetricMapping.AllRequests != nil { + all, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.AllRequests) + if err == nil { + updated.WaitingQueueSize = int(all.GetGauge().GetValue()) - updated.RunningQueueSize + } else { + errs = multierr.Append(errs, err) + } + } + + if p.MetricMapping.WaitingRequests != nil { + waiting, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.WaitingRequests) + if err == nil { + updated.WaitingQueueSize = int(waiting.GetGauge().GetValue()) + } else { + errs = multierr.Append(errs, err) + } + } + + if p.MetricMapping.KVCacheUsage != nil { + usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUsage) + if err == nil { + updated.KVCacheUsagePercent = usage.GetGauge().GetValue() + } else { + errs = multierr.Append(errs, err) + } + } else if p.MetricMapping.UsedKVCacheBlocks != nil && p.MetricMapping.MaxKVCacheBlocks != nil { + used, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.UsedKVCacheBlocks) + if err != nil { + errs = multierr.Append(errs, err) + } + max, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.MaxKVCacheBlocks) + if err != nil { + errs = multierr.Append(errs, err) + } + if err == nil { + usage := 0.0 + if max.GetGauge().GetValue() > 0 { + usage = used.GetGauge().GetValue() / max.GetGauge().GetValue() + } + updated.KVCacheUsagePercent = usage + } + } + + // Handle LoRA metrics (only if all LoRA MetricSpecs are present) + if p.MetricMapping.LoraRequestInfo != nil { + loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies) + errs = multierr.Append(errs, err) + + if loraMetrics != nil { + updated.ActiveModels = make(map[string]int) + for _, label := range loraMetrics.GetLabel() { + if label.GetName() == LoraRequestInfoRunningAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } + } + } + if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } + } + } + if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { + if label.GetValue() != "" { + updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) + if err != nil { + errs = multierr.Append(errs, err) + } + } + } + } + } + } + + return updated, errs +} + +// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info` +// reason its specially fetched is because each label key value pair permutation generates new series +// and only most recent is useful. The value of each series is the creation timestamp so we can +// retrieve the latest by sorting the value. +func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { + if p.MetricMapping.LoraRequestInfo == nil { + return nil, time.Time{}, nil // No LoRA metrics configured + } + + loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName] + if !ok { + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) + return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) + } + + var latest *dto.Metric + var latestTs float64 // Use float64, as Gauge.Value is float64 + + // Iterate over all metrics in the family. + for _, m := range loraRequests.GetMetric() { + running := "" + waiting := "" + // Check if the metric has the expected LoRA labels. This is important! + hasRequiredLabels := false + for _, lp := range m.GetLabel() { + switch lp.GetName() { + case LoraRequestInfoRunningAdaptersMetricName: + running = lp.GetValue() + hasRequiredLabels = true + case LoraRequestInfoWaitingAdaptersMetricName: + waiting = lp.GetValue() + hasRequiredLabels = true + } + } + //Skip if it does not have the lora labels + if !hasRequiredLabels { + continue + } + // Ignore metrics with both labels empty. + if running == "" && waiting == "" { + continue + } + + // Select the metric with the *largest Gauge Value* (which represents the timestamp). + if m.GetGauge().GetValue() > latestTs { + latestTs = m.GetGauge().GetValue() + latest = m + } + } + if latest == nil { + logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName) + return nil, time.Time{}, nil + } + + // Convert the gauge value (creation timestamp) to time.Time. + return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time +} + +// getMetric retrieves a specific metric based on MetricSpec. +func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { + mf, ok := metricFamilies[spec.MetricName] + if !ok { + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName) + return nil, fmt.Errorf("metric family %q not found", spec.MetricName) + } + + if len(mf.GetMetric()) == 0 { + return nil, fmt.Errorf("no metrics available for %q", spec.MetricName) + } + // if there is a specified label, return only that metric in the family + if spec.Labels != nil { + return getLabeledMetric(logger, mf, spec) + } + return getLatestMetric(logger, mf) +} + +// getLatestMetric gets the latest metric of a family (for metrics without labels). +func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) { + var latestTs int64 + var latest *dto.Metric + for _, m := range mf.GetMetric() { + if m.GetTimestampMs() >= latestTs { + latestTs = m.GetTimestampMs() + latest = m + } + } + + if latest == nil { + return nil, fmt.Errorf("no metrics found for %q", mf.GetName()) + } + + logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName()) + return latest, nil +} + +// getLabeledMetric gets the latest metric with matching labels. +func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { + var latestMetric *dto.Metric + var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater + + for _, m := range mf.GetMetric() { + if labelsMatch(m.GetLabel(), spec.Labels) { + if m.GetTimestampMs() > latestTimestamp { + latestTimestamp = m.GetTimestampMs() + latestMetric = m + } + } + } + + if latestMetric != nil { + logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName) + return latestMetric, nil + } + + return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels) +} + +// labelsMatch checks if a metric's labels contain all the labels in the spec. +func labelsMatch(metricLabels []*dto.LabelPair, specLabels map[string]string) bool { + if len(specLabels) == 0 { + return true // No specific labels required + } + + for specName, specValue := range specLabels { + found := false + for _, label := range metricLabels { + if label.GetName() == specName && label.GetValue() == specValue { + found = true + break + } + } + if !found { + return false // A required label is missing + } + } + return true // All required labels are present +} diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/metrics_spec.go new file mode 100644 index 000000000..aabcf9835 --- /dev/null +++ b/pkg/epp/backend/metrics_spec.go @@ -0,0 +1,164 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "fmt" + "strings" +) + +// MetricSpec represents a single metric's specification. +type MetricSpec struct { + MetricName string + Labels map[string]string // Label name -> Label value +} + +// MetricMapping holds named MetricSpecs. +type MetricMapping struct { + AllRequests *MetricSpec // Option 1 + WaitingRequests *MetricSpec // Option 2 + RunningRequests *MetricSpec // Required + UsedKVCacheBlocks *MetricSpec // Optional (part of a group) + MaxKVCacheBlocks *MetricSpec // Optional (part of a group) + KVCacheUsage *MetricSpec // Optional (alternative to the group above) + // LoRA Metrics (vLLM Specific, optional) + LoraRequestInfo *MetricSpec +} + +// stringToMetricSpec converts a string to a MetricSpec. +// Example inputs: +// +// "metric_name" +// "metric_name{label1=value1}" +// "metric_name{label1=value1,label2=value2}" +func stringToMetricSpec(specStr string) (*MetricSpec, error) { + if specStr == "" { + return nil, nil // Allow empty strings to represent nil MetricSpecs + } + + specStr = strings.TrimSpace(specStr) + metricName := specStr + labels := make(map[string]string) + + // Check for labels enclosed in curly braces + start := strings.Index(specStr, "{") + end := strings.Index(specStr, "}") + + if start != -1 || end != -1 { // If *either* brace is present... + if start == -1 || end == -1 || end <= start+1 { // ...check that *both* are present and correctly placed. + return nil, fmt.Errorf("invalid metric spec string: %q, missing or malformed label block", specStr) + } + + metricName = strings.TrimSpace(specStr[:start]) + labelStr := specStr[start+1 : end] + + // Split into individual label pairs + labelPairs := strings.Split(labelStr, ",") + for _, pair := range labelPairs { + pair = strings.TrimSpace(pair) + parts := strings.Split(pair, "=") + if len(parts) != 2 { + return nil, fmt.Errorf("invalid label pair: %q in metric spec: %q", pair, specStr) + } + labelName := strings.TrimSpace(parts[0]) + labelValue := strings.TrimSpace(parts[1]) + if labelName == "" || labelValue == "" { + return nil, fmt.Errorf("empty label name or value in pair: %q in metric spec: %q", pair, specStr) + } + labels[labelName] = labelValue + } + // Check for extra characters after labels + if end != len(specStr)-1 { + return nil, fmt.Errorf("invalid characters after label section in: %q", specStr) + } + + } + + if metricName == "" { //Metric name cannot be empty + return nil, fmt.Errorf("empty metric name in spec: %q", specStr) + } + + return &MetricSpec{ + MetricName: metricName, + Labels: labels, + }, nil +} + +// NewMetricMapping creates a MetricMapping from string values. +func NewMetricMapping(allStr, waitingStr, runningStr, usedBlocksStr, maxBlocksStr, usageStr, loraReqInfoStr string) (*MetricMapping, error) { + allSpec, err := stringToMetricSpec(allStr) + if err != nil { + return nil, fmt.Errorf("error parsing AllRequests: %w", err) + } + waitingSpec, err := stringToMetricSpec(waitingStr) + if err != nil { + return nil, fmt.Errorf("error parsing WaitingRequests: %w", err) + } + runningSpec, err := stringToMetricSpec(runningStr) + if err != nil { + return nil, fmt.Errorf("error parsing RunningRequests: %w", err) + } + usedBlocksSpec, err := stringToMetricSpec(usedBlocksStr) + if err != nil { + return nil, fmt.Errorf("error parsing UsedKVCacheBlocks: %w", err) + } + maxBlocksSpec, err := stringToMetricSpec(maxBlocksStr) + if err != nil { + return nil, fmt.Errorf("error parsing MaxKVCacheBlocks: %w", err) + } + usageSpec, err := stringToMetricSpec(usageStr) + if err != nil { + return nil, fmt.Errorf("error parsing KVCacheUsage: %w", err) + } + loraReqInfoSpec, err := stringToMetricSpec(loraReqInfoStr) + if err != nil { + return nil, fmt.Errorf("error parsing loraReqInfoStr: %w", err) + } + mapping := &MetricMapping{ + AllRequests: allSpec, + WaitingRequests: waitingSpec, + RunningRequests: runningSpec, + UsedKVCacheBlocks: usedBlocksSpec, + MaxKVCacheBlocks: maxBlocksSpec, + KVCacheUsage: usageSpec, + LoraRequestInfo: loraReqInfoSpec, + } + + if err := mapping.Validate(); err != nil { + return nil, err // Return validation error + } + + return mapping, nil +} + +// Validate checks if the MetricMapping is valid. +func (m *MetricMapping) Validate() error { + // 1. WaitingRequests OR AllRequests (but not both can be nil) + if m.WaitingRequests == nil && m.AllRequests == nil { + return fmt.Errorf("either WaitingRequests or AllRequests must be specified") + } + + if m.RunningRequests == nil { + return fmt.Errorf("RunningRequests is required") + } + + // 2. KVCacheUsage OR (UsedKVCacheBlocks AND MaxKVCacheBlocks) + if m.KVCacheUsage == nil && (m.UsedKVCacheBlocks == nil || m.MaxKVCacheBlocks == nil) { + return fmt.Errorf("either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified") + } + return nil +} diff --git a/pkg/epp/backend/metrics_spec_test.go b/pkg/epp/backend/metrics_spec_test.go new file mode 100644 index 000000000..084ae5b5a --- /dev/null +++ b/pkg/epp/backend/metrics_spec_test.go @@ -0,0 +1,281 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "reflect" + "strings" + "testing" +) + +func TestStringToMetricSpec(t *testing.T) { + tests := []struct { + name string + input string + want *MetricSpec + wantErr bool + }{ + { + name: "empty string", + input: "", + want: nil, + wantErr: false, + }, + { + name: "no labels", + input: "my_metric", + want: &MetricSpec{ + MetricName: "my_metric", + Labels: map[string]string{}, + }, + wantErr: false, + }, + { + name: "one label", + input: "my_metric{label1=value1}", + want: &MetricSpec{ + MetricName: "my_metric", + Labels: map[string]string{ + "label1": "value1", + }, + }, + wantErr: false, + }, + { + name: "multiple labels", + input: "my_metric{label1=value1,label2=value2}", + want: &MetricSpec{ + MetricName: "my_metric", + Labels: map[string]string{ + "label1": "value1", + "label2": "value2", + }, + }, + wantErr: false, + }, + { + name: "extra whitespace", + input: " my_metric { label1 = value1 , label2 = value2 } ", + want: &MetricSpec{ + MetricName: "my_metric", + Labels: map[string]string{ + "label1": "value1", + "label2": "value2", + }, + }, + wantErr: false, + }, + { + name: "missing closing brace", + input: "my_metric{label1=value1", + want: nil, + wantErr: true, + }, + { + name: "missing opening brace", + input: "my_metriclabel1=value1}", + want: nil, // Corrected expected value + wantErr: true, + }, + { + name: "invalid label pair", + input: "my_metric{label1}", + want: nil, + wantErr: true, + }, + { + name: "empty label name", + input: "my_metric{=value1}", + want: nil, + wantErr: true, + }, + { + name: "empty label value", + input: "my_metric{label1=}", + want: nil, + wantErr: true, + }, + { + name: "empty label name and value with spaces", + input: "my_metric{ = }", + want: nil, + wantErr: true, + }, + { + name: "characters after closing brace", + input: "my_metric{label=val}extra", + want: nil, + wantErr: true, + }, + { + name: "empty metric name", + input: "{label=val}", + want: nil, + wantErr: true, + }, + { + name: "no labels and just metric name with space", + input: "my_metric ", + want: &MetricSpec{ + MetricName: "my_metric", + Labels: map[string]string{}, + }, + wantErr: false, + }, + { + name: "no labels and just metric name with space before and after", + input: " my_metric ", + want: &MetricSpec{ + MetricName: "my_metric", + Labels: map[string]string{}, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := stringToMetricSpec(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("stringToMetricSpec() error = %v, wantErr %v", err, tt.wantErr) + return + } + if tt.want != nil && got != nil { // compare maps directly + if tt.want.Labels == nil { + tt.want.Labels = make(map[string]string) + } + if !reflect.DeepEqual(got.MetricName, tt.want.MetricName) { + t.Errorf("stringToMetricSpec() got MetricName = %v, want %v", got.MetricName, tt.want.MetricName) + } + if !reflect.DeepEqual(got.Labels, tt.want.Labels) { + t.Errorf("stringToMetricSpec() got Labels = %v, want %v", got.Labels, tt.want.Labels) + } + } else if tt.want != got { // handles if one is nil and the other isn't + t.Errorf("stringToMetricSpec() = %v, want %v", got, tt.want) + + } + + }) + } +} + +func TestNewMetricMappingAndValidate(t *testing.T) { + tests := []struct { + name string + allStr string + waitingStr string + runningStr string + usedStr string + maxStr string + usageStr string + loraReqInfoStr string + wantErr bool + expectedErr string // Added to check for specific error messages + }{ + { + name: "valid vllm mapping", + runningStr: "running_metric", + waitingStr: "waiting_metric", + usageStr: "usage_metric", + loraReqInfoStr: "lora_requests_info", + wantErr: false, + expectedErr: "", + }, + { + name: "valid triton mapping", + runningStr: "running_metric{label1=value1}", + allStr: "all_metric{label2=value2}", + usedStr: "used_blocks{label3=value3}", + maxStr: "max_blocks{label4=value4}", + wantErr: false, + }, + { + name: "multiple labels mapping", + runningStr: "running_metric{label1=value1,label5=value5}", + allStr: "all_metric{label2=value2,label6=value6}", + usedStr: "used_blocks{label3=value3}", + maxStr: "max_blocks{label4=value4}", + wantErr: false, + }, + { + name: "missing running", + waitingStr: "waiting_metric", + usageStr: "usage_metric", + wantErr: true, + expectedErr: "RunningRequests is required", + }, + { + name: "missing both waiting and all", + runningStr: "running_metric", + usageStr: "usage_metric", + wantErr: true, + expectedErr: "either WaitingRequests or AllRequests must be specified", + }, + { + name: "missing usage and both block metrics", + runningStr: "running_metric", + waitingStr: "waiting_metric", + wantErr: true, + expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified", + }, + { + name: "missing max block metric", + runningStr: "running_metric", + waitingStr: "waiting_metric", + usedStr: "used_blocks", + wantErr: true, + expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified", + }, + { + name: "missing used block metric", + runningStr: "running_metric", + waitingStr: "waiting_metric", + maxStr: "max_blocks", + wantErr: true, + expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified", + }, + { + name: "invalid running metric format", + runningStr: "running_metric{invalid", + waitingStr: "waiting_metric", + usageStr: "usage_metric", + wantErr: true, + expectedErr: "error parsing RunningRequests", // Check for part of the expected error + }, + { + name: "lora metrics present", + runningStr: "running_metric", + waitingStr: "waiting_metric", + usageStr: "usage_metric", + loraReqInfoStr: "lora_requests_info", + + wantErr: false, + expectedErr: "", // Check for part of the expected error + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := NewMetricMapping(tt.allStr, tt.waitingStr, tt.runningStr, tt.usedStr, tt.maxStr, tt.usageStr, tt.loraReqInfoStr) + if (err != nil) != tt.wantErr { + t.Errorf("NewMetricMapping() error = %v, wantErr %v", err, tt.wantErr) + return + } + if tt.wantErr && !strings.Contains(err.Error(), tt.expectedErr) { + t.Errorf("NewMetricMapping() error = %v, expected to contain = %v", err, tt.expectedErr) + } + }) + } +} diff --git a/pkg/epp/backend/metrics_test.go b/pkg/epp/backend/metrics_test.go new file mode 100644 index 000000000..0bfafcee5 --- /dev/null +++ b/pkg/epp/backend/metrics_test.go @@ -0,0 +1,741 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "context" + "fmt" + "reflect" + "strconv" + "strings" + "testing" + + dto "github.com/prometheus/client_model/go" + "go.uber.org/multierr" + "google.golang.org/protobuf/proto" + "k8s.io/apimachinery/pkg/types" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// --- Test Helpers --- + +func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric { + labelPairs := []*dto.LabelPair{} + for k, v := range labels { + labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)}) + } + return &dto.Metric{ + Label: labelPairs, + Gauge: &dto.Gauge{Value: &value}, + TimestampMs: ×tampMs, + } +} + +func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily { + return &dto.MetricFamily{ + Name: &name, + Type: dto.MetricType_GAUGE.Enum(), + Metric: metrics, + } +} + +// --- Tests --- + +func TestGetMetric(t *testing.T) { + logger := logutil.NewTestLogger() + + metricFamilies := map[string]*dto.MetricFamily{ + "metric1": makeMetricFamily("metric1", + makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000), + makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000), + ), + "metric2": makeMetricFamily("metric2", + makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500), + makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500), + ), + "metric3": makeMetricFamily("metric3", + makeMetric("metric3", map[string]string{}, 5.0, 3000), + makeMetric("metric3", map[string]string{}, 6.0, 1000), + ), + } + + tests := []struct { + name string + spec MetricSpec + wantValue float64 + wantError bool + shouldPanic bool // Add this + }{ + { + name: "get labeled metric, exists", + spec: MetricSpec{ + MetricName: "metric1", + Labels: map[string]string{"label1": "value1"}, + }, + wantValue: 1.0, + wantError: false, + }, + { + name: "get labeled metric, wrong value", + spec: MetricSpec{ + MetricName: "metric1", + Labels: map[string]string{"label1": "value3"}, + }, + wantValue: -1, // Expect an error, not a specific value + wantError: true, + }, + { + name: "get labeled metric, missing label", + spec: MetricSpec{ + MetricName: "metric1", + Labels: map[string]string{"label2": "value2"}, + }, + wantValue: -1, + wantError: true, + }, + { + name: "get labeled metric, extra label present", + spec: MetricSpec{ + MetricName: "metric2", + Labels: map[string]string{"labelA": "A1"}, + }, + wantValue: 3.0, + wantError: false, + }, + { + name: "get unlabeled metric, exists", + spec: MetricSpec{ + MetricName: "metric3", + Labels: nil, // Explicitly nil + }, + wantValue: 5.0, // latest metric, which occurs first in our test data + wantError: false, + }, + { + name: "get unlabeled metric, metric family not found", + spec: MetricSpec{ + MetricName: "metric4", + Labels: nil, + }, + wantValue: -1, + wantError: true, + }, + { + name: "get labeled metric, metric family not found", + spec: MetricSpec{ + MetricName: "metric4", + Labels: map[string]string{"label1": "value1"}, + }, + wantValue: -1, + wantError: true, + }, + { + name: "get metric, no metrics available", + spec: MetricSpec{ + MetricName: "empty_metric", + }, + wantValue: -1, + wantError: true, + }, + { + name: "get latest metric", + spec: MetricSpec{ + MetricName: "metric3", + Labels: map[string]string{}, // Empty map, not nil + }, + wantValue: 5.0, + wantError: false, + }, + } + + p := &PodMetricsClientImpl{} // No need for MetricMapping here + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.shouldPanic { + defer func() { + if r := recover(); r == nil { + t.Errorf("The code did not panic") + } + }() + } + + gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec) + + if tt.wantError { + if err == nil { + t.Errorf("getMetric() expected error, got nil") + } + } else { + if err != nil { + t.Errorf("getMetric() unexpected error: %v", err) + } + if gotMetric.GetGauge().GetValue() != tt.wantValue { + t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue) + } + } + }) + } +} + +func TestLabelsMatch(t *testing.T) { + tests := []struct { + name string + metricLabels []*dto.LabelPair + specLabels map[string]string + want bool + }{ + { + name: "empty spec labels, should match", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{}, + want: true, + }, + { + name: "nil spec labels, should match", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: nil, + want: true, + }, + { + name: "exact match", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{"a": "b"}, + want: true, + }, + { + name: "extra labels in metric", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}, {Name: proto.String("c"), Value: proto.String("d")}}, + specLabels: map[string]string{"a": "b"}, + want: true, + }, + { + name: "missing label in metric", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{"a": "b", "c": "d"}, + want: false, + }, + { + name: "value mismatch", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{"a": "c"}, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := labelsMatch(tt.metricLabels, tt.specLabels); got != tt.want { + t.Errorf("labelsMatch() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestGetLatestLoraMetric(t *testing.T) { + logger := logutil.NewTestLogger() + + testCases := []struct { + name string + metricFamilies map[string]*dto.MetricFamily + expectedAdapters map[string]int + expectedMax int + expectedErr error + mapping *MetricMapping + }{ + { + name: "no lora metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "some_other_metric": makeMetricFamily("some_other_metric", + makeMetric("some_other_metric", nil, 1.0, 1000), + ), + }, + expectedAdapters: nil, + expectedMax: 0, + expectedErr: fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + }, + { + name: "basic lora metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000), // Newer + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older + + ), + }, + expectedAdapters: map[string]int{"lora1": 0}, + expectedMax: 2, + expectedErr: nil, + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + }, + { + name: "no matching lora metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000), + ), + }, + expectedAdapters: nil, + expectedMax: 0, + expectedErr: nil, // Expect *no* error; just no adapters found + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + }, + { + name: "no lora metrics if not in MetricMapping", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000), + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000), + ), + }, + expectedAdapters: nil, + expectedMax: 0, + expectedErr: nil, + mapping: &MetricMapping{ // No LoRA metrics defined + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + p := &PodMetricsClientImpl{MetricMapping: tc.mapping} + loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies) + + if tc.expectedErr != nil { + if err == nil || err.Error() != tc.expectedErr.Error() { + t.Errorf("getLatestLoraMetric() error = %v, wantErr %v", err, tc.expectedErr) + } + return // Stop here if an error was expected + } else if err != nil { + t.Fatalf("getLatestLoraMetric() unexpected error: %v", err) + } + + if tc.mapping.LoraRequestInfo == nil { + if loraMetric != nil { + t.Errorf("getLatestLoraMetric() expected nil metric, got %v", loraMetric) + } + return // Stop if no Lora metrics are expected. + } + + if tc.expectedAdapters == nil && loraMetric == nil { + return // Both nil, as expected + } + + if tc.expectedAdapters != nil && loraMetric != nil { // proceed with checks + + adaptersFound := make(map[string]int) + maxLora := 0 + for _, label := range loraMetric.GetLabel() { + if label.GetName() == "running_lora_adapters" && label.GetValue() != "" { + for _, adapter := range strings.Split(label.GetValue(), ",") { + adaptersFound[adapter] = 0 + } + } + if label.GetName() == "waiting_lora_adapters" && label.GetValue() != "" { + for _, adapter := range strings.Split(label.GetValue(), ",") { + adaptersFound[adapter] = 0 // Overwrite if already present + } + } + if label.GetName() == "max_lora" { + var converr error // define err in this scope. + maxLora, converr = strconv.Atoi(label.GetValue()) + if converr != nil && tc.expectedErr == nil { // only report if we don't expect any other errors + t.Errorf("getLatestLoraMetric() could not parse max_lora: %v", converr) + } + } + } + + if !reflect.DeepEqual(adaptersFound, tc.expectedAdapters) { + t.Errorf("getLatestLoraMetric() adapters = %v, want %v", adaptersFound, tc.expectedAdapters) + } + if maxLora != tc.expectedMax { + t.Errorf("getLatestLoraMetric() maxLora = %v, want %v", maxLora, tc.expectedMax) + } + } else { // one is nil and the other is not + t.Errorf("getLatestLoraMetric(): one of expectedAdapters/loraMetric is nil and the other is not, expected %v, got %v", tc.expectedAdapters, loraMetric) + } + }) + } +} + +func TestPromToPodMetrics(t *testing.T) { + logger := logutil.NewTestLogger() + + tests := []struct { + name string + metricFamilies map[string]*dto.MetricFamily + mapping *MetricMapping + existingMetrics *datastore.PodMetrics + expectedMetrics *datastore.PodMetrics + expectedErrCount int // Count of expected errors + }{ + { + name: "vllm metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm_running": makeMetricFamily("vllm_running", + makeMetric("vllm_running", nil, 10.0, 2000), + makeMetric("vllm_running", nil, 12.0, 1000), //Older + ), + "vllm_waiting": makeMetricFamily("vllm_waiting", + makeMetric("vllm_waiting", nil, 5.0, 1000), + makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer + ), + "vllm_usage": makeMetricFamily("vllm_usage", + makeMetric("vllm_usage", nil, 0.8, 2000), + makeMetric("vllm_usage", nil, 0.7, 500), + ), + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), + ), + }, + mapping: &MetricMapping{ + RunningRequests: &MetricSpec{MetricName: "vllm_running"}, + WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, + KVCacheUsage: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{}, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + RunningQueueSize: 10, + WaitingQueueSize: 7, + KVCacheUsagePercent: 0.8, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + MaxActiveModels: 3, + }, + }, + expectedErrCount: 0, + }, + { + name: "triton metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "triton_running": makeMetricFamily("triton_running", + makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000), + makeMetric("triton_running", map[string]string{"queue": "slow"}, 12.0, 1000), //Older, but different label + ), + "triton_all": makeMetricFamily("triton_all", + makeMetric("triton_all", map[string]string{"queue": "fast"}, 15.0, 1000), + makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000), // Newer + ), + "triton_used": makeMetricFamily("triton_used", + makeMetric("triton_used", map[string]string{"type": "gpu"}, 80.0, 1000), + ), + "triton_max": makeMetricFamily("triton_max", + makeMetric("triton_max", map[string]string{"type": "gpu"}, 100.0, 1000), + ), + }, + mapping: &MetricMapping{ + RunningRequests: &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}}, + AllRequests: &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}}, + UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}}, + MaxKVCacheBlocks: &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + RunningQueueSize: 10, + WaitingQueueSize: 7, // 17 (all) - 10 (running) + KVCacheUsagePercent: 0.8, // 80 / 100 + }, + }, + expectedErrCount: 0, + }, + { + name: "triton metrics, missing label", + metricFamilies: map[string]*dto.MetricFamily{ + "triton_running": makeMetricFamily("triton_running", + makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000), + ), + "triton_all": makeMetricFamily("triton_all", + makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000), + ), + // triton_used and _max have no metrics with type=gpu label. + }, + mapping: &MetricMapping{ + RunningRequests: &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}}, + AllRequests: &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}}, + UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}}, + MaxKVCacheBlocks: &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + RunningQueueSize: 10, + WaitingQueueSize: 7, + KVCacheUsagePercent: 0.0, // expect this to still be present, but with default 0 value + }, + }, + + expectedErrCount: 2, // Two errors: Used and Max + }, + { + name: "missing metrics", + metricFamilies: map[string]*dto.MetricFamily{}, // No metrics + mapping: &MetricMapping{ + RunningRequests: &MetricSpec{MetricName: "vllm_running"}, + WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, + KVCacheUsage: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, + expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, + expectedErrCount: 4, // Errors for all 4 main metrics + }, + { + name: "partial metrics available + LoRA", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm_usage": makeMetricFamily("vllm_usage", + makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present + ), + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), + ), + }, + mapping: &MetricMapping{ + RunningRequests: &MetricSpec{MetricName: "vllm_running"}, // Not present + WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present + KVCacheUsage: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{}, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + RunningQueueSize: 0, + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.8, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + MaxActiveModels: 3, + }, + }, + expectedErrCount: 2, // Errors for the two missing metrics + }, + { + name: "use all requests for waiting queue", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm_running": makeMetricFamily("vllm_running", + makeMetric("vllm_running", nil, 10.0, 2000), + ), + "vllm_all": makeMetricFamily("vllm_all", + makeMetric("vllm_all", nil, 15.0, 1000), + ), + }, + mapping: &MetricMapping{ + RunningRequests: &MetricSpec{MetricName: "vllm_running"}, + AllRequests: &MetricSpec{MetricName: "vllm_all"}, + // No WaitingRequests + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + RunningQueueSize: 10, + WaitingQueueSize: 5, // 15 - 10 + }, + }, + expectedErrCount: 0, + }, + { + name: "invalid max lora", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000), + ), + }, + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{}, + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{"lora1": 0}, + MaxActiveModels: 0, // Should still default to 0. + + }, + }, + expectedErrCount: 1, // Expect *one* error + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + p := &PodMetricsClientImpl{MetricMapping: tc.mapping} + updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics) + + if tc.expectedErrCount == 0 { + if err != nil { + t.Errorf("promToPodMetrics() unexpected error: %v", err) + } + } else { + if err == nil { + t.Errorf("promToPodMetrics() expected errors, got nil") + } else { + // Check the *number* of errors. multierr.Errors() gives us a slice + if len(multierr.Errors(err)) != tc.expectedErrCount { + t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d. Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err) + } + + } + } + // Use podMetricsEqual for comparison with tolerance. + if !reflect.DeepEqual(updated, tc.expectedMetrics) { + t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics) + } + }) + } +} + +// TestFetchMetrics is a basic integration test. A more complete test would mock +// the HTTP client. +func TestFetchMetrics(t *testing.T) { + // This test is very basic as it doesn't mock the HTTP client. It assumes + // there's no server running on the specified port. A real-world test + // suite should use a mock server. + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + existing := &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + } + p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test + + _, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use. + if err == nil { + t.Errorf("FetchMetrics() expected error, got nil") + } + // Check for a specific error message (fragile, but OK for this example) + expectedSubstr := "connection refused" + if err != nil && !strings.Contains(err.Error(), expectedSubstr) { + t.Errorf("FetchMetrics() error = %v, want error containing %q", err, expectedSubstr) + } +} diff --git a/pkg/epp/backend/triton/metrics.go b/pkg/epp/backend/triton/metrics.go index 2f8d24bd9..f28b1c88b 100644 --- a/pkg/epp/backend/triton/metrics.go +++ b/pkg/epp/backend/triton/metrics.go @@ -39,24 +39,6 @@ const ( TRTLLMKvCacheMetricsName = "nv_trt_llm_kv_cache_block_metrics" TRTLLMKvCacheMetricsLabel = "kv_cache_block_type" TRTLLMRequestMetricsLabel = "request_type" - - // THESE ARE UNUSED, EXAMPLES FOR MORE METRICS - inferenceCountMetricName = "nv_inference_count" - inferenceSuccessMetricName = "nv_inference_request_success" - inferenceExecCountMetricName = "nv_inference_exec_count" - inferenceRequestDurationMetricName = "nv_inference_request_duration_us" - waitingQueueSizeMetricName = "nv_inference_pending_request_count" - queueDurationMetricName = "nv_inference_queue_duration_us" - computeInputDurationMetricName = "nv_inference_compute_input_duration_us" - computeInferDurationMetricName = "nv_inference_compute_infer_duration_us" - computeOutputDurationMetricName = "nv_inference_compute_output_duration_us" - gpuUtilizationMetricName = "nv_gpu_utilization" - gpuMemoryTotalMetricName = "nv_gpu_memory_total_bytes" - gpuMemoryUsedMetricName = "nv_gpu_memory_used_bytes" - gpuPowerUsageMetricName = "nv_gpu_power_usage" - gpuPowerLimitMetricName = "nv_gpu_power_limit" - gpuMemoryTotalBytesMetricName = "nv_gpu_memory_total_bytes" - gpuMemoryUsedBytesMetricName = "nv_gpu_memory_used_bytes" ) type PodMetricsClientImpl struct{} @@ -65,12 +47,13 @@ type PodMetricsClientImpl struct{} func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, existing *datastore.PodMetrics, + port int32, ) (*datastore.PodMetrics, error) { logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) // existing.ScrapePort = 8002 // triton has a different port for metrics than the target port for inference - url := existing.BuildScrapeEndpoint() + url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) // TODO print response and err @@ -109,35 +92,46 @@ func promToPodMetrics( var errs error updated := existing.Clone() + //fmt.Print("\n\nDEBUG START\n###### DEBUG getting REQUEST metrics... ######") // Get the "nv_trt_llm_request_metrics" metric family - requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName) - errs = multierr.Append(errs, err) - if err == nil { - if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil { - fmt.Printf("###### DEBUG max: %+v", active) - updated.Metrics.RunningQueueSize = int(active) - } else { - errs = multierr.Append(errs, err) - } + //requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName) + requestMetrics, ok := metricFamilies[TRTLLMRequestMetricsName] + //errs = multierr.Append(errs, err) + if ok { if scheduled, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "scheduled"); err == nil { - fmt.Printf("###### DEBUG max: %+v", scheduled) - updated.Metrics.WaitingQueueSize = int(scheduled) + //fmt.Printf("\n###### DEBUG generation_requests: %+v", generation_requests) + updated.Metrics.RunningQueueSize = int(scheduled) + if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil { + //fmt.Printf("\n###### DEBUG scheduled: %+v", scheduled) + updated.Metrics.WaitingQueueSize = int(active - scheduled) + // pendingMetrics, ok := metricFamilies["nv_inference_pending_request_count"] + // if ok { + // if queued, err := getTrtLlmGaugeMetric(logger, pendingMetrics, "model", "ensemble"); err == nil { + // fmt.Printf("\n###### DEBUG queued requests: %+v", int(queued)) + // } + // } + //fmt.Printf("\n###### DEBUG active (total) requests: %+v", int(active)) + //fmt.Printf("\n###### DEBUG waiting requests: %+v", int(active-scheduled)) + //fmt.Printf("\n###### DEBUG running requests: %+v", int(scheduled)) + } else { + errs = multierr.Append(errs, err) + } } else { errs = multierr.Append(errs, err) } } - fmt.Print("###### DEBUG getting kvblock metrics... ######") + //fmt.Print("\n\n###### DEBUG getting KVBLOCK metrics... ######") // Get the "nv_trt_llm_kv_cache_block_metrics" metric family - kvCacheBlocks, err := getLatestMetric(logger, metricFamilies, TRTLLMKvCacheMetricsName) - errs = multierr.Append(errs, err) + kvCacheBlocks, ok := metricFamilies[TRTLLMKvCacheMetricsName] + // errs = multierr.Append(errs, err) // fmt.Printf("###### DEBUG (should be nil) getLatestMetric errs: %+v", errs) - if err == nil { + if ok { // Calculate the kv-cache usage from the max and used metrics if max, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "max"); err == nil { - fmt.Printf("###### DEBUG max: %+v", max) + //fmt.Printf("\n###### DEBUG max: %+v", max) if used, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "used"); err == nil { - fmt.Printf("###### DEBUG tokens_per: %+v", used) + //fmt.Printf("\n###### DEBUG used: %+v", used) usage := 0.0 if max > 0 { usage = used / max @@ -146,19 +140,13 @@ func promToPodMetrics( } else { errs = multierr.Append(errs, err) } - if tokens_per, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "tokens_per"); err == nil { - fmt.Printf("###### DEBUG tokens_per: %+v", tokens_per) - updated.Metrics.KvCacheMaxTokenCapacity = int(tokens_per * max) - } else { - errs = multierr.Append(errs, err) - } } else { errs = multierr.Append(errs, err) } } - fmt.Printf("###### DEBUG UPDATED: %+v", updated) - fmt.Printf("###### DEBUG ERRORS: %+v", errs) + //fmt.Printf("\n### DEBUG: %+v", updated) + //fmt.Printf("\n###### DEBUG ERRORS: %+v", errs) return updated, errs } @@ -230,10 +218,16 @@ func getCounterMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podName st // getTrtLlmMetric gets a TRT LLM metric with the specified type, key, and value. func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.MetricType, key, value string) (float64, error) { + //fmt.Printf("###### DEBUG START GETTRTMERTIC: %+v", mf.GetMetric()) + //fmt.Printf("###### DEBUG METRICS: %+v", len(mf.GetMetric())) for _, m := range mf.GetMetric() { + //fmt.Printf("###### DEBUG ANALYZING METRIC: %+v", m) + //fmt.Printf("###### DEBUG TIMESTAMP: %+v", m.GetTimestampMs()) foundKey := false foundValue := false + //fmt.Printf("###### DEBUG LABELS: %+v", m.GetLabel()) for _, label := range m.GetLabel() { + //fmt.Printf("###### DEBUG COMPARING label NAME %+v == %+v and label VALUE %+v == %+v", label.GetName(), key, label.GetValue(), value) if label.GetName() == key && label.GetValue() == value { foundKey = true } @@ -242,6 +236,7 @@ func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.Me } } if foundKey && foundValue { + //fmt.Printf("###### DEBUG METRIC FOUND: %+v", m) if metricType == dto.MetricType_GAUGE { logger.V(logutil.TRACE).Info("TRT LLM gauge metric found", "value", m.GetGauge().GetValue(), "key", key, "value", value) return m.GetGauge().GetValue(), nil diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go index f9b960a52..4a1e2b578 100644 --- a/pkg/epp/backend/triton/metrics_test.go +++ b/pkg/epp/backend/triton/metrics_test.go @@ -47,22 +47,17 @@ func TestPromToPodMetrics(t *testing.T) { Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{ - RunningQueueSize: 1, - WaitingQueueSize: 2, - KVCacheUsagePercent: 0.5, // used / max = 50 / 100 - KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50 + RunningQueueSize: 1, + WaitingQueueSize: 2, + KVCacheUsagePercent: 0.5, // used / max = 50 / 100 }, }, initialPodMetrics: &datastore.PodMetrics{ Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{}, }, @@ -75,22 +70,17 @@ func TestPromToPodMetrics(t *testing.T) { Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{ - RunningQueueSize: 0, // Default int value - WaitingQueueSize: 0, // Default int value - KVCacheUsagePercent: 0, // Default float64 value - KvCacheMaxTokenCapacity: 0, // Default int value + RunningQueueSize: 0, // Default int value + WaitingQueueSize: 0, // Default int value + KVCacheUsagePercent: 0, // Default float64 value }, }, initialPodMetrics: &datastore.PodMetrics{ Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{}, }, @@ -103,22 +93,17 @@ func TestPromToPodMetrics(t *testing.T) { Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{ - RunningQueueSize: 1, // from latest - WaitingQueueSize: 2, // from latest - KVCacheUsagePercent: 0.5, // used / max = 50 / 100 (from latest) - KvCacheMaxTokenCapacity: 5000, // max_blocks * tokens_per_block = 100 * 50 (from latest) + RunningQueueSize: 1, // from latest + WaitingQueueSize: 2, // from latest + KVCacheUsagePercent: 0.5, // used / max = 50 / 100 (from latest) }, }, initialPodMetrics: &datastore.PodMetrics{ Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{}, }, @@ -137,8 +122,6 @@ func TestPromToPodMetrics(t *testing.T) { Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{}, }, @@ -146,8 +129,6 @@ func TestPromToPodMetrics(t *testing.T) { Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, - ScrapePort: 9000, - ScrapePath: "/metrics", }, Metrics: datastore.Metrics{}, }, diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go new file mode 100644 index 000000000..8cfcf1d1f --- /dev/null +++ b/pkg/epp/datastore/types.go @@ -0,0 +1,71 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package datastore is a library to interact with backend model servers such as probing metrics. +package datastore + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/types" +) + +type Pod struct { + NamespacedName types.NamespacedName + Address string +} + +type Metrics struct { + // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. + ActiveModels map[string]int + // MaxActiveModels is the maximum number of models that can be loaded to GPU. + MaxActiveModels int + RunningQueueSize int + WaitingQueueSize int + KVCacheUsagePercent float64 + KvCacheMaxTokenCapacity int +} + +type PodMetrics struct { + Pod + Metrics +} + +func (pm *PodMetrics) String() string { + return fmt.Sprintf("Pod: %+v; Address: %+v; Metrics: %+v", pm.NamespacedName, pm.Address, pm.Metrics) +} + +func (pm *PodMetrics) Clone() *PodMetrics { + cm := make(map[string]int, len(pm.ActiveModels)) + for k, v := range pm.ActiveModels { + cm[k] = v + } + clone := &PodMetrics{ + Pod: Pod{ + NamespacedName: pm.NamespacedName, + Address: pm.Address, + }, + Metrics: Metrics{ + ActiveModels: cm, + MaxActiveModels: pm.MaxActiveModels, + RunningQueueSize: pm.RunningQueueSize, + WaitingQueueSize: pm.WaitingQueueSize, + KVCacheUsagePercent: pm.KVCacheUsagePercent, + KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity, + }, + } + return clone +} From 71e00adb67b5bbd7b288688f46e8a7cf76208561 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Mar 2025 20:34:41 +0000 Subject: [PATCH 03/19] Finalize metric refactor and testing. --- config/manifests/{ => vllm}/ext_proc.yaml | 2 +- .../manifests/{ => vllm}/inferencemodel.yaml | 0 pkg/epp/backend/triton/metrics_test.go | 48 ++++++++++--------- 3 files changed, 27 insertions(+), 23 deletions(-) rename config/manifests/{ => vllm}/ext_proc.yaml (96%) rename config/manifests/{ => vllm}/inferencemodel.yaml (100%) diff --git a/config/manifests/ext_proc.yaml b/config/manifests/vllm/ext_proc.yaml similarity index 96% rename from config/manifests/ext_proc.yaml rename to config/manifests/vllm/ext_proc.yaml index 33c47d400..bbd11b5c8 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/vllm/ext_proc.yaml @@ -71,7 +71,7 @@ spec: spec: containers: - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest imagePullPolicy: Always args: - -poolName diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/vllm/inferencemodel.yaml similarity index 100% rename from config/manifests/inferencemodel.yaml rename to config/manifests/vllm/inferencemodel.yaml diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go index 4a1e2b578..931a6346a 100644 --- a/pkg/epp/backend/triton/metrics_test.go +++ b/pkg/epp/backend/triton/metrics_test.go @@ -49,6 +49,7 @@ func TestPromToPodMetrics(t *testing.T) { Address: podAddress, }, Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, RunningQueueSize: 1, WaitingQueueSize: 2, KVCacheUsagePercent: 0.5, // used / max = 50 / 100 @@ -59,7 +60,9 @@ func TestPromToPodMetrics(t *testing.T) { NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, }, - Metrics: datastore.Metrics{}, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, }, expectedErr: false, }, @@ -72,6 +75,7 @@ func TestPromToPodMetrics(t *testing.T) { Address: podAddress, }, Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, RunningQueueSize: 0, // Default int value WaitingQueueSize: 0, // Default int value KVCacheUsagePercent: 0, // Default float64 value @@ -82,7 +86,9 @@ func TestPromToPodMetrics(t *testing.T) { NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, }, - Metrics: datastore.Metrics{}, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, }, expectedErr: false, }, @@ -95,6 +101,7 @@ func TestPromToPodMetrics(t *testing.T) { Address: podAddress, }, Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, RunningQueueSize: 1, // from latest WaitingQueueSize: 2, // from latest KVCacheUsagePercent: 0.5, // used / max = 50 / 100 (from latest) @@ -105,7 +112,9 @@ func TestPromToPodMetrics(t *testing.T) { NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, }, - Metrics: datastore.Metrics{}, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, }, expectedErr: false, }, @@ -118,21 +127,17 @@ func TestPromToPodMetrics(t *testing.T) { Metric: []*dto.Metric{}, // Empty }, }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{}, - }, + expectedMetrics: nil, initialPodMetrics: &datastore.PodMetrics{ Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: podName}, Address: podAddress, }, - Metrics: datastore.Metrics{}, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{}, + }, }, - expectedErr: false, + expectedErr: true, }, } @@ -157,8 +162,8 @@ func allMetricsAvailable(podName string) map[string]*dto.MetricFamily { Name: proto.String(TRTLLMRequestMetricsName), Type: dto.MetricType_GAUGE.Enum(), Metric: []*dto.Metric{ - trtLlmRequestMetric("active", 1, 200), - trtLlmRequestMetric("scheduled", 2, 200), + trtLlmRequestMetric("active", 3, 200), + trtLlmRequestMetric("scheduled", 1, 200), }, }, TRTLLMKvCacheMetricsName: { @@ -179,23 +184,22 @@ func multipleMetricsWithDifferentTimestamps(podName string) map[string]*dto.Metr Name: proto.String(TRTLLMRequestMetricsName), Type: dto.MetricType_GAUGE.Enum(), Metric: []*dto.Metric{ - trtLlmRequestMetric("active", 0, 100), // Older - trtLlmRequestMetric("scheduled", 3, 100), // Older - trtLlmRequestMetric("active", 1, 200), // Newer - trtLlmRequestMetric("scheduled", 2, 200), // Newer - + trtLlmRequestMetric("active", 3, 200), // Newer + trtLlmRequestMetric("scheduled", 1, 200), // Newer + trtLlmRequestMetric("active", 3, 100), // Older + trtLlmRequestMetric("scheduled", 0, 100), // Older }, }, TRTLLMKvCacheMetricsName: { Name: proto.String(TRTLLMKvCacheMetricsName), Type: dto.MetricType_GAUGE.Enum(), Metric: []*dto.Metric{ - trtLlmKvCacheMetric("max", 110, 100), //Older - trtLlmKvCacheMetric("used", 60, 100), //Older - trtLlmKvCacheMetric("tokens_per", 40, 100), //Older trtLlmKvCacheMetric("max", 100, 200), // Newer trtLlmKvCacheMetric("used", 50, 200), // Newer trtLlmKvCacheMetric("tokens_per", 50, 200), // Newer + trtLlmKvCacheMetric("max", 110, 100), //Older + trtLlmKvCacheMetric("used", 60, 100), //Older + trtLlmKvCacheMetric("tokens_per", 40, 100), //Older }, }, } From dd2825f2cf7c27bd643070ccbaefc942a087e6b7 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Mar 2025 20:54:03 +0000 Subject: [PATCH 04/19] Set streaming env var to false in triton ext_proc.yaml --- config/manifests/triton/ext_proc.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml index 16c802838..6797b7c78 100644 --- a/config/manifests/triton/ext_proc.yaml +++ b/config/manifests/triton/ext_proc.yaml @@ -90,6 +90,9 @@ spec: - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=used}" - -maxKVCacheBlocksMetric - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=max}" + env: + - name: USE_STREAMING + value: "false" ports: - containerPort: 9002 - containerPort: 9003 From aa2ee06fe18c591aa284724689c4e6adcb9a3555 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Mar 2025 21:40:24 +0000 Subject: [PATCH 05/19] Update titon server deployment to pull frozen repo branch instead of main for consistency. --- config/manifests/triton/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/manifests/triton/deployment.yaml b/config/manifests/triton/deployment.yaml index 61626293b..189ad90f2 100644 --- a/config/manifests/triton/deployment.yaml +++ b/config/manifests/triton/deployment.yaml @@ -41,7 +41,7 @@ spec: # Install python bindings for tritonserver and tritonfrontend pip install /opt/tritonserver/python/triton*.whl # Install application requirements - git clone https://github.com/triton-inference-server/server.git + git clone --depth 1 --branch v2.55.0 https://github.com/triton-inference-server/server.git cd server/python/openai/ pip install -r requirements.txt pip install uvicorn From d4c083e33398c1483b6ef1c5f3ee88f1186b8c42 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Mar 2025 22:01:04 +0000 Subject: [PATCH 06/19] Remove model server specific metric files and tests and point EPP image to main AR instead of testing registry. --- config/manifests/triton/ext_proc.yaml | 2 +- config/manifests/vllm/ext_proc.yaml | 2 +- pkg/epp/backend/triton/metrics.go | 265 ------------------------- pkg/epp/backend/triton/metrics_test.go | 226 --------------------- 4 files changed, 2 insertions(+), 493 deletions(-) delete mode 100644 pkg/epp/backend/triton/metrics.go delete mode 100644 pkg/epp/backend/triton/metrics_test.go diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml index 6797b7c78..f61a7ec18 100644 --- a/config/manifests/triton/ext_proc.yaml +++ b/config/manifests/triton/ext_proc.yaml @@ -71,7 +71,7 @@ spec: spec: containers: - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main imagePullPolicy: Always args: - -poolName diff --git a/config/manifests/vllm/ext_proc.yaml b/config/manifests/vllm/ext_proc.yaml index bbd11b5c8..33c47d400 100644 --- a/config/manifests/vllm/ext_proc.yaml +++ b/config/manifests/vllm/ext_proc.yaml @@ -71,7 +71,7 @@ spec: spec: containers: - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/triton-test/epp_triton_metrics:latest + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main imagePullPolicy: Always args: - -poolName diff --git a/pkg/epp/backend/triton/metrics.go b/pkg/epp/backend/triton/metrics.go deleted file mode 100644 index f28b1c88b..000000000 --- a/pkg/epp/backend/triton/metrics.go +++ /dev/null @@ -1,265 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package triton - -import ( - "context" - "fmt" - "net/http" - "strconv" - "strings" - - "github.com/go-logr/logr" - dto "github.com/prometheus/client_model/go" - "github.com/prometheus/common/expfmt" - "go.uber.org/multierr" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -const ( - // Triton metrics, see https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/metrics.html - - TRTLLMRequestMetricsName = "nv_trt_llm_request_metrics" - TRTLLMKvCacheMetricsName = "nv_trt_llm_kv_cache_block_metrics" - TRTLLMKvCacheMetricsLabel = "kv_cache_block_type" - TRTLLMRequestMetricsLabel = "request_type" -) - -type PodMetricsClientImpl struct{} - -// FetchMetrics fetches metrics from a given pod. -func (p *PodMetricsClientImpl) FetchMetrics( - ctx context.Context, - existing *datastore.PodMetrics, - port int32, -) (*datastore.PodMetrics, error) { - logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) - - // existing.ScrapePort = 8002 // triton has a different port for metrics than the target port for inference - url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) - // TODO print response and err - - if err != nil { - loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) - return nil, fmt.Errorf("failed to create request: %v", err) - } - resp, err := http.DefaultClient.Do(req) - if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) - } - defer func() { - _ = resp.Body.Close() - }() - - if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) - } - - parser := expfmt.TextParser{} - metricFamilies, err := parser.TextToMetricFamilies(resp.Body) - if err != nil { - return nil, err - } - return promToPodMetrics(logger, metricFamilies, existing) -} - -// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. -func promToPodMetrics( - logger logr.Logger, - metricFamilies map[string]*dto.MetricFamily, - existing *datastore.PodMetrics, -) (*datastore.PodMetrics, error) { - var errs error - updated := existing.Clone() - - //fmt.Print("\n\nDEBUG START\n###### DEBUG getting REQUEST metrics... ######") - // Get the "nv_trt_llm_request_metrics" metric family - //requestMetrics, err := getLatestMetric(logger, metricFamilies, TRTLLMRequestMetricsName) - requestMetrics, ok := metricFamilies[TRTLLMRequestMetricsName] - //errs = multierr.Append(errs, err) - if ok { - if scheduled, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "scheduled"); err == nil { - //fmt.Printf("\n###### DEBUG generation_requests: %+v", generation_requests) - updated.Metrics.RunningQueueSize = int(scheduled) - if active, err := getTrtLlmGaugeMetric(logger, requestMetrics, TRTLLMRequestMetricsLabel, "active"); err == nil { - //fmt.Printf("\n###### DEBUG scheduled: %+v", scheduled) - updated.Metrics.WaitingQueueSize = int(active - scheduled) - // pendingMetrics, ok := metricFamilies["nv_inference_pending_request_count"] - // if ok { - // if queued, err := getTrtLlmGaugeMetric(logger, pendingMetrics, "model", "ensemble"); err == nil { - // fmt.Printf("\n###### DEBUG queued requests: %+v", int(queued)) - // } - // } - //fmt.Printf("\n###### DEBUG active (total) requests: %+v", int(active)) - //fmt.Printf("\n###### DEBUG waiting requests: %+v", int(active-scheduled)) - //fmt.Printf("\n###### DEBUG running requests: %+v", int(scheduled)) - } else { - errs = multierr.Append(errs, err) - } - } else { - errs = multierr.Append(errs, err) - } - } - - //fmt.Print("\n\n###### DEBUG getting KVBLOCK metrics... ######") - // Get the "nv_trt_llm_kv_cache_block_metrics" metric family - kvCacheBlocks, ok := metricFamilies[TRTLLMKvCacheMetricsName] - // errs = multierr.Append(errs, err) - // fmt.Printf("###### DEBUG (should be nil) getLatestMetric errs: %+v", errs) - if ok { - // Calculate the kv-cache usage from the max and used metrics - if max, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "max"); err == nil { - //fmt.Printf("\n###### DEBUG max: %+v", max) - if used, err := getTrtLlmGaugeMetric(logger, kvCacheBlocks, TRTLLMKvCacheMetricsLabel, "used"); err == nil { - //fmt.Printf("\n###### DEBUG used: %+v", used) - usage := 0.0 - if max > 0 { - usage = used / max - } - updated.Metrics.KVCacheUsagePercent = usage - } else { - errs = multierr.Append(errs, err) - } - } else { - errs = multierr.Append(errs, err) - } - } - - //fmt.Printf("\n### DEBUG: %+v", updated) - //fmt.Printf("\n###### DEBUG ERRORS: %+v", errs) - - return updated, errs -} - -// getLatestMetric gets the latest metric of a family. -func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.MetricFamily, error) { - mf, ok := metricFamilies[metricName] - if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName) - return nil, fmt.Errorf("metric family %q not found", metricName) - } - if len(mf.GetMetric()) == 0 { - return nil, fmt.Errorf("no metrics available for %q", metricName) - } - - var latestTs int64 - var latestMf *dto.MetricFamily - for _, m := range mf.GetMetric() { - if m.GetTimestampMs() >= latestTs { - latestTs = m.GetTimestampMs() - latestMf = &dto.MetricFamily{ - Name: mf.Name, - Help: mf.Help, - Type: mf.Type, - Metric: []*dto.Metric{m}, - } - } - } - - logger.V(logutil.TRACE).Info("Metric value selected", "metric Family", latestMf, "metric", metricName) - return latestMf, nil -} - -// getGaugeMetricForPod gets gauge metric value for a given pod. -func getGaugeMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podIdentifier string) (float64, error) { - for _, m := range mf.GetMetric() { - for _, label := range m.GetLabel() { - if (label.GetName() == "pod" || label.GetName() == "gpu_uuid") && strings.Contains(label.GetValue(), podIdentifier) { - logger.V(logutil.TRACE).Info("Pod metric found", "value", m.GetGauge().GetValue(), "labelName", label.GetName(), "labelValue", label.GetValue()) - - return m.GetGauge().GetValue(), nil // Return the value with nil error - } - } - } - logger.V(logutil.TRACE).Info("Metric Value not found for pod", "pod", podIdentifier, "metric family", mf.GetName()) - return -1, fmt.Errorf("metric value not found for pod %s in metric family %s", podIdentifier, mf.GetName()) // Return an error -} - -// getCounterMetricForPod gets counter metric value for a given pod. -func getCounterMetricForPod(logger logr.Logger, mf *dto.MetricFamily, podName string) (int, error) { - for _, m := range mf.GetMetric() { - for _, label := range m.GetLabel() { - if label.GetName() == "pod" && label.GetValue() == podName { - val := m.GetCounter().GetValue() - intVal, err := strconv.Atoi(fmt.Sprintf("%v", val)) // Convert float64 to int - if err != nil { - return -1, fmt.Errorf("failed to convert counter metric to int: %w", err) - } - logger.V(logutil.TRACE).Info("Pod metric found", "value", intVal) - - return intVal, nil - } - } - } - return -1, nil -} - -// TRTLLM metrics - -// getTrtLlmMetric gets a TRT LLM metric with the specified type, key, and value. -func getTrtLlmMetric(logger logr.Logger, mf *dto.MetricFamily, metricType dto.MetricType, key, value string) (float64, error) { - //fmt.Printf("###### DEBUG START GETTRTMERTIC: %+v", mf.GetMetric()) - //fmt.Printf("###### DEBUG METRICS: %+v", len(mf.GetMetric())) - for _, m := range mf.GetMetric() { - //fmt.Printf("###### DEBUG ANALYZING METRIC: %+v", m) - //fmt.Printf("###### DEBUG TIMESTAMP: %+v", m.GetTimestampMs()) - foundKey := false - foundValue := false - //fmt.Printf("###### DEBUG LABELS: %+v", m.GetLabel()) - for _, label := range m.GetLabel() { - //fmt.Printf("###### DEBUG COMPARING label NAME %+v == %+v and label VALUE %+v == %+v", label.GetName(), key, label.GetValue(), value) - if label.GetName() == key && label.GetValue() == value { - foundKey = true - } - if mf.GetType() == metricType { - foundValue = true - } - } - if foundKey && foundValue { - //fmt.Printf("###### DEBUG METRIC FOUND: %+v", m) - if metricType == dto.MetricType_GAUGE { - logger.V(logutil.TRACE).Info("TRT LLM gauge metric found", "value", m.GetGauge().GetValue(), "key", key, "value", value) - return m.GetGauge().GetValue(), nil - } else if metricType == dto.MetricType_COUNTER { - val := m.GetCounter().GetValue() - intVal, err := strconv.Atoi(fmt.Sprintf("%v", val)) - if err != nil { - return -1, fmt.Errorf("failed to convert counter metric to int: %w", err) - } - logger.V(logutil.TRACE).Info("TRT LLM counter metric found", "value", intVal, "key", key, "value", value) - return float64(intVal), nil - } - } - } - return -1, fmt.Errorf("TRT LLM metric not found: %s{ %s=\"%s\" }", mf.GetName(), key, value) -} - -// getTrtLlmGaugeMetric gets a gauge TRT LLM metric. -func getTrtLlmGaugeMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) { - return getTrtLlmMetric(logger, mf, dto.MetricType_GAUGE, key, value) -} - -// getTrtLlmCounterMetric gets a counter TRT LLM metric. -func getTrtLlmCounterMetric(logger logr.Logger, mf *dto.MetricFamily, key, value string) (float64, error) { - return getTrtLlmMetric(logger, mf, dto.MetricType_COUNTER, key, value) -} diff --git a/pkg/epp/backend/triton/metrics_test.go b/pkg/epp/backend/triton/metrics_test.go deleted file mode 100644 index 931a6346a..000000000 --- a/pkg/epp/backend/triton/metrics_test.go +++ /dev/null @@ -1,226 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package triton - -import ( - "testing" - - dto "github.com/prometheus/client_model/go" - "github.com/stretchr/testify/assert" - "google.golang.org/protobuf/proto" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -func TestPromToPodMetrics(t *testing.T) { - logger := logutil.NewTestLogger() - - podName := "test-pod" - podAddress := "10.0.0.1" - - testCases := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - expectedMetrics *datastore.PodMetrics - expectedErr bool - initialPodMetrics *datastore.PodMetrics - }{ - { - name: "all metrics available", - metricFamilies: allMetricsAvailable(podName), - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - RunningQueueSize: 1, - WaitingQueueSize: 2, - KVCacheUsagePercent: 0.5, // used / max = 50 / 100 - }, - }, - initialPodMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, - }, - expectedErr: false, - }, - { - name: "missing metrics", - metricFamilies: map[string]*dto.MetricFamily{}, // No metrics provided - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - RunningQueueSize: 0, // Default int value - WaitingQueueSize: 0, // Default int value - KVCacheUsagePercent: 0, // Default float64 value - }, - }, - initialPodMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, - }, - expectedErr: false, - }, - { - name: "multiple timestamps", - metricFamilies: multipleMetricsWithDifferentTimestamps(podName), - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - RunningQueueSize: 1, // from latest - WaitingQueueSize: 2, // from latest - KVCacheUsagePercent: 0.5, // used / max = 50 / 100 (from latest) - }, - }, - initialPodMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, - }, - expectedErr: false, - }, - { - name: "empty metric family", - metricFamilies: map[string]*dto.MetricFamily{ - TRTLLMRequestMetricsName: { - Name: proto.String(TRTLLMRequestMetricsName), - Type: dto.MetricType_GAUGE.Enum(), - Metric: []*dto.Metric{}, // Empty - }, - }, - expectedMetrics: nil, - initialPodMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: podName}, - Address: podAddress, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, - }, - expectedErr: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialPodMetrics) - if tc.expectedErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - assert.Equal(t, tc.expectedMetrics, updated) - } - }) - } -} - -// --- Helper Functions --- - -func allMetricsAvailable(podName string) map[string]*dto.MetricFamily { - return map[string]*dto.MetricFamily{ - TRTLLMRequestMetricsName: { - Name: proto.String(TRTLLMRequestMetricsName), - Type: dto.MetricType_GAUGE.Enum(), - Metric: []*dto.Metric{ - trtLlmRequestMetric("active", 3, 200), - trtLlmRequestMetric("scheduled", 1, 200), - }, - }, - TRTLLMKvCacheMetricsName: { - Name: proto.String(TRTLLMKvCacheMetricsName), - Type: dto.MetricType_GAUGE.Enum(), - Metric: []*dto.Metric{ - trtLlmKvCacheMetric("max", 100, 200), - trtLlmKvCacheMetric("used", 50, 200), - trtLlmKvCacheMetric("tokens_per", 50, 200), - }, - }, - } -} - -func multipleMetricsWithDifferentTimestamps(podName string) map[string]*dto.MetricFamily { - return map[string]*dto.MetricFamily{ - TRTLLMRequestMetricsName: { - Name: proto.String(TRTLLMRequestMetricsName), - Type: dto.MetricType_GAUGE.Enum(), - Metric: []*dto.Metric{ - trtLlmRequestMetric("active", 3, 200), // Newer - trtLlmRequestMetric("scheduled", 1, 200), // Newer - trtLlmRequestMetric("active", 3, 100), // Older - trtLlmRequestMetric("scheduled", 0, 100), // Older - }, - }, - TRTLLMKvCacheMetricsName: { - Name: proto.String(TRTLLMKvCacheMetricsName), - Type: dto.MetricType_GAUGE.Enum(), - Metric: []*dto.Metric{ - trtLlmKvCacheMetric("max", 100, 200), // Newer - trtLlmKvCacheMetric("used", 50, 200), // Newer - trtLlmKvCacheMetric("tokens_per", 50, 200), // Newer - trtLlmKvCacheMetric("max", 110, 100), //Older - trtLlmKvCacheMetric("used", 60, 100), //Older - trtLlmKvCacheMetric("tokens_per", 40, 100), //Older - }, - }, - } -} - -func trtLlmRequestMetric(requestType string, value float64, timestampMs int64) *dto.Metric { - return &dto.Metric{ - Label: []*dto.LabelPair{ - {Name: proto.String(TRTLLMRequestMetricsLabel), Value: proto.String(requestType)}, - }, - Gauge: &dto.Gauge{Value: &value}, - TimestampMs: ×tampMs, - } -} - -func trtLlmKvCacheMetric(blockType string, value float64, timestampMs int64) *dto.Metric { - return &dto.Metric{ - Label: []*dto.LabelPair{ - {Name: proto.String(TRTLLMKvCacheMetricsLabel), Value: proto.String(blockType)}, - }, - Gauge: &dto.Gauge{Value: &value}, - TimestampMs: ×tampMs, - } -} From df3f3e3ac7f0ac0bb0702b10a291e362c14806d7 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 7 Mar 2025 00:23:03 +0000 Subject: [PATCH 07/19] Remove commented prints and old comments. --- cmd/epp/main.go | 5 +++-- pkg/epp/backend/metrics_spec.go | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index c5264b823..40f80b39a 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -156,7 +156,8 @@ func run() error { pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval) // Setup runner. datastore := datastore.NewDatastore(ctx, pmf) - // switch case across different model server metrics (triton, vllm) + + // Set up mapper for metric scraping. mapping, err := backend.NewMetricMapping( *allRequestsMetric, *waitingRequestsMetric, @@ -167,7 +168,7 @@ func run() error { *loraRequestInfoMetric, ) if err != nil { - setupLog.Error(err, "Failed to create metric mapping from flags") + setupLog.Error(err, "Failed to create metric mapping from flags.") return err } provider := backend.NewProvider(&backend.PodMetricsClientImpl{MetricMapping: mapping}, datastore) diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/metrics_spec.go index aabcf9835..9cd194db1 100644 --- a/pkg/epp/backend/metrics_spec.go +++ b/pkg/epp/backend/metrics_spec.go @@ -32,9 +32,9 @@ type MetricMapping struct { AllRequests *MetricSpec // Option 1 WaitingRequests *MetricSpec // Option 2 RunningRequests *MetricSpec // Required - UsedKVCacheBlocks *MetricSpec // Optional (part of a group) - MaxKVCacheBlocks *MetricSpec // Optional (part of a group) - KVCacheUsage *MetricSpec // Optional (alternative to the group above) + UsedKVCacheBlocks *MetricSpec // Option 1 (part of a group) + MaxKVCacheBlocks *MetricSpec // Option 1 (part of a group) + KVCacheUsage *MetricSpec // Option 2 (alternative to the group above) // LoRA Metrics (vLLM Specific, optional) LoraRequestInfo *MetricSpec } From 558132e021ad7d6f6cd1dfaa82232c4b1717efcc Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 7 Mar 2025 19:48:56 +0000 Subject: [PATCH 08/19] Remove triton support for now, make metrics mapping 1-to-1 with load balancing metrics. --- cmd/epp/main.go | 22 ++- config/manifests/triton/deployment.yaml | 100 ----------- config/manifests/triton/ext_proc.yaml | 126 -------------- config/manifests/triton/inferencemodel.yaml | 9 - config/manifests/triton/triton-set-up.yaml | 111 ------------- config/manifests/vllm/ext_proc.yaml | 8 - pkg/epp/backend/metrics.go | 44 +---- pkg/epp/backend/metrics_spec.go | 65 +------- pkg/epp/backend/metrics_spec_test.go | 109 ------------ pkg/epp/backend/metrics_test.go | 174 ++------------------ pkg/epp/datastore/types.go | 18 +- 11 files changed, 42 insertions(+), 744 deletions(-) delete mode 100644 config/manifests/triton/deployment.yaml delete mode 100644 config/manifests/triton/ext_proc.yaml delete mode 100644 config/manifests/triton/inferencemodel.yaml delete mode 100644 config/manifests/triton/triton-set-up.yaml diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 40f80b39a..f3e0b6571 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -94,14 +94,16 @@ var ( "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ "then a self-signed certificate is used.") // metric flags - allRequestsMetric = flag.String("allRequestsMetric", "", "Prometheus metric for the total number of processing requests, both queued and running.") - waitingRequestsMetric = flag.String("waitingRequestsMetric", "", "Prometheus metric for the number of queued requests.") - runningRequestsMetric = flag.String("runningRequestsMetric", "", "Prometheus metric for the number of running requests.") - usedKVCacheBlocksMetric = flag.String("usedKVCacheBlocksMetric", "", "Prometheus metric for the number of utilized KV-cache blocks.") - maxKVCacheBlocksMetric = flag.String("maxKVCacheBlocksMetric", "", "Prometheus metric for the total number of available KV-cache blocks.") - kVCacheUsageMetric = flag.String("kVCacheUsageMetric", "", "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") + totalQueuedRequestMetric = flag.String("totalQueuedRequestMetric", + "vllm:num_requests_waiting", + "Prometheus metric for the number of queued requests.") + kVCacheUsageMetric = flag.String("kVCacheUsageMetric", + "vllm:gpu_cache_usage_perc", + "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") // LoRA metrics - loraRequestInfoMetric = flag.String("loraRequestInfoMetric", "", "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") + loraRequestInfoMetric = flag.String("loraRequestInfoMetric", + "vllm:lora_requests_info", + "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") setupLog = ctrl.Log.WithName("setup") ) @@ -159,11 +161,7 @@ func run() error { // Set up mapper for metric scraping. mapping, err := backend.NewMetricMapping( - *allRequestsMetric, - *waitingRequestsMetric, - *runningRequestsMetric, - *usedKVCacheBlocksMetric, - *maxKVCacheBlocksMetric, + *totalQueuedRequestMetric, *kVCacheUsageMetric, *loraRequestInfoMetric, ) diff --git a/config/manifests/triton/deployment.yaml b/config/manifests/triton/deployment.yaml deleted file mode 100644 index 189ad90f2..000000000 --- a/config/manifests/triton/deployment.yaml +++ /dev/null @@ -1,100 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: llama-triton-deployment -spec: - replicas: 1 # Start with 1 replica. Adjust as needed. - selector: - matchLabels: - app: llama-triton # This MUST match the labels in the template - template: - metadata: - labels: - app: llama-triton - spec: - containers: - - name: triton-server - image: nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3 # Use base Triton image - imagePullPolicy: IfNotPresent - command: ["/bin/bash", "-c"] - args: - - | - set -e - apt-get update && apt-get install -y python3.12-venv - - # Create and activate a virtual environment - python3 -m venv /opt/venv - source /opt/venv/bin/activate - pip install SentencePiece - pip install packaging - pip install numpy - pip install torch - pip install requests - pip install transformers - pip install pillow - - # Use launch_triton_server.py - # python3 /models/tensorrtllm_backend/scripts/launch_triton_server.py --world_size 1 --model_repo /models/tensorrtllm_backend/llama_ifb - # tail -f /dev/null - - # Launch OpenAI completetions endpoint - # Install python bindings for tritonserver and tritonfrontend - pip install /opt/tritonserver/python/triton*.whl - # Install application requirements - git clone --depth 1 --branch v2.55.0 https://github.com/triton-inference-server/server.git - cd server/python/openai/ - pip install -r requirements.txt - pip install uvicorn - pip install -U huggingface_hub - huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential - - python3 openai_frontend/main.py --model-repository /models/tensorrtllm_backend/llama_ifb --tokenizer meta-llama/Llama-2-7b-chat-hf - ports: - - containerPort: 9000 - name: http - - containerPort: 9001 - name: grpc - - containerPort: 9002 - name: metrics - volumeMounts: - - mountPath: /models - name: model-volume - - mountPath: /secrets/huggingface - name: huggingface-secret - readOnly: true - resources: - limits: - ephemeral-storage: 40Gi - nvidia.com/gpu: 1 - memory: 40Gi - requests: - ephemeral-storage: 40Gi - memory: 40Gi - nvidia.com/gpu: 1 - volumes: - - name: model-volume - persistentVolumeClaim: - claimName: llama-model-pvc - - name: huggingface-secret - secret: - secretName: hf-token - ---- -apiVersion: v1 -kind: Service -metadata: - name: llama-triton-service -spec: - type: ClusterIP - ports: - - port: 9000 - targetPort: http - name: http-inference-server - - port: 9001 - targetPort: grpc - name: grpc-inference-server - - port: 9002 - targetPort: metrics - name: http-metrics - selector: - app: llama-triton diff --git a/config/manifests/triton/ext_proc.yaml b/config/manifests/triton/ext_proc.yaml deleted file mode 100644 index f61a7ec18..000000000 --- a/config/manifests/triton/ext_proc.yaml +++ /dev/null @@ -1,126 +0,0 @@ -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read -rules: -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencemodels"] - verbs: ["get", "watch", "list"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: ["discovery.k8s.io"] - resources: ["endpointslices"] - verbs: ["get", "watch", "list"] -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read-binding -subjects: -- kind: ServiceAccount - name: default - namespace: default -roleRef: - kind: ClusterRole - name: pod-read ---- -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - labels: - name: triton-llama2-7b-pool -spec: - targetPortNumber: 9000 - selector: - app: llama-triton - extensionRef: - name: inference-gateway-ext-proc ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: inference-gateway-ext-proc - namespace: default - labels: - app: inference-gateway-ext-proc -spec: - replicas: 1 - selector: - matchLabels: - app: inference-gateway-ext-proc - template: - metadata: - labels: - app: inference-gateway-ext-proc - spec: - containers: - - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main - imagePullPolicy: Always - args: - - -poolName - - "triton-llama2-7b-pool" - - -v - - "3" - - -grpcPort - - "9002" - - -grpcHealthPort - - "9003" - - -allRequestsMetric - - "nv_trt_llm_request_metrics{request_type=active}" - - -runningRequestsMetric - - "nv_trt_llm_request_metrics{request_type=scheduled}" - - -usedKVCacheBlocksMetric - - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=used}" - - -maxKVCacheBlocksMetric - - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=max}" - env: - - name: USE_STREAMING - value: "false" - ports: - - containerPort: 9002 - - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 ---- -apiVersion: v1 -kind: Service -metadata: - name: inference-gateway-ext-proc - namespace: default -spec: - selector: - app: inference-gateway-ext-proc - ports: - - protocol: TCP - port: 9002 - targetPort: 9002 - type: ClusterIP diff --git a/config/manifests/triton/inferencemodel.yaml b/config/manifests/triton/inferencemodel.yaml deleted file mode 100644 index db643a85c..000000000 --- a/config/manifests/triton/inferencemodel.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: triton-llama2-7b-model -spec: - modelName: ensemble - criticality: Standard - poolRef: - name: triton-llama2-7b-pool diff --git a/config/manifests/triton/triton-set-up.yaml b/config/manifests/triton/triton-set-up.yaml deleted file mode 100644 index 08fa0852c..000000000 --- a/config/manifests/triton/triton-set-up.yaml +++ /dev/null @@ -1,111 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: llama-model-pvc -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 200Gi - ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: llama-build-job -spec: - backoffLimit: 0 - template: - metadata: - labels: - app: llama-triton - spec: - containers: - - name: llama-builder - image: nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3 # Use the base Triton image directly - command: ["/bin/bash", "-c"] - args: - - | - set -e # Exit on error - - apt-get update && apt-get install -y python3.12-venv - - # Create and activate a virtual environment - python3 -m venv /opt/venv - source /opt/venv/bin/activate - - # Install git (it might not be in the base image) - apt-get update && apt-get install -y --no-install-recommends git - - # Clone the tensorrt_llm_backend repository and set up submodule - git clone -b triton-llm/v0.17.0 https://github.com/triton-inference-server/tensorrtllm_backend.git /models/tensorrtllm_backend - cd /models/tensorrtllm_backend - git lfs install - git submodule update --init --recursive - - # --- Hugging Face Setup --- - # 1. Install the Hugging Face CLI - pip install -U huggingface_hub - pip install transformers - pip install --extra-index-url https://pypi.nvidia.com/ tensorrt-llm - pip install tensorrt_llm - - # 2. Log in using the token from the secret - # The secret is mounted as a file. - huggingface-cli login --token $(cat /secrets/huggingface/token) --add-to-git-credential - huggingface-cli download meta-llama/Llama-2-7b-hf --local-dir /models/hf_models/ - - # Download and convert the Hugging Face model. Modify parameters as needed. - export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json', local_dir='/models/hf_models/')).parent)"` - echo PATH TO LLAMA MODEL: $HF_LLAMA_MODEL - export UNIFIED_CKPT_PATH=/models/tmp/ckpt/llama/7b/ - export ENGINE_PATH=/models/tmp/engines/llama/7b/ - export TRTLLM_MODEL_REPO=/models/tensorrtllm_backend/llama_ifb - python3 /models/tensorrtllm_backend/tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ - --output_dir ${UNIFIED_CKPT_PATH} \ - --dtype float16 - - # Build the TensorRT-LLM engine. Adjust parameters (e.g., world_size) as needed. - trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --output_dir ${ENGINE_PATH} \ - --gemm_plugin float16 \ - --kv_cache_type paged \ - --context_fmha enable \ - --gpt_attention_plugin float16 \ - --remove_input_padding enable \ - --max_batch_size 64 - - cp /models/tensorrtllm_backend/all_models/inflight_batcher_llm/ ${TRTLLM_MODEL_REPO} -r - - python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 - python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 - python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 - python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 - python3 /models/tensorrtllm_backend/tools/fill_template.py -i ${TRTLLM_MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 - - - echo "Build complete!" - volumeMounts: - - mountPath: /models - name: model-volume - - mountPath: /secrets/huggingface - name: huggingface-secret - readOnly: true - resources: - limits: - ephemeral-storage: 80Gi - nvidia.com/gpu: 1 - memory: 40Gi - requests: - ephemeral-storage: 80Gi - nvidia.com/gpu: 1 - memory: 40Gi - restartPolicy: Never - volumes: - - name: model-volume - persistentVolumeClaim: - claimName: llama-model-pvc - - name: huggingface-secret - secret: - secretName: hf-token diff --git a/config/manifests/vllm/ext_proc.yaml b/config/manifests/vllm/ext_proc.yaml index 33c47d400..d70467ee0 100644 --- a/config/manifests/vllm/ext_proc.yaml +++ b/config/manifests/vllm/ext_proc.yaml @@ -82,14 +82,6 @@ spec: - "9002" - -grpcHealthPort - "9003" - - -waitingRequestsMetric - - "vllm:num_requests_waiting" - - -runningRequestsMetric - - "vllm:num_requests_running" - - -kVCacheUsageMetric - - "vllm:gpu_cache_usage_perc" - - -loraRequestInfoMetric - - "vllm:lora_requests_info" env: - name: USE_STREAMING value: "false" diff --git a/pkg/epp/backend/metrics.go b/pkg/epp/backend/metrics.go index 2f2082652..edc4b6e80 100644 --- a/pkg/epp/backend/metrics.go +++ b/pkg/epp/backend/metrics.go @@ -91,56 +91,22 @@ func (p *PodMetricsClientImpl) promToPodMetrics( var errs error updated := existing.Clone() - if p.MetricMapping.RunningRequests != nil { - running, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.RunningRequests) + if p.MetricMapping.TotalQueuedRequests != nil { + queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests) if err == nil { - updated.RunningQueueSize = int(running.GetGauge().GetValue()) + updated.WaitingQueueSize = int(queued.GetGauge().GetValue()) } else { errs = multierr.Append(errs, err) } } - if p.MetricMapping.AllRequests != nil { - all, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.AllRequests) - if err == nil { - updated.WaitingQueueSize = int(all.GetGauge().GetValue()) - updated.RunningQueueSize - } else { - errs = multierr.Append(errs, err) - } - } - - if p.MetricMapping.WaitingRequests != nil { - waiting, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.WaitingRequests) - if err == nil { - updated.WaitingQueueSize = int(waiting.GetGauge().GetValue()) - } else { - errs = multierr.Append(errs, err) - } - } - - if p.MetricMapping.KVCacheUsage != nil { - usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUsage) + if p.MetricMapping.KVCacheUtilization != nil { + usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization) if err == nil { updated.KVCacheUsagePercent = usage.GetGauge().GetValue() } else { errs = multierr.Append(errs, err) } - } else if p.MetricMapping.UsedKVCacheBlocks != nil && p.MetricMapping.MaxKVCacheBlocks != nil { - used, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.UsedKVCacheBlocks) - if err != nil { - errs = multierr.Append(errs, err) - } - max, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.MaxKVCacheBlocks) - if err != nil { - errs = multierr.Append(errs, err) - } - if err == nil { - usage := 0.0 - if max.GetGauge().GetValue() > 0 { - usage = used.GetGauge().GetValue() / max.GetGauge().GetValue() - } - updated.KVCacheUsagePercent = usage - } } // Handle LoRA metrics (only if all LoRA MetricSpecs are present) diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/metrics_spec.go index 9cd194db1..7ce2f5d60 100644 --- a/pkg/epp/backend/metrics_spec.go +++ b/pkg/epp/backend/metrics_spec.go @@ -29,14 +29,9 @@ type MetricSpec struct { // MetricMapping holds named MetricSpecs. type MetricMapping struct { - AllRequests *MetricSpec // Option 1 - WaitingRequests *MetricSpec // Option 2 - RunningRequests *MetricSpec // Required - UsedKVCacheBlocks *MetricSpec // Option 1 (part of a group) - MaxKVCacheBlocks *MetricSpec // Option 1 (part of a group) - KVCacheUsage *MetricSpec // Option 2 (alternative to the group above) - // LoRA Metrics (vLLM Specific, optional) - LoraRequestInfo *MetricSpec + TotalQueuedRequests *MetricSpec + KVCacheUtilization *MetricSpec + LoraRequestInfo *MetricSpec } // stringToMetricSpec converts a string to a MetricSpec. @@ -99,28 +94,12 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) { } // NewMetricMapping creates a MetricMapping from string values. -func NewMetricMapping(allStr, waitingStr, runningStr, usedBlocksStr, maxBlocksStr, usageStr, loraReqInfoStr string) (*MetricMapping, error) { - allSpec, err := stringToMetricSpec(allStr) - if err != nil { - return nil, fmt.Errorf("error parsing AllRequests: %w", err) - } - waitingSpec, err := stringToMetricSpec(waitingStr) +func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr string) (*MetricMapping, error) { + queuedSpec, err := stringToMetricSpec(queuedStr) if err != nil { return nil, fmt.Errorf("error parsing WaitingRequests: %w", err) } - runningSpec, err := stringToMetricSpec(runningStr) - if err != nil { - return nil, fmt.Errorf("error parsing RunningRequests: %w", err) - } - usedBlocksSpec, err := stringToMetricSpec(usedBlocksStr) - if err != nil { - return nil, fmt.Errorf("error parsing UsedKVCacheBlocks: %w", err) - } - maxBlocksSpec, err := stringToMetricSpec(maxBlocksStr) - if err != nil { - return nil, fmt.Errorf("error parsing MaxKVCacheBlocks: %w", err) - } - usageSpec, err := stringToMetricSpec(usageStr) + kvUsageSpec, err := stringToMetricSpec(kvUsageStr) if err != nil { return nil, fmt.Errorf("error parsing KVCacheUsage: %w", err) } @@ -129,36 +108,10 @@ func NewMetricMapping(allStr, waitingStr, runningStr, usedBlocksStr, maxBlocksSt return nil, fmt.Errorf("error parsing loraReqInfoStr: %w", err) } mapping := &MetricMapping{ - AllRequests: allSpec, - WaitingRequests: waitingSpec, - RunningRequests: runningSpec, - UsedKVCacheBlocks: usedBlocksSpec, - MaxKVCacheBlocks: maxBlocksSpec, - KVCacheUsage: usageSpec, - LoraRequestInfo: loraReqInfoSpec, - } - - if err := mapping.Validate(); err != nil { - return nil, err // Return validation error + TotalQueuedRequests: queuedSpec, + KVCacheUtilization: kvUsageSpec, + LoraRequestInfo: loraReqInfoSpec, } return mapping, nil } - -// Validate checks if the MetricMapping is valid. -func (m *MetricMapping) Validate() error { - // 1. WaitingRequests OR AllRequests (but not both can be nil) - if m.WaitingRequests == nil && m.AllRequests == nil { - return fmt.Errorf("either WaitingRequests or AllRequests must be specified") - } - - if m.RunningRequests == nil { - return fmt.Errorf("RunningRequests is required") - } - - // 2. KVCacheUsage OR (UsedKVCacheBlocks AND MaxKVCacheBlocks) - if m.KVCacheUsage == nil && (m.UsedKVCacheBlocks == nil || m.MaxKVCacheBlocks == nil) { - return fmt.Errorf("either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified") - } - return nil -} diff --git a/pkg/epp/backend/metrics_spec_test.go b/pkg/epp/backend/metrics_spec_test.go index 084ae5b5a..141b97386 100644 --- a/pkg/epp/backend/metrics_spec_test.go +++ b/pkg/epp/backend/metrics_spec_test.go @@ -18,7 +18,6 @@ package backend import ( "reflect" - "strings" "testing" ) @@ -171,111 +170,3 @@ func TestStringToMetricSpec(t *testing.T) { }) } } - -func TestNewMetricMappingAndValidate(t *testing.T) { - tests := []struct { - name string - allStr string - waitingStr string - runningStr string - usedStr string - maxStr string - usageStr string - loraReqInfoStr string - wantErr bool - expectedErr string // Added to check for specific error messages - }{ - { - name: "valid vllm mapping", - runningStr: "running_metric", - waitingStr: "waiting_metric", - usageStr: "usage_metric", - loraReqInfoStr: "lora_requests_info", - wantErr: false, - expectedErr: "", - }, - { - name: "valid triton mapping", - runningStr: "running_metric{label1=value1}", - allStr: "all_metric{label2=value2}", - usedStr: "used_blocks{label3=value3}", - maxStr: "max_blocks{label4=value4}", - wantErr: false, - }, - { - name: "multiple labels mapping", - runningStr: "running_metric{label1=value1,label5=value5}", - allStr: "all_metric{label2=value2,label6=value6}", - usedStr: "used_blocks{label3=value3}", - maxStr: "max_blocks{label4=value4}", - wantErr: false, - }, - { - name: "missing running", - waitingStr: "waiting_metric", - usageStr: "usage_metric", - wantErr: true, - expectedErr: "RunningRequests is required", - }, - { - name: "missing both waiting and all", - runningStr: "running_metric", - usageStr: "usage_metric", - wantErr: true, - expectedErr: "either WaitingRequests or AllRequests must be specified", - }, - { - name: "missing usage and both block metrics", - runningStr: "running_metric", - waitingStr: "waiting_metric", - wantErr: true, - expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified", - }, - { - name: "missing max block metric", - runningStr: "running_metric", - waitingStr: "waiting_metric", - usedStr: "used_blocks", - wantErr: true, - expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified", - }, - { - name: "missing used block metric", - runningStr: "running_metric", - waitingStr: "waiting_metric", - maxStr: "max_blocks", - wantErr: true, - expectedErr: "either KVCacheUsage or both UsedKVCacheBlocks and MaxKVCacheBlocks must be specified", - }, - { - name: "invalid running metric format", - runningStr: "running_metric{invalid", - waitingStr: "waiting_metric", - usageStr: "usage_metric", - wantErr: true, - expectedErr: "error parsing RunningRequests", // Check for part of the expected error - }, - { - name: "lora metrics present", - runningStr: "running_metric", - waitingStr: "waiting_metric", - usageStr: "usage_metric", - loraReqInfoStr: "lora_requests_info", - - wantErr: false, - expectedErr: "", // Check for part of the expected error - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, err := NewMetricMapping(tt.allStr, tt.waitingStr, tt.runningStr, tt.usedStr, tt.maxStr, tt.usageStr, tt.loraReqInfoStr) - if (err != nil) != tt.wantErr { - t.Errorf("NewMetricMapping() error = %v, wantErr %v", err, tt.wantErr) - return - } - if tt.wantErr && !strings.Contains(err.Error(), tt.expectedErr) { - t.Errorf("NewMetricMapping() error = %v, expected to contain = %v", err, tt.expectedErr) - } - }) - } -} diff --git a/pkg/epp/backend/metrics_test.go b/pkg/epp/backend/metrics_test.go index 0bfafcee5..1b0ad05d9 100644 --- a/pkg/epp/backend/metrics_test.go +++ b/pkg/epp/backend/metrics_test.go @@ -395,10 +395,6 @@ func TestPromToPodMetrics(t *testing.T) { { name: "vllm metrics", metricFamilies: map[string]*dto.MetricFamily{ - "vllm_running": makeMetricFamily("vllm_running", - makeMetric("vllm_running", nil, 10.0, 2000), - makeMetric("vllm_running", nil, 12.0, 1000), //Older - ), "vllm_waiting": makeMetricFamily("vllm_waiting", makeMetric("vllm_waiting", nil, 5.0, 1000), makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer @@ -412,10 +408,9 @@ func TestPromToPodMetrics(t *testing.T) { ), }, mapping: &MetricMapping{ - RunningRequests: &MetricSpec{MetricName: "vllm_running"}, - WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, - KVCacheUsage: &MetricSpec{MetricName: "vllm_usage"}, - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, + KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, existingMetrics: &datastore.PodMetrics{ Pod: datastore.Pod{ @@ -436,7 +431,6 @@ func TestPromToPodMetrics(t *testing.T) { }, }, Metrics: datastore.Metrics{ - RunningQueueSize: 10, WaitingQueueSize: 7, KVCacheUsagePercent: 0.8, ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, @@ -445,118 +439,17 @@ func TestPromToPodMetrics(t *testing.T) { }, expectedErrCount: 0, }, - { - name: "triton metrics", - metricFamilies: map[string]*dto.MetricFamily{ - "triton_running": makeMetricFamily("triton_running", - makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000), - makeMetric("triton_running", map[string]string{"queue": "slow"}, 12.0, 1000), //Older, but different label - ), - "triton_all": makeMetricFamily("triton_all", - makeMetric("triton_all", map[string]string{"queue": "fast"}, 15.0, 1000), - makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000), // Newer - ), - "triton_used": makeMetricFamily("triton_used", - makeMetric("triton_used", map[string]string{"type": "gpu"}, 80.0, 1000), - ), - "triton_max": makeMetricFamily("triton_max", - makeMetric("triton_max", map[string]string{"type": "gpu"}, 100.0, 1000), - ), - }, - mapping: &MetricMapping{ - RunningRequests: &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}}, - AllRequests: &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}}, - UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}}, - MaxKVCacheBlocks: &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}}, - }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - RunningQueueSize: 10, - WaitingQueueSize: 7, // 17 (all) - 10 (running) - KVCacheUsagePercent: 0.8, // 80 / 100 - }, - }, - expectedErrCount: 0, - }, - { - name: "triton metrics, missing label", - metricFamilies: map[string]*dto.MetricFamily{ - "triton_running": makeMetricFamily("triton_running", - makeMetric("triton_running", map[string]string{"queue": "fast"}, 10.0, 2000), - ), - "triton_all": makeMetricFamily("triton_all", - makeMetric("triton_all", map[string]string{"queue": "fast"}, 17.0, 2000), - ), - // triton_used and _max have no metrics with type=gpu label. - }, - mapping: &MetricMapping{ - RunningRequests: &MetricSpec{MetricName: "triton_running", Labels: map[string]string{"queue": "fast"}}, - AllRequests: &MetricSpec{MetricName: "triton_all", Labels: map[string]string{"queue": "fast"}}, - UsedKVCacheBlocks: &MetricSpec{MetricName: "triton_used", Labels: map[string]string{"type": "gpu"}}, - MaxKVCacheBlocks: &MetricSpec{MetricName: "triton_max", Labels: map[string]string{"type": "gpu"}}, - }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - RunningQueueSize: 10, - WaitingQueueSize: 7, - KVCacheUsagePercent: 0.0, // expect this to still be present, but with default 0 value - }, - }, - - expectedErrCount: 2, // Two errors: Used and Max - }, { name: "missing metrics", metricFamilies: map[string]*dto.MetricFamily{}, // No metrics mapping: &MetricMapping{ - RunningRequests: &MetricSpec{MetricName: "vllm_running"}, - WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, - KVCacheUsage: &MetricSpec{MetricName: "vllm_usage"}, - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, + KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedErrCount: 4, // Errors for all 4 main metrics + expectedErrCount: 3, // Errors for all 4 main metrics }, { name: "partial metrics available + LoRA", @@ -569,10 +462,9 @@ func TestPromToPodMetrics(t *testing.T) { ), }, mapping: &MetricMapping{ - RunningRequests: &MetricSpec{MetricName: "vllm_running"}, // Not present - WaitingRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present - KVCacheUsage: &MetricSpec{MetricName: "vllm_usage"}, - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present + KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, existingMetrics: &datastore.PodMetrics{ Pod: datastore.Pod{ @@ -593,57 +485,13 @@ func TestPromToPodMetrics(t *testing.T) { }, }, Metrics: datastore.Metrics{ - RunningQueueSize: 0, WaitingQueueSize: 0, KVCacheUsagePercent: 0.8, ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, MaxActiveModels: 3, }, }, - expectedErrCount: 2, // Errors for the two missing metrics - }, - { - name: "use all requests for waiting queue", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm_running": makeMetricFamily("vllm_running", - makeMetric("vllm_running", nil, 10.0, 2000), - ), - "vllm_all": makeMetricFamily("vllm_all", - makeMetric("vllm_all", nil, 15.0, 1000), - ), - }, - mapping: &MetricMapping{ - RunningRequests: &MetricSpec{MetricName: "vllm_running"}, - AllRequests: &MetricSpec{MetricName: "vllm_all"}, - // No WaitingRequests - }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - }, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{}, - RunningQueueSize: 10, - WaitingQueueSize: 5, // 15 - 10 - }, - }, - expectedErrCount: 0, + expectedErrCount: 1, // Errors for the two missing metrics }, { name: "invalid max lora", diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go index 8cfcf1d1f..b87b1c0ae 100644 --- a/pkg/epp/datastore/types.go +++ b/pkg/epp/datastore/types.go @@ -32,11 +32,9 @@ type Metrics struct { // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. ActiveModels map[string]int // MaxActiveModels is the maximum number of models that can be loaded to GPU. - MaxActiveModels int - RunningQueueSize int - WaitingQueueSize int - KVCacheUsagePercent float64 - KvCacheMaxTokenCapacity int + MaxActiveModels int + WaitingQueueSize int + KVCacheUsagePercent float64 } type PodMetrics struct { @@ -59,12 +57,10 @@ func (pm *PodMetrics) Clone() *PodMetrics { Address: pm.Address, }, Metrics: Metrics{ - ActiveModels: cm, - MaxActiveModels: pm.MaxActiveModels, - RunningQueueSize: pm.RunningQueueSize, - WaitingQueueSize: pm.WaitingQueueSize, - KVCacheUsagePercent: pm.KVCacheUsagePercent, - KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity, + ActiveModels: cm, + MaxActiveModels: pm.MaxActiveModels, + WaitingQueueSize: pm.WaitingQueueSize, + KVCacheUsagePercent: pm.KVCacheUsagePercent, }, } return clone From 5838459945ce106852626a306997da7d87173736 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 7 Mar 2025 22:19:40 +0000 Subject: [PATCH 09/19] moved files for cleaner diff --- cmd/epp/main.go | 4 +- pkg/epp/backend/metrics.go | 287 ------- pkg/epp/backend/metrics_test.go | 589 -------------- pkg/epp/backend/vllm/metrics.go | 256 ++++--- pkg/epp/backend/{ => vllm}/metrics_spec.go | 2 +- .../backend/{ => vllm}/metrics_spec_test.go | 2 +- pkg/epp/backend/vllm/metrics_test.go | 719 +++++++++++++----- 7 files changed, 686 insertions(+), 1173 deletions(-) delete mode 100644 pkg/epp/backend/metrics.go delete mode 100644 pkg/epp/backend/metrics_test.go rename pkg/epp/backend/{ => vllm}/metrics_spec.go (99%) rename pkg/epp/backend/{ => vllm}/metrics_spec_test.go (99%) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index f3e0b6571..d3c1ab09b 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -160,7 +160,7 @@ func run() error { datastore := datastore.NewDatastore(ctx, pmf) // Set up mapper for metric scraping. - mapping, err := backend.NewMetricMapping( + mapping, err := vllm.NewMetricMapping( *totalQueuedRequestMetric, *kVCacheUsageMetric, *loraRequestInfoMetric, @@ -169,7 +169,7 @@ func run() error { setupLog.Error(err, "Failed to create metric mapping from flags.") return err } - provider := backend.NewProvider(&backend.PodMetricsClientImpl{MetricMapping: mapping}, datastore) + provider := backend.NewProvider(&vllm.PodMetricsClientImpl{MetricMapping: mapping}, datastore) // serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, diff --git a/pkg/epp/backend/metrics.go b/pkg/epp/backend/metrics.go deleted file mode 100644 index edc4b6e80..000000000 --- a/pkg/epp/backend/metrics.go +++ /dev/null @@ -1,287 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package backend - -import ( - "context" - "fmt" - "net/http" - "strconv" - "strings" - "time" - - "github.com/go-logr/logr" - dto "github.com/prometheus/client_model/go" - "github.com/prometheus/common/expfmt" - "go.uber.org/multierr" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -const ( - // Hardcoded vLLM specific LoRA metrics - LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" - LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" - LoraRequestInfoMaxAdaptersMetricName = "max_lora" -) - -type PodMetricsClientImpl struct { - MetricMapping *MetricMapping -} - -// FetchMetrics fetches metrics from a given pod. -func (p *PodMetricsClientImpl) FetchMetrics( - ctx context.Context, - existing *datastore.PodMetrics, - port int32, -) (*datastore.PodMetrics, error) { - logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) - - url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) - return nil, fmt.Errorf("failed to create request: %v", err) - } - resp, err := http.DefaultClient.Do(req) - if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) - } - defer func() { - _ = resp.Body.Close() - }() - - if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) - } - - parser := expfmt.TextParser{} - metricFamilies, err := parser.TextToMetricFamilies(resp.Body) - if err != nil { - return nil, err - } - return p.promToPodMetrics(logger, metricFamilies, existing) -} - -// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. -func (p *PodMetricsClientImpl) promToPodMetrics( - logger logr.Logger, - metricFamilies map[string]*dto.MetricFamily, - existing *datastore.PodMetrics, -) (*datastore.PodMetrics, error) { - var errs error - updated := existing.Clone() - - if p.MetricMapping.TotalQueuedRequests != nil { - queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests) - if err == nil { - updated.WaitingQueueSize = int(queued.GetGauge().GetValue()) - } else { - errs = multierr.Append(errs, err) - } - } - - if p.MetricMapping.KVCacheUtilization != nil { - usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization) - if err == nil { - updated.KVCacheUsagePercent = usage.GetGauge().GetValue() - } else { - errs = multierr.Append(errs, err) - } - } - - // Handle LoRA metrics (only if all LoRA MetricSpecs are present) - if p.MetricMapping.LoraRequestInfo != nil { - loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies) - errs = multierr.Append(errs, err) - - if loraMetrics != nil { - updated.ActiveModels = make(map[string]int) - for _, label := range loraMetrics.GetLabel() { - if label.GetName() == LoraRequestInfoRunningAdaptersMetricName { - if label.GetValue() != "" { - adapterList := strings.Split(label.GetValue(), ",") - for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 - } - } - } - if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { - if label.GetValue() != "" { - adapterList := strings.Split(label.GetValue(), ",") - for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 - } - } - } - if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { - if label.GetValue() != "" { - updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) - if err != nil { - errs = multierr.Append(errs, err) - } - } - } - } - } - } - - return updated, errs -} - -// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info` -// reason its specially fetched is because each label key value pair permutation generates new series -// and only most recent is useful. The value of each series is the creation timestamp so we can -// retrieve the latest by sorting the value. -func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { - if p.MetricMapping.LoraRequestInfo == nil { - return nil, time.Time{}, nil // No LoRA metrics configured - } - - loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName] - if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) - return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) - } - - var latest *dto.Metric - var latestTs float64 // Use float64, as Gauge.Value is float64 - - // Iterate over all metrics in the family. - for _, m := range loraRequests.GetMetric() { - running := "" - waiting := "" - // Check if the metric has the expected LoRA labels. This is important! - hasRequiredLabels := false - for _, lp := range m.GetLabel() { - switch lp.GetName() { - case LoraRequestInfoRunningAdaptersMetricName: - running = lp.GetValue() - hasRequiredLabels = true - case LoraRequestInfoWaitingAdaptersMetricName: - waiting = lp.GetValue() - hasRequiredLabels = true - } - } - //Skip if it does not have the lora labels - if !hasRequiredLabels { - continue - } - // Ignore metrics with both labels empty. - if running == "" && waiting == "" { - continue - } - - // Select the metric with the *largest Gauge Value* (which represents the timestamp). - if m.GetGauge().GetValue() > latestTs { - latestTs = m.GetGauge().GetValue() - latest = m - } - } - if latest == nil { - logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName) - return nil, time.Time{}, nil - } - - // Convert the gauge value (creation timestamp) to time.Time. - return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time -} - -// getMetric retrieves a specific metric based on MetricSpec. -func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { - mf, ok := metricFamilies[spec.MetricName] - if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName) - return nil, fmt.Errorf("metric family %q not found", spec.MetricName) - } - - if len(mf.GetMetric()) == 0 { - return nil, fmt.Errorf("no metrics available for %q", spec.MetricName) - } - // if there is a specified label, return only that metric in the family - if spec.Labels != nil { - return getLabeledMetric(logger, mf, spec) - } - return getLatestMetric(logger, mf) -} - -// getLatestMetric gets the latest metric of a family (for metrics without labels). -func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) { - var latestTs int64 - var latest *dto.Metric - for _, m := range mf.GetMetric() { - if m.GetTimestampMs() >= latestTs { - latestTs = m.GetTimestampMs() - latest = m - } - } - - if latest == nil { - return nil, fmt.Errorf("no metrics found for %q", mf.GetName()) - } - - logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName()) - return latest, nil -} - -// getLabeledMetric gets the latest metric with matching labels. -func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { - var latestMetric *dto.Metric - var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater - - for _, m := range mf.GetMetric() { - if labelsMatch(m.GetLabel(), spec.Labels) { - if m.GetTimestampMs() > latestTimestamp { - latestTimestamp = m.GetTimestampMs() - latestMetric = m - } - } - } - - if latestMetric != nil { - logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName) - return latestMetric, nil - } - - return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels) -} - -// labelsMatch checks if a metric's labels contain all the labels in the spec. -func labelsMatch(metricLabels []*dto.LabelPair, specLabels map[string]string) bool { - if len(specLabels) == 0 { - return true // No specific labels required - } - - for specName, specValue := range specLabels { - found := false - for _, label := range metricLabels { - if label.GetName() == specName && label.GetValue() == specValue { - found = true - break - } - } - if !found { - return false // A required label is missing - } - } - return true // All required labels are present -} diff --git a/pkg/epp/backend/metrics_test.go b/pkg/epp/backend/metrics_test.go deleted file mode 100644 index 1b0ad05d9..000000000 --- a/pkg/epp/backend/metrics_test.go +++ /dev/null @@ -1,589 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package backend - -import ( - "context" - "fmt" - "reflect" - "strconv" - "strings" - "testing" - - dto "github.com/prometheus/client_model/go" - "go.uber.org/multierr" - "google.golang.org/protobuf/proto" - "k8s.io/apimachinery/pkg/types" - - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -// --- Test Helpers --- - -func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric { - labelPairs := []*dto.LabelPair{} - for k, v := range labels { - labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)}) - } - return &dto.Metric{ - Label: labelPairs, - Gauge: &dto.Gauge{Value: &value}, - TimestampMs: ×tampMs, - } -} - -func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily { - return &dto.MetricFamily{ - Name: &name, - Type: dto.MetricType_GAUGE.Enum(), - Metric: metrics, - } -} - -// --- Tests --- - -func TestGetMetric(t *testing.T) { - logger := logutil.NewTestLogger() - - metricFamilies := map[string]*dto.MetricFamily{ - "metric1": makeMetricFamily("metric1", - makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000), - makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000), - ), - "metric2": makeMetricFamily("metric2", - makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500), - makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500), - ), - "metric3": makeMetricFamily("metric3", - makeMetric("metric3", map[string]string{}, 5.0, 3000), - makeMetric("metric3", map[string]string{}, 6.0, 1000), - ), - } - - tests := []struct { - name string - spec MetricSpec - wantValue float64 - wantError bool - shouldPanic bool // Add this - }{ - { - name: "get labeled metric, exists", - spec: MetricSpec{ - MetricName: "metric1", - Labels: map[string]string{"label1": "value1"}, - }, - wantValue: 1.0, - wantError: false, - }, - { - name: "get labeled metric, wrong value", - spec: MetricSpec{ - MetricName: "metric1", - Labels: map[string]string{"label1": "value3"}, - }, - wantValue: -1, // Expect an error, not a specific value - wantError: true, - }, - { - name: "get labeled metric, missing label", - spec: MetricSpec{ - MetricName: "metric1", - Labels: map[string]string{"label2": "value2"}, - }, - wantValue: -1, - wantError: true, - }, - { - name: "get labeled metric, extra label present", - spec: MetricSpec{ - MetricName: "metric2", - Labels: map[string]string{"labelA": "A1"}, - }, - wantValue: 3.0, - wantError: false, - }, - { - name: "get unlabeled metric, exists", - spec: MetricSpec{ - MetricName: "metric3", - Labels: nil, // Explicitly nil - }, - wantValue: 5.0, // latest metric, which occurs first in our test data - wantError: false, - }, - { - name: "get unlabeled metric, metric family not found", - spec: MetricSpec{ - MetricName: "metric4", - Labels: nil, - }, - wantValue: -1, - wantError: true, - }, - { - name: "get labeled metric, metric family not found", - spec: MetricSpec{ - MetricName: "metric4", - Labels: map[string]string{"label1": "value1"}, - }, - wantValue: -1, - wantError: true, - }, - { - name: "get metric, no metrics available", - spec: MetricSpec{ - MetricName: "empty_metric", - }, - wantValue: -1, - wantError: true, - }, - { - name: "get latest metric", - spec: MetricSpec{ - MetricName: "metric3", - Labels: map[string]string{}, // Empty map, not nil - }, - wantValue: 5.0, - wantError: false, - }, - } - - p := &PodMetricsClientImpl{} // No need for MetricMapping here - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if tt.shouldPanic { - defer func() { - if r := recover(); r == nil { - t.Errorf("The code did not panic") - } - }() - } - - gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec) - - if tt.wantError { - if err == nil { - t.Errorf("getMetric() expected error, got nil") - } - } else { - if err != nil { - t.Errorf("getMetric() unexpected error: %v", err) - } - if gotMetric.GetGauge().GetValue() != tt.wantValue { - t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue) - } - } - }) - } -} - -func TestLabelsMatch(t *testing.T) { - tests := []struct { - name string - metricLabels []*dto.LabelPair - specLabels map[string]string - want bool - }{ - { - name: "empty spec labels, should match", - metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, - specLabels: map[string]string{}, - want: true, - }, - { - name: "nil spec labels, should match", - metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, - specLabels: nil, - want: true, - }, - { - name: "exact match", - metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, - specLabels: map[string]string{"a": "b"}, - want: true, - }, - { - name: "extra labels in metric", - metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}, {Name: proto.String("c"), Value: proto.String("d")}}, - specLabels: map[string]string{"a": "b"}, - want: true, - }, - { - name: "missing label in metric", - metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, - specLabels: map[string]string{"a": "b", "c": "d"}, - want: false, - }, - { - name: "value mismatch", - metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, - specLabels: map[string]string{"a": "c"}, - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := labelsMatch(tt.metricLabels, tt.specLabels); got != tt.want { - t.Errorf("labelsMatch() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestGetLatestLoraMetric(t *testing.T) { - logger := logutil.NewTestLogger() - - testCases := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - expectedAdapters map[string]int - expectedMax int - expectedErr error - mapping *MetricMapping - }{ - { - name: "no lora metrics", - metricFamilies: map[string]*dto.MetricFamily{ - "some_other_metric": makeMetricFamily("some_other_metric", - makeMetric("some_other_metric", nil, 1.0, 1000), - ), - }, - expectedAdapters: nil, - expectedMax: 0, - expectedErr: fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing - mapping: &MetricMapping{ - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - }, - { - name: "basic lora metrics", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000), // Newer - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older - - ), - }, - expectedAdapters: map[string]int{"lora1": 0}, - expectedMax: 2, - expectedErr: nil, - mapping: &MetricMapping{ - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - }, - { - name: "no matching lora metrics", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000), - ), - }, - expectedAdapters: nil, - expectedMax: 0, - expectedErr: nil, // Expect *no* error; just no adapters found - mapping: &MetricMapping{ - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - }, - { - name: "no lora metrics if not in MetricMapping", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000), - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000), - ), - }, - expectedAdapters: nil, - expectedMax: 0, - expectedErr: nil, - mapping: &MetricMapping{ // No LoRA metrics defined - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - p := &PodMetricsClientImpl{MetricMapping: tc.mapping} - loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies) - - if tc.expectedErr != nil { - if err == nil || err.Error() != tc.expectedErr.Error() { - t.Errorf("getLatestLoraMetric() error = %v, wantErr %v", err, tc.expectedErr) - } - return // Stop here if an error was expected - } else if err != nil { - t.Fatalf("getLatestLoraMetric() unexpected error: %v", err) - } - - if tc.mapping.LoraRequestInfo == nil { - if loraMetric != nil { - t.Errorf("getLatestLoraMetric() expected nil metric, got %v", loraMetric) - } - return // Stop if no Lora metrics are expected. - } - - if tc.expectedAdapters == nil && loraMetric == nil { - return // Both nil, as expected - } - - if tc.expectedAdapters != nil && loraMetric != nil { // proceed with checks - - adaptersFound := make(map[string]int) - maxLora := 0 - for _, label := range loraMetric.GetLabel() { - if label.GetName() == "running_lora_adapters" && label.GetValue() != "" { - for _, adapter := range strings.Split(label.GetValue(), ",") { - adaptersFound[adapter] = 0 - } - } - if label.GetName() == "waiting_lora_adapters" && label.GetValue() != "" { - for _, adapter := range strings.Split(label.GetValue(), ",") { - adaptersFound[adapter] = 0 // Overwrite if already present - } - } - if label.GetName() == "max_lora" { - var converr error // define err in this scope. - maxLora, converr = strconv.Atoi(label.GetValue()) - if converr != nil && tc.expectedErr == nil { // only report if we don't expect any other errors - t.Errorf("getLatestLoraMetric() could not parse max_lora: %v", converr) - } - } - } - - if !reflect.DeepEqual(adaptersFound, tc.expectedAdapters) { - t.Errorf("getLatestLoraMetric() adapters = %v, want %v", adaptersFound, tc.expectedAdapters) - } - if maxLora != tc.expectedMax { - t.Errorf("getLatestLoraMetric() maxLora = %v, want %v", maxLora, tc.expectedMax) - } - } else { // one is nil and the other is not - t.Errorf("getLatestLoraMetric(): one of expectedAdapters/loraMetric is nil and the other is not, expected %v, got %v", tc.expectedAdapters, loraMetric) - } - }) - } -} - -func TestPromToPodMetrics(t *testing.T) { - logger := logutil.NewTestLogger() - - tests := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - mapping *MetricMapping - existingMetrics *datastore.PodMetrics - expectedMetrics *datastore.PodMetrics - expectedErrCount int // Count of expected errors - }{ - { - name: "vllm metrics", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm_waiting": makeMetricFamily("vllm_waiting", - makeMetric("vllm_waiting", nil, 5.0, 1000), - makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer - ), - "vllm_usage": makeMetricFamily("vllm_usage", - makeMetric("vllm_usage", nil, 0.8, 2000), - makeMetric("vllm_usage", nil, 0.7, 500), - ), - "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), - ), - }, - mapping: &MetricMapping{ - TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, - KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{}, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - WaitingQueueSize: 7, - KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, - MaxActiveModels: 3, - }, - }, - expectedErrCount: 0, - }, - { - name: "missing metrics", - metricFamilies: map[string]*dto.MetricFamily{}, // No metrics - mapping: &MetricMapping{ - TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, - KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedErrCount: 3, // Errors for all 4 main metrics - }, - { - name: "partial metrics available + LoRA", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm_usage": makeMetricFamily("vllm_usage", - makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present - ), - "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), - ), - }, - mapping: &MetricMapping{ - TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present - KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{}, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, - MaxActiveModels: 3, - }, - }, - expectedErrCount: 1, // Errors for the two missing metrics - }, - { - name: "invalid max lora", - metricFamilies: map[string]*dto.MetricFamily{ - "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000), - ), - }, - mapping: &MetricMapping{ - LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, - }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{}, - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{"lora1": 0}, - MaxActiveModels: 0, // Should still default to 0. - - }, - }, - expectedErrCount: 1, // Expect *one* error - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - p := &PodMetricsClientImpl{MetricMapping: tc.mapping} - updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics) - - if tc.expectedErrCount == 0 { - if err != nil { - t.Errorf("promToPodMetrics() unexpected error: %v", err) - } - } else { - if err == nil { - t.Errorf("promToPodMetrics() expected errors, got nil") - } else { - // Check the *number* of errors. multierr.Errors() gives us a slice - if len(multierr.Errors(err)) != tc.expectedErrCount { - t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d. Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err) - } - - } - } - // Use podMetricsEqual for comparison with tolerance. - if !reflect.DeepEqual(updated, tc.expectedMetrics) { - t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics) - } - }) - } -} - -// TestFetchMetrics is a basic integration test. A more complete test would mock -// the HTTP client. -func TestFetchMetrics(t *testing.T) { - // This test is very basic as it doesn't mock the HTTP client. It assumes - // there's no server running on the specified port. A real-world test - // suite should use a mock server. - ctx := logutil.NewTestLoggerIntoContext(context.Background()) - existing := &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - } - p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test - - _, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use. - if err == nil { - t.Errorf("FetchMetrics() expected error, got nil") - } - // Check for a specific error message (fragile, but OK for this example) - expectedSubstr := "connection refused" - if err != nil && !strings.Contains(err.Error(), expectedSubstr) { - t.Errorf("FetchMetrics() error = %v, want error containing %q", err, expectedSubstr) - } -} diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 8d2dd7154..4c1532080 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -14,7 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package vllm provides vllm specific pod metrics implementation. package vllm import ( @@ -30,60 +29,49 @@ import ( "github.com/prometheus/common/expfmt" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -// Metric names used in the vLLM metrics implementation. -// Refer to the protocol doc for more details: -// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol const ( - LoraRequestInfoMetricName = "vllm:lora_requests_info" + // Hardcoded vLLM specific LoRA metrics LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraRequestInfoMaxAdaptersMetricName = "max_lora" - // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. - RunningQueueSizeMetricName = "vllm:num_requests_running" - WaitingQueueSizeMetricName = "vllm:num_requests_waiting" - /* TODO: Uncomment this once the following are added to the fork. - RunningQueueSizeMetricName = "vllm:num_tokens_running" - WaitingQueueSizeMetricName = "vllm:num_tokens_waiting" - */ - KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" ) -type PodMetricsClientImpl struct{} +type PodMetricsClientImpl struct { + MetricMapping *MetricMapping +} // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, - pod *metrics.Pod, - existing *metrics.Metrics, + existing *datastore.PodMetrics, port int32, -) (*metrics.Metrics, error) { - logger := log.FromContext(ctx).V(logutil.TRACE) +) (*datastore.PodMetrics, error) { + logger := log.FromContext(ctx) + loggerDefault := logger.V(logutil.DEFAULT) - // Currently the metrics endpoint is hard-coded, which works with vLLM. - // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" + url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { - logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) + loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) + loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { - logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) + loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) } parser := expfmt.TextParser{} @@ -91,74 +79,70 @@ func (p *PodMetricsClientImpl) FetchMetrics( if err != nil { return nil, err } - return promToPodMetrics(logger, metricFamilies, existing) + return p.promToPodMetrics(logger, metricFamilies, existing) } -// promToPodMetrics updates internal pod metrics with scraped prometheus metrics. -// A combined error is returned if errors occur in one or more metric processing. -// it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map. -func promToPodMetrics( +// promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. +func (p *PodMetricsClientImpl) promToPodMetrics( logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, - existing *metrics.Metrics, -) (*metrics.Metrics, error) { + existing *datastore.PodMetrics, +) (*datastore.PodMetrics, error) { var errs error updated := existing.Clone() - runningQueueSize, err := getLatestMetric(logger, metricFamilies, RunningQueueSizeMetricName) - errs = multierr.Append(errs, err) - if err == nil { - updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue()) - } - waitingQueueSize, err := getLatestMetric(logger, metricFamilies, WaitingQueueSizeMetricName) - errs = multierr.Append(errs, err) - if err == nil { - updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue()) - } - cachePercent, err := getLatestMetric(logger, metricFamilies, KVCacheUsagePercentMetricName) - errs = multierr.Append(errs, err) - if err == nil { - updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() - } - - loraMetrics, _, err := getLatestLoraMetric(logger, metricFamilies) - errs = multierr.Append(errs, err) - /* TODO: uncomment once this is available in vllm. - kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName) - errs = multierr.Append(errs, err) - if err != nil { - updated.KvCacheMaxTokenCapacity = int(kvCap) - } - */ - - if loraMetrics != nil { - updated.ActiveModels = make(map[string]int) - for _, label := range loraMetrics.GetLabel() { - if label.GetName() == LoraRequestInfoRunningAdaptersMetricName { - if label.GetValue() != "" { - adapterList := strings.Split(label.GetValue(), ",") - for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 + + if p.MetricMapping.TotalQueuedRequests != nil { + queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests) + if err == nil { + updated.WaitingQueueSize = int(queued.GetGauge().GetValue()) + } else { + errs = multierr.Append(errs, err) + } + } + + if p.MetricMapping.KVCacheUtilization != nil { + usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization) + if err == nil { + updated.KVCacheUsagePercent = usage.GetGauge().GetValue() + } else { + errs = multierr.Append(errs, err) + } + } + + // Handle LoRA metrics (only if all LoRA MetricSpecs are present) + if p.MetricMapping.LoraRequestInfo != nil { + loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies) + errs = multierr.Append(errs, err) + + if loraMetrics != nil { + updated.ActiveModels = make(map[string]int) + for _, label := range loraMetrics.GetLabel() { + if label.GetName() == LoraRequestInfoRunningAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } } } - } - if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { - if label.GetValue() != "" { - adapterList := strings.Split(label.GetValue(), ",") - for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 + if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } } } - } - if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { - if label.GetValue() != "" { - updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) - if err != nil { - errs = multierr.Append(errs, err) + if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { + if label.GetValue() != "" { + updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) + if err != nil { + errs = multierr.Append(errs, err) + } } } } } - } return updated, errs @@ -168,62 +152,80 @@ func promToPodMetrics( // reason its specially fetched is because each label key value pair permutation generates new series // and only most recent is useful. The value of each series is the creation timestamp so we can // retrieve the latest by sorting the value. -func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { - loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] +func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { + if p.MetricMapping.LoraRequestInfo == nil { + return nil, time.Time{}, nil // No LoRA metrics configured + } + + loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName] if !ok { - logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) - return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) + return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) } var latest *dto.Metric - var latestTs float64 + var latestTs float64 // Use float64, as Gauge.Value is float64 // Iterate over all metrics in the family. for _, m := range loraRequests.GetMetric() { - var running, waiting string - // Read the label values for running and waiting adapters. + running := "" + waiting := "" + // Check if the metric has the expected LoRA labels. This is important! + hasRequiredLabels := false for _, lp := range m.GetLabel() { switch lp.GetName() { case LoraRequestInfoRunningAdaptersMetricName: running = lp.GetValue() + hasRequiredLabels = true case LoraRequestInfoWaitingAdaptersMetricName: waiting = lp.GetValue() + hasRequiredLabels = true } } - - // Ignore metrics with both labels empty. This happens when there are no running or waiting requests on - // the server, in this case it is best to use the last set of active adapters. + //Skip if it does not have the lora labels + if !hasRequiredLabels { + continue + } + // Ignore metrics with both labels empty. if running == "" && waiting == "" { continue } - // Select the metric with the latest creation timestamp. + // Select the metric with the *largest Gauge Value* (which represents the timestamp). if m.GetGauge().GetValue() > latestTs { latestTs = m.GetGauge().GetValue() latest = m } } - if latest == nil { - logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName) + logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName) return nil, time.Time{}, nil } // Convert the gauge value (creation timestamp) to time.Time. - return latest, time.Unix(0, int64(latestTs*1000)), nil + return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time } -// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. -// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric. -func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { - mf, ok := metricFamilies[metricName] +// getMetric retrieves a specific metric based on MetricSpec. +func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { + mf, ok := metricFamilies[spec.MetricName] if !ok { - logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName) - return nil, fmt.Errorf("metric family %q not found", metricName) + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName) + return nil, fmt.Errorf("metric family %q not found", spec.MetricName) } + if len(mf.GetMetric()) == 0 { - return nil, fmt.Errorf("no metrics available for %q", metricName) + return nil, fmt.Errorf("no metrics available for %q", spec.MetricName) + } + // if there is a specified label, return only that metric in the family + if spec.Labels != nil { + return getLabeledMetric(logger, mf, spec) } + return getLatestMetric(logger, mf) +} + +// getLatestMetric gets the latest metric of a family (for metrics without labels). +func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) { var latestTs int64 var latest *dto.Metric for _, m := range mf.GetMetric() { @@ -232,6 +234,54 @@ func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFa latest = m } } - logger.V(logutil.TRACE).Info("Metric value selected", "value", latest, "metric", metricName) + + if latest == nil { + return nil, fmt.Errorf("no metrics found for %q", mf.GetName()) + } + + logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName()) return latest, nil } + +// getLabeledMetric gets the latest metric with matching labels. +func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { + var latestMetric *dto.Metric + var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater + + for _, m := range mf.GetMetric() { + if labelsMatch(m.GetLabel(), spec.Labels) { + if m.GetTimestampMs() > latestTimestamp { + latestTimestamp = m.GetTimestampMs() + latestMetric = m + } + } + } + + if latestMetric != nil { + logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName) + return latestMetric, nil + } + + return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels) +} + +// labelsMatch checks if a metric's labels contain all the labels in the spec. +func labelsMatch(metricLabels []*dto.LabelPair, specLabels map[string]string) bool { + if len(specLabels) == 0 { + return true // No specific labels required + } + + for specName, specValue := range specLabels { + found := false + for _, label := range metricLabels { + if label.GetName() == specName && label.GetValue() == specValue { + found = true + break + } + } + if !found { + return false // A required label is missing + } + } + return true // All required labels are present +} diff --git a/pkg/epp/backend/metrics_spec.go b/pkg/epp/backend/vllm/metrics_spec.go similarity index 99% rename from pkg/epp/backend/metrics_spec.go rename to pkg/epp/backend/vllm/metrics_spec.go index 7ce2f5d60..bdd1e6671 100644 --- a/pkg/epp/backend/metrics_spec.go +++ b/pkg/epp/backend/vllm/metrics_spec.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package backend +package vllm import ( "fmt" diff --git a/pkg/epp/backend/metrics_spec_test.go b/pkg/epp/backend/vllm/metrics_spec_test.go similarity index 99% rename from pkg/epp/backend/metrics_spec_test.go rename to pkg/epp/backend/vllm/metrics_spec_test.go index 141b97386..d73ce21dd 100644 --- a/pkg/epp/backend/metrics_spec_test.go +++ b/pkg/epp/backend/vllm/metrics_spec_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package backend +package vllm import ( "reflect" diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go index 5555bd260..0f05185d1 100644 --- a/pkg/epp/backend/vllm/metrics_test.go +++ b/pkg/epp/backend/vllm/metrics_test.go @@ -17,234 +17,573 @@ limitations under the License. package vllm import ( - "errors" + "context" + "fmt" + "reflect" + "strconv" + "strings" "testing" dto "github.com/prometheus/client_model/go" - "github.com/stretchr/testify/assert" + "go.uber.org/multierr" "google.golang.org/protobuf/proto" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "k8s.io/apimachinery/pkg/types" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -func TestPromToPodMetrics(t *testing.T) { +// --- Test Helpers --- + +func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric { + labelPairs := []*dto.LabelPair{} + for k, v := range labels { + labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)}) + } + return &dto.Metric{ + Label: labelPairs, + Gauge: &dto.Gauge{Value: &value}, + TimestampMs: ×tampMs, + } +} + +func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily { + return &dto.MetricFamily{ + Name: &name, + Type: dto.MetricType_GAUGE.Enum(), + Metric: metrics, + } +} + +// --- Tests --- + +func TestGetMetric(t *testing.T) { + logger := logutil.NewTestLogger() + + metricFamilies := map[string]*dto.MetricFamily{ + "metric1": makeMetricFamily("metric1", + makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000), + makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000), + ), + "metric2": makeMetricFamily("metric2", + makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500), + makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500), + ), + "metric3": makeMetricFamily("metric3", + makeMetric("metric3", map[string]string{}, 5.0, 3000), + makeMetric("metric3", map[string]string{}, 6.0, 1000), + ), + } + + tests := []struct { + name string + spec MetricSpec + wantValue float64 + wantError bool + shouldPanic bool // Add this + }{ + { + name: "get labeled metric, exists", + spec: MetricSpec{ + MetricName: "metric1", + Labels: map[string]string{"label1": "value1"}, + }, + wantValue: 1.0, + wantError: false, + }, + { + name: "get labeled metric, wrong value", + spec: MetricSpec{ + MetricName: "metric1", + Labels: map[string]string{"label1": "value3"}, + }, + wantValue: -1, // Expect an error, not a specific value + wantError: true, + }, + { + name: "get labeled metric, missing label", + spec: MetricSpec{ + MetricName: "metric1", + Labels: map[string]string{"label2": "value2"}, + }, + wantValue: -1, + wantError: true, + }, + { + name: "get labeled metric, extra label present", + spec: MetricSpec{ + MetricName: "metric2", + Labels: map[string]string{"labelA": "A1"}, + }, + wantValue: 3.0, + wantError: false, + }, + { + name: "get unlabeled metric, exists", + spec: MetricSpec{ + MetricName: "metric3", + Labels: nil, // Explicitly nil + }, + wantValue: 5.0, // latest metric, which occurs first in our test data + wantError: false, + }, + { + name: "get unlabeled metric, metric family not found", + spec: MetricSpec{ + MetricName: "metric4", + Labels: nil, + }, + wantValue: -1, + wantError: true, + }, + { + name: "get labeled metric, metric family not found", + spec: MetricSpec{ + MetricName: "metric4", + Labels: map[string]string{"label1": "value1"}, + }, + wantValue: -1, + wantError: true, + }, + { + name: "get metric, no metrics available", + spec: MetricSpec{ + MetricName: "empty_metric", + }, + wantValue: -1, + wantError: true, + }, + { + name: "get latest metric", + spec: MetricSpec{ + MetricName: "metric3", + Labels: map[string]string{}, // Empty map, not nil + }, + wantValue: 5.0, + wantError: false, + }, + } + + p := &PodMetricsClientImpl{} // No need for MetricMapping here + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.shouldPanic { + defer func() { + if r := recover(); r == nil { + t.Errorf("The code did not panic") + } + }() + } + + gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec) + + if tt.wantError { + if err == nil { + t.Errorf("getMetric() expected error, got nil") + } + } else { + if err != nil { + t.Errorf("getMetric() unexpected error: %v", err) + } + if gotMetric.GetGauge().GetValue() != tt.wantValue { + t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue) + } + } + }) + } +} + +func TestLabelsMatch(t *testing.T) { + tests := []struct { + name string + metricLabels []*dto.LabelPair + specLabels map[string]string + want bool + }{ + { + name: "empty spec labels, should match", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{}, + want: true, + }, + { + name: "nil spec labels, should match", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: nil, + want: true, + }, + { + name: "exact match", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{"a": "b"}, + want: true, + }, + { + name: "extra labels in metric", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}, {Name: proto.String("c"), Value: proto.String("d")}}, + specLabels: map[string]string{"a": "b"}, + want: true, + }, + { + name: "missing label in metric", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{"a": "b", "c": "d"}, + want: false, + }, + { + name: "value mismatch", + metricLabels: []*dto.LabelPair{{Name: proto.String("a"), Value: proto.String("b")}}, + specLabels: map[string]string{"a": "c"}, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := labelsMatch(tt.metricLabels, tt.specLabels); got != tt.want { + t.Errorf("labelsMatch() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestGetLatestLoraMetric(t *testing.T) { logger := logutil.NewTestLogger() testCases := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - initialMetrics *metrics.Metrics - expectedMetrics *metrics.Metrics - expectedErr error + name string + metricFamilies map[string]*dto.MetricFamily + expectedAdapters map[string]int + expectedMax int + expectedErr error + mapping *MetricMapping }{ { - name: "all metrics available", + name: "no lora metrics", metricFamilies: map[string]*dto.MetricFamily{ - RunningQueueSizeMetricName: { - Metric: []*dto.Metric{ - { - Gauge: &dto.Gauge{ - Value: proto.Float64(10), - }, - TimestampMs: proto.Int64(100), - }, - { - Gauge: &dto.Gauge{ - Value: proto.Float64(15), - }, - TimestampMs: proto.Int64(200), // This is the latest - }, - }, - }, - WaitingQueueSizeMetricName: { - Metric: []*dto.Metric{ - { - Gauge: &dto.Gauge{ - Value: proto.Float64(20), - }, - TimestampMs: proto.Int64(100), - }, - { - Gauge: &dto.Gauge{ - Value: proto.Float64(25), - }, - TimestampMs: proto.Int64(200), // This is the latest - }, + "some_other_metric": makeMetricFamily("some_other_metric", + makeMetric("some_other_metric", nil, 1.0, 1000), + ), + }, + expectedAdapters: nil, + expectedMax: 0, + expectedErr: fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + }, + { + name: "basic lora metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000), // Newer + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older + + ), + }, + expectedAdapters: map[string]int{"lora1": 0}, + expectedMax: 2, + expectedErr: nil, + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + }, + { + name: "no matching lora metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000), + ), + }, + expectedAdapters: nil, + expectedMax: 0, + expectedErr: nil, // Expect *no* error; just no adapters found + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + }, + { + name: "no lora metrics if not in MetricMapping", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000), + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000), + ), + }, + expectedAdapters: nil, + expectedMax: 0, + expectedErr: nil, + mapping: &MetricMapping{ // No LoRA metrics defined + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + p := &PodMetricsClientImpl{MetricMapping: tc.mapping} + loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies) + + if tc.expectedErr != nil { + if err == nil || err.Error() != tc.expectedErr.Error() { + t.Errorf("getLatestLoraMetric() error = %v, wantErr %v", err, tc.expectedErr) + } + return // Stop here if an error was expected + } else if err != nil { + t.Fatalf("getLatestLoraMetric() unexpected error: %v", err) + } + + if tc.mapping.LoraRequestInfo == nil { + if loraMetric != nil { + t.Errorf("getLatestLoraMetric() expected nil metric, got %v", loraMetric) + } + return // Stop if no Lora metrics are expected. + } + + if tc.expectedAdapters == nil && loraMetric == nil { + return // Both nil, as expected + } + + if tc.expectedAdapters != nil && loraMetric != nil { // proceed with checks + + adaptersFound := make(map[string]int) + maxLora := 0 + for _, label := range loraMetric.GetLabel() { + if label.GetName() == "running_lora_adapters" && label.GetValue() != "" { + for _, adapter := range strings.Split(label.GetValue(), ",") { + adaptersFound[adapter] = 0 + } + } + if label.GetName() == "waiting_lora_adapters" && label.GetValue() != "" { + for _, adapter := range strings.Split(label.GetValue(), ",") { + adaptersFound[adapter] = 0 // Overwrite if already present + } + } + if label.GetName() == "max_lora" { + var converr error // define err in this scope. + maxLora, converr = strconv.Atoi(label.GetValue()) + if converr != nil && tc.expectedErr == nil { // only report if we don't expect any other errors + t.Errorf("getLatestLoraMetric() could not parse max_lora: %v", converr) + } + } + } + + if !reflect.DeepEqual(adaptersFound, tc.expectedAdapters) { + t.Errorf("getLatestLoraMetric() adapters = %v, want %v", adaptersFound, tc.expectedAdapters) + } + if maxLora != tc.expectedMax { + t.Errorf("getLatestLoraMetric() maxLora = %v, want %v", maxLora, tc.expectedMax) + } + } else { // one is nil and the other is not + t.Errorf("getLatestLoraMetric(): one of expectedAdapters/loraMetric is nil and the other is not, expected %v, got %v", tc.expectedAdapters, loraMetric) + } + }) + } +} + +func TestPromToPodMetrics(t *testing.T) { + logger := logutil.NewTestLogger() + + tests := []struct { + name string + metricFamilies map[string]*dto.MetricFamily + mapping *MetricMapping + existingMetrics *datastore.PodMetrics + expectedMetrics *datastore.PodMetrics + expectedErrCount int // Count of expected errors + }{ + { + name: "vllm metrics", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm_waiting": makeMetricFamily("vllm_waiting", + makeMetric("vllm_waiting", nil, 5.0, 1000), + makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer + ), + "vllm_usage": makeMetricFamily("vllm_usage", + makeMetric("vllm_usage", nil, 0.8, 2000), + makeMetric("vllm_usage", nil, 0.7, 500), + ), + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), + ), + }, + mapping: &MetricMapping{ + TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, + KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", }, }, - KVCacheUsagePercentMetricName: { - Metric: []*dto.Metric{ - { - Gauge: &dto.Gauge{ - Value: proto.Float64(0.8), - }, - TimestampMs: proto.Int64(100), - }, - { - Gauge: &dto.Gauge{ - Value: proto.Float64(0.9), - }, - TimestampMs: proto.Int64(200), // This is the latest - }, + Metrics: datastore.Metrics{}, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", }, }, - LoraRequestInfoMetricName: { - Metric: []*dto.Metric{ - { - Label: []*dto.LabelPair{ - { - Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), - Value: proto.String("lora3,lora4"), - }, - { - Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), - Value: proto.String("2"), - }, - }, - Gauge: &dto.Gauge{ - Value: proto.Float64(100), - }, - }, - { - Label: []*dto.LabelPair{ - { - Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), - Value: proto.String("lora2"), - }, - { - Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), - Value: proto.String("2"), - }, - }, - Gauge: &dto.Gauge{ - Value: proto.Float64(90), - }, - }, - }, + Metrics: datastore.Metrics{ + WaitingQueueSize: 7, + KVCacheUsagePercent: 0.8, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + MaxActiveModels: 3, }, }, - expectedMetrics: &metrics.Metrics{ - RunningQueueSize: 15, - WaitingQueueSize: 25, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "lora3": 0, - "lora4": 0, - }, - MaxActiveModels: 2, + expectedErrCount: 0, + }, + { + name: "missing metrics", + metricFamilies: map[string]*dto.MetricFamily{}, // No metrics + mapping: &MetricMapping{ + TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, + KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - initialMetrics: &metrics.Metrics{}, - expectedErr: nil, + existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, + expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, + expectedErrCount: 3, // Errors for all 4 main metrics }, { - name: "invalid max lora", + name: "partial metrics available + LoRA", metricFamilies: map[string]*dto.MetricFamily{ - RunningQueueSizeMetricName: { - Metric: []*dto.Metric{ - { - Gauge: &dto.Gauge{ - Value: proto.Float64(10), - }, - TimestampMs: proto.Int64(100), - }, - { - Gauge: &dto.Gauge{ - Value: proto.Float64(15), - }, - TimestampMs: proto.Int64(200), // This is the latest - }, + "vllm_usage": makeMetricFamily("vllm_usage", + makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present + ), + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), + ), + }, + mapping: &MetricMapping{ + TotalQueuedRequests: &MetricSpec{MetricName: "vllm_waiting"}, // Not Present + KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", }, }, - WaitingQueueSizeMetricName: { - Metric: []*dto.Metric{ - { - Gauge: &dto.Gauge{ - Value: proto.Float64(20), - }, - TimestampMs: proto.Int64(100), - }, - { - Gauge: &dto.Gauge{ - Value: proto.Float64(25), - }, - TimestampMs: proto.Int64(200), // This is the latest - }, + Metrics: datastore.Metrics{}, // Initialize with empty Metrics + }, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", }, }, - KVCacheUsagePercentMetricName: { - Metric: []*dto.Metric{ - { - Gauge: &dto.Gauge{ - Value: proto.Float64(0.8), - }, - TimestampMs: proto.Int64(100), - }, - { - Gauge: &dto.Gauge{ - Value: proto.Float64(0.9), - }, - TimestampMs: proto.Int64(200), // This is the latest - }, - }, + Metrics: datastore.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.8, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + MaxActiveModels: 3, }, - LoraRequestInfoMetricName: { - Metric: []*dto.Metric{ - { - Label: []*dto.LabelPair{ - { - Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), - Value: proto.String("lora3,lora4"), - }, - { - Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), - Value: proto.String("2a"), - }, - }, - Gauge: &dto.Gauge{ - Value: proto.Float64(100), - }, - }, - { - Label: []*dto.LabelPair{ - { - Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), - Value: proto.String("lora2"), - }, - { - Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), - Value: proto.String("2"), - }, - }, - Gauge: &dto.Gauge{ - Value: proto.Float64(90), - }, - }, + }, + expectedErrCount: 1, // Errors for the two missing metrics + }, + { + name: "invalid max lora", + metricFamilies: map[string]*dto.MetricFamily{ + "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000), + ), + }, + mapping: &MetricMapping{ + LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, + }, + existingMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", }, }, + Metrics: datastore.Metrics{}, }, - expectedMetrics: &metrics.Metrics{ - RunningQueueSize: 15, - WaitingQueueSize: 25, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "lora3": 0, - "lora4": 0, + expectedMetrics: &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + Metrics: datastore.Metrics{ + ActiveModels: map[string]int{"lora1": 0}, + MaxActiveModels: 0, // Should still default to 0. + }, - MaxActiveModels: 0, }, - initialMetrics: &metrics.Metrics{}, - expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), + expectedErrCount: 1, // Expect *one* error }, } - for _, tc := range testCases { + + for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialMetrics) - if tc.expectedErr != nil { - assert.Error(t, err) + p := &PodMetricsClientImpl{MetricMapping: tc.mapping} + updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics) + + if tc.expectedErrCount == 0 { + if err != nil { + t.Errorf("promToPodMetrics() unexpected error: %v", err) + } } else { - assert.NoError(t, err) - assert.Equal(t, tc.expectedMetrics, updated) + if err == nil { + t.Errorf("promToPodMetrics() expected errors, got nil") + } else { + // Check the *number* of errors. multierr.Errors() gives us a slice + if len(multierr.Errors(err)) != tc.expectedErrCount { + t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d. Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err) + } + + } + } + // Use podMetricsEqual for comparison with tolerance. + if !reflect.DeepEqual(updated, tc.expectedMetrics) { + t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics) } }) } } + +// TestFetchMetrics is a basic integration test. A more complete test would mock +// the HTTP client. +func TestFetchMetrics(t *testing.T) { + // This test is very basic as it doesn't mock the HTTP client. It assumes + // there's no server running on the specified port. A real-world test + // suite should use a mock server. + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + existing := &datastore.PodMetrics{ + Pod: datastore.Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", + }, + }, + } + p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test + + _, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use. + if err == nil { + t.Errorf("FetchMetrics() expected error, got nil") + } + // Check for a specific error message (fragile, but OK for this example) + expectedSubstr := "connection refused" + if err != nil && !strings.Contains(err.Error(), expectedSubstr) { + t.Errorf("FetchMetrics() error = %v, want error containing %q", err, expectedSubstr) + } +} From 1c367a6ecbf4bbd5f8f83703cbd45f1cd8bd3e3f Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 7 Mar 2025 22:55:41 +0000 Subject: [PATCH 10/19] re-add todos and rename kv flag to reflect percentage usage. --- cmd/epp/main.go | 4 ++-- pkg/epp/backend/vllm/metrics.go | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index d3c1ab09b..a0441d4a6 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -97,7 +97,7 @@ var ( totalQueuedRequestMetric = flag.String("totalQueuedRequestMetric", "vllm:num_requests_waiting", "Prometheus metric for the number of queued requests.") - kVCacheUsageMetric = flag.String("kVCacheUsageMetric", + kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric", "vllm:gpu_cache_usage_perc", "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") // LoRA metrics @@ -162,7 +162,7 @@ func run() error { // Set up mapper for metric scraping. mapping, err := vllm.NewMetricMapping( *totalQueuedRequestMetric, - *kVCacheUsageMetric, + *kvCacheUsagePercentageMetric, *loraRequestInfoMetric, ) if err != nil { diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 4c1532080..6d181b612 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -34,7 +34,7 @@ import ( ) const ( - // Hardcoded vLLM specific LoRA metrics + // LoRA metrics based on protocol LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraRequestInfoMaxAdaptersMetricName = "max_lora" @@ -53,6 +53,8 @@ func (p *PodMetricsClientImpl) FetchMetrics( logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) + // Currently the metrics endpoint is hard-coded, which works with vLLM. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) From 3356bd30eff9ab5678d524bcfd564dcd78a7ffd1 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 13 Mar 2025 20:46:47 +0000 Subject: [PATCH 11/19] Fix nits, move logging channel for backend/metrics.go from default to trace, fix comments. --- cmd/epp/main.go | 5 +- pkg/epp/backend/vllm/metrics.go | 14 +-- pkg/epp/backend/vllm/metrics_spec.go | 6 +- pkg/epp/backend/vllm/metrics_spec_test.go | 19 ++-- pkg/epp/backend/vllm/metrics_test.go | 109 +++++++++------------- 5 files changed, 65 insertions(+), 88 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index a0441d4a6..277cff37f 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" + servermetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" @@ -160,7 +161,7 @@ func run() error { datastore := datastore.NewDatastore(ctx, pmf) // Set up mapper for metric scraping. - mapping, err := vllm.NewMetricMapping( + mapping, err := servermetrics.NewMetricMapping( *totalQueuedRequestMetric, *kvCacheUsagePercentageMetric, *loraRequestInfoMetric, @@ -169,7 +170,7 @@ func run() error { setupLog.Error(err, "Failed to create metric mapping from flags.") return err } - provider := backend.NewProvider(&vllm.PodMetricsClientImpl{MetricMapping: mapping}, datastore) + provider := backend.NewProvider(&servermetrics.PodMetricsClientImpl{MetricMapping: mapping}, datastore) // serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 6d181b612..1328a7672 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package vllm +package metrics import ( "context" @@ -161,7 +161,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName] if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) } @@ -212,7 +212,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { mf, ok := metricFamilies[spec.MetricName] if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", spec.MetricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", spec.MetricName) return nil, fmt.Errorf("metric family %q not found", spec.MetricName) } @@ -221,14 +221,14 @@ func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[ } // if there is a specified label, return only that metric in the family if spec.Labels != nil { - return getLabeledMetric(logger, mf, spec) + return getLabeledMetric(logger, mf, &spec) } return getLatestMetric(logger, mf) } // getLatestMetric gets the latest metric of a family (for metrics without labels). func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) { - var latestTs int64 + var latestTs int64 = -1 var latest *dto.Metric for _, m := range mf.GetMetric() { if m.GetTimestampMs() >= latestTs { @@ -246,12 +246,12 @@ func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, err } // getLabeledMetric gets the latest metric with matching labels. -func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { +func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) { var latestMetric *dto.Metric var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater for _, m := range mf.GetMetric() { - if labelsMatch(m.GetLabel(), spec.Labels) { + if spec == nil || labelsMatch(m.GetLabel(), spec.Labels) { if m.GetTimestampMs() > latestTimestamp { latestTimestamp = m.GetTimestampMs() latestMetric = m diff --git a/pkg/epp/backend/vllm/metrics_spec.go b/pkg/epp/backend/vllm/metrics_spec.go index bdd1e6671..bd8f39ccf 100644 --- a/pkg/epp/backend/vllm/metrics_spec.go +++ b/pkg/epp/backend/vllm/metrics_spec.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package vllm +package metrics import ( "fmt" @@ -41,10 +41,6 @@ type MetricMapping struct { // "metric_name{label1=value1}" // "metric_name{label1=value1,label2=value2}" func stringToMetricSpec(specStr string) (*MetricSpec, error) { - if specStr == "" { - return nil, nil // Allow empty strings to represent nil MetricSpecs - } - specStr = strings.TrimSpace(specStr) metricName := specStr labels := make(map[string]string) diff --git a/pkg/epp/backend/vllm/metrics_spec_test.go b/pkg/epp/backend/vllm/metrics_spec_test.go index d73ce21dd..8de6dac29 100644 --- a/pkg/epp/backend/vllm/metrics_spec_test.go +++ b/pkg/epp/backend/vllm/metrics_spec_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package vllm +package metrics import ( "reflect" @@ -32,7 +32,7 @@ func TestStringToMetricSpec(t *testing.T) { name: "empty string", input: "", want: nil, - wantErr: false, + wantErr: true, }, { name: "no labels", @@ -152,9 +152,14 @@ func TestStringToMetricSpec(t *testing.T) { t.Errorf("stringToMetricSpec() error = %v, wantErr %v", err, tt.wantErr) return } - if tt.want != nil && got != nil { // compare maps directly - if tt.want.Labels == nil { - tt.want.Labels = make(map[string]string) + if tt.wantErr { + if got != nil { // handles if we got a nil spec and didn't expect an error + t.Errorf("stringToMetricSpec() = %v, want %v", got, tt.want) + return + } + } else { + if got == nil { + t.Errorf("stringToMetricSpec() = got nil but wanted %v", tt.want) } if !reflect.DeepEqual(got.MetricName, tt.want.MetricName) { t.Errorf("stringToMetricSpec() got MetricName = %v, want %v", got.MetricName, tt.want.MetricName) @@ -162,11 +167,7 @@ func TestStringToMetricSpec(t *testing.T) { if !reflect.DeepEqual(got.Labels, tt.want.Labels) { t.Errorf("stringToMetricSpec() got Labels = %v, want %v", got.Labels, tt.want.Labels) } - } else if tt.want != got { // handles if one is nil and the other isn't - t.Errorf("stringToMetricSpec() = %v, want %v", got, tt.want) - } - }) } } diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go index 0f05185d1..3bc4fc703 100644 --- a/pkg/epp/backend/vllm/metrics_test.go +++ b/pkg/epp/backend/vllm/metrics_test.go @@ -14,10 +14,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package vllm +package metrics import ( "context" + + "errors" "fmt" "reflect" "strconv" @@ -25,7 +27,7 @@ import ( "testing" dto "github.com/prometheus/client_model/go" - "go.uber.org/multierr" + "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" "k8s.io/apimachinery/pkg/types" @@ -76,11 +78,10 @@ func TestGetMetric(t *testing.T) { } tests := []struct { - name string - spec MetricSpec - wantValue float64 - wantError bool - shouldPanic bool // Add this + name string + spec MetricSpec + wantGaugeValue float64 + wantError bool }{ { name: "get labeled metric, exists", @@ -88,8 +89,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric1", Labels: map[string]string{"label1": "value1"}, }, - wantValue: 1.0, - wantError: false, + wantGaugeValue: 1.0, + wantError: false, }, { name: "get labeled metric, wrong value", @@ -97,8 +98,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric1", Labels: map[string]string{"label1": "value3"}, }, - wantValue: -1, // Expect an error, not a specific value - wantError: true, + wantGaugeValue: -1, // Expect an error, not a specific value + wantError: true, }, { name: "get labeled metric, missing label", @@ -106,8 +107,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric1", Labels: map[string]string{"label2": "value2"}, }, - wantValue: -1, - wantError: true, + wantGaugeValue: -1, + wantError: true, }, { name: "get labeled metric, extra label present", @@ -115,8 +116,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric2", Labels: map[string]string{"labelA": "A1"}, }, - wantValue: 3.0, - wantError: false, + wantGaugeValue: 3.0, + wantError: false, }, { name: "get unlabeled metric, exists", @@ -124,8 +125,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric3", Labels: nil, // Explicitly nil }, - wantValue: 5.0, // latest metric, which occurs first in our test data - wantError: false, + wantGaugeValue: 5.0, // latest metric, which occurs first in our test data + wantError: false, }, { name: "get unlabeled metric, metric family not found", @@ -133,8 +134,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric4", Labels: nil, }, - wantValue: -1, - wantError: true, + wantGaugeValue: -1, + wantError: true, }, { name: "get labeled metric, metric family not found", @@ -142,16 +143,16 @@ func TestGetMetric(t *testing.T) { MetricName: "metric4", Labels: map[string]string{"label1": "value1"}, }, - wantValue: -1, - wantError: true, + wantGaugeValue: -1, + wantError: true, }, { name: "get metric, no metrics available", spec: MetricSpec{ MetricName: "empty_metric", }, - wantValue: -1, - wantError: true, + wantGaugeValue: -1, + wantError: true, }, { name: "get latest metric", @@ -159,8 +160,8 @@ func TestGetMetric(t *testing.T) { MetricName: "metric3", Labels: map[string]string{}, // Empty map, not nil }, - wantValue: 5.0, - wantError: false, + wantGaugeValue: 5.0, + wantError: false, }, } @@ -168,13 +169,6 @@ func TestGetMetric(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if tt.shouldPanic { - defer func() { - if r := recover(); r == nil { - t.Errorf("The code did not panic") - } - }() - } gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec) @@ -184,10 +178,10 @@ func TestGetMetric(t *testing.T) { } } else { if err != nil { - t.Errorf("getMetric() unexpected error: %v", err) + t.Fatalf("getMetric() unexpected error: %v", err) } - if gotMetric.GetGauge().GetValue() != tt.wantValue { - t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantValue) + if gotMetric.GetGauge().GetValue() != tt.wantGaugeValue { + t.Errorf("getMetric() got value %v, want %v", gotMetric.GetGauge().GetValue(), tt.wantGaugeValue) } } }) @@ -385,12 +379,12 @@ func TestPromToPodMetrics(t *testing.T) { logger := logutil.NewTestLogger() tests := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - mapping *MetricMapping - existingMetrics *datastore.PodMetrics - expectedMetrics *datastore.PodMetrics - expectedErrCount int // Count of expected errors + name string + metricFamilies map[string]*dto.MetricFamily + mapping *MetricMapping + existingMetrics *datastore.PodMetrics + expectedMetrics *datastore.PodMetrics + expectedErr error // Count of expected errors }{ { name: "vllm metrics", @@ -437,7 +431,6 @@ func TestPromToPodMetrics(t *testing.T) { MaxActiveModels: 3, }, }, - expectedErrCount: 0, }, { name: "missing metrics", @@ -447,9 +440,9 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedErrCount: 3, // Errors for all 4 main metrics + existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, + expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, + expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), }, { name: "partial metrics available + LoRA", @@ -491,7 +484,7 @@ func TestPromToPodMetrics(t *testing.T) { MaxActiveModels: 3, }, }, - expectedErrCount: 1, // Errors for the two missing metrics + expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), }, { name: "invalid max lora", @@ -527,7 +520,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, - expectedErrCount: 1, // Expect *one* error + expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), }, } @@ -535,25 +528,11 @@ func TestPromToPodMetrics(t *testing.T) { t.Run(tc.name, func(t *testing.T) { p := &PodMetricsClientImpl{MetricMapping: tc.mapping} updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics) - - if tc.expectedErrCount == 0 { - if err != nil { - t.Errorf("promToPodMetrics() unexpected error: %v", err) - } + if tc.expectedErr != nil { + assert.Error(t, err) } else { - if err == nil { - t.Errorf("promToPodMetrics() expected errors, got nil") - } else { - // Check the *number* of errors. multierr.Errors() gives us a slice - if len(multierr.Errors(err)) != tc.expectedErrCount { - t.Errorf("promToPodMetrics() wrong number of errors: got %d, want %d. Errors: %v", len(multierr.Errors(err)), tc.expectedErrCount, err) - } - - } - } - // Use podMetricsEqual for comparison with tolerance. - if !reflect.DeepEqual(updated, tc.expectedMetrics) { - t.Errorf("promToPodMetrics() got %+v, want %+v", updated, tc.expectedMetrics) + assert.NoError(t, err) + assert.Equal(t, tc.expectedMetrics, updated) } }) } From 371fd582393dee21a162d693130fbf92e0a5c8ac Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 13 Mar 2025 23:37:27 +0000 Subject: [PATCH 12/19] Rebase into metric agnostic redesign. --- cmd/epp/main.go | 16 +- pkg/epp/backend/{vllm => metrics}/metrics.go | 20 +- .../backend/{vllm => metrics}/metrics_spec.go | 0 .../{vllm => metrics}/metrics_spec_test.go | 0 .../backend/{vllm => metrics}/metrics_test.go | 133 ++++--------- pkg/epp/backend/provider.go | 183 ------------------ 6 files changed, 53 insertions(+), 299 deletions(-) rename pkg/epp/backend/{vllm => metrics}/metrics.go (94%) rename pkg/epp/backend/{vllm => metrics}/metrics_spec.go (100%) rename pkg/epp/backend/{vllm => metrics}/metrics_spec_test.go (100%) rename pkg/epp/backend/{vllm => metrics}/metrics_test.go (82%) delete mode 100644 pkg/epp/backend/provider.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 277cff37f..634cda4a2 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -37,10 +37,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" - servermetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" @@ -156,12 +153,8 @@ func run() error { ctx := ctrl.SetupSignalHandler() - pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval) - // Setup runner. - datastore := datastore.NewDatastore(ctx, pmf) - // Set up mapper for metric scraping. - mapping, err := servermetrics.NewMetricMapping( + mapping, err := backendmetrics.NewMetricMapping( *totalQueuedRequestMetric, *kvCacheUsagePercentageMetric, *loraRequestInfoMetric, @@ -170,8 +163,11 @@ func run() error { setupLog.Error(err, "Failed to create metric mapping from flags.") return err } - provider := backend.NewProvider(&servermetrics.PodMetricsClientImpl{MetricMapping: mapping}, datastore) - // + + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval) + // Setup runner. + datastore := datastore.NewDatastore(ctx, pmf) + serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/metrics/metrics.go similarity index 94% rename from pkg/epp/backend/vllm/metrics.go rename to pkg/epp/backend/metrics/metrics.go index 1328a7672..cc988758f 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -29,7 +29,6 @@ import ( "github.com/prometheus/common/expfmt" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -47,15 +46,16 @@ type PodMetricsClientImpl struct { // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, - existing *datastore.PodMetrics, + pod *Pod, + existing *Metrics, port int32, -) (*datastore.PodMetrics, error) { +) (*Metrics, error) { logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" + url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { @@ -64,16 +64,16 @@ func (p *PodMetricsClientImpl) FetchMetrics( } resp, err := http.DefaultClient.Do(req) if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) + loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) + loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } parser := expfmt.TextParser{} @@ -88,8 +88,8 @@ func (p *PodMetricsClientImpl) FetchMetrics( func (p *PodMetricsClientImpl) promToPodMetrics( logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, - existing *datastore.PodMetrics, -) (*datastore.PodMetrics, error) { + existing *Metrics, +) (*Metrics, error) { var errs error updated := existing.Clone() diff --git a/pkg/epp/backend/vllm/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go similarity index 100% rename from pkg/epp/backend/vllm/metrics_spec.go rename to pkg/epp/backend/metrics/metrics_spec.go diff --git a/pkg/epp/backend/vllm/metrics_spec_test.go b/pkg/epp/backend/metrics/metrics_spec_test.go similarity index 100% rename from pkg/epp/backend/vllm/metrics_spec_test.go rename to pkg/epp/backend/metrics/metrics_spec_test.go diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go similarity index 82% rename from pkg/epp/backend/vllm/metrics_test.go rename to pkg/epp/backend/metrics/metrics_test.go index 3bc4fc703..41a3eb9ae 100644 --- a/pkg/epp/backend/vllm/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -18,7 +18,6 @@ package metrics import ( "context" - "errors" "fmt" "reflect" @@ -28,10 +27,10 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" + "go.uber.org/multierr" "google.golang.org/protobuf/proto" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -377,13 +376,12 @@ func TestGetLatestLoraMetric(t *testing.T) { func TestPromToPodMetrics(t *testing.T) { logger := logutil.NewTestLogger() - tests := []struct { name string metricFamilies map[string]*dto.MetricFamily mapping *MetricMapping - existingMetrics *datastore.PodMetrics - expectedMetrics *datastore.PodMetrics + existingMetrics *Metrics + expectedMetrics *Metrics expectedErr error // Count of expected errors }{ { @@ -398,7 +396,7 @@ func TestPromToPodMetrics(t *testing.T) { makeMetric("vllm_usage", nil, 0.7, 500), ), "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000), ), }, mapping: &MetricMapping{ @@ -406,30 +404,12 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{}, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - WaitingQueueSize: 7, - KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, - MaxActiveModels: 3, - }, + existingMetrics: &Metrics{}, + expectedMetrics: &Metrics{ + WaitingQueueSize: 7, + KVCacheUsagePercent: 0.8, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + MaxActiveModels: 3, }, }, { @@ -440,9 +420,9 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedMetrics: &datastore.PodMetrics{Metrics: datastore.Metrics{ActiveModels: map[string]int{}}}, - expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), + existingMetrics: &Metrics{ActiveModels: map[string]int{}}, + expectedMetrics: &Metrics{ActiveModels: map[string]int{}}, + expectedErr: multierr.Combine(fmt.Errorf("metric family \"vllm_waiting\" not found"), fmt.Errorf("metric family \"vllm_usage\" not found"), fmt.Errorf("metric family \"vllm:lora_requests_info\" not found")), }, { name: "partial metrics available + LoRA", @@ -451,7 +431,7 @@ func TestPromToPodMetrics(t *testing.T) { makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present ), "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 5.0, 3000), + makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000), ), }, mapping: &MetricMapping{ @@ -459,32 +439,14 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{}, // Initialize with empty Metrics - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, - MaxActiveModels: 3, - }, + existingMetrics: &Metrics{}, + expectedMetrics: &Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.8, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + MaxActiveModels: 3, }, - expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), + expectedErr: fmt.Errorf("metric family \"vllm_waiting\" not found"), }, { name: "invalid max lora", @@ -496,31 +458,13 @@ func TestPromToPodMetrics(t *testing.T) { mapping: &MetricMapping{ LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{}, - }, - expectedMetrics: &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, - }, - Metrics: datastore.Metrics{ - ActiveModels: map[string]int{"lora1": 0}, - MaxActiveModels: 0, // Should still default to 0. - - }, + existingMetrics: &Metrics{}, + expectedMetrics: &Metrics{ + ActiveModels: map[string]int{"lora1": 0}, + MaxActiveModels: 0, // Should still default to 0. + }, - expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), + expectedErr: errors.New("strconv.Atoi: parsing \"invalid\": invalid syntax"), }, } @@ -530,6 +474,7 @@ func TestPromToPodMetrics(t *testing.T) { updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics) if tc.expectedErr != nil { assert.Error(t, err) + assert.EqualError(t, err, tc.expectedErr.Error()) } else { assert.NoError(t, err) assert.Equal(t, tc.expectedMetrics, updated) @@ -538,25 +483,21 @@ func TestPromToPodMetrics(t *testing.T) { } } -// TestFetchMetrics is a basic integration test. A more complete test would mock -// the HTTP client. +// TestFetchMetrics is a basic integration test. It assumes +// there's no server running on the specified port. func TestFetchMetrics(t *testing.T) { - // This test is very basic as it doesn't mock the HTTP client. It assumes - // there's no server running on the specified port. A real-world test - // suite should use a mock server. ctx := logutil.NewTestLoggerIntoContext(context.Background()) - existing := &datastore.PodMetrics{ - Pod: datastore.Pod{ - Address: "127.0.0.1", - NamespacedName: types.NamespacedName{ - Namespace: "test", - Name: "pod", - }, + pod := &Pod{ + Address: "127.0.0.1", + NamespacedName: types.NamespacedName{ + Namespace: "test", + Name: "pod", }, } + existing := &Metrics{} p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test - _, err := p.FetchMetrics(ctx, existing, 9999) // Use a port that's unlikely to be in use. + _, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use. if err == nil { t.Errorf("FetchMetrics() expected error, got nil") } diff --git a/pkg/epp/backend/provider.go b/pkg/epp/backend/provider.go deleted file mode 100644 index 959f3e0c9..000000000 --- a/pkg/epp/backend/provider.go +++ /dev/null @@ -1,183 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package backend - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/go-logr/logr" - "go.uber.org/multierr" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -const ( - fetchMetricsTimeout = 5 * time.Second -) - -func NewProvider(pmc PodMetricsClient, datastore datastore.Datastore) *Provider { - p := &Provider{ - pmc: pmc, - datastore: datastore, - } - return p -} - -// Provider provides backend pods and information such as metrics. -type Provider struct { - pmc PodMetricsClient - datastore datastore.Datastore -} - -type PodMetricsClient interface { - FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error) -} - -func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { - // periodically refresh metrics - logger := log.FromContext(ctx) - go func() { - for { - select { - case <-ctx.Done(): - logger.V(logutil.DEFAULT).Info("Shutting down metrics prober") - return - default: - time.Sleep(refreshMetricsInterval) - if err := p.refreshMetricsOnce(logger); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics") - } - } - } - }() - - // Periodically flush prometheus metrics for inference pool - go func() { - for { - select { - case <-ctx.Done(): - logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") - return - default: - time.Sleep(refreshPrometheusMetricsInterval) - p.flushPrometheusMetricsOnce(logger) - } - } - }() - - // Periodically print out the pods and metrics for DEBUGGING. - if logger := logger.V(logutil.DEBUG); logger.Enabled() { - go func() { - for { - select { - case <-ctx.Done(): - logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") - return - default: - time.Sleep(5 * time.Second) - logger.Info("Current Pods and metrics gathered", "metrics", p.datastore.PodGetAll()) - } - } - }() - } - - return nil -} - -func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { - loggerTrace := logger.V(logutil.TRACE) - pool, _ := p.datastore.PoolGet() - if pool == nil { - loggerTrace.Info("No inference pool or not initialized") - return nil - } - ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) - defer cancel() - start := time.Now() - defer func() { - d := time.Since(start) - // TODO: add a metric instead of logging - loggerTrace.Info("Metrics refreshed", "duration", d) - }() - - var wg sync.WaitGroup - errCh := make(chan error) - processOnePod := func(key, value any) bool { - loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value) - existing := value.(*datastore.PodMetrics) - wg.Add(1) - go func() { - defer wg.Done() - updated, err := p.pmc.FetchMetrics(ctx, existing, pool.Spec.TargetPortNumber) - if err != nil { - errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err) - return - } - p.datastore.PodUpdateMetricsIfExist(updated.NamespacedName, &updated.Metrics) - loggerTrace.Info("Updated metrics for pod", "pod", updated.NamespacedName, "metrics", updated.Metrics) - }() - return true - } - p.datastore.PodRange(processOnePod) - - // Wait for metric collection for all pods to complete and close the error channel in a - // goroutine so this is unblocking, allowing the code to proceed to the error collection code - // below. - // Note we couldn't use a buffered error channel with a size because the size of the podMetrics - // sync.Map is unknown beforehand. - go func() { - wg.Wait() - close(errCh) - }() - - var errs error - for err := range errCh { - errs = multierr.Append(errs, err) - } - return errs -} - -func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { - pool, _ := p.datastore.PoolGet() - if pool == nil { - // No inference pool or not initialize. - return - } - - var kvCacheTotal float64 - var queueTotal int - - podMetrics := p.datastore.PodGetAll() - logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) - if len(podMetrics) == 0 { - return - } - - for _, pod := range podMetrics { - kvCacheTotal += pod.KVCacheUsagePercent - queueTotal += pod.WaitingQueueSize - } - - podTotalCount := len(podMetrics) - metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) - metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) -} From 97fd0defbd2d2f2a92eaa1ca9b406e47a311ac23 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 02:27:01 +0000 Subject: [PATCH 13/19] Merge getLatestMetric and getLabeledMetric. --- pkg/epp/backend/metrics/metrics.go | 37 ++++++++---------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index cc988758f..67baf853e 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -219,39 +219,22 @@ func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[ if len(mf.GetMetric()) == 0 { return nil, fmt.Errorf("no metrics available for %q", spec.MetricName) } - // if there is a specified label, return only that metric in the family - if spec.Labels != nil { - return getLabeledMetric(logger, mf, &spec) - } - return getLatestMetric(logger, mf) -} -// getLatestMetric gets the latest metric of a family (for metrics without labels). -func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily) (*dto.Metric, error) { - var latestTs int64 = -1 - var latest *dto.Metric - for _, m := range mf.GetMetric() { - if m.GetTimestampMs() >= latestTs { - latestTs = m.GetTimestampMs() - latest = m - } - } - - if latest == nil { - return nil, fmt.Errorf("no metrics found for %q", mf.GetName()) - } - - logger.V(logutil.TRACE).Info("Latest metric value selected", "value", latest, "metric", mf.GetName()) - return latest, nil + return getLatestMetric(logger, mf, &spec) } // getLabeledMetric gets the latest metric with matching labels. -func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) { +func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) { var latestMetric *dto.Metric var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater + var labels map[string]string = nil + if spec.Labels != nil { + labels = spec.Labels + } + for _, m := range mf.GetMetric() { - if spec == nil || labelsMatch(m.GetLabel(), spec.Labels) { + if labels == nil || labelsMatch(m.GetLabel(), spec.Labels) { if m.GetTimestampMs() > latestTimestamp { latestTimestamp = m.GetTimestampMs() latestMetric = m @@ -260,11 +243,11 @@ func getLabeledMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec } if latestMetric != nil { - logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "metric", spec.MetricName) + logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "name", spec.MetricName) return latestMetric, nil } - return nil, fmt.Errorf("no matching labeled metric found for %q with labels %v", spec.MetricName, spec.Labels) + return nil, fmt.Errorf("no matching metric found for %q with labels %+v", spec.MetricName, labels) } // labelsMatch checks if a metric's labels contain all the labels in the spec. From 27b34e9410ec8a7e535376a24ce786fc0d5f7d54 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 02:34:48 +0000 Subject: [PATCH 14/19] Remove unused datastore types. --- pkg/epp/datastore/types.go | 67 -------------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 pkg/epp/datastore/types.go diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go deleted file mode 100644 index b87b1c0ae..000000000 --- a/pkg/epp/datastore/types.go +++ /dev/null @@ -1,67 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package datastore is a library to interact with backend model servers such as probing metrics. -package datastore - -import ( - "fmt" - - "k8s.io/apimachinery/pkg/types" -) - -type Pod struct { - NamespacedName types.NamespacedName - Address string -} - -type Metrics struct { - // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. - ActiveModels map[string]int - // MaxActiveModels is the maximum number of models that can be loaded to GPU. - MaxActiveModels int - WaitingQueueSize int - KVCacheUsagePercent float64 -} - -type PodMetrics struct { - Pod - Metrics -} - -func (pm *PodMetrics) String() string { - return fmt.Sprintf("Pod: %+v; Address: %+v; Metrics: %+v", pm.NamespacedName, pm.Address, pm.Metrics) -} - -func (pm *PodMetrics) Clone() *PodMetrics { - cm := make(map[string]int, len(pm.ActiveModels)) - for k, v := range pm.ActiveModels { - cm[k] = v - } - clone := &PodMetrics{ - Pod: Pod{ - NamespacedName: pm.NamespacedName, - Address: pm.Address, - }, - Metrics: Metrics{ - ActiveModels: cm, - MaxActiveModels: pm.MaxActiveModels, - WaitingQueueSize: pm.WaitingQueueSize, - KVCacheUsagePercent: pm.KVCacheUsagePercent, - }, - } - return clone -} From 4b84744e75dc6a18a80ffe5d7a46e333854ed903 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 02:50:41 +0000 Subject: [PATCH 15/19] Fix lint. --- pkg/epp/backend/metrics/metrics.go | 2 +- pkg/epp/backend/metrics/metrics_spec.go | 2 +- pkg/epp/backend/metrics/metrics_test.go | 49 ++++++++++++------------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 67baf853e..b3cfcea77 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -184,7 +184,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam hasRequiredLabels = true } } - //Skip if it does not have the lora labels + // Skip if it does not have the lora labels if !hasRequiredLabels { continue } diff --git a/pkg/epp/backend/metrics/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go index bd8f39ccf..ce0c075dd 100644 --- a/pkg/epp/backend/metrics/metrics_spec.go +++ b/pkg/epp/backend/metrics/metrics_spec.go @@ -79,7 +79,7 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) { } - if metricName == "" { //Metric name cannot be empty + if metricName == "" { // Metric name cannot be empty return nil, fmt.Errorf("empty metric name in spec: %q", specStr) } diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index 41a3eb9ae..455758d99 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -19,7 +19,6 @@ package metrics import ( "context" "errors" - "fmt" "reflect" "strconv" "strings" @@ -36,7 +35,7 @@ import ( // --- Test Helpers --- -func makeMetric(metricName string, labels map[string]string, value float64, timestampMs int64) *dto.Metric { +func makeMetric(labels map[string]string, value float64, timestampMs int64) *dto.Metric { labelPairs := []*dto.LabelPair{} for k, v := range labels { labelPairs = append(labelPairs, &dto.LabelPair{Name: proto.String(k), Value: proto.String(v)}) @@ -63,16 +62,16 @@ func TestGetMetric(t *testing.T) { metricFamilies := map[string]*dto.MetricFamily{ "metric1": makeMetricFamily("metric1", - makeMetric("metric1", map[string]string{"label1": "value1"}, 1.0, 1000), - makeMetric("metric1", map[string]string{"label1": "value2"}, 2.0, 2000), + makeMetric(map[string]string{"label1": "value1"}, 1.0, 1000), + makeMetric(map[string]string{"label1": "value2"}, 2.0, 2000), ), "metric2": makeMetricFamily("metric2", - makeMetric("metric2", map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500), - makeMetric("metric2", map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500), + makeMetric(map[string]string{"labelA": "A1", "labelB": "B1"}, 3.0, 1500), + makeMetric(map[string]string{"labelA": "A2", "labelB": "B2"}, 4.0, 2500), ), "metric3": makeMetricFamily("metric3", - makeMetric("metric3", map[string]string{}, 5.0, 3000), - makeMetric("metric3", map[string]string{}, 6.0, 1000), + makeMetric(map[string]string{}, 5.0, 3000), + makeMetric(map[string]string{}, 6.0, 1000), ), } @@ -256,12 +255,12 @@ func TestGetLatestLoraMetric(t *testing.T) { name: "no lora metrics", metricFamilies: map[string]*dto.MetricFamily{ "some_other_metric": makeMetricFamily("some_other_metric", - makeMetric("some_other_metric", nil, 1.0, 1000), + makeMetric(nil, 1.0, 1000), ), }, expectedAdapters: nil, expectedMax: 0, - expectedErr: fmt.Errorf("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing + expectedErr: errors.New("metric family \"vllm:lora_requests_info\" not found"), // Expect an error because the family is missing mapping: &MetricMapping{ LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, @@ -270,8 +269,8 @@ func TestGetLatestLoraMetric(t *testing.T) { name: "basic lora metrics", metricFamilies: map[string]*dto.MetricFamily{ "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000), // Newer - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older + makeMetric(map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 3000.0, 1000), // Newer + makeMetric(map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 1000.0, 1000), // Older ), }, @@ -286,7 +285,7 @@ func TestGetLatestLoraMetric(t *testing.T) { name: "no matching lora metrics", metricFamilies: map[string]*dto.MetricFamily{ "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"other_label": "value"}, 5.0, 3000), + makeMetric(map[string]string{"other_label": "value"}, 5.0, 3000), ), }, expectedAdapters: nil, @@ -300,8 +299,8 @@ func TestGetLatestLoraMetric(t *testing.T) { name: "no lora metrics if not in MetricMapping", metricFamilies: map[string]*dto.MetricFamily{ "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000), - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000), + makeMetric(map[string]string{"running_lora_adapters": "lora1", "max_lora": "2"}, 5.0, 3000), + makeMetric(map[string]string{"running_lora_adapters": "lora2,lora3", "max_lora": "4"}, 6.0, 1000), ), }, expectedAdapters: nil, @@ -388,15 +387,15 @@ func TestPromToPodMetrics(t *testing.T) { name: "vllm metrics", metricFamilies: map[string]*dto.MetricFamily{ "vllm_waiting": makeMetricFamily("vllm_waiting", - makeMetric("vllm_waiting", nil, 5.0, 1000), - makeMetric("vllm_waiting", nil, 7.0, 2000), // Newer + makeMetric(nil, 5.0, 1000), + makeMetric(nil, 7.0, 2000), // Newer ), "vllm_usage": makeMetricFamily("vllm_usage", - makeMetric("vllm_usage", nil, 0.8, 2000), - makeMetric("vllm_usage", nil, 0.7, 500), + makeMetric(nil, 0.8, 2000), + makeMetric(nil, 0.7, 500), ), "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000), + makeMetric(map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000), ), }, mapping: &MetricMapping{ @@ -422,16 +421,16 @@ func TestPromToPodMetrics(t *testing.T) { }, existingMetrics: &Metrics{ActiveModels: map[string]int{}}, expectedMetrics: &Metrics{ActiveModels: map[string]int{}}, - expectedErr: multierr.Combine(fmt.Errorf("metric family \"vllm_waiting\" not found"), fmt.Errorf("metric family \"vllm_usage\" not found"), fmt.Errorf("metric family \"vllm:lora_requests_info\" not found")), + expectedErr: multierr.Combine(errors.New("metric family \"vllm_waiting\" not found"), errors.New("metric family \"vllm_usage\" not found"), errors.New("metric family \"vllm:lora_requests_info\" not found")), }, { name: "partial metrics available + LoRA", metricFamilies: map[string]*dto.MetricFamily{ "vllm_usage": makeMetricFamily("vllm_usage", - makeMetric("vllm_usage", nil, 0.8, 2000), // Only usage is present + makeMetric(nil, 0.8, 2000), // Only usage is present ), "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000), + makeMetric(map[string]string{"running_lora_adapters": "lora1,lora2", "waiting_lora_adapters": "lora3", "max_lora": "3"}, 3000.0, 1000), ), }, mapping: &MetricMapping{ @@ -446,13 +445,13 @@ func TestPromToPodMetrics(t *testing.T) { ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, MaxActiveModels: 3, }, - expectedErr: fmt.Errorf("metric family \"vllm_waiting\" not found"), + expectedErr: errors.New("metric family \"vllm_waiting\" not found"), }, { name: "invalid max lora", metricFamilies: map[string]*dto.MetricFamily{ "vllm:lora_requests_info": makeMetricFamily("vllm:lora_requests_info", - makeMetric("vllm:lora_requests_info", map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000), + makeMetric(map[string]string{"running_lora_adapters": "lora1", "max_lora": "invalid"}, 3000.0, 1000), ), }, mapping: &MetricMapping{ From 66e0376cb3046d487f190fc702cd28646b697bc1 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 17:06:43 +0000 Subject: [PATCH 16/19] Remove log and fix nits. --- pkg/epp/backend/metrics/metrics.go | 26 ++++++++----------------- pkg/epp/backend/metrics/metrics_test.go | 2 +- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index b3cfcea77..714a44f11 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -22,7 +22,6 @@ import ( "net/http" "strconv" "strings" - "time" "github.com/go-logr/logr" dto "github.com/prometheus/client_model/go" @@ -113,7 +112,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( // Handle LoRA metrics (only if all LoRA MetricSpecs are present) if p.MetricMapping.LoraRequestInfo != nil { - loraMetrics, _, err := p.getLatestLoraMetric(logger, metricFamilies) + loraMetrics, err := p.getLatestLoraMetric(logger, metricFamilies) errs = multierr.Append(errs, err) if loraMetrics != nil { @@ -154,15 +153,15 @@ func (p *PodMetricsClientImpl) promToPodMetrics( // reason its specially fetched is because each label key value pair permutation generates new series // and only most recent is useful. The value of each series is the creation timestamp so we can // retrieve the latest by sorting the value. -func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { +func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, error) { if p.MetricMapping.LoraRequestInfo == nil { - return nil, time.Time{}, nil // No LoRA metrics configured + return nil, nil // No LoRA metrics configured } loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName] if !ok { logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) - return nil, time.Time{}, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) + return nil, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) } var latest *dto.Metric @@ -200,19 +199,16 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam } } if latest == nil { - logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", p.MetricMapping.LoraRequestInfo.MetricName) - return nil, time.Time{}, nil + return nil, nil } - // Convert the gauge value (creation timestamp) to time.Time. - return latest, time.Unix(0, int64(latestTs*1e9)), nil // Convert nanoseconds to time.Time + return latest, nil // Convert nanoseconds to time.Time } // getMetric retrieves a specific metric based on MetricSpec. func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { mf, ok := metricFamilies[spec.MetricName] if !ok { - logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", spec.MetricName) return nil, fmt.Errorf("metric family %q not found", spec.MetricName) } @@ -228,13 +224,8 @@ func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) var latestMetric *dto.Metric var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater - var labels map[string]string = nil - if spec.Labels != nil { - labels = spec.Labels - } - for _, m := range mf.GetMetric() { - if labels == nil || labelsMatch(m.GetLabel(), spec.Labels) { + if spec.Labels == nil || labelsMatch(m.GetLabel(), spec.Labels) { if m.GetTimestampMs() > latestTimestamp { latestTimestamp = m.GetTimestampMs() latestMetric = m @@ -243,11 +234,10 @@ func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) } if latestMetric != nil { - logger.V(logutil.TRACE).Info("Labeled metric found", "value", latestMetric, "name", spec.MetricName) return latestMetric, nil } - return nil, fmt.Errorf("no matching metric found for %q with labels %+v", spec.MetricName, labels) + return nil, fmt.Errorf("no matching metric found for %q with labels %+v", spec.MetricName, spec.Labels) } // labelsMatch checks if a metric's labels contain all the labels in the spec. diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index 455758d99..d2e637fc7 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -314,7 +314,7 @@ func TestGetLatestLoraMetric(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { p := &PodMetricsClientImpl{MetricMapping: tc.mapping} - loraMetric, _, err := p.getLatestLoraMetric(logger, tc.metricFamilies) + loraMetric, err := p.getLatestLoraMetric(logger, tc.metricFamilies) if tc.expectedErr != nil { if err == nil || err.Error() != tc.expectedErr.Error() { From 9f4859b2f4cd49d720b8b6f1a60d71ab5b5ee42e Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 17:45:35 +0000 Subject: [PATCH 17/19] Move ext_proc and inferencemodel yaml files back, fix nits and remove all logging from metrics.go. --- cmd/epp/main.go | 8 ++-- config/manifests/{vllm => }/ext_proc.yaml | 0 .../manifests/{vllm => }/inferencemodel.yaml | 0 pkg/epp/backend/metrics/metrics.go | 45 +++++++------------ pkg/epp/backend/metrics/metrics_test.go | 6 +-- 5 files changed, 22 insertions(+), 37 deletions(-) rename config/manifests/{vllm => }/ext_proc.yaml (100%) rename config/manifests/{vllm => }/inferencemodel.yaml (100%) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 634cda4a2..fa63f0bce 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -92,14 +92,14 @@ var ( "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ "then a self-signed certificate is used.") // metric flags - totalQueuedRequestMetric = flag.String("totalQueuedRequestMetric", + totalQueuedRequestsMetric = flag.String("totalQueuedRequestsMetric", "vllm:num_requests_waiting", "Prometheus metric for the number of queued requests.") kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric", "vllm:gpu_cache_usage_perc", "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") // LoRA metrics - loraRequestInfoMetric = flag.String("loraRequestInfoMetric", + loraInfoMetric = flag.String("loraInfoMetric", "vllm:lora_requests_info", "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") @@ -155,9 +155,9 @@ func run() error { // Set up mapper for metric scraping. mapping, err := backendmetrics.NewMetricMapping( - *totalQueuedRequestMetric, + *totalQueuedRequestsMetric, *kvCacheUsagePercentageMetric, - *loraRequestInfoMetric, + *loraInfoMetric, ) if err != nil { setupLog.Error(err, "Failed to create metric mapping from flags.") diff --git a/config/manifests/vllm/ext_proc.yaml b/config/manifests/ext_proc.yaml similarity index 100% rename from config/manifests/vllm/ext_proc.yaml rename to config/manifests/ext_proc.yaml diff --git a/config/manifests/vllm/inferencemodel.yaml b/config/manifests/inferencemodel.yaml similarity index 100% rename from config/manifests/vllm/inferencemodel.yaml rename to config/manifests/inferencemodel.yaml diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 714a44f11..7de3d9031 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -28,14 +28,13 @@ import ( "github.com/prometheus/common/expfmt" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( // LoRA metrics based on protocol - LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" - LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" - LoraRequestInfoMaxAdaptersMetricName = "max_lora" + LoraInfoRunningAdaptersMetricName = "running_lora_adapters" + LoraInfoWaitingAdaptersMetricName = "waiting_lora_adapters" + LoraInfoMaxAdaptersMetricName = "max_lora" ) type PodMetricsClientImpl struct { @@ -50,7 +49,6 @@ func (p *PodMetricsClientImpl) FetchMetrics( port int32, ) (*Metrics, error) { logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. @@ -58,12 +56,10 @@ func (p *PodMetricsClientImpl) FetchMetrics( req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { - loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) } defer func() { @@ -71,7 +67,6 @@ func (p *PodMetricsClientImpl) FetchMetrics( }() if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } @@ -93,7 +88,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( updated := existing.Clone() if p.MetricMapping.TotalQueuedRequests != nil { - queued, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.TotalQueuedRequests) + queued, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalQueuedRequests) if err == nil { updated.WaitingQueueSize = int(queued.GetGauge().GetValue()) } else { @@ -102,7 +97,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( } if p.MetricMapping.KVCacheUtilization != nil { - usage, err := p.getMetric(logger, metricFamilies, *p.MetricMapping.KVCacheUtilization) + usage, err := p.getMetric(metricFamilies, *p.MetricMapping.KVCacheUtilization) if err == nil { updated.KVCacheUsagePercent = usage.GetGauge().GetValue() } else { @@ -112,13 +107,13 @@ func (p *PodMetricsClientImpl) promToPodMetrics( // Handle LoRA metrics (only if all LoRA MetricSpecs are present) if p.MetricMapping.LoraRequestInfo != nil { - loraMetrics, err := p.getLatestLoraMetric(logger, metricFamilies) + loraMetrics, err := p.getLatestLoraMetric(metricFamilies) errs = multierr.Append(errs, err) if loraMetrics != nil { updated.ActiveModels = make(map[string]int) for _, label := range loraMetrics.GetLabel() { - if label.GetName() == LoraRequestInfoRunningAdaptersMetricName { + if label.GetName() == LoraInfoRunningAdaptersMetricName { if label.GetValue() != "" { adapterList := strings.Split(label.GetValue(), ",") for _, adapter := range adapterList { @@ -126,7 +121,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( } } } - if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { + if label.GetName() == LoraInfoWaitingAdaptersMetricName { if label.GetValue() != "" { adapterList := strings.Split(label.GetValue(), ",") for _, adapter := range adapterList { @@ -134,7 +129,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( } } } - if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { + if label.GetName() == LoraInfoMaxAdaptersMetricName { if label.GetValue() != "" { updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) if err != nil { @@ -153,14 +148,13 @@ func (p *PodMetricsClientImpl) promToPodMetrics( // reason its specially fetched is because each label key value pair permutation generates new series // and only most recent is useful. The value of each series is the creation timestamp so we can // retrieve the latest by sorting the value. -func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, error) { +func (p *PodMetricsClientImpl) getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, error) { if p.MetricMapping.LoraRequestInfo == nil { return nil, nil // No LoRA metrics configured } loraRequests, ok := metricFamilies[p.MetricMapping.LoraRequestInfo.MetricName] if !ok { - logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", p.MetricMapping.LoraRequestInfo.MetricName) return nil, fmt.Errorf("metric family %q not found", p.MetricMapping.LoraRequestInfo.MetricName) } @@ -171,22 +165,15 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam for _, m := range loraRequests.GetMetric() { running := "" waiting := "" - // Check if the metric has the expected LoRA labels. This is important! - hasRequiredLabels := false + // Check if the metric has the expected LoRA labels. for _, lp := range m.GetLabel() { switch lp.GetName() { - case LoraRequestInfoRunningAdaptersMetricName: + case LoraInfoRunningAdaptersMetricName: running = lp.GetValue() - hasRequiredLabels = true - case LoraRequestInfoWaitingAdaptersMetricName: + case LoraInfoWaitingAdaptersMetricName: waiting = lp.GetValue() - hasRequiredLabels = true } } - // Skip if it does not have the lora labels - if !hasRequiredLabels { - continue - } // Ignore metrics with both labels empty. if running == "" && waiting == "" { continue @@ -206,7 +193,7 @@ func (p *PodMetricsClientImpl) getLatestLoraMetric(logger logr.Logger, metricFam } // getMetric retrieves a specific metric based on MetricSpec. -func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { +func (p *PodMetricsClientImpl) getMetric(metricFamilies map[string]*dto.MetricFamily, spec MetricSpec) (*dto.Metric, error) { mf, ok := metricFamilies[spec.MetricName] if !ok { return nil, fmt.Errorf("metric family %q not found", spec.MetricName) @@ -216,11 +203,11 @@ func (p *PodMetricsClientImpl) getMetric(logger logr.Logger, metricFamilies map[ return nil, fmt.Errorf("no metrics available for %q", spec.MetricName) } - return getLatestMetric(logger, mf, &spec) + return getLatestMetric(mf, &spec) } // getLabeledMetric gets the latest metric with matching labels. -func getLatestMetric(logger logr.Logger, mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) { +func getLatestMetric(mf *dto.MetricFamily, spec *MetricSpec) (*dto.Metric, error) { var latestMetric *dto.Metric var latestTimestamp int64 = -1 // Initialize to -1 so any timestamp is greater diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index d2e637fc7..0a1e2cd79 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -58,7 +58,6 @@ func makeMetricFamily(name string, metrics ...*dto.Metric) *dto.MetricFamily { // --- Tests --- func TestGetMetric(t *testing.T) { - logger := logutil.NewTestLogger() metricFamilies := map[string]*dto.MetricFamily{ "metric1": makeMetricFamily("metric1", @@ -168,7 +167,7 @@ func TestGetMetric(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - gotMetric, err := p.getMetric(logger, metricFamilies, tt.spec) + gotMetric, err := p.getMetric(metricFamilies, tt.spec) if tt.wantError { if err == nil { @@ -241,7 +240,6 @@ func TestLabelsMatch(t *testing.T) { } func TestGetLatestLoraMetric(t *testing.T) { - logger := logutil.NewTestLogger() testCases := []struct { name string @@ -314,7 +312,7 @@ func TestGetLatestLoraMetric(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { p := &PodMetricsClientImpl{MetricMapping: tc.mapping} - loraMetric, err := p.getLatestLoraMetric(logger, tc.metricFamilies) + loraMetric, err := p.getLatestLoraMetric(tc.metricFamilies) if tc.expectedErr != nil { if err == nil || err.Error() != tc.expectedErr.Error() { From c082e869436b647d0e35af15ed97a83767624f7a Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 18:00:51 +0000 Subject: [PATCH 18/19] Remove the rest of logging from metrics.go and tests. --- pkg/epp/backend/metrics/metrics.go | 6 +----- pkg/epp/backend/metrics/metrics_test.go | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 7de3d9031..be732e78e 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -23,11 +23,9 @@ import ( "strconv" "strings" - "github.com/go-logr/logr" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" - "sigs.k8s.io/controller-runtime/pkg/log" ) const ( @@ -48,7 +46,6 @@ func (p *PodMetricsClientImpl) FetchMetrics( existing *Metrics, port int32, ) (*Metrics, error) { - logger := log.FromContext(ctx) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. @@ -75,12 +72,11 @@ func (p *PodMetricsClientImpl) FetchMetrics( if err != nil { return nil, err } - return p.promToPodMetrics(logger, metricFamilies, existing) + return p.promToPodMetrics(metricFamilies, existing) } // promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. func (p *PodMetricsClientImpl) promToPodMetrics( - logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, existing *Metrics, ) (*Metrics, error) { diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index 0a1e2cd79..d0396bf74 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -372,7 +372,6 @@ func TestGetLatestLoraMetric(t *testing.T) { } func TestPromToPodMetrics(t *testing.T) { - logger := logutil.NewTestLogger() tests := []struct { name string metricFamilies map[string]*dto.MetricFamily @@ -468,7 +467,7 @@ func TestPromToPodMetrics(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { p := &PodMetricsClientImpl{MetricMapping: tc.mapping} - updated, err := p.promToPodMetrics(logger, tc.metricFamilies, tc.existingMetrics) + updated, err := p.promToPodMetrics(tc.metricFamilies, tc.existingMetrics) if tc.expectedErr != nil { assert.Error(t, err) assert.EqualError(t, err, tc.expectedErr.Error()) From 81ee1e6b66ff371daa7f080a3d9d2aef5785784a Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Mar 2025 18:27:55 +0000 Subject: [PATCH 19/19] Add trace log to podmetrics and small warning fix to metrics_spec_test. --- pkg/epp/backend/metrics/metrics_spec_test.go | 2 +- pkg/epp/backend/metrics/pod_metrics.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/epp/backend/metrics/metrics_spec_test.go b/pkg/epp/backend/metrics/metrics_spec_test.go index 8de6dac29..828042065 100644 --- a/pkg/epp/backend/metrics/metrics_spec_test.go +++ b/pkg/epp/backend/metrics/metrics_spec_test.go @@ -159,7 +159,7 @@ func TestStringToMetricSpec(t *testing.T) { } } else { if got == nil { - t.Errorf("stringToMetricSpec() = got nil but wanted %v", tt.want) + t.Fatalf("stringToMetricSpec() = got nil but wanted %v", tt.want) } if !reflect.DeepEqual(got.MetricName, tt.want.MetricName) { t.Errorf("stringToMetricSpec() got MetricName = %v, want %v", got.MetricName, tt.want.MetricName) diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index b954a98ce..01db14bec 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -115,6 +115,7 @@ func (pm *podMetrics) refreshMetrics() error { defer cancel() updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics(), pool.Spec.TargetPortNumber) if err != nil { + pm.logger.V(logutil.TRACE).Info("Failed to refreshed metrics:", "err", err) // As refresher is running in the background, it's possible that the pod is deleted but // the refresh goroutine doesn't read the done channel yet. In this case, we just return nil. // The refresher will be stopped after this interval.