mayabar
diff --git a/‎README.md
+4 b/‎README.md
+4
diff --git a/‎api/v1alpha2/inferencemodel_types.go
+1-1 b/‎api/v1alpha2/inferencemodel_types.go
+1-1
diff --git a/‎cmd/epp/main.go
+9-5 b/‎cmd/epp/main.go
+9-5
diff --git a/‎config/charts/inferencepool/README.md
+13-1 b/‎config/charts/inferencepool/README.md
+13-1
diff --git a/‎config/charts/inferencepool/templates/epp-deployment.yaml
+8-1 b/‎config/charts/inferencepool/templates/epp-deployment.yaml
+8-1
diff --git a/‎config/charts/inferencepool/values.yaml
+1 b/‎config/charts/inferencepool/values.yaml
+1
diff --git a/‎config/manifests/inferencepool-resources.yaml
+2-1 b/‎config/manifests/inferencepool-resources.yaml
+2-1
diff --git a/‎docs/proposals/00x-epp-compliance-proposal/README.md
+99 b/‎docs/proposals/00x-epp-compliance-proposal/README.md
+99
diff --git a/‎docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg
+1 b/‎docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg
+1
diff --git a/‎go.mod
+3-4 b/‎go.mod
+3-4
diff --git a/‎go.sum
+8-8 b/‎go.sum
+8-8
diff --git a/‎mkdocs.yml
+4-1 b/‎mkdocs.yml
+4-1
diff --git a/‎pkg/epp/backend/metrics/fake.go
-9 b/‎pkg/epp/backend/metrics/fake.go
-9
diff --git a/‎pkg/epp/backend/metrics/logger.go
+5-5 b/‎pkg/epp/backend/metrics/logger.go
+5-5
@@ -1,3 +1,7 @@
+[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/gateway-api-inference-extension)](https://goreportcard.com/report/sigs.k8s.io/gateway-api-inference-extension)
+[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension)
+[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE)
+
 # Gateway API Inference Extension 
 
 This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.
 
@@ -126,7 +126,7 @@ type PoolObjectReference struct {
 }
 
 // Criticality defines how important it is to serve the model compared to other models.
-// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.
 // This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
 // +kubebuilder:validation:Enum=Critical;Standard;Sheddable
 type Criticality string
 
@@ -30,6 +30,7 @@ import (
 	"go.uber.org/zap/zapcore"
 	"google.golang.org/grpc"
 	healthPb "google.golang.org/grpc/health/grpc_health_v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/rest"
 	"k8s.io/component-base/metrics/legacyregistry"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -140,14 +141,16 @@ func run() error {
 		return err
 	}
 
-	mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg)
+	poolNamespacedName := types.NamespacedName{
+		Name:      *poolName,
+		Namespace: *poolNamespace,
+	}
+	mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg)
 	if err != nil {
 		setupLog.Error(err, "Failed to create controller manager")
 		return err
 	}
 
-	ctx := ctrl.SetupSignalHandler()
-
 	// Set up mapper for metric scraping.
 	mapping, err := backendmetrics.NewMetricMapping(
 		*totalQueuedRequestsMetric,
@@ -162,14 +165,15 @@ func run() error {
 
 	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval)
 	// Setup runner.
+	ctx := ctrl.SetupSignalHandler()
+
 	datastore := datastore.NewDatastore(ctx, pmf)
 
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
 		DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
 		DestinationEndpointHintKey:               *destinationEndpointHintKey,
-		PoolName:                                 *poolName,
-		PoolNamespace:                            *poolNamespace,
+		PoolNamespacedName:                       poolNamespacedName,
 		Datastore:                                datastore,
 		SecureServing:                            *secureServing,
 		CertPath:                                 *certPath,
 
@@ -2,7 +2,6 @@
 
 A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment.  
 
-
 ## Install
 
 To install an InferencePool named `vllm-llama3-8b-instruct`  that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
@@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \
 
 Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
 
+### Install for Triton TensorRT-LLM
+
+Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
+
+```txt
+$ helm install triton-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \
+  --set inferencePool.modelServerType=triton-tensorrt-llm \
+  --set provider.name=[none|gke] \
+  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
+```
+
 ## Uninstall
 
 Run the following command to uninstall the chart:
@@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart.
 | **Parameter Name**                          | **Description**                                                                                                        |
 |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
 | `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. |
+| `inferencePool.modelServerType`            | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. |
 | `inferencePool.modelServers.matchLabels`    | Label selector to match vllm backends managed by the inference pool.                                                   |
 | `inferenceExtension.replicas`               | Number of replicas for the endpoint picker extension service. Defaults to `1`.                                         |
 | `inferenceExtension.image.name`             | Name of the container image used for the endpoint picker.                                                              |
 
@@ -35,6 +35,14 @@ spec:
         - "9003"
         - -metricsPort
         - "9090"
+        {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
+        - -totalQueuedRequestsMetric
+        - "nv_trt_llm_request_metrics{request_type=waiting}"
+        - -kvCacheUsagePercentageMetric
+        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
+        - -loraInfoMetric
+        - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
+        {{- end }}
         ports:
         - name: grpc
           containerPort: 9002
@@ -54,4 +62,3 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
-
 
@@ -9,6 +9,7 @@ inferenceExtension:
 
 inferencePool:
   targetPortNumber: 8000
+  modelServerType: vllm # vllm, triton-tensorrt-llm
   # modelServers: # REQUIRED
     # matchLabels: 
     #   app: vllm-llama3-8b-instruct
 
@@ -4,7 +4,6 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  labels:
   name: vllm-llama3-8b-instruct
 spec:
   targetPortNumber: 8000
@@ -54,6 +53,8 @@ spec:
         args:
         - -poolName
         - "vllm-llama3-8b-instruct"
+        - "-poolNamespace"
+        - "default"
         - -v
         - "4"
         - --zap-encoder
 
@@ -0,0 +1,99 @@
+# Gateway API Inference Extension
+
+Author(s): @kfswain
+## Proposal Status
+ ***Draft***
+
+## Table of Contents
+
+<!-- toc -->
+
+-   [Summary](#summary)
+-   [Goals](#goals)
+-   [Non-Goals](#non-goals)
+-   [Proposal](#proposal)
+    -   [Personas](#personas)
+        -   [Inference Platform Admin](#inference-platform-admin)
+        -   [Inference Workload Owner](#workload-owner)
+    -   [Axioms](#axioms)
+    -   [InferencePool](#inferencepool)
+    -   [InferenceModel](#inferencemodel)
+    -   [Spec](#spec)
+    -   [Diagrams](#diagrams)
+    -   [Alternatives](#alternatives) 
+- [Open Questions](#open-questions)
+    
+<!-- /toc -->
+
+## Summary
+
+This proposal seeks to standardize the implementation of an EPP (End-point Picker) for the Inference Gateway extension (also known as Gateway API Inference Extension). Additionally, this proposes to restructure the current implementation of the EPP to be more modular, and approachable.
+
+## Goals
+
+- Set a standard on how the EPP & APIs interact
+- Settle on common nomenclature for clearer communication
+- Allow for modularization of the EPP, to be extended to a user's specific needs
+
+## Non-Goals
+
+- Reshaping the current API
+- A change in scope of the current project
+
+## Proposal
+
+This proposal is not proposing any net new features, instead, we are refactoring our current implementation to better handle more devs, more features, etc. At the time of writing, GIE is currently at v0.3, and that stronger experimental context (along with external feedback) made clear the need this restructure. The image below give a high level view of how our components work together.
+
+<img src="./images/epp_arch.svg" alt="Scheduling Algorithm" width="1000" />
+
+## Overview
+At a quick glance, the EPP is being broken into specific layers. The `Data Layer` is of note, as it is a vertical that will be accessed by all the others. The data layer manages the k8s, data, metric & usage data, as well as processing of the above data to determine resource scarcity regimes.
+
+The other layers are handled in sequential process. Starting with the **Ext-Proc** call. The request is buffered and then sent to the **Routing Layer**, which processes any User defined per-InferenceModel routing rules & request enrichment happening first (at the time of writing that is currently just translating the InferenceModel name to a weight-split actual model). Then _all_ requests pass through the to-be-implemented [**Flow Controller**](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674) to ensure that any request entry to the pool adhereing to the guidelines set by the Priority, Fairness, & Queueing configuration. And finally, the **Scheduling Layer** is the load balancing algorithm that intelligently routes requests based on the current state of the InferencePool.
+
+## Components
+
+To further expand upon these component layers. We will first break them into `extensible` and `non-extensible` layers. `Non-extensible` layers are intended to be static, and handled on behalf of the user, typically implementing low-opinion infrastructure. 
+
+The `Extensible` layers are:
+- Data Layer
+- Routing Layer
+- Flow Controller
+- Scheduling Layer
+
+The `Non-Extensible` layer(s) are:
+- The Ext-Proc Server
+
+### `Extensible`
+
+#### Data Layer
+
+The data layer will consume and store: the InferencePool/InferenceModel config and the pre-defined [Model Server Protocol](../003-model-server-protocol/README.md). Additionally, the data fed from the model servers will be processed and digested to provide resource scarcity regime hints, and autoscaling reccomendations.
+
+Many extensions to scheduling will require changes to ingested metrics, as such, the data layer will be built to be extended, but extenders accept that the Model Server Protocol will no longer provide guarantees on portability of a model server out of the box. 
+
+#### Routing Layer
+
+The routing layer is likely to be the most opinion heavy section, as the scope of what constitutes a 'Route Rule' is somewhat broad. The current examples we expect would be:
+
+- System Prompt injection
+- RAG callout
+- Per-InferenceModel request validation (such as saftey/on-topic, etc)
+
+Due to the possibility of this becoming a bit of a dumping ground. The API will keep a _very_ tight scope on which of these route rules are included in the spec. A standard method of extension will be provided if the need to define a custom rule arises.
+
+#### Flow Controller (WIP - implementation tracked in [#674](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674))
+
+The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing).
+
+#### Scheduling Layer
+
+As the Scheduling Layer is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values.
+
+The Scheduler will define a strong interface API, so that new scheduling algos may be plugged & dark-launched to test in production traffic without impacting said traffic. Extension is expected to adhere to the [Scheduler Subsystem definition](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/603)
+
+### `Non-extensible`
+
+#### Ext-Proc Server
+
+The Ext-Proc Server protocol is very well defined & specific, deviation could cause the EPP to become unusable or unstable. Extension is ill-advised.
@@ -9,8 +9,8 @@ require (
 	github.com/google/go-cmp v0.7.0
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
-	github.com/prometheus/client_golang v1.21.1
-	github.com/prometheus/client_model v0.6.1
+	github.com/prometheus/client_golang v1.22.0
+	github.com/prometheus/client_model v0.6.2
 	github.com/prometheus/common v0.63.0
 	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
@@ -25,7 +25,7 @@ require (
 	k8s.io/component-base v0.32.3
 	k8s.io/utils v0.0.0-20241210054802-24370beab758
 	sigs.k8s.io/controller-runtime v0.20.4
-	sigs.k8s.io/structured-merge-diff/v4 v4.6.0
+	sigs.k8s.io/structured-merge-diff/v4 v4.7.0
 	sigs.k8s.io/yaml v1.4.0
 )
 
@@ -74,7 +74,6 @@ require (
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/klauspost/compress v1.17.11 // indirect
 	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/leodido/go-urn v1.2.1 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 
@@ -112,8 +112,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
-github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
+github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
+github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
 github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -164,10 +164,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
 github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
-github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk=
-github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg=
-github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
-github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
+github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
+github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
+github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k=
 github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18=
 github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
@@ -332,7 +332,7 @@ sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1
 sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
 sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ=
 sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
-sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc=
-sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
+sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI=
+sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
 sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
 sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
@@ -54,13 +54,16 @@ nav:
         API Overview: concepts/api-overview.md
         Conformance: concepts/conformance.md
         Roles and Personas: concepts/roles-and-personas.md
-    - Implementations: implementations.md
+    - Implementations: 
+      - Gateways: implementations/gateways.md
+      - Model Servers: implementations/model-servers.md
     - FAQ: faq.md
   - Guides:
     - User Guides:
       - Getting started: guides/index.md
       - Adapter Rollout: guides/adapter-rollout.md
       - Metrics: guides/metrics.md
+      - Replacing an Inference Pool: guides/replacing-inference-pool.md
     - Implementer's Guide: guides/implementers.md
   - Performance:
     - Benchmark: performance/benchmark/index.md
 
@@ -24,7 +24,6 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
@@ -84,11 +83,3 @@ func (f *FakePodMetricsClient) SetErr(new map[types.NamespacedName]error) {
 	defer f.errMu.Unlock()
 	f.Err = new
 }
-
-type FakeDataStore struct {
-	Res map[string]*v1alpha2.InferenceModel
-}
-
-func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) {
-	return fds.Res[modelName]
-}
@@ -55,8 +55,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 			case <-ctx.Done():
 				logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
 				return
-			case <-ticker.C: // Periodically flush prometheus metrics for inference pool
-				flushPrometheusMetricsOnce(logger, datastore)
+			case <-ticker.C: // Periodically refresh prometheus metrics for inference pool
+				refreshPrometheusMetrics(logger, datastore)
 			}
 		}
 	}()
@@ -86,19 +86,19 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 	}
 }
 
-func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) {
+func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
 	pool, err := datastore.PoolGet()
 	if err != nil {
 		// No inference pool or not initialize.
-		logger.V(logutil.DEFAULT).Info("pool is not initialized, skipping flushing metrics")
+		logger.V(logutil.DEFAULT).Info("Pool is not initialized, skipping refreshing metrics")
 		return
 	}
 
 	var kvCacheTotal float64
 	var queueTotal int
 
 	podMetrics := datastore.PodGetAll()
-	logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics))
+	logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics))
 	if len(podMetrics) == 0 {
 		return
 	}
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ type PoolObjectReference struct {`
`126`	`126`	`}`
`127`	`127`
`128`	`128`	`// Criticality defines how important it is to serve the model compared to other models.`
`129`		`-// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.`
	`129`	`+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.`
`130`	`130`	`// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.`
`131`	`131`	`// +kubebuilder:validation:Enum=Critical;Standard;Sheddable`
`132`	`132`	`type Criticality string`
Original file line number	Diff line number	Diff line change
`@@ -55,8 +55,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh`
`55`	`55`	`case <-ctx.Done():`
`56`	`56`	`logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")`
`57`	`57`	`return`
`58`		`- case <-ticker.C: // Periodically flush prometheus metrics for inference pool`
`59`		`- flushPrometheusMetricsOnce(logger, datastore)`
	`58`	`+ case <-ticker.C: // Periodically refresh prometheus metrics for inference pool`
	`59`	`+ refreshPrometheusMetrics(logger, datastore)`
`60`	`60`	`}`
`61`	`61`	`}`
`62`	`62`	`}()`
`@@ -86,19 +86,19 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh`
`86`	`86`	`}`
`87`	`87`	`}`
`88`	`88`
`89`		`-func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) {`
	`89`	`+func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {`
`90`	`90`	`pool, err := datastore.PoolGet()`
`91`	`91`	`if err != nil {`
`92`	`92`	`// No inference pool or not initialize.`
`93`		`- logger.V(logutil.DEFAULT).Info("pool is not initialized, skipping flushing metrics")`
	`93`	`+ logger.V(logutil.DEFAULT).Info("Pool is not initialized, skipping refreshing metrics")`
`94`	`94`	`return`
`95`	`95`	`}`
`96`	`96`
`97`	`97`	`var kvCacheTotal float64`
`98`	`98`	`var queueTotal int`
`99`	`99`
`100`	`100`	`podMetrics := datastore.PodGetAll()`
`101`		`- logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics))`
	`101`	`+ logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics))`
`102`	`102`	`if len(podMetrics) == 0 {`
`103`	`103`	`return`
`104`	`104`	`}`