Skip to content

Commit 5c66101

Browse files
authored
Merge pull request kubernetes-sigs#47 from neuralmagic/upstream-main
Merging upstream main commits
2 parents 79cca51 + d935a7c commit 5c66101

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1671
-438
lines changed

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/gateway-api-inference-extension)](https://goreportcard.com/report/sigs.k8s.io/gateway-api-inference-extension)
2+
[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension)
3+
[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE)
4+
15
# Gateway API Inference Extension
26

37
This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.

api/v1alpha2/inferencemodel_types.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ type PoolObjectReference struct {
126126
}
127127

128128
// Criticality defines how important it is to serve the model compared to other models.
129-
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
129+
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.
130130
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
131131
// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
132132
type Criticality string

cmd/epp/main.go

+9-5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"go.uber.org/zap/zapcore"
3131
"google.golang.org/grpc"
3232
healthPb "google.golang.org/grpc/health/grpc_health_v1"
33+
"k8s.io/apimachinery/pkg/types"
3334
"k8s.io/client-go/rest"
3435
"k8s.io/component-base/metrics/legacyregistry"
3536
ctrl "sigs.k8s.io/controller-runtime"
@@ -140,14 +141,16 @@ func run() error {
140141
return err
141142
}
142143

143-
mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg)
144+
poolNamespacedName := types.NamespacedName{
145+
Name: *poolName,
146+
Namespace: *poolNamespace,
147+
}
148+
mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg)
144149
if err != nil {
145150
setupLog.Error(err, "Failed to create controller manager")
146151
return err
147152
}
148153

149-
ctx := ctrl.SetupSignalHandler()
150-
151154
// Set up mapper for metric scraping.
152155
mapping, err := backendmetrics.NewMetricMapping(
153156
*totalQueuedRequestsMetric,
@@ -162,14 +165,15 @@ func run() error {
162165

163166
pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval)
164167
// Setup runner.
168+
ctx := ctrl.SetupSignalHandler()
169+
165170
datastore := datastore.NewDatastore(ctx, pmf)
166171

167172
serverRunner := &runserver.ExtProcServerRunner{
168173
GrpcPort: *grpcPort,
169174
DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
170175
DestinationEndpointHintKey: *destinationEndpointHintKey,
171-
PoolName: *poolName,
172-
PoolNamespace: *poolNamespace,
176+
PoolNamespacedName: poolNamespacedName,
173177
Datastore: datastore,
174178
SecureServing: *secureServing,
175179
CertPath: *certPath,

config/charts/inferencepool/README.md

+13-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment.
44

5-
65
## Install
76

87
To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
@@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \
2322

2423
Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
2524

25+
### Install for Triton TensorRT-LLM
26+
27+
Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
28+
29+
```txt
30+
$ helm install triton-llama3-8b-instruct \
31+
--set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \
32+
--set inferencePool.modelServerType=triton-tensorrt-llm \
33+
--set provider.name=[none|gke] \
34+
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
35+
```
36+
2637
## Uninstall
2738

2839
Run the following command to uninstall the chart:
@@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart.
3849
| **Parameter Name** | **Description** |
3950
|---------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
4051
| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. |
52+
| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. |
4153
| `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. |
4254
| `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. |
4355
| `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. |

config/charts/inferencepool/templates/epp-deployment.yaml

+8-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ spec:
3535
- "9003"
3636
- -metricsPort
3737
- "9090"
38+
{{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
39+
- -totalQueuedRequestsMetric
40+
- "nv_trt_llm_request_metrics{request_type=waiting}"
41+
- -kvCacheUsagePercentageMetric
42+
- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
43+
- -loraInfoMetric
44+
- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
45+
{{- end }}
3846
ports:
3947
- name: grpc
4048
containerPort: 9002
@@ -54,4 +62,3 @@ spec:
5462
service: inference-extension
5563
initialDelaySeconds: 5
5664
periodSeconds: 10
57-

config/charts/inferencepool/values.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ inferenceExtension:
99

1010
inferencePool:
1111
targetPortNumber: 8000
12+
modelServerType: vllm # vllm, triton-tensorrt-llm
1213
# modelServers: # REQUIRED
1314
# matchLabels:
1415
# app: vllm-llama3-8b-instruct

config/manifests/inferencepool-resources.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
apiVersion: inference.networking.x-k8s.io/v1alpha2
55
kind: InferencePool
66
metadata:
7-
labels:
87
name: vllm-llama3-8b-instruct
98
spec:
109
targetPortNumber: 8000
@@ -54,6 +53,8 @@ spec:
5453
args:
5554
- -poolName
5655
- "vllm-llama3-8b-instruct"
56+
- "-poolNamespace"
57+
- "default"
5758
- -v
5859
- "4"
5960
- --zap-encoder
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Gateway API Inference Extension
2+
3+
Author(s): @kfswain
4+
## Proposal Status
5+
***Draft***
6+
7+
## Table of Contents
8+
9+
<!-- toc -->
10+
11+
- [Summary](#summary)
12+
- [Goals](#goals)
13+
- [Non-Goals](#non-goals)
14+
- [Proposal](#proposal)
15+
- [Personas](#personas)
16+
- [Inference Platform Admin](#inference-platform-admin)
17+
- [Inference Workload Owner](#workload-owner)
18+
- [Axioms](#axioms)
19+
- [InferencePool](#inferencepool)
20+
- [InferenceModel](#inferencemodel)
21+
- [Spec](#spec)
22+
- [Diagrams](#diagrams)
23+
- [Alternatives](#alternatives)
24+
- [Open Questions](#open-questions)
25+
26+
<!-- /toc -->
27+
28+
## Summary
29+
30+
This proposal seeks to standardize the implementation of an EPP (End-point Picker) for the Inference Gateway extension (also known as Gateway API Inference Extension). Additionally, this proposes to restructure the current implementation of the EPP to be more modular, and approachable.
31+
32+
## Goals
33+
34+
- Set a standard on how the EPP & APIs interact
35+
- Settle on common nomenclature for clearer communication
36+
- Allow for modularization of the EPP, to be extended to a user's specific needs
37+
38+
## Non-Goals
39+
40+
- Reshaping the current API
41+
- A change in scope of the current project
42+
43+
## Proposal
44+
45+
This proposal is not proposing any net new features, instead, we are refactoring our current implementation to better handle more devs, more features, etc. At the time of writing, GIE is currently at v0.3, and that stronger experimental context (along with external feedback) made clear the need this restructure. The image below give a high level view of how our components work together.
46+
47+
<img src="./images/epp_arch.svg" alt="Scheduling Algorithm" width="1000" />
48+
49+
## Overview
50+
At a quick glance, the EPP is being broken into specific layers. The `Data Layer` is of note, as it is a vertical that will be accessed by all the others. The data layer manages the k8s, data, metric & usage data, as well as processing of the above data to determine resource scarcity regimes.
51+
52+
The other layers are handled in sequential process. Starting with the **Ext-Proc** call. The request is buffered and then sent to the **Routing Layer**, which processes any User defined per-InferenceModel routing rules & request enrichment happening first (at the time of writing that is currently just translating the InferenceModel name to a weight-split actual model). Then _all_ requests pass through the to-be-implemented [**Flow Controller**](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674) to ensure that any request entry to the pool adhereing to the guidelines set by the Priority, Fairness, & Queueing configuration. And finally, the **Scheduling Layer** is the load balancing algorithm that intelligently routes requests based on the current state of the InferencePool.
53+
54+
## Components
55+
56+
To further expand upon these component layers. We will first break them into `extensible` and `non-extensible` layers. `Non-extensible` layers are intended to be static, and handled on behalf of the user, typically implementing low-opinion infrastructure.
57+
58+
The `Extensible` layers are:
59+
- Data Layer
60+
- Routing Layer
61+
- Flow Controller
62+
- Scheduling Layer
63+
64+
The `Non-Extensible` layer(s) are:
65+
- The Ext-Proc Server
66+
67+
### `Extensible`
68+
69+
#### Data Layer
70+
71+
The data layer will consume and store: the InferencePool/InferenceModel config and the pre-defined [Model Server Protocol](../003-model-server-protocol/README.md). Additionally, the data fed from the model servers will be processed and digested to provide resource scarcity regime hints, and autoscaling reccomendations.
72+
73+
Many extensions to scheduling will require changes to ingested metrics, as such, the data layer will be built to be extended, but extenders accept that the Model Server Protocol will no longer provide guarantees on portability of a model server out of the box.
74+
75+
#### Routing Layer
76+
77+
The routing layer is likely to be the most opinion heavy section, as the scope of what constitutes a 'Route Rule' is somewhat broad. The current examples we expect would be:
78+
79+
- System Prompt injection
80+
- RAG callout
81+
- Per-InferenceModel request validation (such as saftey/on-topic, etc)
82+
83+
Due to the possibility of this becoming a bit of a dumping ground. The API will keep a _very_ tight scope on which of these route rules are included in the spec. A standard method of extension will be provided if the need to define a custom rule arises.
84+
85+
#### Flow Controller (WIP - implementation tracked in [#674](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674))
86+
87+
The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing).
88+
89+
#### Scheduling Layer
90+
91+
As the Scheduling Layer is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values.
92+
93+
The Scheduler will define a strong interface API, so that new scheduling algos may be plugged & dark-launched to test in production traffic without impacting said traffic. Extension is expected to adhere to the [Scheduler Subsystem definition](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/603)
94+
95+
### `Non-extensible`
96+
97+
#### Ext-Proc Server
98+
99+
The Ext-Proc Server protocol is very well defined & specific, deviation could cause the EPP to become unusable or unstable. Extension is ill-advised.

docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg

+1
Loading

go.mod

+3-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ require (
99
github.com/google/go-cmp v0.7.0
1010
github.com/onsi/ginkgo/v2 v2.23.4
1111
github.com/onsi/gomega v1.37.0
12-
github.com/prometheus/client_golang v1.21.1
13-
github.com/prometheus/client_model v0.6.1
12+
github.com/prometheus/client_golang v1.22.0
13+
github.com/prometheus/client_model v0.6.2
1414
github.com/prometheus/common v0.63.0
1515
github.com/stretchr/testify v1.10.0
1616
go.uber.org/multierr v1.11.0
@@ -25,7 +25,7 @@ require (
2525
k8s.io/component-base v0.32.3
2626
k8s.io/utils v0.0.0-20241210054802-24370beab758
2727
sigs.k8s.io/controller-runtime v0.20.4
28-
sigs.k8s.io/structured-merge-diff/v4 v4.6.0
28+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0
2929
sigs.k8s.io/yaml v1.4.0
3030
)
3131

@@ -74,7 +74,6 @@ require (
7474
github.com/inconshreveable/mousetrap v1.1.0 // indirect
7575
github.com/josharian/intern v1.0.0 // indirect
7676
github.com/json-iterator/go v1.1.12 // indirect
77-
github.com/klauspost/compress v1.17.11 // indirect
7877
github.com/kylelemons/godebug v1.1.0 // indirect
7978
github.com/leodido/go-urn v1.2.1 // indirect
8079
github.com/mailru/easyjson v0.7.7 // indirect

go.sum

+8-8
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
112112
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
113113
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
114114
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
115-
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
116-
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
115+
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
116+
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
117117
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
118118
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
119119
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -164,10 +164,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
164164
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
165165
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
166166
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
167-
github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk=
168-
github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg=
169-
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
170-
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
167+
github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
168+
github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
169+
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
170+
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
171171
github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k=
172172
github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18=
173173
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
@@ -332,7 +332,7 @@ sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1
332332
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
333333
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ=
334334
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
335-
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc=
336-
sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
335+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI=
336+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
337337
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
338338
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=

mkdocs.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,16 @@ nav:
5454
API Overview: concepts/api-overview.md
5555
Conformance: concepts/conformance.md
5656
Roles and Personas: concepts/roles-and-personas.md
57-
- Implementations: implementations.md
57+
- Implementations:
58+
- Gateways: implementations/gateways.md
59+
- Model Servers: implementations/model-servers.md
5860
- FAQ: faq.md
5961
- Guides:
6062
- User Guides:
6163
- Getting started: guides/index.md
6264
- Adapter Rollout: guides/adapter-rollout.md
6365
- Metrics: guides/metrics.md
66+
- Replacing an Inference Pool: guides/replacing-inference-pool.md
6467
- Implementer's Guide: guides/implementers.md
6568
- Performance:
6669
- Benchmark: performance/benchmark/index.md

pkg/epp/backend/metrics/fake.go

-9
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
corev1 "k8s.io/api/core/v1"
2525
"k8s.io/apimachinery/pkg/types"
2626
"sigs.k8s.io/controller-runtime/pkg/log"
27-
"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
2827
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
2928
)
3029

@@ -84,11 +83,3 @@ func (f *FakePodMetricsClient) SetErr(new map[types.NamespacedName]error) {
8483
defer f.errMu.Unlock()
8584
f.Err = new
8685
}
87-
88-
type FakeDataStore struct {
89-
Res map[string]*v1alpha2.InferenceModel
90-
}
91-
92-
func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) {
93-
return fds.Res[modelName]
94-
}

pkg/epp/backend/metrics/logger.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
5555
case <-ctx.Done():
5656
logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
5757
return
58-
case <-ticker.C: // Periodically flush prometheus metrics for inference pool
59-
flushPrometheusMetricsOnce(logger, datastore)
58+
case <-ticker.C: // Periodically refresh prometheus metrics for inference pool
59+
refreshPrometheusMetrics(logger, datastore)
6060
}
6161
}
6262
}()
@@ -86,19 +86,19 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
8686
}
8787
}
8888

89-
func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) {
89+
func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
9090
pool, err := datastore.PoolGet()
9191
if err != nil {
9292
// No inference pool or not initialize.
93-
logger.V(logutil.DEFAULT).Info("pool is not initialized, skipping flushing metrics")
93+
logger.V(logutil.DEFAULT).Info("Pool is not initialized, skipping refreshing metrics")
9494
return
9595
}
9696

9797
var kvCacheTotal float64
9898
var queueTotal int
9999

100100
podMetrics := datastore.PodGetAll()
101-
logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics))
101+
logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics))
102102
if len(podMetrics) == 0 {
103103
return
104104
}

0 commit comments

Comments
 (0)