Skip to content

Commit 4127b08

Browse files
authored
Merge branch 'kubernetes-sigs:main' into inferencepool-ref
2 parents ca7e02e + f7faddc commit 4127b08

File tree

18 files changed

+569
-1352
lines changed

18 files changed

+569
-1352
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/gateway-api-inference-extension)](https://goreportcard.com/report/sigs.k8s.io/gateway-api-inference-extension)
2+
[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension)
3+
[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE)
4+
15
# Gateway API Inference Extension
26

37
This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.

cmd/epp/main.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,6 @@ func run() error {
120120
flag.Parse()
121121
initLogging(&opts)
122122

123-
useStreamingServer, err := strconv.ParseBool(os.Getenv("USE_STREAMING"))
124-
if err != nil {
125-
setupLog.Error(err, "Failed to parse env var USE_STREAMING, defaulting to false")
126-
}
127-
128123
// Validate flags
129124
if err := validateFlags(); err != nil {
130125
setupLog.Error(err, "Failed to validate flags")
@@ -178,7 +173,6 @@ func run() error {
178173
Datastore: datastore,
179174
SecureServing: *secureServing,
180175
CertPath: *certPath,
181-
UseStreaming: useStreamingServer,
182176
RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval,
183177
}
184178
if err := serverRunner.SetupWithManager(ctx, mgr); err != nil {

config/charts/inferencepool/README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment.
44

5-
65
## Install
76

87
To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
@@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \
2322

2423
Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
2524

25+
### Install for Triton TensorRT-LLM
26+
27+
Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
28+
29+
```txt
30+
$ helm install triton-llama3-8b-instruct \
31+
--set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \
32+
--set inferencePool.modelServerType=triton-tensorrt-llm \
33+
--set provider.name=[none|gke] \
34+
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
35+
```
36+
2637
## Uninstall
2738

2839
Run the following command to uninstall the chart:
@@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart.
3849
| **Parameter Name** | **Description** |
3950
|---------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
4051
| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. |
52+
| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. |
4153
| `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. |
4254
| `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. |
4355
| `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. |

config/charts/inferencepool/templates/epp-deployment.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,14 @@ spec:
3535
- "9003"
3636
- -metricsPort
3737
- "9090"
38-
env:
39-
- name: USE_STREAMING
40-
value: "true"
38+
{{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
39+
- -totalQueuedRequestsMetric
40+
- "nv_trt_llm_request_metrics{request_type=waiting}"
41+
- -kvCacheUsagePercentageMetric
42+
- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
43+
- -loraInfoMetric
44+
- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
45+
{{- end }}
4146
ports:
4247
- name: grpc
4348
containerPort: 9002
@@ -57,4 +62,3 @@ spec:
5762
service: inference-extension
5863
initialDelaySeconds: 5
5964
periodSeconds: 10
60-

config/charts/inferencepool/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ inferenceExtension:
99

1010
inferencePool:
1111
targetPortNumber: 8000
12+
modelServerType: vllm # vllm, triton-tensorrt-llm
1213
# modelServers: # REQUIRED
1314
# matchLabels:
1415
# app: vllm-llama3-8b-instruct

config/manifests/inferencepool-resources.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,6 @@ spec:
6262
- "9002"
6363
- -grpcHealthPort
6464
- "9003"
65-
env:
66-
- name: USE_STREAMING
67-
value: "true"
6865
ports:
6966
- containerPort: 9002
7067
- containerPort: 9003

go.mod

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ require (
99
github.com/google/go-cmp v0.7.0
1010
github.com/onsi/ginkgo/v2 v2.23.4
1111
github.com/onsi/gomega v1.37.0
12-
github.com/prometheus/client_golang v1.21.1
13-
github.com/prometheus/client_model v0.6.1
12+
github.com/prometheus/client_golang v1.22.0
13+
github.com/prometheus/client_model v0.6.2
1414
github.com/prometheus/common v0.63.0
1515
github.com/stretchr/testify v1.10.0
1616
go.uber.org/multierr v1.11.0
@@ -25,7 +25,7 @@ require (
2525
k8s.io/component-base v0.32.3
2626
k8s.io/utils v0.0.0-20241210054802-24370beab758
2727
sigs.k8s.io/controller-runtime v0.20.4
28-
sigs.k8s.io/structured-merge-diff/v4 v4.6.0
28+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0
2929
sigs.k8s.io/yaml v1.4.0
3030
)
3131

@@ -74,7 +74,6 @@ require (
7474
github.com/inconshreveable/mousetrap v1.1.0 // indirect
7575
github.com/josharian/intern v1.0.0 // indirect
7676
github.com/json-iterator/go v1.1.12 // indirect
77-
github.com/klauspost/compress v1.17.11 // indirect
7877
github.com/kylelemons/godebug v1.1.0 // indirect
7978
github.com/leodido/go-urn v1.2.1 // indirect
8079
github.com/mailru/easyjson v0.7.7 // indirect

go.sum

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
112112
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
113113
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
114114
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
115-
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
116-
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
115+
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
116+
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
117117
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
118118
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
119119
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -164,10 +164,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
164164
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
165165
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
166166
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
167-
github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk=
168-
github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg=
169-
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
170-
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
167+
github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
168+
github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
169+
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
170+
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
171171
github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k=
172172
github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18=
173173
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
@@ -332,7 +332,7 @@ sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1
332332
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
333333
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ=
334334
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
335-
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc=
336-
sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
335+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI=
336+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
337337
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
338338
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=

mkdocs.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ nav:
5454
API Overview: concepts/api-overview.md
5555
Conformance: concepts/conformance.md
5656
Roles and Personas: concepts/roles-and-personas.md
57-
- Implementations: implementations.md
57+
- Implementations:
58+
- Gateways: implementations/gateways.md
59+
- Model Servers: implementations/model-servers.md
5860
- FAQ: faq.md
5961
- Guides:
6062
- User Guides:

0 commit comments

Comments
 (0)