Skip to content

Commit b68029a

Browse files
committed
Merge branch 'main' into 124
2 parents 1798358 + 6c22d92 commit b68029a

19 files changed

+431
-191
lines changed

.golangci.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ linters:
1414
- dupword
1515
- durationcheck
1616
- fatcontext
17-
- gci
1817
- ginkgolinter
1918
- gocritic
2019
- govet

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ require (
1818
github.com/stretchr/testify v1.10.0
1919
go.uber.org/multierr v1.11.0
2020
google.golang.org/grpc v1.70.0
21-
google.golang.org/protobuf v1.36.4
21+
google.golang.org/protobuf v1.36.5
2222
k8s.io/api v0.32.1
2323
k8s.io/apiextensions-apiserver v0.32.1
2424
k8s.io/apimachinery v0.32.1

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a h1:
329329
google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU=
330330
google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ=
331331
google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw=
332-
google.golang.org/protobuf v1.36.4 h1:6A3ZDJHn/eNqc1i+IdefRzy/9PokBTPvcqMySR7NNIM=
333-
google.golang.org/protobuf v1.36.4/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
332+
google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM=
333+
google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
334334
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
335335
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
336336
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=

pkg/README.md

Lines changed: 1 addition & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,3 @@
11
## Quickstart
22

3-
This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running!
4-
5-
### Requirements
6-
- Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
7-
- A cluster with:
8-
- Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
9-
you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
10-
- 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
11-
12-
### Steps
13-
14-
1. **Deploy Sample Model Server**
15-
16-
Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model.
17-
Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
18-
```bash
19-
kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
20-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
21-
```
22-
23-
1. **Install the Inference Extension CRDs:**
24-
25-
```sh
26-
kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd
27-
```
28-
29-
1. **Deploy InferenceModel**
30-
31-
Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1`
32-
[LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server.
33-
```bash
34-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml
35-
```
36-
37-
1. **Update Envoy Gateway Config to enable Patch Policy**
38-
39-
Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run:
40-
```bash
41-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/enable_patch_policy.yaml
42-
kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system
43-
```
44-
Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again.
45-
46-
1. **Deploy Gateway**
47-
48-
```bash
49-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml
50-
```
51-
> **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.***
52-
53-
Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status:
54-
```bash
55-
$ kubectl get gateway inference-gateway
56-
NAME CLASS ADDRESS PROGRAMMED AGE
57-
inference-gateway inference-gateway <MY_ADDRESS> True 22s
58-
```
59-
60-
1. **Deploy the Inference Extension and InferencePool**
61-
62-
```bash
63-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml
64-
```
65-
66-
1. **Deploy Envoy Gateway Custom Policies**
67-
68-
```bash
69-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml
70-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml
71-
```
72-
> **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further.
73-
74-
1. **OPTIONALLY**: Apply Traffic Policy
75-
76-
For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors.
77-
78-
```bash
79-
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml
80-
```
81-
82-
1. **Try it out**
83-
84-
Wait until the gateway is ready.
85-
86-
```bash
87-
IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
88-
PORT=8081
89-
90-
curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
91-
"model": "tweet-summary",
92-
"prompt": "Write as if you were a critic: San Francisco",
93-
"max_tokens": 100,
94-
"temperature": 0
95-
}'
96-
```
3+
Please refer to our Getting started guide here: https://gateway-api-inference-extension.sigs.k8s.io/guides/

pkg/ext-proc/backend/inferencemodel_reconciler.go

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55

66
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
77
logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging"
8+
"k8s.io/apimachinery/pkg/api/errors"
89
"k8s.io/apimachinery/pkg/runtime"
910
"k8s.io/apimachinery/pkg/types"
1011
"k8s.io/client-go/tools/record"
@@ -25,32 +26,37 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque
2526
if req.Namespace != c.PoolNamespacedName.Namespace {
2627
return ctrl.Result{}, nil
2728
}
28-
klog.V(1).Infof("reconciling InferenceModel %v", req.NamespacedName)
29-
30-
service := &v1alpha1.InferenceModel{}
31-
if err := c.Get(ctx, req.NamespacedName, service); err != nil {
32-
klog.Error(err, "unable to get InferencePool")
29+
klog.V(1).Infof("Reconciling InferenceModel %v", req.NamespacedName)
30+
31+
infModel := &v1alpha1.InferenceModel{}
32+
if err := c.Get(ctx, req.NamespacedName, infModel); err != nil {
33+
if errors.IsNotFound(err) {
34+
klog.V(1).Infof("InferenceModel %v not found. Removing from datastore since object must be deleted", req.NamespacedName)
35+
c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName)
36+
return ctrl.Result{}, nil
37+
}
38+
klog.Error(err, "Unable to get InferenceModel")
3339
return ctrl.Result{}, err
3440
}
3541

36-
c.updateDatastore(service)
42+
c.updateDatastore(infModel)
3743
return ctrl.Result{}, nil
3844
}
3945

40-
func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error {
41-
return ctrl.NewControllerManagedBy(mgr).
42-
For(&v1alpha1.InferenceModel{}).
43-
Complete(c)
44-
}
45-
4646
func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) {
4747
if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name {
4848
klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolNamespacedName.Name)
49-
klog.V(1).Infof("Adding/Updating inference model: %v", infModel.Spec.ModelName)
49+
klog.V(1).Infof("Adding/Updating InferenceModel: %v", infModel.Spec.ModelName)
5050
c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel)
5151
return
5252
}
53-
klog.V(logutil.DEFAULT).Infof("Removing/Not adding inference model: %v", infModel.Spec.ModelName)
53+
klog.V(logutil.DEFAULT).Infof("Removing/Not adding InferenceModel: %v", infModel.Spec.ModelName)
5454
// If we get here. The model is not relevant to this pool, remove.
5555
c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName)
5656
}
57+
58+
func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error {
59+
return ctrl.NewControllerManagedBy(mgr).
60+
For(&v1alpha1.InferenceModel{}).
61+
Complete(c)
62+
}

0 commit comments

Comments
 (0)