diff --git a/config/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml deleted file mode 100644 index 14b7b123c..000000000 --- a/config/manifests/gateway/extension_policy.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: inference-gateway-ext-proc - port: 9002 - processingMode: - allowModeOverride: true - request: - body: Buffered - response: - # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. - # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 8374c5b3e..4c7824ca3 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -6,7 +6,7 @@ spec: modelName: tweet-summary criticality: Critical poolRef: - name: my-pool + name: vllm-llama2-7b targetModels: - name: tweet-summary-1 weight: 100 @@ -20,7 +20,7 @@ spec: modelName: meta-llama/Llama-2-7b-hf criticality: Critical poolRef: - name: my-pool + name: vllm-llama2-7b --- apiVersion: inference.networking.x-k8s.io/v1alpha2 @@ -31,4 +31,4 @@ spec: modelName: Qwen/Qwen2.5-1.5B-Instruct criticality: Critical poolRef: - name: my-pool + name: vllm-llama2-7b diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml index 640086393..8225bd7c7 100644 --- a/config/manifests/inferencepool.yaml +++ b/config/manifests/inferencepool.yaml @@ -75,6 +75,39 @@ spec: initialDelaySeconds: 5 periodSeconds: 10 --- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: vllm-llama2-7b-epp + port: 9002 + processingMode: + allowModeOverride: true + request: + body: Buffered + response: + # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. + # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. + messageTimeout: 1000s + backendSettings: + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route +--- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: diff --git a/site-src/guides/index.md b/site-src/guides/index.md index d6ff84594..d721e73ff 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -88,7 +88,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy Envoy Gateway Custom Policies ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. @@ -125,7 +124,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/ext_proc.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found