diff --git a/config/manifests/gateway/enable_patch_policy.yaml b/config/manifests/gateway/enable_patch_policy.yaml deleted file mode 100644 index 1e9818a1e..000000000 --- a/config/manifests/gateway/enable_patch_policy.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: envoy-gateway-system -data: -# This manifest's main purpose is to set `enabledEnvoyPatchPolicy` to `true`. -# This only needs to be ran once on your cluster (unless you'd like to change anything. i.e. enabling the admin dash) -# Any field under `admin` is optional, and only for enabling the admin endpoints, for debugging. -# Admin Interface: https://www.envoyproxy.io/docs/envoy/latest/operations/admin -# PatchPolicy docs: https://gateway.envoyproxy.io/docs/tasks/extensibility/envoy-patch-policy/#enable-envoypatchpolicy - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - provider: - type: Kubernetes - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true -# admin: -# enablePprof: true -# address: -# host: 127.0.0.1 -# port: 19000 -# enabledDumpConfig: true diff --git a/config/manifests/gateway/gateway.yaml b/config/manifests/gateway/gateway.yaml deleted file mode 100644 index 32f5d484d..000000000 --- a/config/manifests/gateway/gateway.yaml +++ /dev/null @@ -1,50 +0,0 @@ - ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: inference-gateway -spec: - gatewayClassName: inference-gateway - listeners: - - name: http - protocol: HTTP - port: 8080 - - name: llm-gw - protocol: HTTP - port: 8081 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: inference-gateway -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: Backend -metadata: - name: backend-dummy -spec: - endpoints: - - fqdn: - # Both these values are arbitrary and unused as the PatchPolicy redirects requests. - hostname: 'foo.bar.com' - port: 8080 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route -spec: - parentRefs: - - name: inference-gateway - sectionName: llm-gw - rules: - - backendRefs: - - group: gateway.envoyproxy.io - kind: Backend - name: backend-dummy - timeouts: - request: "24h" - backendRequest: "24h" diff --git a/config/manifests/gateway/gke/gateway.yaml b/config/manifests/gateway/gke/gateway.yaml new file mode 100644 index 000000000..942cde5c9 --- /dev/null +++ b/config/manifests/gateway/gke/gateway.yaml @@ -0,0 +1,10 @@ +kind: Gateway +apiVersion: gateway.networking.k8s.io/v1 +metadata: + name: inference-gateway +spec: + gatewayClassName: gke-l7-regional-external-managed + listeners: + - name: http + port: 80 + protocol: HTTP diff --git a/config/manifests/gateway/gke/gcp-backend-policy.yaml b/config/manifests/gateway/gke/gcp-backend-policy.yaml new file mode 100644 index 000000000..519a5a930 --- /dev/null +++ b/config/manifests/gateway/gke/gcp-backend-policy.yaml @@ -0,0 +1,11 @@ +apiVersion: networking.gke.io/v1 +kind: GCPBackendPolicy +metadata: + name: inferencepool-backend-policy +spec: + targetRef: + group: "inference.networking.x-k8s.io" + kind: InferencePool + name: vllm-llama3-8b-instruct + default: + timeoutSec: 300 diff --git a/config/manifests/gateway/gke/healthcheck.yaml b/config/manifests/gateway/gke/healthcheck.yaml new file mode 100644 index 000000000..95f4f2d2b --- /dev/null +++ b/config/manifests/gateway/gke/healthcheck.yaml @@ -0,0 +1,16 @@ +kind: HealthCheckPolicy +apiVersion: networking.gke.io/v1 +metadata: + name: health-check-policy + namespace: default +spec: + targetRef: + group: "inference.networking.x-k8s.io" + kind: InferencePool + name: vllm-llama2-7b + default: + config: + type: HTTP + httpHealthCheck: + requestPath: /health + port: 8000 diff --git a/config/manifests/gateway/httproute-with-timeout.yaml b/config/manifests/gateway/httproute-with-timeout.yaml new file mode 100644 index 000000000..060f18c50 --- /dev/null +++ b/config/manifests/gateway/httproute-with-timeout.yaml @@ -0,0 +1,20 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: vllm-llama2-7b + matches: + - path: + type: PathPrefix + value: / + timeouts: + request: 300s diff --git a/config/manifests/gateway/httproute.yaml b/config/manifests/gateway/httproute.yaml new file mode 100644 index 000000000..5bd8bfb6c --- /dev/null +++ b/config/manifests/gateway/httproute.yaml @@ -0,0 +1,18 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: vllm-llama2-7b + matches: + - path: + type: PathPrefix + value: / diff --git a/config/manifests/gateway/istio/destination-rule.yaml b/config/manifests/gateway/istio/destination-rule.yaml new file mode 100644 index 000000000..f9cd0c3c5 --- /dev/null +++ b/config/manifests/gateway/istio/destination-rule.yaml @@ -0,0 +1,10 @@ +apiVersion: networking.istio.io/v1 +kind: DestinationRule +metadata: + name: epp-insecure-tls +spec: + host: vllm-llama2-7b-epp + trafficPolicy: + tls: + mode: SIMPLE + insecureSkipVerify: true diff --git a/config/manifests/gateway/istio/gateway.yaml b/config/manifests/gateway/istio/gateway.yaml new file mode 100644 index 000000000..dd762678e --- /dev/null +++ b/config/manifests/gateway/istio/gateway.yaml @@ -0,0 +1,10 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: istio + listeners: + - name: http + port: 80 + protocol: HTTP diff --git a/config/manifests/gateway/kgateway/gateway.yaml b/config/manifests/gateway/kgateway/gateway.yaml new file mode 100644 index 000000000..7bcd08a6a --- /dev/null +++ b/config/manifests/gateway/kgateway/gateway.yaml @@ -0,0 +1,10 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: kgateway + listeners: + - name: http + port: 80 + protocol: HTTP diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml deleted file mode 100644 index 923ce22c6..000000000 --- a/config/manifests/gateway/patch_policy.yaml +++ /dev/null @@ -1,123 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyPatchPolicy -metadata: - name: custom-response-patch-policy - namespace: default -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - type: JSONPatch - jsonPatches: - # Necessary to create a cluster of the type: ORIGINAL_DST to allow for - # direct pod scheduling. Which is heavily utilized in our scheduling. - # Specifically the field `original_dst_lb_config` allows us to enable - # `use_http_header` and `http_header_name`. - # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: original_destination_cluster - operation: - op: add - path: "" - value: - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "x-gateway-destination-endpoint" - connect_timeout: 1000s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - circuit_breakers: - thresholds: - - max_connections: 40000 - max_pending_requests: 40000 - max_requests: 40000 - - # This ensures that envoy accepts untrusted certificates. We tried to explicitly - # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work - # and what worked is setting the common_tls_context to empty. - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: "envoyextensionpolicy/default/ext-proc-policy/extproc/0" - operation: - op: add - path: "/transport_socket" - value: - name: "envoy.transport_sockets.tls" - typed_config: - "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" - common_tls_context: {} - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: default/inference-gateway/llm-gw - operation: - op: replace - path: "/virtual_hosts/0/routes/0/route/cluster" - value: original_destination_cluster -# Comment the below to disable full duplex streaming -# NOTE: As of https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/552 -# FULL_DUPLEX_STREAMED is the primary supported protocol for ext-proc. The buffered variant is no longer -# being actively developed, may be missing features/fixes, and will soon be removed. - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: add - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode" - value: FULL_DUPLEX_STREAMED - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: add - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode" - value: SEND - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: add - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode" - value: FULL_DUPLEX_STREAMED - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: replace - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode" - value: SEND - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: replace - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode" - value: SEND ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: vllm-llama3-8b-instruct-epp - port: 9002 - processingMode: - allowModeOverride: true - request: - body: Buffered - response: - # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. - # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route diff --git a/config/manifests/gateway/traffic_policy.yaml b/config/manifests/gateway/traffic_policy.yaml deleted file mode 100644 index e110f1733..000000000 --- a/config/manifests/gateway/traffic_policy.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: BackendTrafficPolicy -metadata: - name: high-connection-route-policy -spec: - targetRefs: - - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h \ No newline at end of file diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool-resources.yaml similarity index 99% rename from config/manifests/inferencepool.yaml rename to config/manifests/inferencepool-resources.yaml index 639157c13..d0f36e832 100644 --- a/config/manifests/inferencepool.yaml +++ b/config/manifests/inferencepool-resources.yaml @@ -22,6 +22,7 @@ spec: - protocol: TCP port: 9002 targetPort: 9002 + appProtocol: http2 type: ClusterIP --- apiVersion: apps/v1 diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 99b78129e..4548d5cd2 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -1,9 +1,12 @@ # Getting started with Gateway API Inference Extension -This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! +??? example "Experimental" + + This project is still in an alpha state and breaking changes may occur in the future. + +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! ## **Prerequisites** - - Envoy Gateway [v1.3.0](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - Support for services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). @@ -39,11 +42,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform. For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. - While it is possible to deploy the model server with less resources, this is not recommended. - For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. - In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. - After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. - For modifying the allocated resources, adjust the numbers in `./config/manifests/vllm/cpu-deployment.yaml` as needed. + + While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. + + After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash @@ -52,68 +54,180 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Install the Inference Extension CRDs - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml - ``` - +=== "Latest Release" + + ```bash + VERSION=v0.2.0 + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml + ``` + +=== "Dev Version" + + ```bash + kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + ``` + ### Deploy InferenceModel Deploy the sample InferenceModel which is configured to load balance traffic between the `food-review-0` and `food-review-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. + ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml ``` -### Update Envoy Gateway Config to enable Patch Policy** +### Deploy the InferencePool and Extension - Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml - kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml ``` - Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. -### Deploy Gateway +### Deploy Inference Gateway - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml - ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + Choose one of the following options to deploy an Inference Gateway. - Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: - ```bash - $ kubectl get gateway inference-gateway - NAME CLASS ADDRESS PROGRAMMED AGE - inference-gateway inference-gateway True 22s - ``` -### Deploy the InferencePool and Extension +=== "GKE" - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml - ``` -### Deploy Envoy Gateway Custom Policies + 1. Enable the Gateway API and configure proxy-only subnets when necessary. See [Deploy Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways) + for detailed instructions. - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml - ``` - > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. - -### **OPTIONALLY**: Apply Traffic Policy + 1. Deploy Gateway and HealthCheckPolicy resources + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` + +=== "Istio" + + Please note that this feature is currently in an experimental phase and is not intended for production use. + The implementation and user experience are subject to changes as we continue to iterate on this project. + + 1. Requirements + + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + + 1. Install Istio + + ``` + TAG=1.26-alpha.80c74f7f43482c226f4f4b10b4dda6261b67a71f + # on Linux + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz + tar -xvf istioctl-$TAG-linux-amd64.tar.gz + # on macOS + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-osx.tar.gz + tar -xvf istioctl-$TAG-osx.tar.gz + # on Windows + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-win.zip + unzip istioctl-$TAG-win.zip + + ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing + ``` + + 1. If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true` (the default mode), it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml + ``` + + 1. Deploy Gateway + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml + ``` + + 1. Label the gateway + + ```bash + kubectl label gateway llm-gateway istio.io/enable-inference-extproc=true + ``` - For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` + +=== "Kgateway" + + [Kgateway](https://kgateway.dev/) v2.0.0 adds support for inference extension as a **technical preview**. This means do not + run Kgateway with inference extension in production environments. Refer to [Issue 10411](https://github.com/kgateway-dev/kgateway/issues/10411) + for the list of caveats, supported features, etc. + + 1. Requirements + + - [Helm](https://helm.sh/docs/intro/install/) installed. + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + + 1. Install Kgateway CRDs + + ```bash + helm upgrade -i --create-namespace --namespace kgateway-system --version $VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` + + 1. Install Kgateway + + ```bash + helm upgrade -i --namespace kgateway-system --version $VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway + --set inferenceExtension.enabled=true + ``` + + 1. Deploy Gateway + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway kgateway True 22s + ``` + +### Deploy the HTTPRoute ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml ``` +### Configure Timeouts + + Given that default timeouts for above implementations may be insufficient for most inference workloads, it is recommended to configure a timeout appropriate for your intended use case. + +=== "GKE" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml + ``` + +=== "Istio" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute-with-timeout.yaml + ``` + +=== "Kgateway" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute-with-timeout.yaml + ``` + ### Try it out Wait until the gateway is ready. ```bash IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=8081 + PORT=80 curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ "model": "food-review", @@ -126,18 +240,32 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Cleanup The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. - please be careful not to delete resources you'd like to keep. - ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found - kubectl delete secret hf-token --ignore-not-found - ``` + Please be careful not to delete resources you'd like to keep. + + 1. Uninstall the Inference Pool + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found + ``` + + 1. Uninstall the Gateway + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml --ignore-not-found + ``` + + 1. Uninstall the CRDs + + ```bash + kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found + ``` diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index f9dea1ccf..643bbf753 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -75,7 +75,7 @@ const ( // inferModelManifest is the manifest for the inference model CRD. inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../../config/manifests/inferencepool.yaml" + inferExtManifest = "../../../config/manifests/inferencepool-resources.yaml" // envoyManifest is the manifest for the envoy proxy test resources. envoyManifest = "../../testdata/envoy.yaml" // modelServerManifestFilepathEnvVar is the env var that holds absolute path to the manifest for the model server test resource.