diff --git a/examples/poc/README.md b/examples/poc/README.md index 6c1cbdc9..739084a9 100644 --- a/examples/poc/README.md +++ b/examples/poc/README.md @@ -17,29 +17,41 @@ This project sets up an Envoy gateway with a custom external processing which im ### Steps 1. **Deploy Sample vLLM Application** + NOTE: Create a HuggingFace API token and store it in a secret named `hf-token` with key `token`. This is configured in the `HUGGING_FACE_HUB_TOKEN` and `HF_TOKEN` environment variables in `./manifests/samples/vllm-lora-deployment.yaml`. ```bash - kubectl apply -f ./manifests/samples/vllm-lora-deployment.yaml - kubectl apply -f ./manifests/samples/vllm-lora-service.yaml + kubectl apply -f ./manifests/vllm/vllm-lora-deployment.yaml + kubectl apply -f ./manifests/vllm/vllm-lora-service.yaml ``` -2. **Install GatewayClass with Ext Proc** - A custom GatewayClass `llm-gateway` which is configured with the llm routing ext proc will be installed into the `llm-gateway` namespace. It's configured to listen on port 8081 for traffic through ext-proc (in addition to the default 8080), see the `EnvoyProxy` configuration in `installation.yaml`. When you create Gateways, make sure the `llm-gateway` GatewayClass is used. +1. **Update Envoy Gateway Config to enable Patch Policy** + + Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: + ```bash + kubectl apply -f ./manifests/gateway/enable_patch_policy.yaml + kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system - NOTE: Ensure the `llm-route-ext-proc` deployment is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers. This won't be needed once we make ext proc dynamically read the pods. + ``` + Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. + + +1. **Deploy Gateway** ```bash - kubectl apply -f ./manifests/installation.yaml + kubectl apply -f ./manifests/gateway/gateway.yaml ``` -3. **Deploy Gateway** +1. **Deploy Ext-Proc** ```bash - kubectl apply -f ./manifests/samples/gateway.yaml + kubectl apply -f ./manifests/gateway/ext_proc.yaml + kubectl apply -f ./manifests/gateway/patch_policy.yaml ``` + **NOTE**: Ensure the `instance-gateway-ext-proc` deployment is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers. This won't be needed once we make ext proc dynamically read the pods. + +1. **Try it out** -4. **Try it out** Wait until the gateway is ready. ```bash diff --git a/examples/poc/manifests/gateway/enable_patch_policy.yaml b/examples/poc/manifests/gateway/enable_patch_policy.yaml new file mode 100644 index 00000000..c1d00e9a --- /dev/null +++ b/examples/poc/manifests/gateway/enable_patch_policy.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: envoy-gateway-system +data: +# This manifest's main purpose is to set `enabledEnvoyPatchPolicy` to `true`. +# Any field under `admin` is optional, and only for enabling the admin endpoints, for debugging. +# Admin Interface: https://www.envoyproxy.io/docs/envoy/latest/operations/admin +# PatchPolicy docs: https://gateway.envoyproxy.io/docs/tasks/extensibility/envoy-patch-policy/#enable-envoypatchpolicy + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + provider: + type: Kubernetes + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + extensionApis: + enableEnvoyPatchPolicy: true + enableBackend: true +# admin: +# enablePprof: true +# address: +# host: 127.0.0.1 +# port: 19000 +# enabledDumpConfig: true diff --git a/examples/poc/manifests/gateway/ext_proc.yaml b/examples/poc/manifests/gateway/ext_proc.yaml new file mode 100644 index 00000000..6112fa9e --- /dev/null +++ b/examples/poc/manifests/gateway/ext_proc.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: instance-gateway-ext-proc + namespace: default + labels: + app: instance-gateway-ext-proc +spec: + replicas: 1 + selector: + matchLabels: + app: instance-gateway-ext-proc + template: + metadata: + labels: + app: instance-gateway-ext-proc + spec: + containers: + - name: instance-gateway-ext-proc + image: ghcr.io/tomatillo-and-multiverse/ext-proc:demo + args: + #TODO: specify label selector and dynamically update pods + - -pods + - "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84" + - -podIPs + - "10.24.11.6:8000,10.24.5.7:8000" + - -enable-fairness + - "false" + ports: + - containerPort: 9002 + - name: curl + image: curlimages/curl + command: ["sleep", "3600"] +--- +apiVersion: v1 +kind: Service +metadata: + name: instance-gateway-ext-proc + namespace: default +spec: + selector: + app: instance-gateway-ext-proc + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + type: ClusterIP +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: instance-gateway-ext-proc + port: 9002 + processingMode: + request: + body: Buffered + response: + messageTimeout: 5s + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route \ No newline at end of file diff --git a/examples/poc/manifests/gateway/gateway.yaml b/examples/poc/manifests/gateway/gateway.yaml new file mode 100644 index 00000000..b964f911 --- /dev/null +++ b/examples/poc/manifests/gateway/gateway.yaml @@ -0,0 +1,47 @@ + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: +spec: + gatewayClassName: + listeners: + - name: http + protocol: HTTP + port: 8080 + - name: llm-gw + protocol: HTTP + port: 8081 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: Backend +metadata: + name: backend-dummy +spec: + endpoints: + - fqdn: + # Both these values are arbitrary and unused as the PatchPolicy redirects requests. + hostname: 'foo.bar.com' + port: 8080 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - name: inference-gateway + sectionName: llm-gw + rules: + - backendRefs: + - group: gateway.envoyproxy.io + kind: Backend + name: backend-dummy \ No newline at end of file diff --git a/examples/poc/manifests/gateway/patch_policy.yaml b/examples/poc/manifests/gateway/patch_policy.yaml new file mode 100644 index 00000000..b7681954 --- /dev/null +++ b/examples/poc/manifests/gateway/patch_policy.yaml @@ -0,0 +1,38 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyPatchPolicy +metadata: + name: custom-response-patch-policy + namespace: default +spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: + type: JSONPatch + jsonPatches: + # Necessary to create a cluster of the type: ORIGINAL_DST to allow for + # direct pod scheduling. Which is heavily utilized in our scheduling. + # Specifically the field `original_dst_lb_config` allows us to enable + # `use_http_header` and `http_header_name`. + # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: original_destination_cluster + operation: + op: add + path: "" + value: + name: original_destination_cluster + type: ORIGINAL_DST + original_dst_lb_config: + use_http_header: true + http_header_name: "target-pod" + connect_timeout: 6s + lb_policy: CLUSTER_PROVIDED + dns_lookup_family: V4_ONLY + + - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" + name: default//llm-gw + operation: + op: replace + path: "/virtual_hosts/1/routes/0/route/cluster" + value: original_destination_cluster \ No newline at end of file diff --git a/examples/poc/manifests/installation.yaml b/examples/poc/manifests/installation.yaml deleted file mode 100644 index 57ecd185..00000000 --- a/examples/poc/manifests/installation.yaml +++ /dev/null @@ -1,155 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: llm-gateway - ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyProxy -metadata: - name: llm-route-envoy-config - namespace: llm-gateway -spec: - provider: - type: Kubernetes - kubernetes: - envoyService: - patch: - type: StrategicMerge - value: - spec: - ports: - - name: http-8081 - port: 8081 - protocol: TCP - targetPort: 8081 - bootstrap: - type: Merge - value: | - static_resources: - listeners: - - name: listener_0 - address: - socket_address: - address: 0.0.0.0 - port_value: 8081 - filter_chains: - - filters: - - name: envoy.filters.network.http_connection_manager - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager - stat_prefix: http - codec_type: AUTO - route_config: - name: local_route - virtual_hosts: - - name: backend - domains: ["*"] - routes: - - match: - prefix: "/" - route: - cluster: original_destination_cluster - timeout: 1000s # Increase route timeout - http_filters: - - name: envoy.filters.http.ext_proc - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor - failure_mode_allow: false - grpc_service: - envoy_grpc: - cluster_name: ext_proc_cluster - processing_mode: - request_header_mode: "SEND" - response_header_mode: "SEND" - request_body_mode: "BUFFERED" - response_body_mode: "NONE" - request_trailer_mode: "SKIP" - response_trailer_mode: "SKIP" - - name: envoy.filters.http.router - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - clusters: - - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "target-pod" - connect_timeout: 6s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - - name: ext_proc_cluster - connect_timeout: 1000s - type: LOGICAL_DNS - http2_protocol_options: {} - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: ext_proc_cluster - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: llm-route-ext-proc.llm-gateway.svc.cluster.local - port_value: 9002 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: llm-gateway -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - parametersRef: - group: gateway.envoyproxy.io - kind: EnvoyProxy - name: llm-route-envoy-config - namespace: llm-gateway - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: llm-route-ext-proc - namespace: llm-gateway - labels: - app: llm-route-ext-proc -spec: - replicas: 1 - selector: - matchLabels: - app: llm-route-ext-proc - template: - metadata: - labels: - app: llm-route-ext-proc - spec: - containers: - - name: llm-route-ext-proc - image: ghcr.io/tomatillo-and-multiverse/ext-proc:demo - args: - #TODO: specify label selector and dynamically update pods - - -pods - - "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84" - - -podIPs - - "10.24.11.6:8000,10.24.5.7:8000" - - -enable-fairness - - "false" - ports: - - containerPort: 9002 - - name: curl - image: curlimages/curl - command: ["sleep", "3600"] ---- -apiVersion: v1 -kind: Service -metadata: - name: llm-route-ext-proc - namespace: llm-gateway -spec: - selector: - app: llm-route-ext-proc - ports: - - protocol: TCP - port: 9002 - targetPort: 9002 - type: ClusterIP diff --git a/examples/poc/manifests/samples/gateway.yaml b/examples/poc/manifests/samples/gateway.yaml deleted file mode 100644 index 0f3f1803..00000000 --- a/examples/poc/manifests/samples/gateway.yaml +++ /dev/null @@ -1,12 +0,0 @@ - ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: llm-gateway -spec: - gatewayClassName: llm-gateway - listeners: - - name: http - protocol: HTTP - port: 8080 diff --git a/examples/poc/manifests/samples/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml similarity index 100% rename from examples/poc/manifests/samples/vllm-lora-deployment.yaml rename to examples/poc/manifests/vllm/vllm-lora-deployment.yaml diff --git a/examples/poc/manifests/samples/vllm-lora-service.yaml b/examples/poc/manifests/vllm/vllm-lora-service.yaml similarity index 100% rename from examples/poc/manifests/samples/vllm-lora-service.yaml rename to examples/poc/manifests/vllm/vllm-lora-service.yaml diff --git a/pkg/README.md b/pkg/README.md new file mode 100644 index 00000000..eee9a68e --- /dev/null +++ b/pkg/README.md @@ -0,0 +1,48 @@ +## Quickstart + +### Steps + +1. **Deploy Sample vLLM Application** + + A sample vLLM deployment with the proper protocol to work with LLM Instance Gateway can be found [here](https://github.com/kubernetes-sigs/llm-instance-gateway/blob/6f9869d6595d2d0f8e6febcbec0f348cb44a3012/examples/poc/manifests/samples/vllm-lora-deployment.yaml#L18). + +1. **Update Envoy Gateway Config to enable Patch Policy** + + Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: + ```bash + kubectl apply -f ./manifests/gateway/enable_patch_policy.yaml + kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system + + ``` + Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. + + +1. **Deploy Gateway** + + ```bash + kubectl apply -f ./manifests/gateway/gateway.yaml + ``` + +1. **Deploy Ext-Proc** + + ```bash + kubectl apply -f ./manifests/gateway/ext_proc.yaml + kubectl apply -f ./manifests/gateway/patch_policy.yaml + ``` + **NOTE**: Ensure the `instance-gateway-ext-proc` deployment is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers. This won't be needed once we make ext proc dynamically read the pods. + +1. **Try it out** + + Wait until the gateway is ready. + + ```bash + IP=$(kubectl get gateway/llm-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=8081 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "tweet-summary", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` \ No newline at end of file diff --git a/pkg/manifests/enable_patch_policy.yaml b/pkg/manifests/enable_patch_policy.yaml new file mode 100644 index 00000000..c1d00e9a --- /dev/null +++ b/pkg/manifests/enable_patch_policy.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: envoy-gateway-system +data: +# This manifest's main purpose is to set `enabledEnvoyPatchPolicy` to `true`. +# Any field under `admin` is optional, and only for enabling the admin endpoints, for debugging. +# Admin Interface: https://www.envoyproxy.io/docs/envoy/latest/operations/admin +# PatchPolicy docs: https://gateway.envoyproxy.io/docs/tasks/extensibility/envoy-patch-policy/#enable-envoypatchpolicy + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + provider: + type: Kubernetes + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + extensionApis: + enableEnvoyPatchPolicy: true + enableBackend: true +# admin: +# enablePprof: true +# address: +# host: 127.0.0.1 +# port: 19000 +# enabledDumpConfig: true diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml new file mode 100644 index 00000000..07babb8a --- /dev/null +++ b/pkg/manifests/ext_proc.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: instance-gateway-ext-proc + namespace: default + labels: + app: instance-gateway-ext-proc +spec: + replicas: 1 + selector: + matchLabels: + app: instance-gateway-ext-proc + template: + metadata: + labels: + app: instance-gateway-ext-proc + spec: + containers: + - name: instance-gateway-ext-proc + image: + args: + #TODO: specify label selector and dynamically update pods + - -pods + - "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84" + - -podIPs + - "10.24.11.6:8000,10.24.5.7:8000" + - -enable-fairness + - "false" + ports: + - containerPort: 9002 + - name: curl + image: curlimages/curl + command: ["sleep", "3600"] +--- +apiVersion: v1 +kind: Service +metadata: + name: instance-gateway-ext-proc + namespace: default +spec: + selector: + app: instance-gateway-ext-proc + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + type: ClusterIP +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: instance-gateway-ext-proc + port: 9002 + processingMode: + request: + body: Buffered + response: + messageTimeout: 5s + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route \ No newline at end of file diff --git a/pkg/manifests/gateway.yaml b/pkg/manifests/gateway.yaml new file mode 100644 index 00000000..b964f911 --- /dev/null +++ b/pkg/manifests/gateway.yaml @@ -0,0 +1,47 @@ + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: +spec: + gatewayClassName: + listeners: + - name: http + protocol: HTTP + port: 8080 + - name: llm-gw + protocol: HTTP + port: 8081 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: Backend +metadata: + name: backend-dummy +spec: + endpoints: + - fqdn: + # Both these values are arbitrary and unused as the PatchPolicy redirects requests. + hostname: 'foo.bar.com' + port: 8080 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - name: inference-gateway + sectionName: llm-gw + rules: + - backendRefs: + - group: gateway.envoyproxy.io + kind: Backend + name: backend-dummy \ No newline at end of file diff --git a/pkg/manifests/patch_policy.yaml b/pkg/manifests/patch_policy.yaml new file mode 100644 index 00000000..b7681954 --- /dev/null +++ b/pkg/manifests/patch_policy.yaml @@ -0,0 +1,38 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyPatchPolicy +metadata: + name: custom-response-patch-policy + namespace: default +spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: + type: JSONPatch + jsonPatches: + # Necessary to create a cluster of the type: ORIGINAL_DST to allow for + # direct pod scheduling. Which is heavily utilized in our scheduling. + # Specifically the field `original_dst_lb_config` allows us to enable + # `use_http_header` and `http_header_name`. + # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: original_destination_cluster + operation: + op: add + path: "" + value: + name: original_destination_cluster + type: ORIGINAL_DST + original_dst_lb_config: + use_http_header: true + http_header_name: "target-pod" + connect_timeout: 6s + lb_policy: CLUSTER_PROVIDED + dns_lookup_family: V4_ONLY + + - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" + name: default//llm-gw + operation: + op: replace + path: "/virtual_hosts/1/routes/0/route/cluster" + value: original_destination_cluster \ No newline at end of file diff --git a/pkg/placeholder.md b/pkg/placeholder.md deleted file mode 100644 index e69de29b..00000000