diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index f96113e1c..60a0fc3e9 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -44,11 +44,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: - name: vllm-llama2-7b-pool + name: my-pool spec: targetPortNumber: 8000 selector: - app: vllm-llama2-7b-pool + app: my-pool extensionRef: name: inference-gateway-ext-proc --- @@ -75,7 +75,7 @@ spec: imagePullPolicy: Always args: - -poolName - - "vllm-llama2-7b-pool" + - "my-pool" - -v - "3" - -grpcPort diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 57240298a..94c36d845 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -6,7 +6,7 @@ spec: modelName: tweet-summary criticality: Critical poolRef: - name: vllm-llama2-7b-pool + name: my-pool targetModels: - name: tweet-summary-1 weight: 100 diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml new file mode 100644 index 000000000..a0925c837 --- /dev/null +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-pool +spec: + replicas: 3 + selector: + matchLabels: + app: my-pool + template: + metadata: + labels: + app: my-pool + spec: + containers: + - name: lora + image: "seedjeffwan/vllm-cpu-env:bb392af4-20250203" + imagePullPolicy: Always + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "Qwen/Qwen2.5-1.5B-Instruct" + - "--port" + - "8000" + - "--enable-lora" + - "--lora-modules" + - '{"name": "tweet-summary-0", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' + - '{"name": "tweet-summary-1", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" + ports: + - containerPort: 8000 + name: http + protocol: TCP + livenessProbe: + failureThreshold: 240 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 600 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: adapter-loader + image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo + command: ["python"] + args: + - ./pull_adapters.py + - --adapter + - ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora + - --duplicate-count + - "4" + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: HF_HOME + value: /adapters + volumeMounts: + - name: adapters + mountPath: "/adapters" + restartPolicy: Always + schedulerName: default-scheduler + terminationGracePeriodSeconds: 30 + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml similarity index 97% rename from config/manifests/vllm/deployment.yaml rename to config/manifests/vllm/gpu-deployment.yaml index 51689c9f2..d16a46a45 100644 --- a/config/manifests/vllm/deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -1,16 +1,16 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama2-7b-pool + name: my-pool spec: replicas: 3 selector: matchLabels: - app: vllm-llama2-7b-pool + app: my-pool template: metadata: labels: - app: vllm-llama2-7b-pool + app: my-pool spec: containers: - name: lora diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 2949d387b..976368ac9 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -5,19 +5,40 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ## **Prerequisites** - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, - you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed. + - Support for services of typs `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). + For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). ## **Steps** ### Deploy Sample Model Server + This quickstart guide contains two options for setting up model server: + + 1. GPU-based model server. + Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). + + 1. CPU-based model server (not using GPUs). + Requirements: a Hugging Face access token that grants access to the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). + + Choose one of these options and follow the steps below. Please do not deploy both, as the deployments have the same name and will override each other. + +#### GPU-Based Model Server + + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed. Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/deployment.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml + ``` + +#### CPU-Based Model Server + + Create a Hugging Face secret to download the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). Ensure that the token grants access to this model. + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Qwen + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml ``` ### Install the Inference Extension CRDs @@ -49,7 +70,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: ```bash