diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml index a0925c837..76865e4cd 100644 --- a/config/manifests/vllm/cpu-deployment.yaml +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: lora - image: "seedjeffwan/vllm-cpu-env:bb392af4-20250203" + image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.7.2" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo imagePullPolicy: Always command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: @@ -23,9 +23,11 @@ spec: - "--port" - "8000" - "--enable-lora" + - "--max-loras" + - "4" - "--lora-modules" - - '{"name": "tweet-summary-0", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' - - '{"name": "tweet-summary-1", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' + - '{"name": "tweet-summary-0", "path": "/adapters/ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora_0"}' + - '{"name": "tweet-summary-1", "path": "/adapters/ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora_1"}' env: - name: PORT value: "8000" @@ -36,6 +38,8 @@ spec: key: token - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING value: "true" + - name: VLLM_CPU_KVCACHE_SPACE + value: "4" ports: - containerPort: 8000 name: http