diff --git a/deploy/components/vllm-p2p/vllm-deployment.yaml b/deploy/components/vllm-p2p/vllm-deployment.yaml index 19fd59c21..c9964962e 100644 --- a/deploy/components/vllm-p2p/vllm-deployment.yaml +++ b/deploy/components/vllm-p2p/vllm-deployment.yaml @@ -31,13 +31,12 @@ spec: - "-c" args: - | - export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \ + export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}} && \ vllm serve ${MODEL_NAME} \ --host 0.0.0.0 \ --port 8000 \ - --enable-chunked-prefill false \ --max-model-len ${MAX_MODEL_LEN} \ - --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' ports: - name: http containerPort: 8000 @@ -78,6 +77,10 @@ spec: secretKeyRef: name: ${HF_SECRET_NAME} key: ${HF_SECRET_KEY} + - name: VLLM_ENABLE_V1_MULTIPROCESSING + value: "1" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: spawn - name: LMCACHE_LOOKUP_URL value: ${REDIS_HOST}:${REDIS_PORT} - name: LMCACHE_ENABLE_DEBUG diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml index 00c87fbbf..a6b1d4a2b 100644 --- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml @@ -29,4 +29,12 @@ spec: valueFrom: secretKeyRef: name: hf-token - key: ${HF_SECRET_KEY} \ No newline at end of file + key: ${HF_SECRET_KEY} + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "true" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "2.0" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1.0" diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 21564e9cc..e9d92c174 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -65,10 +65,10 @@ case "${VLLM_MODE}" in export LORA_ADAPTER_SYNCER_TAG="${LORA_ADAPTER_SYNCER_TAG:-v20250425-ddc3d69}" elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then - export VLLM_IMAGE="${VLLM_IMAGE:-lmcache/vllm-openai}" - export VLLM_TAG="${VLLM_TAG:-2025-03-10}" - export EPP_IMAGE="${EPP_IMAGE:- quay.io/vmaroon/gateway-api-inference-extension/epp}" - export EPP_TAG="${EPP_TAG:-kv-aware}" + export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/llm-d/llm-d-dev}" + export VLLM_TAG="${VLLM_TAG:-lmcache-0.0.6-amd64}" + export EPP_IMAGE="${EPP_IMAGE:-quay.io/llm-d/llm-d-gateway-api-inference-extension-dev}" + export EPP_TAG="${EPP_TAG:-0.0.5-amd64}" export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}" export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}"