diff --git a/examples/poc/manifests/llmservice.yaml b/examples/poc/manifests/llmservice.yaml index 575cd8e2e..93a0e321d 100644 --- a/examples/poc/manifests/llmservice.yaml +++ b/examples/poc/manifests/llmservice.yaml @@ -1,4 +1,14 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: LLMServerPool +metadata: + labels: + name: vllm-llama2-7b-pool +spec: + targetPort: 8000 + modelServerSelector: + "app": "vllm-llama2-7b-pool" +--- +apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: LLMService metadata: labels: @@ -7,17 +17,84 @@ metadata: name: llmservice-sample spec: models: - - name: sql-code-assist - - name: npc-bot + - name: sql-lora + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: sql-lora + weight: 100 + - name: sql-lora-0 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: sql-lora-0 + weight: 100 + - name: sql-lora-1 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: sql-lora-1 + weight: 100 + - name: sql-lora-2 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: sql-lora-2 + weight: 100 + - name: sql-lora-3 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: sql-lora-3 + weight: 100 + - name: sql-lora-4 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: sql-lora-4 + weight: 100 + - name: tweet-summary + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: tweet-summary + weight: 100 + - name: tweet-summary-0 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: tweet-summary-0 + weight: 100 + - name: tweet-summary-1 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: tweet-summary-1 + weight: 100 + - name: tweet-summary-2 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: tweet-summary-2 + weight: 100 + - name: tweet-summary-3 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: tweet-summary-3 + weight: 100 + - name: tweet-summary-4 + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 + targetModels: + - name: tweet-summary-4 + weight: 100 + - name: meta-llama/Llama-2-7b-hf objective: desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50 targetModels: - - name: npc-bot-v1 - weight: 50 - - name: npc-bot-v2 - weight: 50 + - name: meta-llama/Llama-2-7b-hf + weight: 100 poolRef: - kind: LLMServerPool - name: test-pool - - name: gemini-pool - kind: LLMServerPool \ No newline at end of file + name: vllm-llama2-7b-pool diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml index fc9c1d6b0..6f9498b4f 100644 --- a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml +++ b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml @@ -1,17 +1,32 @@ +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama2-7b-pool +spec: + selector: + app: vllm-llama2-7b-pool + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP + +--- + apiVersion: apps/v1 kind: Deployment metadata: - name: vllm + name: vllm-llama2-7b-pool namespace: default spec: - replicas: 6 + replicas: 3 selector: matchLabels: - app: vllm + app: vllm-llama2-7b-pool template: metadata: labels: - app: vllm + app: vllm-llama2-7b-pool spec: containers: - name: lora diff --git a/examples/poc/manifests/vllm/vllm-lora-service.yaml b/examples/poc/manifests/vllm/vllm-lora-service.yaml deleted file mode 100644 index ae55ec650..000000000 --- a/examples/poc/manifests/vllm/vllm-lora-service.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: vllm-lora - namespace: default -spec: - selector: - app: vllm - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 \ No newline at end of file diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 462240b42..ffaeadd1e 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -1,32 +1,65 @@ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["llmservices"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["llmserverpools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default + namespace: default +roleRef: + kind: ClusterRole + name: pod-read +--- + apiVersion: apps/v1 kind: Deployment metadata: - name: instance-gateway-ext-proc + name: inference-gateway-ext-proc namespace: default labels: - app: instance-gateway-ext-proc + app: inference-gateway-ext-proc spec: replicas: 1 selector: matchLabels: - app: instance-gateway-ext-proc + app: inference-gateway-ext-proc template: metadata: labels: - app: instance-gateway-ext-proc + app: inference-gateway-ext-proc spec: containers: - - name: instance-gateway-ext-proc + - name: inference-gateway-ext-proc # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args. image: args: - # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/12) Remove this once ext proc can dynamically reconcile on LLMServerPool. - - -pods - - "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84" - - -podIPs - - "10.24.11.6:8000,10.24.5.7:8000" + - -serverPoolName + - "vllm-llama2-7b-pool" + - -v + - "3" + - -serviceName + - "vllm-llama2-7b-pool" ports: - containerPort: 9002 + - name: curl image: curlimages/curl command: ["sleep", "3600"] @@ -34,11 +67,11 @@ spec: apiVersion: v1 kind: Service metadata: - name: instance-gateway-ext-proc + name: inference-gateway-ext-proc namespace: default spec: selector: - app: instance-gateway-ext-proc + app: inference-gateway-ext-proc ports: - protocol: TCP port: 9002 @@ -55,12 +88,12 @@ spec: - backendRefs: - group: "" kind: Service - name: instance-gateway-ext-proc + name: inference-gateway-ext-proc port: 9002 processingMode: request: body: Buffered - response: + response: {} # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. messageTimeout: 1000s diff --git a/pkg/manifests/gateway.yaml b/pkg/manifests/gateway.yaml index 621d73a5b..32f5d484d 100644 --- a/pkg/manifests/gateway.yaml +++ b/pkg/manifests/gateway.yaml @@ -3,9 +3,9 @@ apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: - name: + name: inference-gateway spec: - gatewayClassName: + gatewayClassName: inference-gateway listeners: - name: http protocol: HTTP @@ -17,7 +17,7 @@ spec: apiVersion: gateway.networking.k8s.io/v1 kind: GatewayClass metadata: - name: + name: inference-gateway spec: controllerName: gateway.envoyproxy.io/gatewayclass-controller --- @@ -38,7 +38,7 @@ metadata: name: llm-route spec: parentRefs: - - name: + - name: inference-gateway sectionName: llm-gw rules: - backendRefs: diff --git a/pkg/manifests/patch_policy.yaml b/pkg/manifests/patch_policy.yaml index 6003dc1bf..00cb9857f 100644 --- a/pkg/manifests/patch_policy.yaml +++ b/pkg/manifests/patch_policy.yaml @@ -7,7 +7,7 @@ spec: targetRef: group: gateway.networking.k8s.io kind: Gateway - name: + name: inference-gateway type: JSONPatch jsonPatches: # Necessary to create a cluster of the type: ORIGINAL_DST to allow for @@ -36,7 +36,7 @@ spec: max_requests: 40000 - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: default//llm-gw + name: default/inference-gateway/llm-gw operation: op: replace path: "/virtual_hosts/0/routes/0/route/cluster"