forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeployment.yaml
91 lines (90 loc) · 2.33 KB
/
deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-deployment
spec:
replicas: 1
selector:
matchLabels:
app: llama-server
template:
metadata:
labels:
app: llama-server
ai.gke.io/model: LLaMA2_7B
ai.gke.io/inference-server: vllm
examples.ai.gke.io/source: model-garden
spec:
shareProcessNamespace: true
containers:
- name: inference-server
image: vllm/vllm-openai:v0.6.3.post1
resources:
requests:
cpu: 5
memory: 20Gi
ephemeral-storage: 40Gi
nvidia.com/gpu : 1
limits:
cpu: 5
memory: 20Gi
ephemeral-storage: 40Gi
nvidia.com/gpu : 1
command: ["/bin/sh", "-c"]
args:
- vllm serve meta-llama/Llama-2-7b-hf
- --host=0.0.0.0
- --port=8000
- --tensor-parallel-size=1
- --swap-space=16
- --gpu-memory-utilization=0.95
- --max-model-len=2048
- --max-num-batched-tokens=4096
- --disable-log-stats
- --enable-loras
- --max-loras=5
env:
- name: DEPLOY_SOURCE
value: UI_NATIVE_MODEL
- name: MODEL_ID
value: "Llama2-7B"
- name: AIP_STORAGE_URI
value: "gs://vertex-model-garden-public-us/llama2/llama2-7b-hf"
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
value: "true"
volumeMounts:
- mountPath: /dev/shm
name: dshm
initContainers:
- name: configmap-reader-1
image: us-docker.pkg.dev/kunjanp-gke-dev-2/lora-sidecar/sidecar:latest
restartPolicy: Always
env:
DYNAMIC_LORA_ROLLOUT_CONFIG: "/config/configmap.yaml"
volumeMounts:
- name: config-volume
mountPath: /config/configmap.yaml
subPath: configmap.yaml
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: config-volume
configMap:
name: dynamic-lora-config
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
cloud.google.com/gke-nodepool: dynamic-lora
---
apiVersion: v1
kind: Service
metadata:
name: llama-service
spec:
selector:
app: llama-server
type: ClusterIP
ports:
- protocol: TCP
port: 8000
targetPort: 8000