Skip to content

Commit c24ff35

Browse files
committed
Move under tools
Signed-off-by: Kunjan <[email protected]>
1 parent 100f636 commit c24ff35

13 files changed

+143
-0
lines changed
+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
vLLMLoRAConfig:
2+
name: sql-loras-llama
3+
ensureExist:
4+
models:
5+
- base-model: meta-llama/Llama-2-7b-hf
6+
id: sql-lora-v8
7+
source: yard1/llama-2-7b-sql-lora-test
8+
- base-model: meta-llama/Llama-2-7b-hf
9+
id: sql-lora-v12
10+
source: yard1/llama-2-7b-sql-lora-test
11+
ensureNotExist:
12+
models:
13+
- base-model: meta-llama/Llama-2-7b-hf
14+
id: sql-lora-v2
15+
source: yard1/llama-2-7b-sql-lora-test
16+
- base-model: meta-llama/Llama-2-7b-hf
17+
id: sql-lora-v7
18+
source: yard1/llama-2-7b-sql-lora-test
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: llama-deployment
5+
spec:
6+
replicas: 1
7+
selector:
8+
matchLabels:
9+
app: llama-server
10+
template:
11+
metadata:
12+
labels:
13+
app: llama-server
14+
ai.gke.io/model: LLaMA2_7B
15+
ai.gke.io/inference-server: vllm
16+
examples.ai.gke.io/source: model-garden
17+
spec:
18+
shareProcessNamespace: true
19+
containers:
20+
- name: inference-server
21+
image: vllm/vllm-openai:v0.6.3.post1
22+
resources:
23+
requests:
24+
cpu: 5
25+
memory: 20Gi
26+
ephemeral-storage: 40Gi
27+
nvidia.com/gpu : 1
28+
limits:
29+
cpu: 5
30+
memory: 20Gi
31+
ephemeral-storage: 40Gi
32+
nvidia.com/gpu : 1
33+
command: ["/bin/sh", "-c"]
34+
args:
35+
- vllm serve meta-llama/Llama-2-7b-hf
36+
- --host=0.0.0.0
37+
- --port=8000
38+
- --tensor-parallel-size=1
39+
- --swap-space=16
40+
- --gpu-memory-utilization=0.95
41+
- --max-model-len=2048
42+
- --max-num-batched-tokens=4096
43+
- --disable-log-stats
44+
- --enable-loras
45+
- --max-loras=5
46+
env:
47+
- name: DEPLOY_SOURCE
48+
value: UI_NATIVE_MODEL
49+
- name: MODEL_ID
50+
value: "Llama2-7B"
51+
- name: AIP_STORAGE_URI
52+
value: "gs://vertex-model-garden-public-us/llama2/llama2-7b-hf"
53+
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
54+
value: "true"
55+
- name: HF_TOKEN
56+
valueFrom:
57+
secretKeyRef:
58+
name: hf-token # The name of your Kubernetes Secret
59+
key: token # The specific key within the Secret
60+
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
61+
value: "/config/configmap.yaml"
62+
volumeMounts:
63+
- mountPath: /dev/shm
64+
name: dshm
65+
initContainers:
66+
- name: lora-adapter-syncer
67+
tty: true
68+
stdin: true
69+
image: us-docker.pkg.dev/kunjanp-gke-dev-2/lora-sidecar/sidecar:latest
70+
restartPolicy: Always
71+
imagePullPolicy: Always
72+
env:
73+
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
74+
value: "/config/configmap.yaml"
75+
volumeMounts: # DO NOT USE subPath
76+
- name: config-volume
77+
mountPath: /config
78+
volumes:
79+
- name: dshm
80+
emptyDir:
81+
medium: Memory
82+
- name: config-volume
83+
configMap:
84+
name: dynamic-lora-config
85+
86+
---
87+
apiVersion: v1
88+
kind: Service
89+
metadata:
90+
name: llama-service
91+
spec:
92+
selector:
93+
app: llama-server
94+
type: ClusterIP
95+
ports:
96+
- protocol: TCP
97+
port: 8000
98+
targetPort: 8000
99+
100+
---
101+
102+
apiVersion: v1
103+
kind: ConfigMap
104+
metadata:
105+
name: dynamic-lora-config
106+
data:
107+
configmap.yaml: |
108+
vLLMLoRAConfig:
109+
name: sql-loras-llama
110+
ensureExist:
111+
models:
112+
- base-model: meta-llama/Llama-2-7b-hf
113+
id: sql-lora-v1
114+
source: yard1/llama-2-7b-sql-lora-test
115+
- base-model: meta-llama/Llama-2-7b-hf
116+
id: sql-lora-v3
117+
source: yard1/llama-2-7b-sql-lora-test
118+
- base-model: meta-llama/Llama-2-7b-hf
119+
id: sql-lora-v4
120+
source: yard1/llama-2-7b-sql-lora-test
121+
ensureNotExist:
122+
models:
123+
- base-model: meta-llama/Llama-2-7b-hf
124+
id: sql-lora-v2
125+
source: yard1/llama-2-7b-sql-lora-test

0 commit comments

Comments
 (0)