Skip to content

Commit 7ef0ab1

Browse files
authored
1 parent 77f8564 commit 7ef0ab1

File tree

8 files changed

+918
-19
lines changed

8 files changed

+918
-19
lines changed
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
apiVersion: inference.networking.x-k8s.io/v1alpha2
2+
kind: InferenceModel
3+
metadata:
4+
name: adapter-0
5+
spec:
6+
modelName: adapter-0
7+
criticality: Critical
8+
poolRef:
9+
name: vllm-llama3-8b-instruct
10+
targetModels:
11+
- name: adapter-0
12+
weight: 100
13+
14+
---
15+
16+
apiVersion: inference.networking.x-k8s.io/v1alpha2
17+
kind: InferenceModel
18+
metadata:
19+
name: adapter-1
20+
spec:
21+
modelName: adapter-1
22+
criticality: Critical
23+
poolRef:
24+
name: vllm-llama3-8b-instruct
25+
targetModels:
26+
- name: adapter-1
27+
weight: 100
28+
29+
---
30+
31+
apiVersion: inference.networking.x-k8s.io/v1alpha2
32+
kind: InferenceModel
33+
metadata:
34+
name: adapter-2
35+
spec:
36+
modelName: adapter-2
37+
criticality: Critical
38+
poolRef:
39+
name: vllm-llama3-8b-instruct
40+
targetModels:
41+
- name: adapter-2
42+
weight: 100
43+
44+
---
45+
46+
apiVersion: inference.networking.x-k8s.io/v1alpha2
47+
kind: InferenceModel
48+
metadata:
49+
name: adapter-3
50+
spec:
51+
modelName: adapter-3
52+
criticality: Critical
53+
poolRef:
54+
name: vllm-llama3-8b-instruct
55+
targetModels:
56+
- name: adapter-3
57+
weight: 100
58+
59+
---
60+
61+
apiVersion: inference.networking.x-k8s.io/v1alpha2
62+
kind: InferenceModel
63+
metadata:
64+
name: adapter-4
65+
spec:
66+
modelName: adapter-4
67+
criticality: Critical
68+
poolRef:
69+
name: vllm-llama3-8b-instruct
70+
targetModels:
71+
- name: adapter-4
72+
weight: 100
73+
74+
---
75+
76+
apiVersion: inference.networking.x-k8s.io/v1alpha2
77+
kind: InferenceModel
78+
metadata:
79+
name: adapter-5
80+
spec:
81+
modelName: adapter-5
82+
criticality: Critical
83+
poolRef:
84+
name: vllm-llama3-8b-instruct
85+
targetModels:
86+
- name: adapter-5
87+
weight: 100
88+
89+
---
90+
91+
apiVersion: inference.networking.x-k8s.io/v1alpha2
92+
kind: InferenceModel
93+
metadata:
94+
name: adapter-6
95+
spec:
96+
modelName: adapter-6
97+
criticality: Critical
98+
poolRef:
99+
name: vllm-llama3-8b-instruct
100+
targetModels:
101+
- name: adapter-6
102+
weight: 100
103+
104+
---
105+
106+
apiVersion: inference.networking.x-k8s.io/v1alpha2
107+
kind: InferenceModel
108+
metadata:
109+
name: adapter-7
110+
spec:
111+
modelName: adapter-7
112+
criticality: Critical
113+
poolRef:
114+
name: vllm-llama3-8b-instruct
115+
targetModels:
116+
- name: adapter-7
117+
weight: 100
118+
119+
---
120+
121+
apiVersion: inference.networking.x-k8s.io/v1alpha2
122+
kind: InferenceModel
123+
metadata:
124+
name: adapter-8
125+
spec:
126+
modelName: adapter-8
127+
criticality: Critical
128+
poolRef:
129+
name: vllm-llama3-8b-instruct
130+
targetModels:
131+
- name: adapter-8
132+
weight: 100
133+
134+
---
135+
136+
apiVersion: inference.networking.x-k8s.io/v1alpha2
137+
kind: InferenceModel
138+
metadata:
139+
name: adapter-9
140+
spec:
141+
modelName: adapter-9
142+
criticality: Critical
143+
poolRef:
144+
name: vllm-llama3-8b-instruct
145+
targetModels:
146+
- name: adapter-9
147+
weight: 100
148+
149+
---
150+
151+
apiVersion: inference.networking.x-k8s.io/v1alpha2
152+
kind: InferenceModel
153+
metadata:
154+
name: adapter-10
155+
spec:
156+
modelName: adapter-10
157+
criticality: Critical
158+
poolRef:
159+
name: vllm-llama3-8b-instruct
160+
targetModels:
161+
- name: adapter-10
162+
weight: 100
163+
164+
---
165+
166+
apiVersion: inference.networking.x-k8s.io/v1alpha2
167+
kind: InferenceModel
168+
metadata:
169+
name: adapter-11
170+
spec:
171+
modelName: adapter-11
172+
criticality: Critical
173+
poolRef:
174+
name: vllm-llama3-8b-instruct
175+
targetModels:
176+
- name: adapter-11
177+
weight: 100
178+
179+
---
180+
181+
apiVersion: inference.networking.x-k8s.io/v1alpha2
182+
kind: InferenceModel
183+
metadata:
184+
name: adapter-12
185+
spec:
186+
modelName: adapter-12
187+
criticality: Critical
188+
poolRef:
189+
name: vllm-llama3-8b-instruct
190+
targetModels:
191+
- name: adapter-12
192+
weight: 100
193+
194+
195+
---
196+
197+
apiVersion: inference.networking.x-k8s.io/v1alpha2
198+
kind: InferenceModel
199+
metadata:
200+
name: adapter-13
201+
spec:
202+
modelName: adapter-13
203+
criticality: Critical
204+
poolRef:
205+
name: vllm-llama3-8b-instruct
206+
targetModels:
207+
- name: adapter-13
208+
weight: 100
209+
210+
211+
---
212+
213+
apiVersion: inference.networking.x-k8s.io/v1alpha2
214+
kind: InferenceModel
215+
metadata:
216+
name: adapter-14
217+
spec:
218+
modelName: adapter-14
219+
criticality: Critical
220+
poolRef:
221+
name: vllm-llama3-8b-instruct
222+
targetModels:
223+
- name: adapter-14
224+
weight: 100
225+
226+
---
227+
228+
229+
apiVersion: inference.networking.x-k8s.io/v1alpha2
230+
kind: InferenceModel
231+
metadata:
232+
name: base-model
233+
spec:
234+
modelName: meta-llama/Llama-3.1-8B-Instruct
235+
criticality: Critical
236+
poolRef:
237+
name: vllm-llama3-8b-instruct
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
labels:
5+
app: benchmark-tool
6+
name: benchmark-tool
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: benchmark-tool
12+
template:
13+
metadata:
14+
labels:
15+
app: benchmark-tool
16+
spec:
17+
containers:
18+
# Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d
19+
- image: '<DOCKER_IMAGE>'
20+
imagePullPolicy: Always
21+
name: benchmark-tool
22+
command:
23+
- bash
24+
- -c
25+
- ./latency_throughput_curve.sh
26+
env:
27+
- name: IP
28+
value: '<target-ip>'
29+
- name: REQUEST_RATES
30+
value: '20,40,60,80,100,120,140,160,180,200'
31+
- name: BENCHMARK_TIME_SECONDS
32+
value: '300'
33+
- name: TOKENIZER
34+
value: 'meta-llama/Llama-3.1-8B-Instruct'
35+
- name: MODELS
36+
value: 'adapter-0,adapter-1,adapter-2,adapter-3,adapter-4,adapter-5,adapter-6,adapter-7,adapter-8,adapter-9,adapter-10,adapter-11,adapter-12,adapter-13,adapter-14'
37+
- name: TRAFFIC_SPLIT
38+
value: '0.12,0.12,0.12,0.12,0.12,0.06,0.06,0.06,0.06,0.06,0.02,0.02,0.02,0.02,0.02'
39+
- name: BACKEND
40+
value: vllm
41+
- name: PORT
42+
value: "80"
43+
- name: INPUT_LENGTH
44+
value: "1024"
45+
- name: OUTPUT_LENGTH
46+
value: '1024'
47+
- name: FILE_PREFIX
48+
value: benchmark
49+
- name: PROMPT_DATASET_FILE
50+
value: Infinity-Instruct_conversations.json
51+
- name: HF_TOKEN
52+
valueFrom:
53+
secretKeyRef:
54+
key: token
55+
name: hf-token
56+
resources:
57+
limits:
58+
cpu: "2"
59+
memory: 20Gi
60+
requests:
61+
cpu: "2"
62+
memory: 20Gi
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
labels:
5+
app: benchmark-tool
6+
name: benchmark-tool
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: benchmark-tool
12+
template:
13+
metadata:
14+
labels:
15+
app: benchmark-tool
16+
spec:
17+
containers:
18+
# Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d
19+
- image: '<DOCKER_IMAGE>'
20+
imagePullPolicy: Always
21+
name: benchmark-tool
22+
command:
23+
- bash
24+
- -c
25+
- ./latency_throughput_curve.sh
26+
env:
27+
- name: IP
28+
value: '<target-ip>'
29+
- name: REQUEST_RATES
30+
value: '300,310,320,330,340,350'
31+
- name: BENCHMARK_TIME_SECONDS
32+
value: '300'
33+
- name: TOKENIZER
34+
value: 'meta-llama/Llama-3.1-8B-Instruct'
35+
- name: MODELS
36+
value: 'meta-llama/Llama-3.1-8B-Instruct'
37+
- name: BACKEND
38+
value: vllm
39+
- name: PORT
40+
value: "80"
41+
- name: INPUT_LENGTH
42+
value: "1024"
43+
- name: OUTPUT_LENGTH
44+
value: '1024'
45+
- name: FILE_PREFIX
46+
value: benchmark
47+
- name: PROMPT_DATASET_FILE
48+
value: billsum_conversations.json
49+
- name: HF_TOKEN
50+
valueFrom:
51+
secretKeyRef:
52+
key: token
53+
name: hf-token
54+
resources:
55+
limits:
56+
cpu: "2"
57+
memory: 20Gi
58+
requests:
59+
cpu: "2"
60+
memory: 20Gi

0 commit comments

Comments
 (0)