Skip to content

Commit 54ee723

Browse files
committed
add HorizontalPodAutoscaler stub to catalog
1 parent 07dd507 commit 54ee723

File tree

16 files changed

+149
-0
lines changed

16 files changed

+149
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: autoscaling/v2
2+
kind: HorizontalPodAutoscaler
3+
metadata:
4+
name: singlehost-inference-hpa
5+
spec:
6+
scaleTargetRef:
7+
apiVersion: apps/v1
8+
kind: Deployment
9+
name: singlehost-inference-deployment-blueprint
10+
# TODO: also include stabilizing-windows, tolerance threshold, etc.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# kustomization.yaml
2+
apiVersion: kustomize.config.k8s.io/v1alpha1
3+
kind: Component
4+
5+
resources:
6+
- hpa.yaml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
apiVersion: autoscaling/v2
2+
kind: HorizontalPodAutoscaler
3+
metadata:
4+
name: singlehost-inference-hpa
5+
spec:
6+
# TODO: add best practices as defined at:
7+
# - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling-tpu
8+
# - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling
9+
minReplicas: 1
10+
maxReplicas: 10
11+
metrics:
12+
- type: Pods
13+
pods:
14+
metric:
15+
name: jetstream-token-latency-ms
16+
target:
17+
type: AverageValue
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# kustomization.yaml
2+
apiVersion: kustomize.config.k8s.io/v1alpha1
3+
kind: Component
4+
5+
components:
6+
- ../../base
7+
8+
patches:
9+
- path: hpa.patch.yaml
10+
target:
11+
kind: HorizontalPodAutoscaler
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: autoscaling/v2
2+
kind: HorizontalPodAutoscaler
3+
metadata:
4+
name: singlehost-inference-hpa
5+
spec:
6+
minReplicas: 1
7+
maxReplicas: 10
8+
metrics:
9+
- type: Pods
10+
pods:
11+
metric:
12+
name: vllm-token-latency-ms
13+
target:
14+
type: AverageValue
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# kustomization.yaml
2+
apiVersion: kustomize.config.k8s.io/v1alpha1
3+
kind: Component
4+
5+
components:
6+
- ../../base
7+
8+
patches:
9+
- path: hpa.patch.yaml
10+
target:
11+
kind: HorizontalPodAutoscaler
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
- op: add
2+
path: /metadata/name
3+
value: gemma-7b-it-jetstream-hpa
4+
- op: add
5+
path: /metadata/labels
6+
value:
7+
app: gemma-7b-it-jetstream-inference-server
8+
- op: add
9+
path: /spec/metrics/0/pods/target/averageValue
10+
value: 50

Diff for: serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ resources:
77

88
components:
99
- ../../../components/gke/resources/tpu/v5e-2x4
10+
# - ../../../components/hpa/jetstream/token-latency # HPA is a work-in-progress
1011

1112
patches:
1213
- path: deployment.patch.yaml
@@ -15,3 +16,8 @@ patches:
1516
- path: job.patch.yaml
1617
target:
1718
kind: Job
19+
- options:
20+
allowNameChange: true
21+
path: hpa.patch.yaml
22+
target:
23+
kind: HorizontalPodAutoscaler
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
- op: add
2+
path: /metadata/name
3+
value: llama3-8b-jetstream-hpa
4+
- op: add
5+
path: /metadata/labels
6+
value:
7+
app: llama3-8b-jetstream-inference-server
8+
- op: add
9+
path: /spec/metrics/0/pods/target/averageValue
10+
value: 50

Diff for: serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ resources:
77

88
components:
99
- ../../../components/gke/resources/tpu/v5e-2x4
10+
# - ../../../components/hpa/jetstream/token-latency # HPA is a work-in-progress
1011

1112
patches:
1213
- path: deployment.patch.yaml
@@ -15,3 +16,8 @@ patches:
1516
- path: job.patch.yaml
1617
target:
1718
kind: Job
19+
- options:
20+
allowNameChange: true
21+
path: hpa.patch.yaml
22+
target:
23+
kind: HorizontalPodAutoscaler
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
- op: add
2+
path: /metadata/name
3+
value: gemma-2b-vllm-hpa
4+
- op: add
5+
path: /metadata/labels
6+
value:
7+
app: gemma-2b-vllm-inference-server
8+
- op: add
9+
path: /spec/metrics/0/pods/target/averageValue
10+
value: 50

Diff for: serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ resources:
77

88
components:
99
- ../../../components/gke/resources/gpu/1-L4
10+
# - ../../../components/hpa/vllm/token-latency # HPA is a work-in-progress
1011

1112
patches:
1213
- path: deployment.patch.yaml
1314
target:
1415
kind: Deployment
16+
- options:
17+
allowNameChange: true
18+
path: hpa.patch.yaml
19+
target:
20+
kind: HorizontalPodAutoscaler
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
- op: add
2+
path: /metadata/name
3+
value: llama3-70b-vllm-hpa
4+
- op: add
5+
path: /metadata/labels
6+
value:
7+
app: llama3-70b-vllm-inference-server
8+
- op: add
9+
path: /spec/metrics/0/pods/target/averageValue
10+
value: 50

Diff for: serving-catalog/core/deployment/vllm/llama3-70b/gke/kustomization.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ resources:
77

88
components:
99
- ../../../components/gke/resources/gpu/8-L4
10+
# - ../../../components/hpa/vllm/token-latency # HPA is a work-in-progress
1011

1112
patches:
1213
- path: deployment.patch.yaml
1314
target:
1415
kind: Deployment
16+
- options:
17+
allowNameChange: true
18+
path: hpa.patch.yaml
19+
target:
20+
kind: HorizontalPodAutoscaler
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
- op: add
2+
path: /metadata/name
3+
value: llama3-8b-vllm-hpa
4+
- op: add
5+
path: /metadata/labels
6+
value:
7+
app: llama3-8b-vllm-inference-server
8+
- op: add
9+
path: /spec/metrics/0/pods/target/averageValue
10+
value: 50

Diff for: serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ resources:
77

88
components:
99
- ../../../components/gke/resources/gpu/1-L4
10+
# - ../../../components/hpa/vllm/token-latency # HPA is a work-in-progress
1011

1112
patches:
1213
- path: deployment.patch.yaml
1314
target:
1415
kind: Deployment
16+
- options:
17+
allowNameChange: true
18+
path: hpa.patch.yaml
19+
target:
20+
kind: HorizontalPodAutoscaler

0 commit comments

Comments
 (0)