kubernetes-sigs · k8s-ci-robot · Mar 27, 2025 · Mar 26, 2025
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
@@ -5,12 +5,12 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl
 
 ## Install
 
-To install an InferencePool named `vllm-llama2-7b`  that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
+To install an InferencePool named `vllm-llama3-8b-instruct`  that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
 
 ```txt
-$ helm install vllm-llama2-7b ./config/charts/inferencepool \
-  --set inferencePool.name=vllm-llama2-7b \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \
+$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool \
+  --set inferencePool.name=vllm-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
   --set inferencePool.targetPortNumber=8000
 ```
 
@@ -19,9 +19,9 @@ where `inferencePool.targetPortNumber` is the pod that vllm backends served on a
 To install via the latest published chart in staging  (--version v0 indicates latest dev version), you can run the following command:
 
 ```txt
-$ helm install vllm-llama2-7b \
-  --set inferencePool.name=vllm-llama2-7b \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \
+$ helm install vllm-llama3-8b-instruct \
+  --set inferencePool.name=vllm-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
   --set inferencePool.targetPortNumber=8000 \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
 ```

diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -12,4 +12,4 @@ inferencePool:
   targetPortNumber: 8000
   # modelServers: # REQUIRED
     # matchLabels: 
-    #   app: vllm-llama2-7b
+    #   app: vllm-llama3-8b-instruct
diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml
@@ -31,9 +31,9 @@ spec:
         - name: BENCHMARK_TIME_SECONDS
           value: '60'
         - name: TOKENIZER
-          value: 'meta-llama/Llama-2-7b-hf'
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
         - name: MODELS
-          value: 'meta-llama/Llama-2-7b-hf'
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
         - name: BACKEND
           value: vllm
         - name: PORT

diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml
@@ -99,7 +99,7 @@ spec:
     - backendRefs:
       - group: ""
         kind: Service
-        name: vllm-llama2-7b-epp
+        name: vllm-llama3-8b-instruct-epp
         port: 9002
       processingMode:
         allowModeOverride: true

diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml
@@ -3,12 +3,12 @@ kind: InferenceModel
 metadata:
   name: inferencemodel-sample
 spec:
-  modelName: tweet-summary
-  criticality: Critical
+  modelName: food-review
+  criticality: Standard
   poolRef:
-    name: vllm-llama2-7b
+    name: vllm-llama3-8b-instruct
   targetModels:
-  - name: tweet-summary-1
+  - name: food-review-1
     weight: 100
 
 ---
@@ -17,10 +17,10 @@ kind: InferenceModel
 metadata:
   name: inferencemodel-base-model
 spec:
-  modelName: meta-llama/Llama-2-7b-hf
+  modelName: meta-llama/Llama-3.1-8B-Instruct
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b
+    name: vllm-llama3-8b-instruct
 
 ---
 apiVersion: inference.networking.x-k8s.io/v1alpha2
@@ -31,4 +31,4 @@ spec:
   modelName: Qwen/Qwen2.5-1.5B-Instruct
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b
+    name: vllm-llama3-8b-instruct
diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml
@@ -2,22 +2,22 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
   labels:
-  name: vllm-llama2-7b
+  name: vllm-llama3-8b-instruct
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama2-7b
+    app: vllm-llama3-8b-instruct
   extensionRef:
-    name: vllm-llama2-7b-epp
+    name: vllm-llama3-8b-instruct-epp
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: vllm-llama2-7b-epp
+  name: vllm-llama3-8b-instruct-epp
   namespace: default
 spec:
   selector:
-    app: vllm-llama2-7b-epp
+    app: vllm-llama3-8b-instruct-epp
   ports:
     - protocol: TCP
       port: 9002
@@ -27,27 +27,27 @@ spec:
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b-epp
+  name: vllm-llama3-8b-instruct-epp
   namespace: default
   labels:
-    app: vllm-llama2-7b-epp
+    app: vllm-llama3-8b-instruct-epp
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: vllm-llama2-7b-epp
+      app: vllm-llama3-8b-instruct-epp
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b-epp
+        app: vllm-llama3-8b-instruct-epp
     spec:
       containers:
       - name: epp
         image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
         imagePullPolicy: Always
         args:
         - -poolName
-        - "vllm-llama2-7b"
+        - "vllm-llama3-8b-instruct"
         - -v
         - "4"
         - --zap-encoder

diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml
@@ -1,16 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b
+  name: vllm-llama3-8b-instruct
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama2-7b
+      app: vllm-llama3-8b-instruct
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b
+        app: vllm-llama3-8b-instruct
     spec:
       containers:
         - name: lora
@@ -26,8 +26,8 @@ spec:
           - "--max-loras"
           - "4"
           - "--lora-modules"
-          - '{"name": "tweet-summary-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
-          - '{"name": "tweet-summary-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
+          - '{"name": "food-review-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
+          - '{"name": "food-review-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
           env:
             - name: PORT
               value: "8000"
@@ -108,10 +108,10 @@ metadata:
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama2-7b
+        name: vllm-llama3-8b-instruct
         port: 8000
         ensureExist:
           models:
           - base-model: Qwen/Qwen2.5-1.5B
-            id: tweet-summary-1
+            id: food-review-1
             source: SriSanth2345/Qwen-1.5B-Tweet-Generations
diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
@@ -1,37 +1,34 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b
+  name: vllm-llama3-8b-instruct
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama2-7b
+      app: vllm-llama3-8b-instruct
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b
+        app: vllm-llama3-8b-instruct
     spec:
       containers:
-        - name: lora
+        - name: vllm
           image: "vllm/vllm-openai:latest"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
           - "--model"
-          - "meta-llama/Llama-2-7b-hf"
+          - "meta-llama/Llama-3.1-8B-Instruct"
           - "--tensor-parallel-size"
           - "1"
           - "--port"
           - "8000"
           - "--enable-lora"
           - "--max-loras"
-          - "4"
+          - "2"
           - "--max-cpu-loras"
           - "12"
-          - "--lora-modules"
-          - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
             # Enabling LoRA support temporarily disables automatic v1, we want to force it on
             # until 0.8.3 vLLM is released.
@@ -238,20 +235,22 @@ spec:
           emptyDir: {}
         - name: config-volume
           configMap:
-            name: vllm-llama2-7b-adapters
+            name: vllm-llama3.1-8b-adapters
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: vllm-llama2-7b-adapters
+  name: vllm-llama3.1-8b-adapters
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama2-7b
+        name: vllm-llama3.1-8b-instruct
         port: 8000
         ensureExist:
           models:
-          - base-model: meta-llama/Llama-2-7b-hf
-            id: tweet-summary-1
-            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
+            id: food-review
+            source: Kawon/llama3.1-food-finetune_v14_r8
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
+            id: cad-fabricator
+            source: redcathode/fabricator
diff --git a/hack/test-e2e.sh b/hack/test-e2e.sh
@@ -124,14 +124,14 @@ if [[ "$CURL_POD" == "true" ]]; then
     while [ $SECONDS -lt $end ]; do
         kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \
             -H 'Content-Type: application/json' \
-            -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
+            -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
         sleep 5
     done
 else
     while [ $SECONDS -lt $end ]; do
         curl -i "$IP:$PORT/v1/completions" \
             -H 'Content-Type: application/json' \
-            -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
+            -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
         sleep 5
     done
 fi
diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go
@@ -97,7 +97,7 @@ func TestPool(t *testing.T) {
 
 func TestModel(t *testing.T) {
 	chatModel := "chat"
-	tsModel := "tweet-summary"
+	tsModel := "food-review"
 	model1ts := testutil.MakeInferenceModel("model1").
 		CreationTimestamp(metav1.Unix(1000, 0)).
 		ModelName(tsModel).ObjRef()
@@ -126,7 +126,7 @@ func TestModel(t *testing.T) {
 		wantModels     []*v1alpha2.InferenceModel
 	}{
 		{
-			name: "Add model1 with tweet-summary as modelName",
+			name: "Add model1 with food-review as modelName",
 			op: func(ds Datastore) bool {
 				return ds.ModelSetIfOlder(model1ts)
 			},
@@ -161,7 +161,7 @@ func TestModel(t *testing.T) {
 			wantModels:   []*v1alpha2.InferenceModel{model2ts},
 		},
 		{
-			name:           "Set model1 with the tweet-summary modelName, both models should exist",
+			name:           "Set model1 with the food-review modelName, both models should exist",
 			existingModels: []*v1alpha2.InferenceModel{model2chat},
 			op: func(ds Datastore) bool {
 				return ds.ModelSetIfOlder(model1ts)
@@ -170,7 +170,7 @@ func TestModel(t *testing.T) {
 			wantModels:   []*v1alpha2.InferenceModel{model2chat, model1ts},
 		},
 		{
-			name:           "Set model1 with the tweet-summary modelName, both models should exist",
+			name:           "Set model1 with the food-review modelName, both models should exist",
 			existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts},
 			op: func(ds Datastore) bool {
 				return ds.ModelSetIfOlder(model1ts)

diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go
@@ -127,7 +127,7 @@ func (s *Server) HandleResponseHeaders(
     "id": "cmpl-573498d260f2423f9e42817bbba3743a",
     "object": "text_completion",
     "created": 1732563765,
-    "model": "meta-llama/Llama-2-7b-hf",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "choices": [
         {
             "index": 0,
@@ -217,7 +217,7 @@ func (s *Server) HandleStreaming(
 }
 
 // Example message if "stream_options": {"include_usage": "true"} is included in the request:
-// data: {"id":"...","object":"text_completion","created":1739400043,"model":"tweet-summary-0","choices":[],
+// data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[],
 // "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 //
 // data: [DONE]

diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go
@@ -31,7 +31,7 @@ const (
 		"id": "cmpl-573498d260f2423f9e42817bbba3743a",
 		"object": "text_completion",
 		"created": 1732563765,
-		"model": "meta-llama/Llama-2-7b-hf",
+		"model": "meta-llama/Llama-3.1-8B-Instruct",
 		"choices": [
 			{
 				"index": 0,
@@ -50,10 +50,10 @@ const (
 	}
 	`
 
-	streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":null}
+	streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":null}
 	`
 
-	streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
+	streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 data: [DONE]
 	`
 )