diff --git a/README.md b/README.md index 892ab8a5d..2ff00581a 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ This project is [alpha (0.2 release)](https://github.com/kubernetes-sigs/gateway ## Getting Started -Follow our [Getting Started Guide](./pkg/README.md) to get the inference-extension up and running on your cluster! +Follow our [Getting Started Guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/) to get the inference-extension up and running on your cluster! See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for detailed API documentation on leveraging our Kubernetes-native declarative APIs diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index eaf05c753..5edb60011 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -1,7 +1,7 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: - name: tweet-summarizer + name: food-review spec: modelName: food-review criticality: Standard diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index beb19bbd3..3386a7916 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -235,12 +235,12 @@ spec: emptyDir: {} - name: config-volume configMap: - name: vllm-llama3.1-8b-adapters + name: vllm-llama3-8b-adapters --- apiVersion: v1 kind: ConfigMap metadata: - name: vllm-llama3.1-8b-adapters + name: vllm-llama3-8b-adapters data: configmap.yaml: | vLLMLoRAConfig: diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md index a398c1246..fdf62c3a0 100644 --- a/site-src/guides/adapter-rollout.md +++ b/site-src/guides/adapter-rollout.md @@ -37,9 +37,9 @@ Change the ConfigMap to match the following (note the new entry under models): ensureExist: models: - id: food-review-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + source: Kawon/llama3.1-food-finetune_v14_r8 - id: food-review-2 - source: mahimairaja/tweet-summarization-llama-2-finetuned + source: Kawon/llama3.1-food-finetune_v14_r8 ``` The new adapter version is applied to the model servers live, without requiring a restart. @@ -121,11 +121,11 @@ Unload the older versions from the servers by updating the LoRA syncer ConfigMap ensureExist: models: - id: food-review-2 - source: mahimairaja/tweet-summarization-llama-2-finetuned + source: Kawon/llama3.1-food-finetune_v14_r8 ensureNotExist: models: - id: food-review-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + source: Kawon/llama3.1-food-finetune_v14_r8 ``` With this, all requests should be served by the new adapter version. diff --git a/tools/dynamic-lora-sidecar/README.md b/tools/dynamic-lora-sidecar/README.md index bebaa8854..4e85fd92a 100644 --- a/tools/dynamic-lora-sidecar/README.md +++ b/tools/dynamic-lora-sidecar/README.md @@ -77,50 +77,27 @@ The sidecar supports the following command-line arguments: ## Example Configuration -Here's an example of using the `defaultBaseModel` field to avoid repetition in your configuration: +In this example, both adapters will use `meta-llama/Llama-3.1-8B-Instruct` as their base model: ```yaml apiVersion: v1 kind: ConfigMap metadata: - name: vllm-llama2-7b-adapters + name: vllm-llama3-8b-adapters data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama2-7b + name: vllm-llama3-8b port: 8000 - defaultBaseModel: meta-llama/Llama-2-7b-hf + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct ensureExist: models: - - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - id: tweet-summary-2 - source: mahimairaja/tweet-summarization-llama-2-finetuned + - id: food-review-1 + source: Kawon/llama3.1-food-finetune_v14_r8 + - id: food-review-2 + source: Kawon/llama3.1-food-finetune_v14_r8 ``` -In this example, both adapters will use `meta-llama/Llama-2-7b-hf` as their base model without needing to specify it for each adapter individually. - -You can still override the default base model for specific adapters when needed: - -```yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: vllm-mixed-adapters -data: - configmap.yaml: | - vLLMLoRAConfig: - name: vllm-mixed - port: 8000 - defaultBaseModel: meta-llama/Llama-2-7b-hf - ensureExist: - models: - - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - id: code-assistant - source: huggingface/code-assistant-lora - base-model: meta-llama/Llama-2-13b-hf # Override for this specific adapter -``` ## Example Deployment The [deployment.yaml](deployment.yaml) file shows an example of deploying the sidecar with custom parameters: