Dynamic lora load/unload sidecar

coolkp · coolkp · commit 14e4b100f349 · 2024-10-23T17:41:53.000Z
diff --git a/examples/dynamic-lora-sidecar/Dockerfile b/examples/dynamic-lora-sidecar/Dockerfile
@@ -0,0 +1,16 @@
+
+FROM python:3.10-slim-buster
+
+WORKDIR /dynamic-lora-reconciler
+
+RUN python3 -m venv /opt/venv
+
+ENV PATH="/opt/venv/bin:$PATH"
+
+RUN pip install --upgrade pip
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY sidecar/sidecar.py . 
+
+CMD ["python", "sidecar.py"]
diff --git a/examples/dynamic-lora-sidecar/README b/examples/dynamic-lora-sidecar/README
@@ -0,0 +1,98 @@
+# Dynamic LORA Adapter Sidecar for vLLM
+
+This directory contains script for a sidecar container to dynamically manage LORA adapters for a vLLM server running in the same Kubernetes pod by reconciling it with a configmap containing lora adapters. 
+
+## Overview
+
+The sidecar continuously monitors a ConfigMap mounted as a YAML configuration file. This file defines the desired state of LORA adapters, including:
+
+- **Adapter ID:** Unique identifier for the adapter.
+- **Source:** Path to the adapter's source files.
+- **Base Model:** The base model to which the adapter should be applied.
+- **toRemove:** (Optional) Indicates whether the adapter should be unloaded.
+
+The sidecar uses the vLLM server's API to load or unload adapters based on the configuration. It also periodically reconciles the registered adapters on the vLLM server with the desired state defined in the ConfigMap, ensuring consistency.
+
+## Features
+
+- **Dynamic Loading and Unloading:**  Load and unload LORA adapters without restarting the vLLM server.
+- **Continuous Reconciliation:**  Ensures the vLLM server's state matches the desired configuration.
+- **ConfigMap Integration:**  Leverages Kubernetes ConfigMaps for easy configuration management.
+- **Easy Deployment:**  Provides a sample deployment YAML for quick setup.
+
+## Repository Contents
+
+- **`sidecar.py`:**  Python script for the sidecar container.
+- **`Dockerfile`:**  Dockerfile to build the sidecar image.
+- **`configmap.yaml`:**  Example ConfigMap YAML file.
+- **`deployment.yaml`:**  Example Kubernetes deployment YAML.
+
+## Usage
+
+1. **Build the Docker Image:**
+   ```bash
+   docker build -t <your-image-name> .
+2. **Create a configmap:**
+    ```bash
+    kubectl create configmap name-of-your-configmap --from-file=your-file.yaml
+3. **Mount the configmap and configure sidecar in your pod**
+    ```yaml
+     spec:
+      shareProcessNamespace: true
+      containers:
+      - name: inference-server
+        image: vllm/vllm-openai:v0.6.3.post1
+        resources:
+          requests:
+            cpu: 5
+            memory: 20Gi
+            ephemeral-storage: 40Gi
+            nvidia.com/gpu : 1
+          limits:
+            cpu: 5
+            memory: 20Gi
+            ephemeral-storage: 40Gi
+            nvidia.com/gpu : 1
+        command: ["/bin/sh", "-c"]
+        args:
+        - vllm serve meta-llama/Llama-2-7b-hf
+        - --host=0.0.0.0
+        - --port=8000
+        - --tensor-parallel-size=1
+        - --swap-space=16
+        - --gpu-memory-utilization=0.95
+        - --max-model-len=2048
+        - --max-num-batched-tokens=4096
+        - --disable-log-stats
+        - --enable-loras
+        - --max-loras=5
+        env:
+        - name: DEPLOY_SOURCE
+          value: UI_NATIVE_MODEL
+        - name: MODEL_ID
+          value: "Llama2-7B"
+        - name: AIP_STORAGE_URI
+          value: "gs://vertex-model-garden-public-us/llama2/llama2-7b-hf"
+        - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+          value: "true"
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      initContainers:
+        - name: configmap-reader-1
+          image: us-docker.pkg.dev/kunjanp-gke-dev-2/lora-sidecar/sidecar:latest
+          restartPolicy: Always
+          env: 
+            DYNAMIC_LORA_ROLLOUT_CONFIG: "/config/configmap.yaml"
+          volumeMounts:
+          - name: config-volume
+            mountPath:  /config/configmap.yaml 
+            subPath: configmap.yaml
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      - name: config-volume
+        configMap:
+          name: dynamic-lora-config
+    
diff --git a/examples/dynamic-lora-sidecar/deployment.yaml b/examples/dynamic-lora-sidecar/deployment.yaml
@@ -0,0 +1,91 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama-server
+  template:
+    metadata:
+      labels:
+        app: llama-server
+        ai.gke.io/model: LLaMA2_7B
+        ai.gke.io/inference-server: vllm
+        examples.ai.gke.io/source: model-garden
+    spec:
+      shareProcessNamespace: true
+      containers:
+      - name: inference-server
+        image: vllm/vllm-openai:v0.6.3.post1
+        resources:
+          requests:
+            cpu: 5
+            memory: 20Gi
+            ephemeral-storage: 40Gi
+            nvidia.com/gpu : 1
+          limits:
+            cpu: 5
+            memory: 20Gi
+            ephemeral-storage: 40Gi
+            nvidia.com/gpu : 1
+        command: ["/bin/sh", "-c"]
+        args:
+        - vllm serve meta-llama/Llama-2-7b-hf
+        - --host=0.0.0.0
+        - --port=8000
+        - --tensor-parallel-size=1
+        - --swap-space=16
+        - --gpu-memory-utilization=0.95
+        - --max-model-len=2048
+        - --max-num-batched-tokens=4096
+        - --disable-log-stats
+        - --enable-loras
+        - --max-loras=5
+        env:
+        - name: DEPLOY_SOURCE
+          value: UI_NATIVE_MODEL
+        - name: MODEL_ID
+          value: "Llama2-7B"
+        - name: AIP_STORAGE_URI
+          value: "gs://vertex-model-garden-public-us/llama2/llama2-7b-hf"
+        - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+          value: "true"
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      initContainers:
+        - name: configmap-reader-1
+          image: us-docker.pkg.dev/kunjanp-gke-dev-2/lora-sidecar/sidecar:latest
+          restartPolicy: Always
+          env: 
+            DYNAMIC_LORA_ROLLOUT_CONFIG: "/config/configmap.yaml"
+          volumeMounts:
+          - name: config-volume
+            mountPath:  /config/configmap.yaml 
+            subPath: configmap.yaml
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      - name: config-volume
+        configMap:
+          name: dynamic-lora-config
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+        cloud.google.com/gke-nodepool: dynamic-lora
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-service
+spec:
+  selector:
+    app: llama-server
+  type: ClusterIP
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
diff --git a/examples/dynamic-lora-sidecar/requirements.txt b/examples/dynamic-lora-sidecar/requirements.txt
@@ -0,0 +1,4 @@
+aiohttp==3.10.10
+pyyaml==6.0.2
+requests==2.32.3
+watchdog==5.0.3
diff --git a/examples/dynamic-lora-sidecar/sidecar/configmap.yaml b/examples/dynamic-lora-sidecar/sidecar/configmap.yaml
@@ -0,0 +1,23 @@
+deployment:
+  host: localhost
+  models:
+  - base-model: meta-llama/Llama-2-7b-hf
+    id: sql-lora-v1
+    source: yard1/llama-2-7b-sql-lora-test
+    status:
+      errors:
+      - ''
+      operation: load
+      timestamp: 2024-10-23 15:43:07 UTC+0000
+    toRemove: false
+  - base-model: meta-llama/Llama-2-7b-hf
+    id: sql-lora-v2
+    source: yard1/llama-2-7b-sql-lora-test
+    status:
+      errors:
+      - already unloaded
+      operation: unload
+      timestamp: 2024-10-23 15:43:07 UTC+0000
+    toRemove: true
+  name: sql-loras-llama
+  port: '8000'
diff --git a/examples/dynamic-lora-sidecar/sidecar/sidecar.py b/examples/dynamic-lora-sidecar/sidecar/sidecar.py
diff --git a/examples/dynamic-lora-sidecar/sidecar/test_sidecar.py b/examples/dynamic-lora-sidecar/sidecar/test_sidecar.py