diff --git a/config/manifests/regression-testing/inferencemodel.yaml b/config/manifests/regression-testing/inferencemodel.yaml new file mode 100644 index 000000000..d8eada95a --- /dev/null +++ b/config/manifests/regression-testing/inferencemodel.yaml @@ -0,0 +1,237 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-0 +spec: + modelName: adapter-0 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-0 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-1 +spec: + modelName: adapter-1 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-1 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-2 +spec: + modelName: adapter-2 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-2 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-3 +spec: + modelName: adapter-3 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-3 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-4 +spec: + modelName: adapter-4 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-4 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-5 +spec: + modelName: adapter-5 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-5 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-6 +spec: + modelName: adapter-6 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-6 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-7 +spec: + modelName: adapter-7 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-7 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-8 +spec: + modelName: adapter-8 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-8 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-9 +spec: + modelName: adapter-9 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-9 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-10 +spec: + modelName: adapter-10 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-10 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-11 +spec: + modelName: adapter-11 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-11 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-12 +spec: + modelName: adapter-12 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-12 + weight: 100 + + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-13 +spec: + modelName: adapter-13 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-13 + weight: 100 + + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-14 +spec: + modelName: adapter-14 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-14 + weight: 100 + +--- + + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: base-model +spec: + modelName: meta-llama/Llama-3.1-8B-Instruct + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct \ No newline at end of file diff --git a/config/manifests/regression-testing/multi-lora-regression.yaml b/config/manifests/regression-testing/multi-lora-regression.yaml new file mode 100644 index 000000000..00b5d7d50 --- /dev/null +++ b/config/manifests/regression-testing/multi-lora-regression.yaml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: benchmark-tool + name: benchmark-tool +spec: + replicas: 1 + selector: + matchLabels: + app: benchmark-tool + template: + metadata: + labels: + app: benchmark-tool + spec: + containers: + # Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d + - image: '' + imagePullPolicy: Always + name: benchmark-tool + command: + - bash + - -c + - ./latency_throughput_curve.sh + env: + - name: IP + value: '' + - name: REQUEST_RATES + value: '20,40,60,80,100,120,140,160,180,200' + - name: BENCHMARK_TIME_SECONDS + value: '300' + - name: TOKENIZER + value: 'meta-llama/Llama-3.1-8B-Instruct' + - name: MODELS + value: 'adapter-0,adapter-1,adapter-2,adapter-3,adapter-4,adapter-5,adapter-6,adapter-7,adapter-8,adapter-9,adapter-10,adapter-11,adapter-12,adapter-13,adapter-14' + - name: TRAFFIC_SPLIT + value: '0.12,0.12,0.12,0.12,0.12,0.06,0.06,0.06,0.06,0.06,0.02,0.02,0.02,0.02,0.02' + - name: BACKEND + value: vllm + - name: PORT + value: "80" + - name: INPUT_LENGTH + value: "1024" + - name: OUTPUT_LENGTH + value: '1024' + - name: FILE_PREFIX + value: benchmark + - name: PROMPT_DATASET_FILE + value: Infinity-Instruct_conversations.json + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: token + name: hf-token + resources: + limits: + cpu: "2" + memory: 20Gi + requests: + cpu: "2" + memory: 20Gi \ No newline at end of file diff --git a/config/manifests/regression-testing/single-workload-regression.yaml b/config/manifests/regression-testing/single-workload-regression.yaml new file mode 100644 index 000000000..b13b7eed8 --- /dev/null +++ b/config/manifests/regression-testing/single-workload-regression.yaml @@ -0,0 +1,60 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: benchmark-tool + name: benchmark-tool +spec: + replicas: 1 + selector: + matchLabels: + app: benchmark-tool + template: + metadata: + labels: + app: benchmark-tool + spec: + containers: + # Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d + - image: '' + imagePullPolicy: Always + name: benchmark-tool + command: + - bash + - -c + - ./latency_throughput_curve.sh + env: + - name: IP + value: '' + - name: REQUEST_RATES + value: '300,310,320,330,340,350' + - name: BENCHMARK_TIME_SECONDS + value: '300' + - name: TOKENIZER + value: 'meta-llama/Llama-3.1-8B-Instruct' + - name: MODELS + value: 'meta-llama/Llama-3.1-8B-Instruct' + - name: BACKEND + value: vllm + - name: PORT + value: "80" + - name: INPUT_LENGTH + value: "1024" + - name: OUTPUT_LENGTH + value: '1024' + - name: FILE_PREFIX + value: benchmark + - name: PROMPT_DATASET_FILE + value: billsum_conversations.json + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: token + name: hf-token + resources: + limits: + cpu: "2" + memory: 20Gi + requests: + cpu: "2" + memory: 20Gi \ No newline at end of file diff --git a/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml new file mode 100644 index 000000000..114cd9922 --- /dev/null +++ b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml @@ -0,0 +1,289 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct +spec: + replicas: 10 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: vllm + image: "vllm/vllm-openai:latest" + imagePullPolicy: Always + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "meta-llama/Llama-3.1-8B-Instruct" + - "--tensor-parallel-size" + - "1" + - "--port" + - "8000" + - "--enable-lora" + - "--max-loras" + - "15" + - "--max-cpu-loras" + - "15" + - "--compilation-config" + - "3" + - "--max-lora-rank" + - "8" + - "--max-num-seqs" + - "2048" + - "--max-model-len" + - "2048" + - "--no-enable-prefix-caching" + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "false" + ports: + - containerPort: 8000 + name: http + protocol: TCP + lifecycle: + preStop: + # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep + # to give upstream gateways a chance to take us out of rotation. The time we wait + # is dependent on the time it takes for all upstreams to completely remove us from + # rotation. Older or simpler load balancers might take upwards of 30s, but we expect + # our deployment to run behind a modern gateway like Envoy which is designed to + # probe for readiness aggressively. + sleep: + # Upstream gateway probers for health should be set on a low period, such as 5s, + # and the shorter we can tighten that bound the faster that we release + # accelerators during controlled shutdowns. However, we should expect variance, + # as load balancers may have internal delays, and we don't want to drop requests + # normally, so we're often aiming to set this value to a p99 propagation latency + # of readiness -> load balancer taking backend out of rotation, not the average. + # + # This value is generally stable and must often be experimentally determined on + # for a given load balancer and health check period. We set the value here to + # the highest value we observe on a supported load balancer, and we recommend + # tuning this value down and verifying no requests are dropped. + # + # If this value is updated, be sure to update terminationGracePeriodSeconds. + # + seconds: 30 + # + # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions + # replace with this exec action. + #exec: + # command: + # - /usr/bin/sleep + # - "30" + livenessProbe: + httpGet: + path: /health + port: http + scheme: HTTP + # vLLM's health check is simple, so we can more aggressively probe it. Liveness + # check endpoints should always be suitable for aggressive probing. + periodSeconds: 1 + successThreshold: 1 + # vLLM has a very simple health implementation, which means that any failure is + # likely significant. However, any liveness triggered restart requires the very + # large core model to be reloaded, and so we should bias towards ensuring the + # server is definitely unhealthy vs immediately restarting. Use 5 attempts as + # evidence of a serious problem. + failureThreshold: 5 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: http + scheme: HTTP + # vLLM's health check is simple, so we can more aggressively probe it. Readiness + # check endpoints should always be suitable for aggressive probing, but may be + # slightly more expensive than readiness probes. + periodSeconds: 1 + successThreshold: 1 + # vLLM has a very simple health implementation, which means that any failure is + # likely significant, + failureThreshold: 1 + timeoutSeconds: 1 + # We set a startup probe so that we don't begin directing traffic or checking + # liveness to this instance until the model is loaded. + startupProbe: + # Failure threshold is when we believe startup will not happen at all, and is set + # to the maximum possible time we believe loading a model will take. In our + # default configuration we are downloading a model from HuggingFace, which may + # take a long time, then the model must load into the accelerator. We choose + # 10 minutes as a reasonable maximum startup time before giving up and attempting + # to restart the pod. + # + # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash + # loop forever. Be sure to set this appropriately. + failureThreshold: 600 + # Set delay to start low so that if the base model changes to something smaller + # or an optimization is deployed, we don't wait unneccesarily. + initialDelaySeconds: 2 + # As a startup probe, this stops running and so we can more aggressively probe + # even a moderately complex startup - this is a very important workload. + periodSeconds: 1 + httpGet: + # vLLM does not start the OpenAI server (and hence make /health available) + # until models are loaded. This may not be true for all model servers. + path: /health + port: http + scheme: HTTP + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths + - name: config-volume + mountPath: /config + restartPolicy: Always + + # vLLM allows VLLM_PORT to be specified as an environment variable, but a user might + # create a 'vllm' service in their namespace. That auto-injects VLLM_PORT in docker + # compatible form as `tcp://:` instead of the numeric value vLLM accepts + # causing CrashLoopBackoff. Set service environment injection off by default. + enableServiceLinks: false + + # Generally, the termination grace period needs to last longer than the slowest request + # we expect to serve plus any extra time spent waiting for load balancers to take the + # model server out of rotation. + # + # An easy starting point is the p99 or max request latency measured for your workload, + # although LLM request latencies vary significantly if clients send longer inputs or + # trigger longer outputs. Since steady state p99 will be higher than the latency + # to drain a server, you may wish to slightly this value either experimentally or + # via the calculation below. + # + # For most models you can derive an upper bound for the maximum drain latency as + # follows: + # + # 1. Identify the maximum context length the model was trained on, or the maximum + # allowed length of output tokens configured on vLLM (llama2-7b was trained to + # 4k context length, while llama3-8b was trained to 128k). + # 2. Output tokens are the more compute intensive to calculate and the accelerator + # will have a maximum concurrency (batch size) - the time per output token at + # maximum batch with no prompt tokens being processed is the slowest an output + # token can be generated (for this model it would be about 100ms TPOT at a max + # batch size around 50) + # 3. Calculate the worst case request duration if a request starts immediately + # before the server stops accepting new connections - generally when it receives + # SIGTERM (for this model that is about 4096 / 10 ~ 40s) + # 4. If there are any requests generating prompt tokens that will delay when those + # output tokens start, and prompt token generation is roughly 6x faster than + # compute-bound output token generation, so add 20% to the time from above (40s + + # 16s ~ 55s) + # + # Thus we think it will take us at worst about 55s to complete the longest possible + # request the model is likely to receive at maximum concurrency (highest latency) + # once requests stop being sent. + # + # NOTE: This number will be lower than steady state p99 latency since we stop receiving + # new requests which require continuous prompt token computation. + # NOTE: The max timeout for backend connections from gateway to model servers should + # be configured based on steady state p99 latency, not drain p99 latency + # + # 5. Add the time the pod takes in its preStop hook to allow the load balancers have + # stopped sending us new requests (55s + 30s ~ 85s) + # + # Because termination grace period controls when the Kubelet forcibly terminates a + # stuck or hung process (a possibility due to a GPU crash), there is operational safety + # in keeping the value roughly proportional to the time to finish serving. There is also + # value in adding a bit of extra time to deal with unexpectedly long workloads. + # + # 6. Add a 50% safety buffer to this time since the operational impact should be low + # (85s * 1.5 ~ 130s) + # + # One additional source of drain latency is that some workloads may run close to + # saturation and have queued requests on each server. Since traffic in excess of the + # max sustainable QPS will result in timeouts as the queues grow, we assume that failure + # to drain in time due to excess queues at the time of shutdown is an expected failure + # mode of server overload. If your workload occasionally experiences high queue depths + # due to periodic traffic, consider increasing the safety margin above to account for + # time to drain queued requests. + terminationGracePeriodSeconds: 130 + + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} + - name: config-volume + configMap: + name: vllm-llama3-8b-instruct-adapters +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3.1-8b-instruct + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: adapter-0 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-1 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-2 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-3 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-4 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-5 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-6 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-7 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-8 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-9 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-10 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-11 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-12 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-13 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-14 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + + + + diff --git a/mkdocs.yml b/mkdocs.yml index e5927ed53..bf1536fe7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -67,6 +67,7 @@ nav: - Implementer's Guide: guides/implementers.md - Performance: - Benchmark: performance/benchmark/index.md + - Regression Testing: performance/regression-testing/index.md - Reference: - API Reference: reference/spec.md - API Types: diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md index 160cc26fb..42d5e727b 100644 --- a/site-src/performance/benchmark/index.md +++ b/site-src/performance/benchmark/index.md @@ -106,7 +106,7 @@ This guide shows how to run the jupyter notebook using vscode after completing k pip install -r ./tools/benchmark/requirements.txt ``` -1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. At the end you should - see a bar chart like below where __"ie"__ represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json). - - ![alt text](example-bar-chart.png) +1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. In the last cell update the benchmark ids with`inference-extension` and `k8s-svc`. At the end you should + see a bar chart like below where **"ie"** represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json). + + ![alt text](example-bar-chart.png) \ No newline at end of file diff --git a/site-src/performance/regression-testing/index.md b/site-src/performance/regression-testing/index.md new file mode 100644 index 000000000..16b5552f5 --- /dev/null +++ b/site-src/performance/regression-testing/index.md @@ -0,0 +1,103 @@ +# Regression Testing + +Regression testing verifies that recent code changes have not adversely affected the performance or stability of the Inference Gateway. + +This guide explains how to run regression tests against the Gateway API inference extension using the [Latency Profile Generator (LPG)](https://github.com/AI-Hypercomputer/inference-benchmark/) to simulate traffic and collect performance metrics. + +## Prerequisites + +Refer to the [benchmark guide](/site-src/performance/benchmark/index.md) for common setup steps, including deployment of the inference extension, model server setup, scaling the vLLM deployment, and obtaining the Gateway IP. + +## Create the LPG Docker Image + +Follow the detailed instructions [here](https://github.com/AI-Hypercomputer/inference-benchmark/blob/1c92df607751a7ddb04e2152ed7f6aaf85bd9ca7/README.md) to build the LPG Docker image: + +* Create an artifact repository: + +```bash +gcloud artifacts repositories create ai-benchmark --location=us-central1 --repository-format=docker +``` + +* Prepare datasets for [Infinity-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [billsum]((https://huggingface.co/datasets/FiscalNote/billsum)): + +```bash +pip install datasets transformers numpy pandas tqdm matplotlib +python datasets/import_dataset.py --hf_token YOUR_TOKEN +``` + +* Build the benchmark Docker image: + +```bash +docker build -t inference-benchmark . +``` + +* Push the Docker image to your artifact registry: + +```bash +docker tag inference-benchmark us-central1-docker.pkg.dev/{project-name}/ai-benchmark/inference-benchmark +docker push us-central1-docker.pkg.dev/{project-name}/ai-benchmark/inference-benchmark +``` + +## Conduct Regression Tests + +Run benchmarks using the configurations below, which are optimized for NVIDIA H100 GPUs (80 GB). Adjust configurations for other hardware as necessary. + +### Test Case 1: Single Workload + +- **Dataset:** `billsum_conversations.json` (created from [HuggingFace billsum dataset](https://huggingface.co/datasets/FiscalNote/billsum)). + * This dataset features long prompts, making it prefill-heavy and ideal for testing scenarios that emphasize initial token generation. +- **Model:** [Llama 3 (8B)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) (*critical*) +- **Replicas:** 10 (vLLM) +- **Request Rates:** 300–350 (increments of 10) + +Refer to example manifest: +`./config/manifests/regression-testing/single-workload-regression.yaml` + +### Test Case 2: Multi-LoRA + +- **Dataset:** `Infinity-Instruct_conversations.json` (created from [HuggingFace Infinity-Instruct dataset](https://huggingface.co/datasets/BAAI/Infinity-Instruct)). + * This dataset has long outputs, making it decode-heavy and useful for testing scenarios focusing on sustained token generation. +- **Model:** [Llama 3 (8B)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) +- **LoRA Adapters:** 15 adapters (`nvidia/llama-3.1-nemoguard-8b-topic-control`, rank 8, critical) +- **Hardware:** NVIDIA H100 GPUs (80 GB) +- **Traffic Distribution:** 60% (first 5 adapters, each 12%), 30% (next 5, each 6%), 10% (last 5, each 2%) simulating prod/dev/test tiers +- **Max LoRA:** 3 +- **Replicas:** 10 (vLLM) +- **Request Rates:** 20–200 (increments of 20) + +Optionally, you can also run benchmarks using the `ShareGPT` dataset for additional coverage. + +Update deployments for multi-LoRA support: +- vLLM Deployment: `./config/manifests/regression-testing/vllm/multi-lora-deployment.yaml` +- InferenceModel: `./config/manifests/inferencemodel.yaml` + +Refer to example manifest: +`./config/manifests/regression-testing/multi-lora-regression.yaml` + +### Execute Benchmarks + +Benchmark in two phases: before and after applying your changes: + +- **Before changes:** + +```bash +benchmark_id='regression-before' ./tools/benchmark/download-benchmark-results.bash +``` + +- **After changes:** + +```bash +benchmark_id='regression-after' ./tools/benchmark/download-benchmark-results.bash +``` + +## Analyze Benchmark Results + +Use the provided Jupyter notebook (`./tools/benchmark/benchmark.ipynb`) to analyze results: + +- Update benchmark IDs to `regression-before` and `regression-after`. +- Compare latency and throughput metrics, performing regression analysis. +- Check R² values specifically: + - **Prompts Attempted/Succeeded:** Expect R² ≈ 1 + - **Output Tokens per Minute, P90 per Output Token Latency, P90 Latency:** Expect R² close to 1 (allow minor variance). + +Identify significant deviations, investigate causes, and confirm performance meets expected standards. \ No newline at end of file diff --git a/tools/benchmark/benchmark.ipynb b/tools/benchmark/benchmark.ipynb index ffd4c455e..21723fbd7 100644 --- a/tools/benchmark/benchmark.ipynb +++ b/tools/benchmark/benchmark.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "executionInfo": { "elapsed": 391, @@ -21,16 +21,17 @@ "#@title Configuration. Edit this before running the rest.\n", "\n", "OUTPUT_DIR='output'\n", - "RUN_ID='default-run'\n", + "RUN_ID='example-run'\n", "# Path to the benchmark dir under `gateway-api-inference-extension/benchmark`\n", "BENCHMARK_DIR =\"./\"\n", "# A regex to match the model name, which matches the output file name.\n", - "MODEL_MATCHER='.*llama.*'" + "MODEL_MATCHER='.*llama.*'\n", + "INTERACTIVE_PLOT='False'" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "executionInfo": { "elapsed": 33, @@ -55,6 +56,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import math\n", + "from sklearn.metrics import r2_score\n", "import logging\n", "level = logging.INFO\n", "logger = logging.getLogger(__name__)\n", @@ -82,11 +84,11 @@ " XY(x = 'request_rate', x_label = 'QPS', y = 'output_tokens_per_min'),\n", " XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_per_output_token_latency\"),\n", " XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_latency\"),\n", + " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n", + " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n", "]\n", "SANITY_CHECK_METRICS = [\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'benchmark_time'),\n", - " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n", - " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'throughput_rps'),\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'total_input_tokens'),\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'total_output_token'),\n", @@ -110,6 +112,8 @@ " self.interactive = interactive\n", " self.annotate = annotate\n", " self.output_dir = output_dir\n", + " self.data = load_data(self.labels, self.run_id, self.output_dir)\n", + " self.groups = group_data(self.data, self.metrics)\n", "\n", " def withRunId(self, run_id):\n", " return Plotter(run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n", @@ -124,10 +128,16 @@ " return Plotter(self.run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, output_dir)\n", "\n", " def plot_bar(self):\n", - " data = load_data(self.labels, self.run_id, self.output_dir)\n", - " groups = group_data(data, self.metrics)\n", + " \n", " logger.debug(\"Plotting run id...\")\n", - " plot_bar(self.labels, groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", + " plot_bar(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", + "\n", + " def plot_delta(self):\n", + " \"\"\"\n", + " Plot the delta between two labels.\n", + " \"\"\"\n", + " logger.debug(\"Plotting delta for run id...\")\n", + " plot_delta(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", "\n", "def filepaths(root_dir):\n", " \"\"\"\n", @@ -201,6 +211,27 @@ " groups = data.groupby(by=['label'],sort=True)\n", " return groups\n", "\n", + "def compute_r2_for_metrics(groups, metrics, label_before, label_after):\n", + " print(\"\\nCoefficient of Determination (R^2) between before and after runs:\")\n", + " for m in metrics:\n", + " try:\n", + " df_b = groups.get_group(label_before).set_index('request_rate')\n", + " df_a = groups.get_group(label_after).set_index('request_rate')\n", + " except KeyError:\n", + " print(f\" Skipping {m.y}: missing group data for '{label_before}' or '{label_after}'\")\n", + " continue\n", + " common = sorted(set(df_b.index).intersection(df_a.index))\n", + " yb = df_b.loc[common, m.y].values\n", + " ya = df_a.loc[common, m.y].values\n", + " mask = ~np.isnan(yb) & ~np.isnan(ya)\n", + " yb, ya = yb[mask], ya[mask]\n", + " if len(yb) > 1 and np.any(yb != 0):\n", + " r2 = r2_score(yb, ya)\n", + " print(f\" {m.y:<30} R^2 = {r2:.4f}\")\n", + " else:\n", + " print(f\" {m.y:<30} insufficient data for R^2 calculation\")\n", + "\n", + "\n", "def init_plot(metrics, num_plots_per_row=NUM_PLOTS_PER_ROW):\n", " num_plots_per_row = min(num_plots_per_row, len(metrics))\n", " row_count = math.ceil(len(metrics) / num_plots_per_row)\n", @@ -229,7 +260,7 @@ " plot_func(curAx, m)\n", " return fig, axes\n", "\n", - "def plot_bar(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=False, annotate=False):\n", + "def plot_bar(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=INTERACTIVE_PLOT, annotate=False):\n", " labels = [label.alias for label in labels]\n", " logger.debug(f'Prnting bar chart for {labels}')\n", " logger.debug(f'groups: {groups}')\n", @@ -294,7 +325,106 @@ " fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n", " fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n", " plt.show()\n", - "\n" + "\n", + "def plot_delta(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=True, annotate=False):\n", + " \"\"\"\n", + " Plot the delta between base_label and compare_label for each metric.\n", + " A positive delta means compare_label has a higher value than base_label.\n", + " \"\"\"\n", + " base_label = labels[0].name\n", + " compare_label = labels[1].name\n", + " logger.debug(f'Printing delta chart for {base_label} vs {compare_label}')\n", + "\n", + " try:\n", + " base_df = groups.get_group((base_label,))\n", + " compare_df = groups.get_group((compare_label,))\n", + " except Exception as e:\n", + " logger.error(f\"Error getting data for labels {base_label} and {compare_label}: {e}\")\n", + " return\n", + "\n", + " y_columns = [m.y for m in metrics]\n", + "\n", + " # 1. Find common request rates\n", + " base_rates = set(base_df['request_rate'].astype(int))\n", + " compare_rates = set(compare_df['request_rate'].astype(int))\n", + " common_rates = sorted(list(base_rates.intersection(compare_rates)))[:6]\n", + "\n", + " if not common_rates:\n", + " logger.error(f\"No common request rates found between {base_label} and {compare_label}\")\n", + " return\n", + "\n", + " # 2. Prepare data for delta calculation\n", + " base_data = base_df.set_index('request_rate').to_dict()\n", + " compare_data = compare_df.set_index('request_rate').to_dict()\n", + "\n", + " # Calculate deltas (compare_label - base_label)\n", + " delta_data = {y_col: {} for y_col in y_columns}\n", + " for y_col in y_columns:\n", + " for rate in common_rates:\n", + " base_val = base_data.get(y_col, {}).get(rate, np.nan)\n", + " compare_val = compare_data.get(y_col, {}).get(rate, np.nan)\n", + "\n", + " if not np.isnan(base_val) and not np.isnan(compare_val):\n", + " delta_data[y_col][rate] = (compare_val - base_val)/base_val*100\n", + " else:\n", + " delta_data[y_col][rate] = np.nan\n", + "\n", + " # 3. Plotting\n", + " def plot_func(curAx, m):\n", + " x = np.arange(len(common_rates))\n", + " y_values = [delta_data[m.y].get(rr, np.nan) for rr in common_rates]\n", + "\n", + " # Determine colors based on positive/negative values\n", + " colors = ['green' if val > 0 else 'blue' for val in y_values]\n", + "\n", + " rects = curAx.bar(x, y_values, 0.6, color=colors)\n", + "\n", + " # Add a horizontal line at y=0\n", + " curAx.axhline(y=0, color='black', linestyle='-', linewidth=1)\n", + "\n", + " if annotate:\n", + " for rect, val in zip(rects, y_values):\n", + " if not np.isnan(val):\n", + " height = rect.get_height()\n", + " # For negative bars, put text above the bar\n", + " vert_align = 'bottom' if val >= 0 else 'top'\n", + " y_offset = 3 if val >= 0 else -3\n", + "\n", + " curAx.annotate(f'{val:.2f}',\n", + " xy=(rect.get_x() + rect.get_width() / 2, val),\n", + " xytext=(0, y_offset), # vertical offset\n", + " textcoords=\"offset points\",\n", + " ha='center', va=vert_align)\n", + "\n", + " # Create a title that shows what this delta represents\n", + " title = f\"Delta: {compare_label} - {base_label} ({m.y})\"\n", + " curAx.set_title(title, fontsize=12)\n", + "\n", + " # Add labels\n", + " curAx.set_xlabel(m.x_label, fontsize=axis_label_fontsize)\n", + " #curAx.set_ylabel(f\"% Delta in {m.y_label}\", fontsize=axis_label_fontsize)\n", + " curAx.set_xticks(x)\n", + " curAx.set_xticklabels(common_rates)\n", + " curAx.tick_params(axis='both', labelsize=tick_label_fontsize)\n", + "\n", + " # Create a dummy handle for the legend\n", + " legend_handle = [plt.Rectangle((0,0),1,1,color='green'),\n", + " plt.Rectangle((0,0),1,1,color='blue')]\n", + " legend_label = [f'{compare_label} > {base_label}',\n", + " f'{compare_label} < {base_label}']\n", + "\n", + " return legend_handle, legend_label\n", + "\n", + " # Create plot with metrics\n", + " fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n", + "\n", + " # Add an overall title for the figure\n", + " fig.suptitle(f\"% Delta Metrics: {compare_label} - {base_label}\",\n", + " fontsize=title_fontsize, y=0.98)\n", + "\n", + " plt.subplots_adjust(bottom=0.15, top=0.9) # Make room for legends\n", + " fig.tight_layout(rect=[0, 0.1, 1, 0.95]) # Adjust the rectangle in which the subplots fit\n", + " plt.show()" ] }, { @@ -320,9 +450,26 @@ "outputs": [], "source": [ "#@title Plot Result\n", - "\n", - "pl = Plotter(run_id=RUN_ID, labels=[Label('inference-extension'),Label('k8s-svc')], output_dir=OUTPUT_DIR)\n", - "pl.plot_bar()" + "# initialize the plotter with the run id and labels. \n", + "# Example labels are 'inference-extension' and 'k8s-svc' if comparing Inference Extension and K8s Service \n", + "# 'regression-before' and 'regression-after' if comparing two different runs of inference extension to see the regression\n", + "\n", + "benchmark_id1 = # eg 'regression-before' or 'inference-extension'\n", + "benchmark_id2 = # eg 'regression-after' or 'k8s-svc'\n", + "labels = [Label(benchmark_id1), Label(benchmark_id2,)]\n", + "\n", + "# Plot bar chart of metrics\n", + "pl = Plotter(run_id=RUN_ID, labels=labels, output_dir=OUTPUT_DIR)\n", + "pl.plot_bar()\n", + "pl.plot_delta()\n", + "\n", + "# Load & group data to compute R^2\n", + "all_data = load_data(labels, RUN_ID, OUTPUT_DIR)\n", + "groups = group_data(all_data)\n", + "compute_r2_for_metrics(groups, CORE_METRICS,\n", + " label_before=benchmark_id1,\n", + " label_after=benchmark_id2)\n", + "\n" ] } ], @@ -355,4 +502,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file