diff --git a/config/manifests/regression-testing/inferencemodel.yaml b/config/manifests/regression-testing/inferencemodel.yaml
new file mode 100644
index 000000000..d8eada95a
--- /dev/null
+++ b/config/manifests/regression-testing/inferencemodel.yaml
@@ -0,0 +1,237 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-0
+spec:
+  modelName: adapter-0
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-0
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-1
+spec:
+  modelName: adapter-1
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-1
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-2
+spec:
+  modelName: adapter-2
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-2
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-3
+spec:
+  modelName: adapter-3
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-3
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-4
+spec:
+  modelName: adapter-4
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-4
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-5
+spec:
+  modelName: adapter-5
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-5
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-6
+spec:
+  modelName: adapter-6
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-6
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-7
+spec:
+  modelName: adapter-7
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-7
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-8
+spec:
+  modelName: adapter-8
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-8
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-9
+spec:
+  modelName: adapter-9
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-9
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-10
+spec:
+  modelName: adapter-10
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-10
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-11
+spec:
+  modelName: adapter-11
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-11
+    weight: 100
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-12
+spec:
+  modelName: adapter-12
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-12
+    weight: 100
+
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-13
+spec:
+  modelName: adapter-13
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-13
+    weight: 100
+
+
+---
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: adapter-14
+spec:
+  modelName: adapter-14
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
+  targetModels:
+  - name: adapter-14
+    weight: 100
+
+---
+
+
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model
+spec:
+  modelName: meta-llama/Llama-3.1-8B-Instruct
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
\ No newline at end of file
diff --git a/config/manifests/regression-testing/multi-lora-regression.yaml b/config/manifests/regression-testing/multi-lora-regression.yaml
new file mode 100644
index 000000000..00b5d7d50
--- /dev/null
+++ b/config/manifests/regression-testing/multi-lora-regression.yaml
@@ -0,0 +1,62 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: benchmark-tool
+  name: benchmark-tool
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: benchmark-tool
+  template:
+    metadata:
+      labels:
+        app: benchmark-tool
+    spec:
+      containers:
+      # Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d
+      - image: '<DOCKER_IMAGE>'
+        imagePullPolicy: Always
+        name: benchmark-tool
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: '<target-ip>'
+        - name: REQUEST_RATES
+          value: '20,40,60,80,100,120,140,160,180,200'
+        - name: BENCHMARK_TIME_SECONDS
+          value: '300'
+        - name: TOKENIZER
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
+        - name: MODELS
+          value: 'adapter-0,adapter-1,adapter-2,adapter-3,adapter-4,adapter-5,adapter-6,adapter-7,adapter-8,adapter-9,adapter-10,adapter-11,adapter-12,adapter-13,adapter-14'
+        - name: TRAFFIC_SPLIT
+          value: '0.12,0.12,0.12,0.12,0.12,0.06,0.06,0.06,0.06,0.06,0.02,0.02,0.02,0.02,0.02'
+        - name: BACKEND
+          value: vllm
+        - name: PORT
+          value: "80"
+        - name: INPUT_LENGTH
+          value: "1024"
+        - name: OUTPUT_LENGTH
+          value: '1024'
+        - name: FILE_PREFIX
+          value: benchmark
+        - name: PROMPT_DATASET_FILE
+          value: Infinity-Instruct_conversations.json
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
\ No newline at end of file
diff --git a/config/manifests/regression-testing/single-workload-regression.yaml b/config/manifests/regression-testing/single-workload-regression.yaml
new file mode 100644
index 000000000..b13b7eed8
--- /dev/null
+++ b/config/manifests/regression-testing/single-workload-regression.yaml
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: benchmark-tool
+  name: benchmark-tool
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: benchmark-tool
+  template:
+    metadata:
+      labels:
+        app: benchmark-tool
+    spec:
+      containers:
+      # Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d
+      - image: '<DOCKER_IMAGE>'
+        imagePullPolicy: Always
+        name: benchmark-tool
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: '<target-ip>'
+        - name: REQUEST_RATES
+          value: '300,310,320,330,340,350'
+        - name: BENCHMARK_TIME_SECONDS
+          value: '300'
+        - name: TOKENIZER
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
+        - name: MODELS
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
+        - name: BACKEND
+          value: vllm
+        - name: PORT
+          value: "80"
+        - name: INPUT_LENGTH
+          value: "1024"
+        - name: OUTPUT_LENGTH
+          value: '1024'
+        - name: FILE_PREFIX
+          value: benchmark
+        - name: PROMPT_DATASET_FILE
+          value: billsum_conversations.json
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
\ No newline at end of file
diff --git a/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml
new file mode 100644
index 000000000..114cd9922
--- /dev/null
+++ b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml
@@ -0,0 +1,289 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama3-8b-instruct
+spec:
+  replicas: 10
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct
+    spec:
+      containers:
+        - name: vllm
+          image: "vllm/vllm-openai:latest"
+          imagePullPolicy: Always
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+          - "--model"
+          - "meta-llama/Llama-3.1-8B-Instruct"
+          - "--tensor-parallel-size"
+          - "1"
+          - "--port"
+          - "8000"
+          - "--enable-lora"
+          - "--max-loras"
+          - "15"
+          - "--max-cpu-loras"
+          - "15"
+          - "--compilation-config"
+          - "3"
+          - "--max-lora-rank"
+          - "8"
+          - "--max-num-seqs"
+          - "2048"
+          - "--max-model-len"
+          - "2048"
+          - "--no-enable-prefix-caching"
+          env:
+            - name: PORT
+              value: "8000"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "false"
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+          lifecycle:
+            preStop:
+              # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep
+              # to give upstream gateways a chance to take us out of rotation. The time we wait
+              # is dependent on the time it takes for all upstreams to completely remove us from
+              # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
+              # our deployment to run behind a modern gateway like Envoy which is designed to 
+              # probe for readiness aggressively.
+              sleep:
+                # Upstream gateway probers for health should be set on a low period, such as 5s,
+                # and the shorter we can tighten that bound the faster that we release
+                # accelerators during controlled shutdowns. However, we should expect variance,
+                # as load balancers may have internal delays, and we don't want to drop requests
+                # normally, so we're often aiming to set this value to a p99 propagation latency
+                # of readiness -> load balancer taking backend out of rotation, not the average.
+                # 
+                # This value is generally stable and must often be experimentally determined on
+                # for a given load balancer and health check period. We set the value here to
+                # the highest value we observe on a supported load balancer, and we recommend
+                # tuning this value down and verifying no requests are dropped.
+                #
+                # If this value is updated, be sure to update terminationGracePeriodSeconds.
+                #
+                seconds: 30
+              #
+              # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions
+              # replace with this exec action.
+              #exec:
+              #  command:
+              #  - /usr/bin/sleep
+              #  - "30"
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            # vLLM's health check is simple, so we can more aggressively probe it.  Liveness
+            # check endpoints should always be suitable for aggressive probing.
+            periodSeconds: 1
+            successThreshold: 1
+            # vLLM has a very simple health implementation, which means that any failure is
+            # likely significant. However, any liveness triggered restart requires the very
+            # large core model to be reloaded, and so we should bias towards ensuring the
+            # server is definitely unhealthy vs immediately restarting. Use 5 attempts as
+            # evidence of a serious problem.
+            failureThreshold: 5
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            # vLLM's health check is simple, so we can more aggressively probe it.  Readiness
+            # check endpoints should always be suitable for aggressive probing, but may be
+            # slightly more expensive than readiness probes.
+            periodSeconds: 1
+            successThreshold: 1
+            # vLLM has a very simple health implementation, which means that any failure is
+            # likely significant,
+            failureThreshold: 1
+            timeoutSeconds: 1
+          # We set a startup probe so that we don't begin directing traffic or checking
+          # liveness to this instance until the model is loaded.
+          startupProbe:
+            # Failure threshold is when we believe startup will not happen at all, and is set
+            # to the maximum possible time we believe loading a model will take. In our
+            # default configuration we are downloading a model from HuggingFace, which may
+            # take a long time, then the model must load into the accelerator. We choose
+            # 10 minutes as a reasonable maximum startup time before giving up and attempting
+            # to restart the pod.
+            #
+            # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash
+            # loop forever. Be sure to set this appropriately.
+            failureThreshold: 600
+            # Set delay to start low so that if the base model changes to something smaller
+            # or an optimization is deployed, we don't wait unneccesarily.
+            initialDelaySeconds: 2
+            # As a startup probe, this stops running and so we can more aggressively probe
+            # even a moderately complex startup - this is a very important workload.
+            periodSeconds: 1
+            httpGet:
+              # vLLM does not start the OpenAI server (and hence make /health available)
+              # until models are loaded. This may not be true for all model servers.
+              path: /health
+              port: http
+              scheme: HTTP
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - mountPath: /data
+              name: data
+            - mountPath: /dev/shm
+              name: shm
+            - name: adapters
+              mountPath: "/adapters"
+      initContainers:
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true 
+          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env: 
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths
+          - name: config-volume
+            mountPath:  /config
+      restartPolicy: Always
+
+      # vLLM allows VLLM_PORT to be specified as an environment variable, but a user might
+      # create a 'vllm' service in their namespace. That auto-injects VLLM_PORT in docker
+      # compatible form as `tcp://<IP>:<PORT>` instead of the numeric value vLLM accepts
+      # causing CrashLoopBackoff. Set service environment injection off by default.
+      enableServiceLinks: false
+
+      # Generally, the termination grace period needs to last longer than the slowest request
+      # we expect to serve plus any extra time spent waiting for load balancers to take the
+      # model server out of rotation.
+      #
+      # An easy starting point is the p99 or max request latency measured for your workload,
+      # although LLM request latencies vary significantly if clients send longer inputs or
+      # trigger longer outputs. Since steady state p99 will be higher than the latency
+      # to drain a server, you may wish to slightly this value either experimentally or
+      # via the calculation below.
+      #
+      # For most models you can derive an upper bound for the maximum drain latency as
+      # follows:
+      # 
+      #   1. Identify the maximum context length the model was trained on, or the maximum
+      #      allowed length of output tokens configured on vLLM (llama2-7b was trained to
+      #      4k context length, while llama3-8b was trained to 128k).
+      #   2. Output tokens are the more compute intensive to calculate and the accelerator
+      #      will have a maximum concurrency (batch size) - the time per output token at
+      #      maximum batch with no prompt tokens being processed is the slowest an output
+      #      token can be generated (for this model it would be about 100ms TPOT at a max
+      #      batch size around 50)
+      #   3. Calculate the worst case request duration if a request starts immediately
+      #      before the server stops accepting new connections - generally when it receives
+      #      SIGTERM (for this model that is about 4096 / 10 ~ 40s)
+      #   4. If there are any requests generating prompt tokens that will delay when those
+      #      output tokens start, and prompt token generation is roughly 6x faster than
+      #      compute-bound output token generation, so add 20% to the time from above (40s + 
+      #      16s ~ 55s)
+      #
+      # Thus we think it will take us at worst about 55s to complete the longest possible
+      # request the model is likely to receive at maximum concurrency (highest latency)
+      # once requests stop being sent.
+      #
+      # NOTE: This number will be lower than steady state p99 latency since we stop receiving
+      #       new requests which require continuous prompt token computation.
+      # NOTE: The max timeout for backend connections from gateway to model servers should
+      #       be configured based on steady state p99 latency, not drain p99 latency
+      #
+      #   5. Add the time the pod takes in its preStop hook to allow the load balancers have
+      #      stopped sending us new requests (55s + 30s ~ 85s)
+      #
+      # Because termination grace period controls when the Kubelet forcibly terminates a
+      # stuck or hung process (a possibility due to a GPU crash), there is operational safety
+      # in keeping the value roughly proportional to the time to finish serving. There is also
+      # value in adding a bit of extra time to deal with unexpectedly long workloads.
+      #   
+      #   6. Add a 50% safety buffer to this time since the operational impact should be low
+      #      (85s * 1.5 ~ 130s)
+      #
+      # One additional source of drain latency is that some workloads may run close to
+      # saturation and have queued requests on each server. Since traffic in excess of the
+      # max sustainable QPS will result in timeouts as the queues grow, we assume that failure
+      # to drain in time due to excess queues at the time of shutdown is an expected failure
+      # mode of server overload. If your workload occasionally experiences high queue depths
+      # due to periodic traffic, consider increasing the safety margin above to account for
+      # time to drain queued requests.
+      terminationGracePeriodSeconds: 130
+
+      volumes:
+        - name: data
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: vllm-llama3-8b-instruct-adapters
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-llama3-8b-instruct-adapters
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        name: vllm-llama3.1-8b-instruct
+        port: 8000
+        defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
+        ensureExist:
+          models:
+          - id: adapter-0
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-1
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-2
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-3
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-4
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-5
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control 
+          - id: adapter-6
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-7
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-8
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-9
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-10  
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-11 
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-12 
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-13
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+          - id: adapter-14  
+            source: nvidia/llama-3.1-nemoguard-8b-topic-control
+
+
+
+
diff --git a/mkdocs.yml b/mkdocs.yml
index e5927ed53..bf1536fe7 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -67,6 +67,7 @@ nav:
     - Implementer's Guide: guides/implementers.md
   - Performance:
     - Benchmark: performance/benchmark/index.md
+    - Regression Testing: performance/regression-testing/index.md
   - Reference:
     - API Reference: reference/spec.md
     - API Types:
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md
index 160cc26fb..42d5e727b 100644
--- a/site-src/performance/benchmark/index.md
+++ b/site-src/performance/benchmark/index.md
@@ -106,7 +106,7 @@ This guide shows how to run the jupyter notebook using vscode after completing k
     pip install -r ./tools/benchmark/requirements.txt
     ```
 
-1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. At the end you should
-   see a bar chart like below where __"ie"__ represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json).
-
-    ![alt text](example-bar-chart.png)
+1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. In the last cell update the benchmark ids with`inference-extension` and `k8s-svc`. At the end you should
+    see a bar chart like below where **"ie"** represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json).
+    
+    ![alt text](example-bar-chart.png)
\ No newline at end of file
diff --git a/site-src/performance/regression-testing/index.md b/site-src/performance/regression-testing/index.md
new file mode 100644
index 000000000..16b5552f5
--- /dev/null
+++ b/site-src/performance/regression-testing/index.md
@@ -0,0 +1,103 @@
+# Regression Testing
+
+Regression testing verifies that recent code changes have not adversely affected the performance or stability of the Inference Gateway.
+
+This guide explains how to run regression tests against the Gateway API inference extension using the [Latency Profile Generator (LPG)](https://github.com/AI-Hypercomputer/inference-benchmark/) to simulate traffic and collect performance metrics.
+
+## Prerequisites
+
+Refer to the [benchmark guide](/site-src/performance/benchmark/index.md) for common setup steps, including deployment of the inference extension, model server setup, scaling the vLLM deployment, and obtaining the Gateway IP.
+
+## Create the LPG Docker Image
+
+Follow the detailed instructions [here](https://github.com/AI-Hypercomputer/inference-benchmark/blob/1c92df607751a7ddb04e2152ed7f6aaf85bd9ca7/README.md) to build the LPG Docker image:
+
+* Create an artifact repository:
+
+```bash
+gcloud artifacts repositories create ai-benchmark --location=us-central1 --repository-format=docker
+```
+
+* Prepare datasets for [Infinity-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [billsum]((https://huggingface.co/datasets/FiscalNote/billsum)):
+
+```bash
+pip install datasets transformers numpy pandas tqdm matplotlib
+python datasets/import_dataset.py --hf_token YOUR_TOKEN
+```
+
+* Build the benchmark Docker image:
+
+```bash
+docker build -t inference-benchmark .
+```
+
+* Push the Docker image to your artifact registry:
+
+```bash
+docker tag inference-benchmark us-central1-docker.pkg.dev/{project-name}/ai-benchmark/inference-benchmark
+docker push us-central1-docker.pkg.dev/{project-name}/ai-benchmark/inference-benchmark
+```
+
+## Conduct Regression Tests
+
+Run benchmarks using the configurations below, which are optimized for NVIDIA H100 GPUs (80 GB). Adjust configurations for other hardware as necessary.
+
+### Test Case 1: Single Workload
+
+- **Dataset:** `billsum_conversations.json` (created from [HuggingFace billsum dataset](https://huggingface.co/datasets/FiscalNote/billsum)).
+    * This dataset features long prompts, making it prefill-heavy and ideal for testing scenarios that emphasize initial token generation.
+- **Model:** [Llama 3 (8B)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) (*critical*)
+- **Replicas:** 10 (vLLM)
+- **Request Rates:** 300–350 (increments of 10)
+
+Refer to example manifest:
+`./config/manifests/regression-testing/single-workload-regression.yaml`
+
+### Test Case 2: Multi-LoRA
+
+- **Dataset:** `Infinity-Instruct_conversations.json` (created from [HuggingFace Infinity-Instruct dataset](https://huggingface.co/datasets/BAAI/Infinity-Instruct)).
+    * This dataset has long outputs, making it decode-heavy and useful for testing scenarios focusing on sustained token generation.
+- **Model:** [Llama 3 (8B)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
+- **LoRA Adapters:** 15 adapters (`nvidia/llama-3.1-nemoguard-8b-topic-control`, rank 8, critical)
+- **Hardware:** NVIDIA H100 GPUs (80 GB)
+- **Traffic Distribution:** 60% (first 5 adapters, each 12%), 30% (next 5, each 6%), 10% (last 5, each 2%) simulating prod/dev/test tiers
+- **Max LoRA:** 3
+- **Replicas:** 10 (vLLM)
+- **Request Rates:** 20–200 (increments of 20)
+
+Optionally, you can also run benchmarks using the `ShareGPT` dataset for additional coverage.
+
+Update deployments for multi-LoRA support:
+- vLLM Deployment: `./config/manifests/regression-testing/vllm/multi-lora-deployment.yaml`
+- InferenceModel: `./config/manifests/inferencemodel.yaml`
+
+Refer to example manifest:
+`./config/manifests/regression-testing/multi-lora-regression.yaml`
+
+### Execute Benchmarks
+
+Benchmark in two phases: before and after applying your changes:
+
+- **Before changes:**
+
+```bash
+benchmark_id='regression-before' ./tools/benchmark/download-benchmark-results.bash
+```
+
+- **After changes:**
+
+```bash
+benchmark_id='regression-after' ./tools/benchmark/download-benchmark-results.bash
+```
+
+## Analyze Benchmark Results
+
+Use the provided Jupyter notebook (`./tools/benchmark/benchmark.ipynb`) to analyze results:
+
+- Update benchmark IDs to `regression-before` and `regression-after`.
+- Compare latency and throughput metrics, performing regression analysis.
+- Check R² values specifically:
+  - **Prompts Attempted/Succeeded:** Expect R² ≈ 1
+  - **Output Tokens per Minute, P90 per Output Token Latency, P90 Latency:** Expect R² close to 1 (allow minor variance).
+
+Identify significant deviations, investigate causes, and confirm performance meets expected standards.
\ No newline at end of file
diff --git a/tools/benchmark/benchmark.ipynb b/tools/benchmark/benchmark.ipynb
index ffd4c455e..21723fbd7 100644
--- a/tools/benchmark/benchmark.ipynb
+++ b/tools/benchmark/benchmark.ipynb
@@ -2,7 +2,7 @@
   "cells": [
     {
       "cell_type": "code",
-      "execution_count": 26,
+      "execution_count": null,
       "metadata": {
         "executionInfo": {
           "elapsed": 391,
@@ -21,16 +21,17 @@
         "#@title Configuration. Edit this before running the rest.\n",
         "\n",
         "OUTPUT_DIR='output'\n",
-        "RUN_ID='default-run'\n",
+        "RUN_ID='example-run'\n",
         "# Path to the benchmark dir under `gateway-api-inference-extension/benchmark`\n",
         "BENCHMARK_DIR =\"./\"\n",
         "# A regex to match the model name, which matches the output file name.\n",
-        "MODEL_MATCHER='.*llama.*'"
+        "MODEL_MATCHER='.*llama.*'\n",
+        "INTERACTIVE_PLOT='False'"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": null,
       "metadata": {
         "executionInfo": {
           "elapsed": 33,
@@ -55,6 +56,7 @@
         "import matplotlib.pyplot as plt\n",
         "import numpy as np\n",
         "import math\n",
+        "from sklearn.metrics import r2_score\n",
         "import logging\n",
         "level = logging.INFO\n",
         "logger = logging.getLogger(__name__)\n",
@@ -82,11 +84,11 @@
         "    XY(x = 'request_rate', x_label = 'QPS', y = 'output_tokens_per_min'),\n",
         "    XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_per_output_token_latency\"),\n",
         "    XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_latency\"),\n",
+        "    XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n",
+        "    XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n",
         "]\n",
         "SANITY_CHECK_METRICS = [\n",
         "    XY(x = 'request_rate', x_label = 'QPS', y = 'benchmark_time'),\n",
-        "    XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n",
-        "    XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n",
         "    XY(x = 'request_rate', x_label = 'QPS', y = 'throughput_rps'),\n",
         "    XY(x = 'request_rate', x_label = 'QPS', y = 'total_input_tokens'),\n",
         "    XY(x = 'request_rate', x_label = 'QPS', y = 'total_output_token'),\n",
@@ -110,6 +112,8 @@
         "    self.interactive = interactive\n",
         "    self.annotate = annotate\n",
         "    self.output_dir = output_dir\n",
+        "    self.data = load_data(self.labels, self.run_id, self.output_dir)\n",
+        "    self.groups = group_data(self.data, self.metrics)\n",
         "\n",
         "  def withRunId(self, run_id):\n",
         "    return Plotter(run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n",
@@ -124,10 +128,16 @@
         "    return Plotter(self.run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, output_dir)\n",
         "\n",
         "  def plot_bar(self):\n",
-        "    data = load_data(self.labels, self.run_id, self.output_dir)\n",
-        "    groups = group_data(data, self.metrics)\n",
+        "    \n",
         "    logger.debug(\"Plotting run id...\")\n",
-        "    plot_bar(self.labels, groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n",
+        "    plot_bar(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n",
+        "\n",
+        "  def plot_delta(self):\n",
+        "    \"\"\"\n",
+        "    Plot the delta between two labels.\n",
+        "    \"\"\"\n",
+        "    logger.debug(\"Plotting delta for run id...\")\n",
+        "    plot_delta(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n",
         "\n",
         "def filepaths(root_dir):\n",
         "    \"\"\"\n",
@@ -201,6 +211,27 @@
         "  groups = data.groupby(by=['label'],sort=True)\n",
         "  return groups\n",
         "\n",
+        "def compute_r2_for_metrics(groups, metrics, label_before, label_after):\n",
+        "    print(\"\\nCoefficient of Determination (R^2) between before and after runs:\")\n",
+        "    for m in metrics:\n",
+        "        try:\n",
+        "            df_b = groups.get_group(label_before).set_index('request_rate')\n",
+        "            df_a = groups.get_group(label_after).set_index('request_rate')\n",
+        "        except KeyError:\n",
+        "            print(f\"  Skipping {m.y}: missing group data for '{label_before}' or '{label_after}'\")\n",
+        "            continue\n",
+        "        common = sorted(set(df_b.index).intersection(df_a.index))\n",
+        "        yb = df_b.loc[common, m.y].values\n",
+        "        ya = df_a.loc[common, m.y].values\n",
+        "        mask = ~np.isnan(yb) & ~np.isnan(ya)\n",
+        "        yb, ya = yb[mask], ya[mask]\n",
+        "        if len(yb) > 1 and np.any(yb != 0):\n",
+        "            r2 = r2_score(yb, ya)\n",
+        "            print(f\"  {m.y:<30} R^2 = {r2:.4f}\")\n",
+        "        else:\n",
+        "            print(f\"  {m.y:<30} insufficient data for R^2 calculation\")\n",
+        "\n",
+        "\n",
         "def init_plot(metrics, num_plots_per_row=NUM_PLOTS_PER_ROW):\n",
         "  num_plots_per_row = min(num_plots_per_row, len(metrics))\n",
         "  row_count = math.ceil(len(metrics) / num_plots_per_row)\n",
@@ -229,7 +260,7 @@
         "    plot_func(curAx, m)\n",
         "  return fig, axes\n",
         "\n",
-        "def plot_bar(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=False, annotate=False):\n",
+        "def plot_bar(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=INTERACTIVE_PLOT, annotate=False):\n",
         "    labels = [label.alias for label in labels]\n",
         "    logger.debug(f'Prnting bar chart for {labels}')\n",
         "    logger.debug(f'groups: {groups}')\n",
@@ -294,7 +325,106 @@
         "    fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n",
         "    fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n",
         "    plt.show()\n",
-        "\n"
+        "\n",
+        "def plot_delta(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=True, annotate=False):\n",
+        "    \"\"\"\n",
+        "    Plot the delta between base_label and compare_label for each metric.\n",
+        "    A positive delta means compare_label has a higher value than base_label.\n",
+        "    \"\"\"\n",
+        "    base_label = labels[0].name\n",
+        "    compare_label = labels[1].name\n",
+        "    logger.debug(f'Printing delta chart for {base_label} vs {compare_label}')\n",
+        "\n",
+        "    try:\n",
+        "        base_df = groups.get_group((base_label,))\n",
+        "        compare_df = groups.get_group((compare_label,))\n",
+        "    except Exception as e:\n",
+        "        logger.error(f\"Error getting data for labels {base_label} and {compare_label}: {e}\")\n",
+        "        return\n",
+        "\n",
+        "    y_columns = [m.y for m in metrics]\n",
+        "\n",
+        "    # 1. Find common request rates\n",
+        "    base_rates = set(base_df['request_rate'].astype(int))\n",
+        "    compare_rates = set(compare_df['request_rate'].astype(int))\n",
+        "    common_rates = sorted(list(base_rates.intersection(compare_rates)))[:6]\n",
+        "\n",
+        "    if not common_rates:\n",
+        "        logger.error(f\"No common request rates found between {base_label} and {compare_label}\")\n",
+        "        return\n",
+        "\n",
+        "    # 2. Prepare data for delta calculation\n",
+        "    base_data = base_df.set_index('request_rate').to_dict()\n",
+        "    compare_data = compare_df.set_index('request_rate').to_dict()\n",
+        "\n",
+        "    # Calculate deltas (compare_label - base_label)\n",
+        "    delta_data = {y_col: {} for y_col in y_columns}\n",
+        "    for y_col in y_columns:\n",
+        "        for rate in common_rates:\n",
+        "            base_val = base_data.get(y_col, {}).get(rate, np.nan)\n",
+        "            compare_val = compare_data.get(y_col, {}).get(rate, np.nan)\n",
+        "\n",
+        "            if not np.isnan(base_val) and not np.isnan(compare_val):\n",
+        "                delta_data[y_col][rate] = (compare_val - base_val)/base_val*100\n",
+        "            else:\n",
+        "                delta_data[y_col][rate] = np.nan\n",
+        "\n",
+        "    # 3. Plotting\n",
+        "    def plot_func(curAx, m):\n",
+        "        x = np.arange(len(common_rates))\n",
+        "        y_values = [delta_data[m.y].get(rr, np.nan) for rr in common_rates]\n",
+        "\n",
+        "        # Determine colors based on positive/negative values\n",
+        "        colors = ['green' if val > 0 else 'blue' for val in y_values]\n",
+        "\n",
+        "        rects = curAx.bar(x, y_values, 0.6, color=colors)\n",
+        "\n",
+        "        # Add a horizontal line at y=0\n",
+        "        curAx.axhline(y=0, color='black', linestyle='-', linewidth=1)\n",
+        "\n",
+        "        if annotate:\n",
+        "            for rect, val in zip(rects, y_values):\n",
+        "                if not np.isnan(val):\n",
+        "                    height = rect.get_height()\n",
+        "                    # For negative bars, put text above the bar\n",
+        "                    vert_align = 'bottom' if val >= 0 else 'top'\n",
+        "                    y_offset = 3 if val >= 0 else -3\n",
+        "\n",
+        "                    curAx.annotate(f'{val:.2f}',\n",
+        "                            xy=(rect.get_x() + rect.get_width() / 2, val),\n",
+        "                            xytext=(0, y_offset),  # vertical offset\n",
+        "                            textcoords=\"offset points\",\n",
+        "                            ha='center', va=vert_align)\n",
+        "\n",
+        "        # Create a title that shows what this delta represents\n",
+        "        title = f\"Delta: {compare_label} - {base_label} ({m.y})\"\n",
+        "        curAx.set_title(title, fontsize=12)\n",
+        "\n",
+        "        # Add labels\n",
+        "        curAx.set_xlabel(m.x_label, fontsize=axis_label_fontsize)\n",
+        "        #curAx.set_ylabel(f\"% Delta in {m.y_label}\", fontsize=axis_label_fontsize)\n",
+        "        curAx.set_xticks(x)\n",
+        "        curAx.set_xticklabels(common_rates)\n",
+        "        curAx.tick_params(axis='both', labelsize=tick_label_fontsize)\n",
+        "\n",
+        "        # Create a dummy handle for the legend\n",
+        "        legend_handle = [plt.Rectangle((0,0),1,1,color='green'),\n",
+        "                        plt.Rectangle((0,0),1,1,color='blue')]\n",
+        "        legend_label = [f'{compare_label} > {base_label}',\n",
+        "                       f'{compare_label} < {base_label}']\n",
+        "\n",
+        "        return legend_handle, legend_label\n",
+        "\n",
+        "    # Create plot with metrics\n",
+        "    fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n",
+        "\n",
+        "    # Add an overall title for the figure\n",
+        "    fig.suptitle(f\"% Delta Metrics: {compare_label} - {base_label}\",\n",
+        "                fontsize=title_fontsize, y=0.98)\n",
+        "\n",
+        "    plt.subplots_adjust(bottom=0.15, top=0.9)  # Make room for legends\n",
+        "    fig.tight_layout(rect=[0, 0.1, 1, 0.95])  # Adjust the rectangle in which the subplots fit\n",
+        "    plt.show()"
       ]
     },
     {
@@ -320,9 +450,26 @@
       "outputs": [],
       "source": [
         "#@title Plot Result\n",
-        "\n",
-        "pl = Plotter(run_id=RUN_ID, labels=[Label('inference-extension'),Label('k8s-svc')], output_dir=OUTPUT_DIR)\n",
-        "pl.plot_bar()"
+        "# initialize the plotter with the run id and labels. \n",
+        "# Example labels are 'inference-extension' and 'k8s-svc' if comparing Inference Extension and K8s Service \n",
+        "# 'regression-before' and 'regression-after' if comparing two different runs of inference extension to see the regression\n",
+        "\n",
+        "benchmark_id1 =  <ID1> # eg 'regression-before' or 'inference-extension'\n",
+        "benchmark_id2 = <ID2> # eg 'regression-after' or 'k8s-svc'\n",
+        "labels = [Label(benchmark_id1), Label(benchmark_id2,)]\n",
+        "\n",
+        "# Plot bar chart of metrics\n",
+        "pl = Plotter(run_id=RUN_ID, labels=labels, output_dir=OUTPUT_DIR)\n",
+        "pl.plot_bar()\n",
+        "pl.plot_delta()\n",
+        "\n",
+        "# Load & group data to compute R^2\n",
+        "all_data = load_data(labels, RUN_ID, OUTPUT_DIR)\n",
+        "groups = group_data(all_data)\n",
+        "compute_r2_for_metrics(groups, CORE_METRICS,\n",
+        "                           label_before=benchmark_id1,\n",
+        "                           label_after=benchmark_id2)\n",
+        "\n"
       ]
     }
   ],
@@ -355,4 +502,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
\ No newline at end of file