kubernetes-sigs · k8s-ci-robot · Sep 18, 2024 · Sep 4, 2024 · terrytangyuan · Sep 18, 2024
diff --git a/examples/poc/README.md b/examples/poc/README.md
@@ -0,0 +1,68 @@
+# Envoy Ext Proc Gateway with LoRA Integration
+
+This project sets up an Envoy gateway to handle gRPC calls with integration of LoRA (Low-Rank Adaptation). The configuration aims to manage gRPC traffic through Envoy's external processing and custom routing based on headers and load balancing rules. The setup includes Kubernetes services and deployments for both the gRPC server and the vllm-lora application.
+
+## Requirements
+- A vLLM based deployment (using the custom image provided below), with LoRA Adapters
+- Kubernetes cluster
+- Envoy Gateway v1.1 installed on your cluster: https://gateway.envoyproxy.io/v1.1/tasks/quickstart/
+- `kubectl` command-line tool
+- Go (for local development)
+
+## vLLM
+***This PoC uses a modified vLLM fork, the public image of the fork is here: `ghcr.io/tomatillo-and-multiverse/vllm:demo`***
+
+The fork is here: https://github.com/kaushikmitr/vllm.
+
+The summary of changes from standard vLLM are:
+- Active/Registered LoRA adapters are returned as a response header (used for lora-aware routing)
+- Queue size is returned as a response header
+- Active/Registered LoRA adapters are emitted as metrics (for out-of-band scraping during low traffic periods)
+
+
+## Overview
+
+This project contains the necessary configurations and code to set up and deploy a service using Kubernetes, Envoy, and Go. The service involves routing based on the model specified (using Open AI API format), collecting metrics, and ensuring efficient load balancing.
+
+![alt text](./envoy-gateway-bootstrap.png)
+
+
+## Quickstart
+
+### Steps
+
+1. **Apply Kubernetes Manifests**
+   ```bash
+   cd manifests
+   kubectl apply -f ext_proc.yaml
+   kubectl apply -f vllm/vllm-lora-service.yaml
+   kubectl apply -f vllm/vllm-lora-deployment.yaml
+   ```
+
+2. **Update `ext_proc.yaml`**
+   - Ensure the `ext_proc.yaml` is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers.
+
+2. **Update and apply `gateway.yaml`**
+   - Ensure the `gateway.yaml` is updated with the internal IP addresses of the ExtProc service. This step is also crucial for the correct routing of requests based on headers.
+    ```bash
+   cd manifests
+   kubectl apply -f gateway.yaml
+   ```
+
+### Monitoring and Metrics
+
+- The Go application collects metrics and saves the latest response headers in memory.
+- Ensure Envoy is configured to route based on the metrics collected from the `/metric` endpoint of different service pods.
+
+## Contributing
+
+1. Fork the repository.
+2. Create a new branch.
+3. Make your changes.
+4. Open a pull request.
+
+## License
+
+This project is licensed under the MIT License.
+
+---
diff --git a/examples/poc/envoy-gateway-bootstrap.png b/examples/poc/envoy-gateway-bootstrap.png
diff --git a/examples/poc/ext-proc/Dockerfile b/examples/poc/ext-proc/Dockerfile
@@ -0,0 +1,19 @@
+## Multistage build
+FROM golang:1.22.5-alpine as build
+ENV CGO_ENABLED=0
+ENV GOOS=linux
+ENV GOARCH=amd64
+
+WORKDIR /src
+COPY . .
+RUN go mod download
+RUN go build -o /ext-proc
+FROM alpine:latest
+## Multistage deploy
+FROM gcr.io/distroless/base-debian10
+# Install bash
+
+WORKDIR /
+COPY --from=build /ext-proc /ext-proc
+
+ENTRYPOINT ["/ext-proc"]
diff --git a/examples/poc/ext-proc/cache/cache.go b/examples/poc/ext-proc/cache/cache.go
@@ -0,0 +1,91 @@
+package cache
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/coocood/freecache"
+)
+
+type ActiveLoraModelMetrics struct {
+	Date                    string
+	PodName                 string
+	ModelName               string
+	NumberOfPendingRequests int
+}
+
+type PendingRequestActiveAdaptersMetrics struct {
+	Date                   string
+	PodName                string
+	PendingRequests        int
+	NumberOfActiveAdapters int
+}
+
+func SetCacheActiveLoraModel(cache *freecache.Cache, metric ActiveLoraModelMetrics) error {
+	cacheKey := fmt.Sprintf("%s:%s", metric.PodName, metric.ModelName)
+	cacheValue, err := json.Marshal(metric)
+	if err != nil {
+		return fmt.Errorf("error marshaling ActiveLoraModelMetrics for key %s: %v", cacheKey, err)
+	}
+	err = cache.Set([]byte(cacheKey), cacheValue, 0)
+	if err != nil {
+		return fmt.Errorf("error setting cacheActiveLoraModel for key %s: %v", cacheKey, err)
+	}
+	fmt.Printf("Set cacheActiveLoraModel - Key: %s, Value: %s\n", cacheKey, cacheValue)
+	return nil
+}
+
+func SetCachePendingRequestActiveAdapters(cache *freecache.Cache, metric PendingRequestActiveAdaptersMetrics) error {
+	cacheKey := fmt.Sprintf("%s:", metric.PodName)
+	cacheValue, err := json.Marshal(metric)
+	if err != nil {
+		return fmt.Errorf("error marshaling PendingRequestActiveAdaptersMetrics for key %s: %v", cacheKey, err)
+	}
+	err = cache.Set([]byte(cacheKey), cacheValue, 0)
+	if err != nil {
+		return fmt.Errorf("error setting cachePendingRequestActiveAdapters for key %s: %v", cacheKey, err)
+	}
+	fmt.Printf("Set cachePendingRequestActiveAdapters - Key: %s, Value: %s\n", cacheKey, cacheValue)
+	return nil
+}
+
+func GetCacheActiveLoraModel(cache *freecache.Cache, podName, modelName string) (*ActiveLoraModelMetrics, error) {
+	cacheKey := fmt.Sprintf("%s:%s", podName, modelName)
+
+	value, err := cache.Get([]byte(cacheKey))
+	if err != nil {
+		return nil, fmt.Errorf("error fetching cacheActiveLoraModel for key %s: %v", cacheKey, err)
+	}
+	var metric ActiveLoraModelMetrics
+	err = json.Unmarshal(value, &metric)
+	if err != nil {
+		return nil, fmt.Errorf("error unmarshaling ActiveLoraModelMetrics for key %s: %v", cacheKey, err)
+	}
+	fmt.Printf("Got cacheActiveLoraModel - Key: %s, Value: %s\n", cacheKey, value)
+	return &metric, nil
+}
+
+func GetCachePendingRequestActiveAdapters(cache *freecache.Cache, podName string) (*PendingRequestActiveAdaptersMetrics, error) {
+	cacheKey := fmt.Sprintf("%s:", podName)
+
+	value, err := cache.Get([]byte(cacheKey))
+	if err != nil {
+		return nil, fmt.Errorf("error fetching cachePendingRequestActiveAdapters for key %s: %v", cacheKey, err)
+	}
+	var metric PendingRequestActiveAdaptersMetrics
+	err = json.Unmarshal(value, &metric)
+	if err != nil {
+		return nil, fmt.Errorf("error unmarshaling PendingRequestActiveAdaptersMetrics for key %s: %v", cacheKey, err)
+	}
+	fmt.Printf("Got cachePendingRequestActiveAdapters - Key: %s, Value: %s\n", cacheKey, value)
+	return &metric, nil
+}
+
+type PodCache struct {
+	PodIPMap map[string]string
+	IpPodMap map[string]string
+}
+
+func SetPodCache(cache *freecache.Cache, pods []string) {
+	cacheKey := fmt.Sprintf("")
+}
diff --git a/examples/poc/ext-proc/go.mod b/examples/poc/ext-proc/go.mod
@@ -0,0 +1,24 @@
+module ext-proc
+
+go 1.21
+
+require (
+	github.com/coocood/freecache v1.2.4
+	github.com/envoyproxy/go-control-plane v0.12.0
+	github.com/prometheus/client_model v0.6.1
+	github.com/prometheus/common v0.55.0
+	google.golang.org/grpc v1.65.0
+)
+
+require (
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b // indirect
+	github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect
+	github.com/golang/protobuf v1.5.4 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	golang.org/x/net v0.26.0 // indirect
+	golang.org/x/sys v0.21.0 // indirect
+	golang.org/x/text v0.16.0 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect
+	google.golang.org/protobuf v1.34.2 // indirect
+)
diff --git a/examples/poc/ext-proc/go.sum b/examples/poc/ext-proc/go.sum
@@ -0,0 +1,33 @@
+github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b h1:ga8SEFjZ60pxLcmhnThWgvH2wg8376yUJmPhEH4H3kw=
+github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
+github.com/coocood/freecache v1.2.4 h1:UdR6Yz/X1HW4fZOuH0Z94KwG851GWOSknua5VUbb/5M=
+github.com/coocood/freecache v1.2.4/go.mod h1:RBUWa/Cy+OHdfTGFEhEuE1pMCMX51Ncizj7rthiQ3vk=
+github.com/envoyproxy/go-control-plane v0.12.0 h1:4X+VP1GHd1Mhj6IB5mMeGbLCleqxjletLK6K0rbxyZI=
+github.com/envoyproxy/go-control-plane v0.12.0/go.mod h1:ZBTaoJ23lqITozF0M6G4/IragXCQKCnYbmlmtHvwRG0=
+github.com/envoyproxy/protoc-gen-validate v1.0.4 h1:gVPz/FMfvh57HdSJQyvBtF00j8JU4zdyUgIUNhlgg0A=
+github.com/envoyproxy/protoc-gen-validate v1.0.4/go.mod h1:qys6tmnRsYrQqIhm2bvKZH4Blx/1gTIZ2UKVY1M+Yew=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
+github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
+github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
+golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
+golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
+golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
+golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
+golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 h1:Zy9XzmMEflZ/MAaA7vNcoebnRAld7FsPW1EeBB7V0m8=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0=
+google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc=
+google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ=
+google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
+google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=