kubernetes-sigs
diff --git a/‎examples/poc/manifests/vllm/vllm-lora-deployment.yaml
+2-2 b/‎examples/poc/manifests/vllm/vllm-lora-deployment.yaml
+2-2
diff --git a/‎pkg/README.md
+4-4 b/‎pkg/README.md
+4-4
diff --git a/‎pkg/ext-proc/backend/fake.go
+3-1 b/‎pkg/ext-proc/backend/fake.go
+3-1
diff --git a/‎pkg/ext-proc/backend/provider.go
+30-11 b/‎pkg/ext-proc/backend/provider.go
+30-11
diff --git a/‎pkg/ext-proc/backend/provider_test.go
+111 b/‎pkg/ext-proc/backend/provider_test.go
+111
diff --git a/‎pkg/ext-proc/backend/types.go
+8-6 b/‎pkg/ext-proc/backend/types.go
+8-6
@@ -78,9 +78,9 @@ spec:
             timeoutSeconds: 1
           resources:
             limits:
-              nvidia.com/gpu: 2
+              nvidia.com/gpu: 1
             requests:
-              nvidia.com/gpu: 2
+              nvidia.com/gpu: 1
           volumeMounts:
             - mountPath: /data
               name: data
 
@@ -10,7 +10,7 @@
 
    Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run:
    ```bash
-   kubectl apply -f ./manifests/gateway/enable_patch_policy.yaml
+   kubectl apply -f ./manifests/enable_patch_policy.yaml
    kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system
 
    ```
@@ -20,14 +20,14 @@
 1. **Deploy Gateway**
 
    ```bash
-   kubectl apply -f ./manifests/gateway/gateway.yaml
+   kubectl apply -f ./manifests/gateway.yaml
    ```
 
 1. **Deploy Ext-Proc**
 
    ```bash
-   kubectl apply -f ./manifests/gateway/ext_proc.yaml
-   kubectl apply -f ./manifests/gateway/patch_policy.yaml
+   kubectl apply -f ./manifests/ext_proc.yaml
+   kubectl apply -f ./manifests/patch_policy.yaml
    ```
    **NOTE**: Ensure the `instance-gateway-ext-proc` deployment is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers. This won't be needed once we make ext proc dynamically read the pods.
 
 
@@ -1,5 +1,7 @@
 package backend
 
+import "context"
+
 type FakePodLister struct {
 	Err  error
 	Pods PodSet
@@ -10,7 +12,7 @@ type FakePodMetricsClient struct {
 	Res map[Pod]*PodMetrics
 }
 
-func (f *FakePodMetricsClient) FetchMetrics(pod Pod, existing *PodMetrics) (*PodMetrics, error) {
+func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error) {
 	if err, ok := f.Err[pod]; ok {
 		return nil, err
 	}
 
@@ -1,6 +1,7 @@
 package backend
 
 import (
+	"context"
 	"fmt"
 	"sync"
 	"time"
@@ -9,6 +10,10 @@ import (
 	klog "k8s.io/klog/v2"
 )
 
+const (
+	fetchMetricsTimeout = 5 * time.Second
+)
+
 func NewProvider(pmc PodMetricsClient, pl PodLister) *Provider {
 	p := &Provider{
 		podMetrics: sync.Map{},
@@ -27,7 +32,7 @@ type Provider struct {
 }
 
 type PodMetricsClient interface {
-	FetchMetrics(pod Pod, existing *PodMetrics) (*PodMetrics, error)
+	FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error)
 }
 
 type PodLister interface {
@@ -60,7 +65,8 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duratio
 	if err := p.refreshPodsOnce(); err != nil {
 		return fmt.Errorf("failed to init pods: %v", err)
 	}
-	if err := p.refreshMetricsOnce(); err != nil {
+	err := p.refreshMetricsOnce()
+	if err != nil {
 		return fmt.Errorf("failed to init metrics: %v", err)
 	}
 
@@ -113,7 +119,7 @@ func (p *Provider) refreshPodsOnce() error {
 			new := &PodMetrics{
 				Pod: pod,
 				Metrics: Metrics{
-					CachedModels: make(map[string]int),
+					ActiveModels: make(map[string]int),
 				},
 			}
 			p.podMetrics.Store(pod, new)
@@ -132,35 +138,48 @@ func (p *Provider) refreshPodsOnce() error {
 }
 
 func (p *Provider) refreshMetricsOnce() error {
+	ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout)
+	defer cancel()
 	start := time.Now()
 	defer func() {
 		d := time.Since(start)
 		// TODO: add a metric instead of logging
 		klog.V(4).Infof("Refreshed metrics in %v", d)
 	}()
 	var wg sync.WaitGroup
-	var errs error
+	errCh := make(chan error)
 	processOnePod := func(key, value any) bool {
 		klog.V(4).Infof("Processing pod %v and metric %v", key, value)
 		pod := key.(Pod)
 		existing := value.(*PodMetrics)
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			updated, err := p.pmc.FetchMetrics(pod, existing)
+			updated, err := p.pmc.FetchMetrics(ctx, pod, existing)
 			if err != nil {
-				multierr.Append(errs, fmt.Errorf("failed to parse metrics from %s: %v", pod, err))
+				errCh <- fmt.Errorf("failed to parse metrics from %s: %v", pod, err)
 				return
 			}
-			klog.V(4).Infof("Updated metrics for pod %s: %v", pod, updated.Metrics)
-			if err != nil {
-				multierr.Append(errs, fmt.Errorf("failed to get all pod metrics updated from prometheus: %v", err))
-			}
 			p.UpdatePodMetrics(pod, updated)
+			klog.V(4).Infof("Updated metrics for pod %s: %v", pod, updated.Metrics)
 		}()
 		return true
 	}
 	p.podMetrics.Range(processOnePod)
-	wg.Wait()
+
+	// Wait for metric collection for all pods to complete and close the error channel in a
+	// goroutine so this is unblocking, allowing the code to proceed to the error collection code
+	// below.
+	// Note we couldn't use a buffered error channel with a size because the size of the podMetrics
+	// sync.Map is unknown beforehand.
+	go func() {
+		wg.Wait()
+		close(errCh)
+	}()
+
+	var errs error
+	for err := range errCh {
+		errs = multierr.Append(errs, err)
+	}
 	return errs
 }
@@ -0,0 +1,111 @@
+package backend
+
+import (
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+)
+
+var (
+	pod1 = &PodMetrics{
+		Pod: Pod{Name: "pod1"},
+		Metrics: Metrics{
+			WaitingQueueSize:    0,
+			KVCacheUsagePercent: 0.2,
+			MaxActiveModels:     2,
+			ActiveModels: map[string]int{
+				"foo": 1,
+				"bar": 1,
+			},
+		},
+	}
+	pod2 = &PodMetrics{
+		Pod: Pod{Name: "pod2"},
+		Metrics: Metrics{
+			WaitingQueueSize:    1,
+			KVCacheUsagePercent: 0.2,
+			MaxActiveModels:     2,
+			ActiveModels: map[string]int{
+				"foo1": 1,
+				"bar1": 1,
+			},
+		},
+	}
+)
+
+func TestProvider(t *testing.T) {
+	tests := []struct {
+		name    string
+		pmc     PodMetricsClient
+		pl      PodLister
+		initErr bool
+		want    []*PodMetrics
+	}{
+		{
+			name: "Init success",
+			pl: &FakePodLister{
+				Pods: map[Pod]bool{
+					pod1.Pod: true,
+					pod2.Pod: true,
+				},
+			},
+			pmc: &FakePodMetricsClient{
+				Res: map[Pod]*PodMetrics{
+					pod1.Pod: pod1,
+					pod2.Pod: pod2,
+				},
+			},
+			want: []*PodMetrics{pod1, pod2},
+		},
+		{
+			name: "Fetch metrics error",
+			pl: &FakePodLister{
+				Pods: map[Pod]bool{
+					pod1.Pod: true,
+					pod2.Pod: true,
+				},
+			},
+			pmc: &FakePodMetricsClient{
+				Err: map[Pod]error{
+					pod2.Pod: errors.New("injected error"),
+				},
+				Res: map[Pod]*PodMetrics{
+					pod1.Pod: pod1,
+				},
+			},
+			initErr: true,
+			want: []*PodMetrics{
+				pod1,
+				// Failed to fetch pod2 metrics so it remains the default values.
+				&PodMetrics{
+					Pod: Pod{Name: "pod2"},
+					Metrics: Metrics{
+						WaitingQueueSize:    0,
+						KVCacheUsagePercent: 0,
+						MaxActiveModels:     0,
+						ActiveModels:        map[string]int{},
+					},
+				}},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			p := NewProvider(test.pmc, test.pl)
+			err := p.Init(time.Millisecond, time.Millisecond)
+			if test.initErr != (err != nil) {
+				t.Fatalf("Unexpected error, got: %v, want: %v", err, test.initErr)
+			}
+			metrics := p.AllPodMetrics()
+			lessFunc := func(a, b *PodMetrics) bool {
+				return a.String() < b.String()
+			}
+			if diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(lessFunc)); diff != "" {
+				t.Errorf("Unexpected output (-want +got): %v", diff)
+			}
+		})
+	}
+}
@@ -12,12 +12,14 @@ type Pod struct {
 }
 
 func (p Pod) String() string {
-	return p.Namespace + "." + p.Name
+	return p.Namespace + "/" + p.Name
 }
 
 type Metrics struct {
-	// CachedModels is a set of models(including LoRA adapters) that are currently cached to GPU.
-	CachedModels            map[string]int
+	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
+	ActiveModels map[string]int
+	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
+	MaxActiveModels         int
 	RunningQueueSize        int
 	WaitingQueueSize        int
 	KVCacheUsagePercent     float64
@@ -34,14 +36,14 @@ func (pm *PodMetrics) String() string {
 }
 
 func (pm *PodMetrics) Clone() *PodMetrics {
-	cm := make(map[string]int, len(pm.CachedModels))
-	for k, v := range pm.CachedModels {
+	cm := make(map[string]int, len(pm.ActiveModels))
+	for k, v := range pm.ActiveModels {
 		cm[k] = v
 	}
 	clone := &PodMetrics{
 		Pod: pm.Pod,
 		Metrics: Metrics{
-			CachedModels:            cm,
+			ActiveModels:            cm,
 			RunningQueueSize:        pm.RunningQueueSize,
 			WaitingQueueSize:        pm.WaitingQueueSize,
 			KVCacheUsagePercent:     pm.KVCacheUsagePercent,
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`	`1`	`package backend`
`2`	`2`
	`3`	`+import "context"`
	`4`	`+`
`3`	`5`	`type FakePodLister struct {`
`4`	`6`	`Err error`
`5`	`7`	`Pods PodSet`
`@@ -10,7 +12,7 @@ type FakePodMetricsClient struct {`
`10`	`12`	`Res map[Pod]*PodMetrics`
`11`	`13`	`}`
`12`	`14`
`13`		`-func (f FakePodMetricsClient) FetchMetrics(pod Pod, existing PodMetrics) (*PodMetrics, error) {`
	`15`	`+func (f FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existing PodMetrics) (*PodMetrics, error) {`
`14`	`16`	`if err, ok := f.Err[pod]; ok {`
`15`	`17`	`return nil, err`
`16`	`18`	`}`