add labels to pod metadata for the use of scheduler plugins (#779)

nirrozenbaum · web-flow · commit ec735af17cf9 · 2025-05-02T13:43:57.000-07:00
Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;
diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go
@@ -75,6 +75,7 @@ func toInternalPod(in *corev1.Pod) *backend.Pod {
 			Namespace: in.Namespace,
 		},
 		Address: in.Status.PodIP,
+		Labels:  in.Labels,
 	}
 }
 
diff --git a/pkg/epp/backend/pod.go b/pkg/epp/backend/pod.go
@@ -22,6 +22,7 @@ import (
 type Pod struct {
 	NamespacedName types.NamespacedName
 	Address        string
+	Labels         map[string]string
 }
 
 func (p *Pod) String() string {
@@ -41,5 +42,6 @@ func (p *Pod) Clone() *Pod {
 			Namespace: p.NamespacedName.Namespace,
 		},
 		Address: p.Address,
+		Labels:  p.Labels,
 	}
 }
diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go
@@ -98,7 +98,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 	tests := []struct {
 		name              string
 		requests          []*extProcPb.ProcessingRequest
-		pods              map[backend.Pod]*backendmetrics.Metrics
+		pods              map[*backend.Pod]*backendmetrics.Metrics
 		wantResponses     []*extProcPb.ProcessingResponse
 		wantMetrics       map[string]string
 		wantErr           bool
@@ -109,7 +109,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			name:     "select lower queue and kv cache, no active lora",
 			requests: integrationutils.GenerateStreamedRequestSet(logger, "test1", "my-model"),
 			// pod-1 will be picked because it has relatively low queue size and low KV cache.
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    3,
 					KVCacheUsagePercent: 0.2,
@@ -184,7 +184,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			requests: integrationutils.GenerateStreamedRequestSet(logger, "test2", "sql-lora"),
 			// pod-1 will be picked because it has relatively low queue size, with the requested
 			// model being active, and has low KV cache.
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    0,
 					KVCacheUsagePercent: 0.2,
@@ -269,7 +269,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			// pod-2 will be picked despite it NOT having the requested model being active
 			// as it's above the affinity for queue size. Also is critical, so we should
 			// still honor request despite all queues > 5
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    10,
 					KVCacheUsagePercent: 0.2,
@@ -352,7 +352,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			requests: integrationutils.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"),
 			// no pods will be picked as all models are either above kv threshold,
 			// queue threshold, or both.
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    6,
 					KVCacheUsagePercent: 0.2,
@@ -400,7 +400,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 			name:     "noncritical, but one server has capacity, do not shed",
 			requests: integrationutils.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"),
 			// pod 0 will be picked as all other models are above threshold
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    4,
 					KVCacheUsagePercent: 0.2,
@@ -511,7 +511,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 
 			//
 			// pod 0 will be picked as all other models are above threshold
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    4,
 					KVCacheUsagePercent: 0.2,
@@ -622,7 +622,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 
 			//
 			// pod 0 will be picked as all other models are above threshold
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    4,
 					KVCacheUsagePercent: 0.2,
@@ -734,7 +734,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 
 			//
 			// pod 0 will be picked as all other models are above threshold
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    4,
 					KVCacheUsagePercent: 0.2,
@@ -833,7 +833,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 
 			//
 			// pod 0 will be picked as all other models are above threshold
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    4,
 					KVCacheUsagePercent: 0.2,
@@ -1181,7 +1181,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 					DynamicMetadata: makeMetadata("192.168.1.1:8000"),
 				},
 			},
-			pods: map[backend.Pod]*backendmetrics.Metrics{
+			pods: map[*backend.Pod]*backendmetrics.Metrics{
 				fakePod(0): {
 					WaitingQueueSize:    4,
 					KVCacheUsagePercent: 0.2,
@@ -1227,7 +1227,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 	}
 }
 
-func setUpHermeticServer(t *testing.T, podAndMetrics map[backend.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) {
+func setUpHermeticServer(t *testing.T, podAndMetrics map[*backend.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) {
 	// Reconfigure the TestPodMetricsClient.
 	res := map[types.NamespacedName]*backendmetrics.Metrics{}
 	for pod, metrics := range podAndMetrics {
@@ -1305,10 +1305,11 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backend.Pod]*backendmet
 	}
 }
 
-func fakePod(index int) backend.Pod {
-	return backend.Pod{
+func fakePod(index int) *backend.Pod {
+	return &backend.Pod{
 		NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"},
 		Address:        fmt.Sprintf("192.168.1.%d", index+1),
+		Labels:         make(map[string]string, 0),
 	}
 }
 

Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ func toInternalPod(in corev1.Pod) backend.Pod {`
`75`	`75`	`Namespace: in.Namespace,`
`76`	`76`	`},`
`77`	`77`	`Address: in.Status.PodIP,`
	`78`	`+ Labels: in.Labels,`
`78`	`79`	`}`
`79`	`80`	`}`
`80`	`81`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ import (`
`22`	`22`	`type Pod struct {`
`23`	`23`	`NamespacedName types.NamespacedName`
`24`	`24`	`Address string`
	`25`	`+ Labels map[string]string`
`25`	`26`	`}`
`26`	`27`
`27`	`28`	`func (p *Pod) String() string {`
`@@ -41,5 +42,6 @@ func (p Pod) Clone() Pod {`
`41`	`42`	`Namespace: p.NamespacedName.Namespace,`
`42`	`43`	`},`
`43`	`44`	`Address: p.Address,`
	`45`	`+ Labels: p.Labels,`
`44`	`46`	`}`
`45`	`47`	`}`