kubernetes-sigs
diff --git a/‎pkg/epp/backend/metrics/types.go
Lines changed: 6 additions & 0 deletions b/‎pkg/epp/backend/metrics/types.go
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/epp/handlers/request.go
Lines changed: 3 additions & 3 deletions b/‎pkg/epp/handlers/request.go
Lines changed: 3 additions & 3 deletions
diff --git a/‎pkg/epp/handlers/server.go
Lines changed: 1 addition & 1 deletion b/‎pkg/epp/handlers/server.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/epp/scheduling/config/config.go
Lines changed: 58 additions & 0 deletions b/‎pkg/epp/scheduling/config/config.go
Lines changed: 58 additions & 0 deletions
@@ -79,6 +79,9 @@ func (p *Pod) String() string {
 }
 
 func (p *Pod) Clone() *Pod {
+	if p == nil {
+		return nil
+	}
 	return &Pod{
 		NamespacedName: types.NamespacedName{
 			Name:      p.NamespacedName.Name,
@@ -118,6 +121,9 @@ func (m *Metrics) String() string {
 }
 
 func (m *Metrics) Clone() *Metrics {
+	if m == nil {
+		return nil
+	}
 	cm := make(map[string]int, len(m.ActiveModels))
 	for k, v := range m.ActiveModels {
 		cm[k] = v
 
@@ -67,7 +67,7 @@ func (s *StreamingServer) HandleRequestBody(
 		ResolvedTargetModel: modelName,
 		Critical:            modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical,
 	}
-	logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical)
+	logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq)
 
 	var err error
 	// Update target models in the body.
@@ -81,11 +81,11 @@ func (s *StreamingServer) HandleRequestBody(
 		return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)}
 	}
 
-	target, err := s.scheduler.Schedule(ctx, llmReq)
+	res, err := s.scheduler.Schedule(ctx, llmReq)
 	if err != nil {
 		return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()}
 	}
-	targetPod := target.GetPod()
+	targetPod := res.TargetPod.GetPod()
 
 	// Insert target endpoint to instruct Envoy to route requests to the specified target pod.
 	// Attach the port number
 
@@ -65,7 +65,7 @@ type StreamingServer struct {
 }
 
 type Scheduler interface {
-	Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (targetPod schedulingtypes.Pod, err error)
+	Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (result *schedulingtypes.Result, err error)
 }
 
 // RequestContext stores context information during the life time of an HTTP request.
 
@@ -0,0 +1,58 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+// Config holds all the configuration values for the scheduler
+type Config struct {
+	KVCacheThreshold       float64
+	QueueThresholdCritical int
+	QueueingThresholdLoRA  int
+	LoraAffinityThreshold  float64
+}
+
+const (
+	// Default values to use if environment variables are not set
+	defaultKVCacheThreshold       = 0.8
+	defaultQueueThresholdCritical = 5
+	defaultQueueingThresholdLoRA  = 128
+	defaultLoraAffinityThreshold  = 0.999
+)
+
+// LoadConfig loads configuration from environment variables
+func LoadConfig() Config {
+	// Use a default logger for initial configuration loading
+	baseLogger := log.Log.WithName("scheduling-config")
+
+	config := Config{
+		KVCacheThreshold:       envutil.GetEnvFloat("KV_CACHE_THRESHOLD", defaultKVCacheThreshold, baseLogger),
+		QueueThresholdCritical: envutil.GetEnvInt("QUEUE_THRESHOLD_CRITICAL", defaultQueueThresholdCritical, baseLogger),
+		QueueingThresholdLoRA:  envutil.GetEnvInt("QUEUING_THRESHOLD_LORA", defaultQueueingThresholdLoRA, baseLogger),
+		LoraAffinityThreshold:  envutil.GetEnvFloat("LORA_AFFINITY_THRESHOLD", defaultLoraAffinityThreshold, baseLogger),
+	}
+
+	baseLogger.V(logutil.DEFAULT).Info("Scheduler configuration loaded", "config", config)
+
+	return config
+}
+
+var Conf = LoadConfig()
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ func (s *StreamingServer) HandleRequestBody(`
`67`	`67`	`ResolvedTargetModel: modelName,`
`68`	`68`	`Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical,`
`69`	`69`	`}`
`70`		`- logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical)`
	`70`	`+ logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq)`
`71`	`71`
`72`	`72`	`var err error`
`73`	`73`	`// Update target models in the body.`
`@@ -81,11 +81,11 @@ func (s *StreamingServer) HandleRequestBody(`
`81`	`81`	`return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)}`
`82`	`82`	`}`
`83`	`83`
`84`		`- target, err := s.scheduler.Schedule(ctx, llmReq)`
	`84`	`+ res, err := s.scheduler.Schedule(ctx, llmReq)`
`85`	`85`	`if err != nil {`
`86`	`86`	`return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()}`
`87`	`87`	`}`
`88`		`- targetPod := target.GetPod()`
	`88`	`+ targetPod := res.TargetPod.GetPod()`
`89`	`89`
`90`	`90`	`// Insert target endpoint to instruct Envoy to route requests to the specified target pod.`
`91`	`91`	`// Attach the port number`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ type StreamingServer struct {`
`65`	`65`	`}`
`66`	`66`
`67`	`67`	`type Scheduler interface {`
`68`		`- Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (targetPod schedulingtypes.Pod, err error)`
	`68`	`+ Schedule(ctx context.Context, b schedulingtypes.LLMRequest) (result schedulingtypes.Result, err error)`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`// RequestContext stores context information during the life time of an HTTP request.`