rlakhtakia
diff --git a/‎pkg/epp/backend/metrics/pod_metrics.go
+5-6 b/‎pkg/epp/backend/metrics/pod_metrics.go
+5-6
diff --git a/‎pkg/epp/backend/metrics/types.go
+7-8 b/‎pkg/epp/backend/metrics/types.go
+7-8
diff --git a/‎pkg/epp/scheduling/plugins/filter.go renamed to ‎pkg/epp/scheduling/plugins/filter/filter.go
+36-49 b/‎pkg/epp/scheduling/plugins/filter.go renamed to ‎pkg/epp/scheduling/plugins/filter/filter.go
+36-49
diff --git a/‎pkg/epp/scheduling/plugins/filter_test.go renamed to ‎pkg/epp/scheduling/plugins/filter/filter_test.go
+13-25 b/‎pkg/epp/scheduling/plugins/filter_test.go renamed to ‎pkg/epp/scheduling/plugins/filter/filter_test.go
+13-25
@@ -41,9 +41,8 @@ type podMetrics struct {
 	ds       Datastore
 	interval time.Duration
 
-	parentCtx context.Context
-	once      sync.Once // ensure the StartRefreshLoop is only called once.
-	done      chan struct{}
+	once sync.Once // ensure the StartRefreshLoop is only called once.
+	done chan struct{}
 
 	logger logr.Logger
 }
@@ -79,8 +78,8 @@ func toInternalPod(in *corev1.Pod) *Pod {
 }
 
 // start starts a goroutine exactly once to periodically update metrics. The goroutine will be
-// stopped either when stop() is called, or the parentCtx is cancelled.
-func (pm *podMetrics) startRefreshLoop() {
+// stopped either when stop() is called, or the given ctx is cancelled.
+func (pm *podMetrics) startRefreshLoop(ctx context.Context) {
 	pm.once.Do(func() {
 		go func() {
 			pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod())
@@ -90,7 +89,7 @@ func (pm *podMetrics) startRefreshLoop() {
 				select {
 				case <-pm.done:
 					return
-				case <-pm.parentCtx.Done():
+				case <-ctx.Done():
 					return
 				case <-ticker.C: // refresh metrics periodically
 					if err := pm.refreshMetrics(); err != nil {
 
@@ -43,18 +43,17 @@ type PodMetricsFactory struct {
 func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics {
 	pod := toInternalPod(in)
 	pm := &podMetrics{
-		pmc:       f.pmc,
-		ds:        ds,
-		interval:  f.refreshMetricsInterval,
-		parentCtx: parentCtx,
-		once:      sync.Once{},
-		done:      make(chan struct{}),
-		logger:    log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
+		pmc:      f.pmc,
+		ds:       ds,
+		interval: f.refreshMetricsInterval,
+		once:     sync.Once{},
+		done:     make(chan struct{}),
+		logger:   log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
 	}
 	pm.pod.Store(pod)
 	pm.metrics.Store(newMetrics())
 
-	pm.startRefreshLoop()
+	pm.startRefreshLoop(parentCtx)
 	return pm
 }
 
 
@@ -14,56 +14,55 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package plugins
+package filter
 
 import (
-	"errors"
 	"math"
 	"math/rand"
 	"time"
 
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-type Filter struct {
+type baseFilter struct {
 	name   string
 	filter filterFunc
 }
 
-func (bf *Filter) Name() string {
-	if bf == nil {
+func (f *baseFilter) Name() string {
+	if f == nil {
 		return "nil"
 	}
-	return bf.name
+	return f.name
 }
 
-func (bf *Filter) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
+func (f *baseFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
 	loggerTrace := ctx.Logger.V(logutil.TRACE)
-	loggerTrace.Info("Running a filter", "name", bf.Name(), "podCount", len(pods))
+	loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods))
 
-	return bf.filter(ctx, pods)
+	return f.filter(ctx, pods)
 }
 
 // DecisionTreeFilter applies current filterFunc, and then recursively applies next filters
 // depending success or failure of the current filter.
 // It can be used to construct a flow chart algorithm.
 type DecisionTreeFilter struct {
-	Current types.Filter
+	Current plugins.Filter
 	// NextOnSuccess filter will be applied after successfully applying the current filter.
 	// The filtered results will be passed to the next filter.
-	NextOnSuccess types.Filter
+	NextOnSuccess plugins.Filter
 	// NextOnFailure filter will be applied if current filter fails.
 	// The original input will be passed to the next filter.
-	NextOnFailure types.Filter
+	NextOnFailure plugins.Filter
 	// NextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the
 	// success or failure of the current filter.
 	// NOTE: When using NextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil.
 	// However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of
 	// NextOnSuccessOrFailure,  in the success and failure scenarios, respectively.
-	NextOnSuccessOrFailure types.Filter
+	NextOnSuccessOrFailure plugins.Filter
 }
 
 func (f *DecisionTreeFilter) Name() string {
@@ -73,15 +72,15 @@ func (f *DecisionTreeFilter) Name() string {
 	return f.Current.Name()
 }
 
-func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
+func (f *DecisionTreeFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
 	loggerTrace := ctx.Logger.V(logutil.TRACE)
-	filtered, err := f.Current.Filter(ctx, pods)
+	filtered := f.Current.Filter(ctx, pods)
 
 	next := f.NextOnSuccessOrFailure
-	if err == nil && len(filtered) > 0 {
+	if len(filtered) > 0 {
 		if f.NextOnSuccess == nil && f.NextOnSuccessOrFailure == nil {
 			// No succeeding filters to run, return.
-			return filtered, err
+			return filtered
 		}
 		if f.NextOnSuccess != nil {
 			next = f.NextOnSuccess
@@ -92,7 +91,7 @@ func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]typ
 	} else {
 		if f.NextOnFailure == nil && f.NextOnSuccessOrFailure == nil {
 			// No succeeding filters to run, return.
-			return filtered, err
+			return filtered
 		}
 		if f.NextOnFailure != nil {
 			next = f.NextOnFailure
@@ -104,26 +103,24 @@ func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]typ
 }
 
 // filterFunc filters a set of input pods to a subset.
-type filterFunc func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error)
+type filterFunc func(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod
 
 // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc.
 func toFilterFunc(pp podPredicate) filterFunc {
-	return func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
+	return func(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
 		filtered := []types.Pod{}
 		for _, pod := range pods {
 			pass := pp(ctx.Req, pod)
 			if pass {
 				filtered = append(filtered, pod)
 			}
 		}
-		if len(filtered) == 0 {
-			return nil, errors.New("no pods left")
-		}
-		return filtered, nil
+
+		return filtered
 	}
 }
 
-var LeastQueueFilter = &Filter{
+var LeastQueueFilter = &baseFilter{
 	name:   "least queuing",
 	filter: leastQueuingFilterFunc,
 }
@@ -135,7 +132,7 @@ var LeastQueueFilter = &Filter{
 // the least one as it gives more choices for the next filter, which on aggregate gave better
 // results.
 // TODO: Compare this strategy with other strategies such as top K.
-func leastQueuingFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
+func leastQueuingFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
 	min := math.MaxInt
 	max := 0
 	filtered := []types.Pod{}
@@ -154,15 +151,15 @@ func leastQueuingFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod,
 			filtered = append(filtered, pod)
 		}
 	}
-	return filtered, nil
+	return filtered
 }
 
-var LowQueueFilter = &Filter{
+var LowQueueFilter = &baseFilter{
 	name:   "low queueing filter",
 	filter: toFilterFunc((queueThresholdPredicate(config.Conf.QueueingThresholdLoRA))),
 }
 
-var LeastKVCacheFilter = &Filter{
+var LeastKVCacheFilter = &baseFilter{
 	name:   "least KV cache percent",
 	filter: leastKVCacheFilterFunc,
 }
@@ -173,7 +170,7 @@ var LeastKVCacheFilter = &Filter{
 // should consider them all instead of the absolute minimum one. This worked better than picking the
 // least one as it gives more choices for the next filter, which on aggregate gave better results.
 // TODO: Compare this strategy with other strategies such as top K.
-func leastKVCacheFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
+func leastKVCacheFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
 	min := math.MaxFloat64
 	var max float64 = 0
 	filtered := []types.Pod{}
@@ -192,10 +189,10 @@ func leastKVCacheFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod,
 			filtered = append(filtered, pod)
 		}
 	}
-	return filtered, nil
+	return filtered
 }
 
-var LoRAAffinityFilter = &Filter{
+var LoRAAffinityFilter = &baseFilter{
 	name:   "affinity LoRA",
 	filter: loRASoftAffinityFilterFunc,
 }
@@ -216,7 +213,7 @@ var LoRAAffinityFilter = &Filter{
 // Returns:
 //   - Filtered slice of pod metrics based on affinity and availability
 //   - Error if any issues occur during filtering
-func loRASoftAffinityFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
+func loRASoftAffinityFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
 
 	// Pre-allocate slices with estimated capacity
 	filtered_affinity := make([]types.Pod, 0, len(pods))
@@ -241,34 +238,24 @@ func loRASoftAffinityFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.P
 	// If both groups have pods, use probability to select which group to return
 	if len(filtered_affinity) > 0 && len(filtered_available) > 0 {
 		if randGen.Float64() < config.Conf.LoraAffinityThreshold {
-			return filtered_affinity, nil
+			return filtered_affinity
 		}
-		return filtered_available, nil
+		return filtered_available
 	}
 
 	// Return whichever group has pods
 	if len(filtered_affinity) > 0 {
-		return filtered_affinity, nil
+		return filtered_affinity
 	}
 
-	return filtered_available, nil
+	return filtered_available
 }
 
-var HasCapacityFilter = &Filter{
+var HasCapacityFilter = &baseFilter{
 	name:   "has capacity for sheddable requests",
 	filter: toFilterFunc(queueThresholdPredicate(config.Conf.QueueThresholdCritical).and(kvCacheThresholdPredicate(config.Conf.KVCacheThreshold))),
 }
 
-var DropRequestFilter = &Filter{
-	name: "drop request",
-	filter: func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
-		ctx.Logger.V(logutil.DEFAULT).Info("Request dropped", "request", ctx.Req)
-		return []types.Pod{}, errutil.Error{
-			Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources",
-		}
-	},
-}
-
 // podPredicate is a filter function to check whether a pod is desired.
 type podPredicate func(req *types.LLMRequest, pod types.Pod) bool
 
 
@@ -14,11 +14,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package plugins
+package filter
 
 import (
 	"context"
-	"errors"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -34,30 +33,26 @@ func TestFilter(t *testing.T) {
 		req    *types.LLMRequest
 		input  []types.Pod
 		output []types.Pod
-		err    bool
 		filter *DecisionTreeFilter
 	}{
 		{
-			name: "simple filter without successor, failure",
+			name: "simple filter without available pods",
 			filter: &DecisionTreeFilter{
-				Current: &Filter{
-					name: "error",
-					filter: func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) {
-						return nil, errors.New("filter error")
+				Current: &baseFilter{
+					name: "filter all",
+					filter: func(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
+						return []types.Pod{}
 					},
 				},
 			},
-			err: true,
+			output: []types.Pod{},
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			ctx := types.NewContext(context.Background(), test.req, test.input)
-			got, err := test.filter.Filter(ctx, test.input)
-			if test.err != (err != nil) {
-				t.Errorf("Unexpected error, got %v, want %v", err, test.err)
-			}
+			ctx := types.NewSchedulingContext(context.Background(), test.req, test.input)
+			got := test.filter.Filter(ctx, test.input)
 
 			opt := cmp.AllowUnexported(types.PodMetrics{})
 			if diff := cmp.Diff(test.output, got, opt); diff != "" {
@@ -74,7 +69,6 @@ func TestFilterFunc(t *testing.T) {
 		req    *types.LLMRequest
 		input  []types.Pod
 		output []types.Pod
-		err    bool
 	}{
 		{
 			name:   "least queuing empty input",
@@ -193,11 +187,8 @@ func TestFilterFunc(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			ctx := types.NewContext(context.Background(), test.req, test.input)
-			got, err := test.f(ctx, test.input)
-			if test.err != (err != nil) {
-				t.Errorf("Unexpected error, got %v, want %v", err, test.err)
-			}
+			ctx := types.NewSchedulingContext(context.Background(), test.req, test.input)
+			got := test.f(ctx, test.input)
 
 			opt := cmp.AllowUnexported(types.PodMetrics{})
 			if diff := cmp.Diff(test.output, got, opt); diff != "" {
@@ -254,7 +245,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 			},
 		},
 	}
-	ctx := types.NewContext(context.Background(), req, pods)
+	ctx := types.NewSchedulingContext(context.Background(), req, pods)
 
 	// Run the filter function multiple times and count the results
 	affinityCount := 0
@@ -265,10 +256,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 	expectedAvailabilityPercent := 100 - expectedAffinityPercent
 
 	for i := 0; i < numIterations; i++ {
-		result, err := loRASoftAffinityFilterFunc(ctx, pods)
-		if err != nil {
-			t.Fatalf("Unexpected error: %v", err)
-		}
+		result := loRASoftAffinityFilterFunc(ctx, pods)
 
 		// Check which type of pod was returned
 		if len(result) != 1 {