Skip to content

Allow bodyless requests to passthrough EPP #555

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 0 additions & 31 deletions pkg/epp/datastore/datastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@ import (
"context"
"errors"
"fmt"
"math/rand"
"sync"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -304,35 +302,6 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelV
return outMap
}

func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string {
source := rand.NewSource(rand.Int63())
if seed > 0 {
source = rand.NewSource(seed)
}
r := rand.New(source)

// all the weight values are nil, then we should return random model name
if model.Spec.TargetModels[0].Weight == nil {
index := r.Int31n(int32(len(model.Spec.TargetModels)))
return model.Spec.TargetModels[index].Name
}

var weights int32
for _, model := range model.Spec.TargetModels {
weights += *model.Weight
}
logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights)
randomVal := r.Int31n(weights)
// TODO: optimize this without using loop
for _, model := range model.Spec.TargetModels {
if randomVal < *model.Weight {
return model.Name
}
randomVal -= *model.Weight
}
return ""
}

func IsCritical(model *v1alpha2.InferenceModel) bool {
if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha2.Critical {
return true
Expand Down
108 changes: 0 additions & 108 deletions pkg/epp/datastore/datastore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing"
)

Expand Down Expand Up @@ -223,113 +222,6 @@ func TestModel(t *testing.T) {
}
}

func TestRandomWeightedDraw(t *testing.T) {
logger := logutil.NewTestLogger()
tests := []struct {
name string
model *v1alpha2.InferenceModel
want string
}{
{
name: "'random' distribution",
model: &v1alpha2.InferenceModel{
Spec: v1alpha2.InferenceModelSpec{
TargetModels: []v1alpha2.TargetModel{
{
Name: "canary",
Weight: pointer(50),
},
{
Name: "v1",
Weight: pointer(50),
},
},
},
},
want: "canary",
},
{
name: "'random' distribution",
model: &v1alpha2.InferenceModel{
Spec: v1alpha2.InferenceModelSpec{
TargetModels: []v1alpha2.TargetModel{
{
Name: "canary",
Weight: pointer(25),
},
{
Name: "v1.1",
Weight: pointer(55),
},
{
Name: "v1",
Weight: pointer(50),
},
},
},
},
want: "v1",
},
{
name: "'random' distribution",
model: &v1alpha2.InferenceModel{
Spec: v1alpha2.InferenceModelSpec{
TargetModels: []v1alpha2.TargetModel{
{
Name: "canary",
Weight: pointer(20),
},
{
Name: "v1.1",
Weight: pointer(20),
},
{
Name: "v1",
Weight: pointer(10),
},
},
},
},
want: "v1.1",
},
{
name: "weighted distribution with weight unset",
model: &v1alpha2.InferenceModel{
Spec: v1alpha2.InferenceModelSpec{
TargetModels: []v1alpha2.TargetModel{
{
Name: "canary",
},
{
Name: "v1.1",
},
{
Name: "v1",
},
},
},
},
want: "canary",
},
}
var seedVal int64 = 420
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
for range 10000 {
model := RandomWeightedDraw(logger, test.model, seedVal)
if model != test.want {
t.Errorf("Model returned: %v != %v", model, test.want)
break
}
}
})
}
}

func pointer(v int32) *int32 {
return &v
}

var (
pod1 = &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Expand Down
2 changes: 1 addition & 1 deletion pkg/epp/handlers/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (s *Server) HandleRequestBody(
return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)}
}
if len(modelObj.Spec.TargetModels) > 0 {
modelName = datastore.RandomWeightedDraw(logger, modelObj, 0)
modelName = RandomWeightedDraw(logger, modelObj, 0)
if modelName == "" {
return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)}
}
Expand Down
10 changes: 4 additions & 6 deletions pkg/epp/handlers/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ func (s *Server) HandleResponseHeaders(
if header.Key == "content-type" {
contentType := header.RawValue
if strings.Contains(string(contentType), "text/event-stream") {
reqCtx.Streaming = true
} else {
reqCtx.Streaming = false
reqCtx.modelServerStreaming = true
}
typeFound = true
}
Expand Down Expand Up @@ -155,7 +153,7 @@ func (s *Server) HandleResponseBody(
loggerVerbose := logger.V(logutil.VERBOSE)
body := req.Request.(*extProcPb.ProcessingRequest_ResponseBody)

if reqCtx.Streaming {
if reqCtx.modelServerStreaming {
logger.V(logutil.DEBUG).Info("Processing HandleResponseBody")
if err := s.HandleStreaming(ctx, reqCtx, body, loggerVerbose); err != nil {
return nil, err
Expand Down Expand Up @@ -189,7 +187,7 @@ func (s *Server) HandleNonStreaming(
if err := json.Unmarshal(body.ResponseBody.Body, &res); err != nil {
return errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("unmarshaling response body: %v", err)}
}
reqCtx.Response = res
reqCtx.Usage = res.Usage
reqCtx.ResponseSize = len(body.ResponseBody.Body)
reqCtx.ResponseComplete = true
loggerVerbose.Info("Response generated", "response", res)
Expand All @@ -205,7 +203,7 @@ func (s *Server) HandleStreaming(
responseText := string(body.ResponseBody.Body)
if strings.Contains(responseText, streamingEndMsg) {
parsedResp := ParseRespForUsage(ctx, responseText, loggerVerbose)
reqCtx.Response = parsedResp
reqCtx.Usage = parsedResp.Usage
}

if body.ResponseBody.EndOfStream {
Expand Down
28 changes: 12 additions & 16 deletions pkg/epp/handlers/response_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func TestHandleResponseBody(t *testing.T) {
name string
req *extProcPb.ProcessingRequest_ResponseBody
reqCtx *RequestContext
want Response
want Usage
wantErr bool
}{
{
Expand All @@ -75,12 +75,10 @@ func TestHandleResponseBody(t *testing.T) {
Body: []byte(body),
},
},
want: Response{
Usage: Usage{
PromptTokens: 11,
TotalTokens: 111,
CompletionTokens: 100,
},
want: Usage{
PromptTokens: 11,
TotalTokens: 111,
CompletionTokens: 100,
},
},
{
Expand All @@ -100,7 +98,7 @@ func TestHandleResponseBody(t *testing.T) {
},
},
reqCtx: &RequestContext{
Streaming: true,
modelServerStreaming: true,
},
wantErr: false,
// In the middle of streaming response, so request context response is not set yet.
Expand All @@ -113,15 +111,13 @@ func TestHandleResponseBody(t *testing.T) {
},
},
reqCtx: &RequestContext{
Streaming: true,
modelServerStreaming: true,
},
wantErr: false,
want: Response{
Usage: Usage{
PromptTokens: 7,
TotalTokens: 17,
CompletionTokens: 10,
},
want: Usage{
PromptTokens: 7,
TotalTokens: 17,
CompletionTokens: 10,
},
},
}
Expand All @@ -141,7 +137,7 @@ func TestHandleResponseBody(t *testing.T) {
return
}

if diff := cmp.Diff(test.want, reqCtx.Response); diff != "" {
if diff := cmp.Diff(test.want, reqCtx.Usage); diff != "" {
t.Errorf("HandleResponseBody returned unexpected response, diff(-want, +got): %v", diff)
}
})
Expand Down
35 changes: 29 additions & 6 deletions pkg/epp/handlers/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
reqCtx.ResponseCompleteTimestamp = time.Now()
metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens)
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens)
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
}
if reqCtx.Streaming {
if reqCtx.modelServerStreaming {
logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)
} else {
loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx)
Expand All @@ -149,7 +149,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
}
}

if !reqCtx.Streaming {
if !reqCtx.modelServerStreaming {
loggerVerbose.Info("Response generated", "response", resp)
} else {
logger.V(logutil.DEBUG).Info("Response generated", "response", resp)
Expand Down Expand Up @@ -224,9 +224,32 @@ type RequestContext struct {
RequestReceivedTimestamp time.Time
ResponseCompleteTimestamp time.Time
RequestSize int
Response Response
Usage Usage
ResponseSize int
ResponseComplete bool
ResponseStatusCode string
Streaming bool

RequestState StreamRequestState
modelServerStreaming bool

reqHeaderResp *extProcPb.ProcessingResponse
reqBodyResp *extProcPb.ProcessingResponse
reqTrailerResp *extProcPb.ProcessingResponse

respHeaderResp *extProcPb.ProcessingResponse
respBodyResp *extProcPb.ProcessingResponse
respTrailerResp *extProcPb.ProcessingResponse
}

type StreamRequestState int

const (
RequestReceived StreamRequestState = 0
HeaderRequestResponseComplete StreamRequestState = 1
BodyRequestResponsesComplete StreamRequestState = 2
TrailerRequestResponsesComplete StreamRequestState = 3
ResponseRecieved StreamRequestState = 4
HeaderResponseResponseComplete StreamRequestState = 5
BodyResponseResponsesComplete StreamRequestState = 6
TrailerResponseResponsesComplete StreamRequestState = 7
)
Loading