Merge pull request #137 from vMaroon/chat-completions

vMaroon · web-flow · commit 47e32bf2b947 · 2025-05-08T00:41:11.000+03:00
Add Support for OpenAI ChatCompletions API - PrefixAware Scoring
diff --git a/Makefile b/Makefile
@@ -512,7 +512,6 @@ image-build: check-container-tool load-version-json ## Build container image usi
 		--build-arg TARGETARCH=$(TARGETARCH) \
 		--build-arg GIT_NM_USER=$(GIT_NM_USER)\
         --build-arg NM_TOKEN=$(NM_TOKEN) \
-		--progress=plain \
  		-t $(IMG) .
 
 .PHONY: image-push
diff --git a/go.mod b/go.mod
@@ -5,16 +5,19 @@ go 1.24.1
 toolchain go1.24.2
 
 require (
+	github.com/cespare/xxhash/v2 v2.3.0
 	github.com/elastic/crd-ref-docs v0.1.0
 	github.com/envoyproxy/go-control-plane/envoy v1.32.4
 	github.com/go-logr/logr v1.4.2
 	github.com/google/go-cmp v0.7.0
+	github.com/hashicorp/golang-lru/v2 v2.0.7
 	github.com/neuralmagic/llm-d-kv-cache-manager v0.0.0-20250430102735-86595011431d
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
 	github.com/prometheus/client_golang v1.22.0
 	github.com/prometheus/client_model v0.6.2
 	github.com/prometheus/common v0.63.0
+	github.com/sashabaranov/go-openai v1.39.1
 	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
 	go.uber.org/zap v1.27.0
@@ -42,7 +45,6 @@ require (
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
 	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
-	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect
 	github.com/daulet/tokenizers v1.20.2 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
@@ -74,7 +76,6 @@ require (
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/gorilla/websocket v1.5.0 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
-	github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
 	github.com/huandu/xstrings v1.3.3 // indirect
 	github.com/imdario/mergo v0.3.11 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
diff --git a/go.sum b/go.sum
@@ -189,6 +189,8 @@ github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRl
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/sashabaranov/go-openai v1.39.1 h1:TMD4w77Iy9WTFlgnjNaxbAASdsCJ9R/rMdzL+SN14oU=
+github.com/sashabaranov/go-openai v1.39.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
 github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go
@@ -79,9 +79,15 @@ func (s *StreamingServer) HandleRequestBody(
 	if llmReq.Model != llmReq.ResolvedTargetModel {
 		requestBodyMap["model"] = llmReq.ResolvedTargetModel
 	}
-	// Extract prompt from the request body.
+	// Extract prompt/messages from the request body.
 	if prompt, ok := requestBodyMap["prompt"].(string); ok {
 		llmReq.Prompt = prompt
+	} else if _, ok := requestBodyMap["messages"]; ok { // check for chat completion request
+		if chatRequest, err := schedulingtypes.NewKVCacheChatCompletionRequest(requestBodyMap); err == nil {
+			llmReq.ChatCompletionRequest = chatRequest
+		} else {
+			logger.Error(err, "Error creating chat completion request")
+		}
 	}
 
 	requestBodyBytes, err = json.Marshal(requestBodyMap)
diff --git a/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go b/pkg/epp/scheduling/plugins/scorer/prefix_aware_scorer.go
@@ -54,7 +54,14 @@ func (s *PrefixAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod
 		return nil
 	}
 
-	scores := s.prefixStore.FindMatchingPods(ctx.Req.Prompt, ctx.Req.Model)
+	var prompt string
+	if ctx.Req.ChatCompletionRequest != nil {
+		prompt = ctx.Req.ChatCompletionRequest.ToString()
+	} else {
+		prompt = ctx.Req.Prompt
+	}
+
+	scores := s.prefixStore.FindMatchingPods(prompt, ctx.Req.Model)
 	loggerDebug.Info("Got pod scores", "scores", scores)
 
 	if len(scores) == 0 {
@@ -92,7 +99,14 @@ func (s *PrefixAwareScorer) PostSchedule(ctx *types.SchedulingContext, res *type
 		return
 	}
 
-	if err := s.prefixStore.AddEntry(ctx.Req.Model, ctx.Req.Prompt, &pod.GetPod().NamespacedName); err != nil {
+	var prompt string
+	if ctx.Req.ChatCompletionRequest != nil {
+		prompt = ctx.Req.ChatCompletionRequest.ToString()
+	} else {
+		prompt = ctx.Req.Prompt
+	}
+
+	if err := s.prefixStore.AddEntry(ctx.Req.Model, prompt, &pod.GetPod().NamespacedName); err != nil {
 		debugLogger.Error(err, "Failed to add entry to prefix store", "req", ctx.Req, "pod", pod)
 		return
 	}
diff --git a/pkg/epp/scheduling/types/chat_completions.go b/pkg/epp/scheduling/types/chat_completions.go
@@ -0,0 +1,108 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package types
+
+import (
+	"encoding/json"
+	"strings"
+
+	"github.com/sashabaranov/go-openai"
+)
+
+// KVCacheChatCompletionRequest is a struct that represents the fields from an
+// OpenAI API ChatCompletionRequest that are relevant for KV cache generation.
+// Model is not included as it is contained in the LLMRequest struct.
+//
+// Multimodal requests are not supported in the current implementation.
+type KVCacheChatCompletionRequest struct {
+	Messages    []openai.ChatCompletionMessage `json:"messages"`
+	Tools       []openai.Tool                  `json:"tools,omitempty"`
+	ToolChoices []openai.ToolChoice            `json:"tool_choices,omitempty"`
+}
+
+// NewKVCacheChatCompletionRequest creates a new KVCacheChatCompletionRequest
+// from a json request.
+//
+// The call marshals the input map to JSON and then unmarshals it into the
+// KVCacheChatCompletionRequest struct.
+func NewKVCacheChatCompletionRequest(input map[string]interface{}) (*KVCacheChatCompletionRequest, error) {
+	var req KVCacheChatCompletionRequest
+
+	if messagesRaw, ok := input["messages"]; ok {
+		bytes, err := json.Marshal(messagesRaw)
+		if err != nil {
+			return nil, err
+		}
+		if err := json.Unmarshal(bytes, &req.Messages); err != nil {
+			return nil, err
+		}
+	}
+
+	if toolsRaw, ok := input["tools"]; ok {
+		bytes, err := json.Marshal(toolsRaw)
+		if err != nil {
+			return nil, err
+		}
+		if err := json.Unmarshal(bytes, &req.Tools); err != nil {
+			return nil, err
+		}
+	}
+
+	if choicesRaw, ok := input["tool_choices"]; ok {
+		bytes, err := json.Marshal(choicesRaw)
+		if err != nil {
+			return nil, err
+		}
+		if err := json.Unmarshal(bytes, &req.ToolChoices); err != nil {
+			return nil, err
+		}
+	}
+
+	return &req, nil
+}
+
+// ToString generates a string representation of the KVCacheChatCompletionRequest.
+func (r *KVCacheChatCompletionRequest) ToString() string {
+	var builder strings.Builder
+
+	for _, msg := range r.Messages {
+		builder.WriteString(msg.Role)
+		builder.WriteString(":")
+		builder.WriteString(msg.Content)
+		builder.WriteString("\n")
+	}
+
+	if len(r.Tools) > 0 {
+		toolsJSON, err := json.Marshal(r.Tools)
+		if err == nil {
+			builder.WriteString("tools:")
+			builder.Write(toolsJSON)
+			builder.WriteString("\n")
+		}
+	}
+
+	if len(r.ToolChoices) > 0 {
+		choicesJSON, err := json.Marshal(r.ToolChoices)
+		if err == nil {
+			builder.WriteString("tool_choices:")
+			builder.Write(choicesJSON)
+			builder.WriteString("\n")
+		}
+	}
+
+	return builder.String()
+}
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
@@ -27,8 +27,9 @@ import (
 
 // LLMRequest is a structured representation of the fields we parse out of the LLMRequest body.
 type LLMRequest struct {
-	Model  string
-	Prompt string
+	Model                 string
+	Prompt                string
+	ChatCompletionRequest *KVCacheChatCompletionRequest
 	// Target models is a map of target model name to weight.
 	TargetModels map[string]int
 	Headers      map[string]string
@@ -39,7 +40,13 @@ type LLMRequest struct {
 }
 
 func (r *LLMRequest) String() string {
-	return fmt.Sprintf("Model: %s, TargetModels: %v, ResolvedTargetModel: %s, Critical: %t, PromptLength: %v", r.Model, r.TargetModels, r.ResolvedTargetModel, r.Critical, len(r.Prompt))
+	promptLength := len(r.Prompt)
+	if r.ChatCompletionRequest != nil {
+		promptLength = len(r.ChatCompletionRequest.ToString())
+	}
+
+	return fmt.Sprintf("Model: %s, TargetModels: %v, ResolvedTargetModel: %s, Critical: %t, PromptLength: %v",
+		r.Model, r.TargetModels, r.ResolvedTargetModel, r.Critical, promptLength)
 }
 
 type Pod interface {

-Original file line number
+Diff line change
 toolchain go1.24.2
 require (
 +	github.com/cespare/xxhash/v2 v2.3.0
 	github.com/elastic/crd-ref-docs v0.1.0
 	github.com/envoyproxy/go-control-plane/envoy v1.32.4
 	github.com/go-logr/logr v1.4.2
 	github.com/google/go-cmp v0.7.0
 +	github.com/hashicorp/golang-lru/v2 v2.0.7
 	github.com/neuralmagic/llm-d-kv-cache-manager v0.0.0-20250430102735-86595011431d
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
 	github.com/prometheus/client_golang v1.22.0
 	github.com/prometheus/client_model v0.6.2
 	github.com/prometheus/common v0.63.0
 +	github.com/sashabaranov/go-openai v1.39.1
 	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
 	go.uber.org/zap v1.27.0
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
 	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
 -	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect
 	github.com/daulet/tokenizers v1.20.2 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/gorilla/websocket v1.5.0 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
 -	github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
 	github.com/huandu/xstrings v1.3.3 // indirect
 	github.com/imdario/mergo v0.3.11 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect