From 3590f78e7d7664dae29e875fcb643620653474b4 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 5 Sep 2024 06:24:15 +0000 Subject: [PATCH 01/24] Big ol proposal commit --- .../proposals/002-api-proposal/images/bep.svg | 1 + .../002-api-proposal/images/gw_w_bep.svg | 1 + docs/proposals/002-api-proposal/proposal.md | 254 ++++++++++++++++++ 3 files changed, 256 insertions(+) create mode 100644 docs/proposals/002-api-proposal/images/bep.svg create mode 100644 docs/proposals/002-api-proposal/images/gw_w_bep.svg create mode 100644 docs/proposals/002-api-proposal/proposal.md diff --git a/docs/proposals/002-api-proposal/images/bep.svg b/docs/proposals/002-api-proposal/images/bep.svg new file mode 100644 index 00000000..cfac8994 --- /dev/null +++ b/docs/proposals/002-api-proposal/images/bep.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/images/gw_w_bep.svg b/docs/proposals/002-api-proposal/images/gw_w_bep.svg new file mode 100644 index 00000000..077158ea --- /dev/null +++ b/docs/proposals/002-api-proposal/images/gw_w_bep.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md new file mode 100644 index 00000000..722f7f29 --- /dev/null +++ b/docs/proposals/002-api-proposal/proposal.md @@ -0,0 +1,254 @@ + +# LLM Instance Gateway + +## Proposal Status + ***Draft*** + +## Table of Contents + + + +- [Summary](#summary) +- [Goals](#goals) +- [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [Personas](#personas) + - [Inference Platform Admin](#inference-platform-admin) + - [LLM Use Case Owner](#llm-use-case-owner) + - [Axioms](#axioms) + - [BackendPool](#backendpool) + - [LLMUseCase](#llmusecase) + - [Spec](#spec) + - [Diagrams](#diagrams) + - [Alternatives](#alternatives) +- [FAQ](#faq) +- [Open Questions](#open-questions) + + + +## Summary + +This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **LLMUseCase** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMUseCase is used to define objectives, the LoRA Adapter(s) used by the Use Case, and is owned by the LLM Use Case Owner. + +## Goals + +- Drive concensus on direction of LLM Instance Gateway Solution +- Documentation of API decisions for posterity + +## Non-Goals + +- Hash out every implementation detail +- Be a formal KEP + +## Proposal + +### Personas + +Before diving into the details of the API, decriptions of the personas will help shape the thought process of the API design. + +#### Inference Platform Admin + +The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads. Including handling Ops for: + - Hardware + - Model Server + - Base Model + - Resource Allocation for Workloads + - Gateway configuration + - etc + +#### LLM Use Case Owner + +Owns... An LLM based... Use Case... + +Okay, seriously. An LLM Use Case Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: +- Defining SLO +- Deploying LoRA Adapters (or other fine-tune) +- Managing rollout of adapters + +### Axioms + +The API design is based on these axioms: + +- Pools of shared compute should be *discrete* for scheduling to properly work +- Pod-level scheduling should not be handled by a high-level gateway +- Simple use cases should be simple to define (or are implicitly defined via reasonable defaults) +- This solution should be composable with other Gateway solutions and flexible to fit customer needs +- The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future + +The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **BackendPool**. + +### BackendPool + +The BackendPool at its core is a logical grouping of compute, expressed in the form of Pods (typically model servers), akin to a K8s Service. The BackendPool would deploy its own routing, and offer administrative configuration to the Platform Admin. + + It is expected for the BackendPool to: + - Enforce fair consumption of resources across competing use cases + - Efficiently route requests across shared compute (as displayed by the PoC) + +It is _not_ expected for the BackendPool to: + - Enforce any common set of adapters or base models are available on the Pods + - Manage Deployments of Pods within the Pool + - Manage Pod lifecycle of pods within the pool + +Additionally, any Pod that seeks to join a BackendPool would need to support a protocol, defined by LLM Instance Gateway, to ensure the Pool has adequate information to intelligently route requests. + +### LLMUseCase + +An LLMUseCase allows the UseCaseOwner to define: +- Which LoRA adapter(s) to consume + - LLMUseCase allows for traffic splitting between adapters _in the same pool_ to allow for new LoRA adapter versions to be easily rolled out +- SLO objectives for the UseCAse +- The Pools this UseCase is relevant to + +### Spec + +**BackendPool** +```golang +// A grouping of model serving instances that are expected to serve the same model(s) and speak the same API protocol +// The LLMBackendPool is also the scope for enforcing priority and fairness across different use cases. +// When used with the Gateway API, the LLMBackendPool can serve as the BackendRefs of an HTTPRoute. +// Generally the LLMBackendPool is owned by the "Inference Platform Admin" persona. +type LLMBackendPool struct { + metav1.ObjectMeta + metav1.TypeMeta + Spec LLMBackendSpec +} + +type LLMBackendPoolSpec struct { + // Pod selector, similar to k8s Service. + Selector map[string]string `json:"selector,omitempty"` + // Allows names within the `model` param that don't have an explicitly defined UseCase to pass through. Defaults to `false`. + AllowUndefinedModels bool + // Admin-defined minimum objective value that can be requested, will cause an error in LLMUseCase if limit is broken. + MinTPOT float32 +} +``` + + +**LLMUseCase** +```golang +// LLMUseCase represents a use case of an LLM, which is multiplexed onto one or more LLMBackendPools. +// It allows mapping a use case to backend pools, and models or adapters that backend model servers understand. +// The LLMUseCase defines request routing behavior within an LLMBackendPool. +// Generally the LLMUseCase is owned by the "LLM Use Case Owner" persona, which can be teams in an organization. +type LLMUseCase struct { + metav1.ObjectMeta + metav1.TypeMeta + + Spec LLMUseCaseSpec +} + +type LLMUseCaseSpec struct { + // Map use case to one or more backend pools. + // In the most common case, this should be a single backend pool. + // Multiple backend pools can be used for traffic splitting while migrating to a new backend pool. + Rules []LLMUseCaseRule +} + +// LLMUseCaseRule represents a mapping from a LLMUseCase to a backend pool and adapters/models in that pool. +type LLMUseCaseRule struct { + // The name used in the `model` param of incoming requests + ModelName string + // Optional + Objective *Objective + // Required. + // Reference to an LLMBackendPool. This allows registering a use case + // as valid on a pool. + // NOTE: Allowing multiple pools is a configuration convenience. + PoolRef []corev1.ObjectReference + // Optional. + // Allow multiple versions of a model for traffic splitting. + // If not specified, the target model name is defaulted to the + // modelName parameter. + TargetModels []common.TargetModel +} + + +// TargetModel represents a deployed model or a LoRA adapter. +type TargetModel struct { + // The name of the adapter expected by the ModelServer. + TargetModelName string + // Weight is used to determine the percentage of traffic that should be + // sent to this target model when + // multiple versions of the models are specified. + Weight int +} + +// Objective defines the performance targets of a LLM use case. +// NOTE: Objectives are best effort +type Objective struct { + // Only one target can be set. + TPOT []LatencyTarget + FairnessWeight int +} + + +type LatencyTarget struct { + Percentile float64 `json:"percentile,omitempty"` + Target time.Duration `json:"target,omitempty"` +} +``` + +### Diagrams + +Much of this is better explained visually: + +Below is a detailed view of the BackendPool + +![BackendPool](./images/bep.svg) + +This diagram lightly follows the example request for a model `interestingName`. +The flow can be described as: +- The request comes in to our routing solution(Ext-Proc) +- ExtProc looks up the UseCases affiliated with this pool `examplePool` +- `interestingName` is currently undergoing a change of LoRA adapters from `creativeNameGen-v3` (20% traffic split) to `veryCreativeNameGen` (80% traffic split) +- `veryCreativeNameGen` is selected as the LoRA adapter, and replaces `interestingName` in the body of the request (mutated by ext-proc) +- the request is then efficiently scheduled onto one of the valid Pods +- metrics are sent back to the BEP, aggregated and re-emitted via sidecar (following the metric standardization) + +How Multiple BackendPools might integrate together: + +![K8s Gateway with BackendPools](./images/gw_w_bep.svg) + +Here we see that we can have: +- Multiple Routes pointing to the same pool +- Routes splitting traffic across multiple pools + +The functionality of the Kubernetes Gateway is unchanged with this proposal, allowing seamless integration with the BackendPool. + + +### Alternatives + +#### Key Decisions + +Our alternatives hinge on some key decisions: +- Allowing HTTPRoute to treat the BackendPool as the backendRef +- Creating a separate layer of abstraction, instead of extending HTTPRoute + +#### LLMUseCase as a backend ref + +We toyed with the idea of allowing an LLMUsecase be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters, and require that the HTTPRoute also specify the backend the UseCase is intended to run on. All of which would require substantial work on the Kubernetes Gateway, while not providing much flexibility. + +#### LLMRoute + +Our original idea was to define all UseCase config at the Kubernetes Gateway layer, and have no BackendPool. This is inherently challenging, as LLMRoute would become a superset of HTTPRoute, or the Gateway would become bespoke, and work only for the LLMRoute use case. + +## FAQ +- Why 2 layers of weighting? (HttpRoute & UseCase) + - Feasibly done - No extension of HttpRoute. Just works, as BackendPool operates like a service. + - Complexity is only expressed during transition states (model version upgrade) + - Keeps Pools self contained - multiple K8s gateways can direct traffic to the same pool without needing to re-express Pool-level behavior +- What is a backend pool attempting to define? + - BackendPool groups resources that should be shared over the UseCases that are affiliated with the pool + - Best practice would also suggest keeping the same base model for all ModelServers in the pool, but that is not enforced + + +## Open Questions + +- Reasonable defaults + - Should use cases be required? Or can a customer simply create a pool, and direct requests to the pool, and expect even fairness/priority across the different LoRA adapters that are requested? + - If so? How should we handle the mix between explicit and implicit use cases? Are implicit usecases just default everything? (and inherently lower prio). + - NOTE: Current thinking is this is yes we should allow non-use case defined requests, but is a security risk if on by default. So pools should opt-in +- Configuration control + - How many routing decisions should we make on behalf of the user vs allow for configuration? + - Do we decide that SLO adherence is stricter than Fariness adherence? Do we allow for configuration of such tooling? (would be expressed in the BackendPool API) From b5d0c05d4f5f4f0ee12a568333f7cb0f1c53b748 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Mon, 9 Sep 2024 20:57:02 +0000 Subject: [PATCH 02/24] 1st round of addressing comments --- docs/proposals/002-api-proposal/proposal.md | 30 ++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 722f7f29..4f6b69d3 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -28,7 +28,7 @@ ## Summary -This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **LLMUseCase** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMUseCase is used to define objectives, the LoRA Adapter(s) used by the Use Case, and is owned by the LLM Use Case Owner. +This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **LLMUseCase** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMUseCase defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. ## Goals @@ -58,9 +58,7 @@ The Inference Platform Admin creates and manages the infrastructure necessary to #### LLM Use Case Owner -Owns... An LLM based... Use Case... - -Okay, seriously. An LLM Use Case Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: +An LLM Use Case Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: - Defining SLO - Deploying LoRA Adapters (or other fine-tune) - Managing rollout of adapters @@ -96,8 +94,8 @@ Additionally, any Pod that seeks to join a BackendPool would need to support a p An LLMUseCase allows the UseCaseOwner to define: - Which LoRA adapter(s) to consume - - LLMUseCase allows for traffic splitting between adapters _in the same pool_ to allow for new LoRA adapter versions to be easily rolled out -- SLO objectives for the UseCAse + - LLMUseCase allows for traffic splitting between adapters _in the same backendpool_ to allow for new LoRA adapter versions to be easily rolled out +- SLO objectives for the UseCase - The Pools this UseCase is relevant to ### Spec @@ -148,6 +146,7 @@ type LLMUseCaseSpec struct { // LLMUseCaseRule represents a mapping from a LLMUseCase to a backend pool and adapters/models in that pool. type LLMUseCaseRule struct { // The name used in the `model` param of incoming requests + // https://platform.openai.com/docs/api-reference/making-requests ModelName string // Optional Objective *Objective @@ -156,9 +155,9 @@ type LLMUseCaseRule struct { // as valid on a pool. // NOTE: Allowing multiple pools is a configuration convenience. PoolRef []corev1.ObjectReference - // Optional. - // Allow multiple versions of a model for traffic splitting. - // If not specified, the target model name is defaulted to the + // Optional + // Allow multiple versions of a model for traffic splitting. + // If not specified, the target model name is defaulted to the // modelName parameter. TargetModels []common.TargetModel } @@ -166,11 +165,10 @@ type LLMUseCaseRule struct { // TargetModel represents a deployed model or a LoRA adapter. type TargetModel struct { - // The name of the adapter expected by the ModelServer. + // The name of the adapter as expected by the ModelServer. TargetModelName string // Weight is used to determine the percentage of traffic that should be - // sent to this target model when - // multiple versions of the models are specified. + // sent to this target model when multiple versions of the model are specified. Weight int } @@ -223,11 +221,13 @@ The functionality of the Kubernetes Gateway is unchanged with this proposal, all Our alternatives hinge on some key decisions: - Allowing HTTPRoute to treat the BackendPool as the backendRef + - Whereas the alternatives might have the LLMUseCase as the backend ref - Creating a separate layer of abstraction, instead of extending HTTPRoute + - Explained in more detail in the LLMRoute section #### LLMUseCase as a backend ref -We toyed with the idea of allowing an LLMUsecase be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters, and require that the HTTPRoute also specify the backend the UseCase is intended to run on. All of which would require substantial work on the Kubernetes Gateway, while not providing much flexibility. +We toyed with the idea of allowing an LLMUsecase be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters (assuming OpenAI protocol continues to require the model param in the body), and require that the HTTPRoute also specify the backend the UseCase is intended to run on. Since we our primary proposal already specifies the backend, packing this functionality would require substantial work on the Kubernetes Gateway, while not providing much flexibility. #### LLMRoute @@ -245,10 +245,10 @@ Our original idea was to define all UseCase config at the Kubernetes Gateway lay ## Open Questions -- Reasonable defaults +- Reasonable defaults (how do we behave in the absence of user-specified values in optional fields) - Should use cases be required? Or can a customer simply create a pool, and direct requests to the pool, and expect even fairness/priority across the different LoRA adapters that are requested? - If so? How should we handle the mix between explicit and implicit use cases? Are implicit usecases just default everything? (and inherently lower prio). - NOTE: Current thinking is this is yes we should allow non-use case defined requests, but is a security risk if on by default. So pools should opt-in - Configuration control - How many routing decisions should we make on behalf of the user vs allow for configuration? - - Do we decide that SLO adherence is stricter than Fariness adherence? Do we allow for configuration of such tooling? (would be expressed in the BackendPool API) + - Do we decide that SLO adherence is stricter than Fairness adherence? Do we allow for configuration of such tooling? (would be expressed in the BackendPool API) From 1dabf98376afb388c194ba29e0c242162c0193f5 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 12 Sep 2024 16:47:16 +0000 Subject: [PATCH 03/24] Partial glossary implementation --- docs/proposals/002-api-proposal/glossary.md | 46 +++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 docs/proposals/002-api-proposal/glossary.md diff --git a/docs/proposals/002-api-proposal/glossary.md b/docs/proposals/002-api-proposal/glossary.md new file mode 100644 index 00000000..08a37f66 --- /dev/null +++ b/docs/proposals/002-api-proposal/glossary.md @@ -0,0 +1,46 @@ +# Glossary + +This is a glossary that deep-dives on terms used within the api proposal, in an effort to give context to the API decisions + + +- [API Terms](#api-terms) + - [BackendPool](#backendpool) +- [Priority](#priority) +- [Fairness](#fairness) +- [Lora Affinity](#lora-affinity) +- [Latency Based Routing](#latency-based-routing) + + + + +## API Terms +This is a very brief description of terms used to describe API objects, this is included only if the glossary is the first doc you are reading. + +### BackendPool +A grouping of model servers that serve the same set of fine-tunes (LoRA as a primary example). + +### UseCase +An LLM workload that is defined and runs on a BackendPool with other use cases. + +## Priority + +### Summary +Priority specifies the importance of a UseCase relative to other usecases within a BackendPool. + +### Description + +For our purposes, priority can be thought of in two classes: +- Critical +- Non-Critical + +The primary difference is that non-critical UseCase requests will be rejected in favor of Critical UseCases the face of resource scarcity. + +Example: + +Your current request load is using 80 Arbitrary Compute Units(ACU) of your pools total of 100ACU capacity. 40ACU are critical workload requests, 45 are non-critical. If you were to lose 30 ACU due to an unforseen outage. Priority would dictate that of the 10 surplus ACU to be rejected the entirety of them would be from the non-critical requests. + +## Fairness + +## Lora Affinity + +## Latency Based Routing \ No newline at end of file From 6ef1add2fcc48dc4c22642c288eaa5baddc3f8dc Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 12 Sep 2024 17:00:15 +0000 Subject: [PATCH 04/24] API updates --- docs/proposals/002-api-proposal/proposal.md | 95 ++++++++------------- 1 file changed, 37 insertions(+), 58 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 4f6b69d3..75c48d6b 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -77,6 +77,8 @@ The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused o ### BackendPool +*** FOR MVP THE BACKEND IS PROPOSED TO BE IMPLICIT *** + The BackendPool at its core is a logical grouping of compute, expressed in the form of Pods (typically model servers), akin to a K8s Service. The BackendPool would deploy its own routing, and offer administrative configuration to the Platform Admin. It is expected for the BackendPool to: @@ -100,69 +102,46 @@ An LLMUseCase allows the UseCaseOwner to define: ### Spec -**BackendPool** -```golang -// A grouping of model serving instances that are expected to serve the same model(s) and speak the same API protocol -// The LLMBackendPool is also the scope for enforcing priority and fairness across different use cases. -// When used with the Gateway API, the LLMBackendPool can serve as the BackendRefs of an HTTPRoute. -// Generally the LLMBackendPool is owned by the "Inference Platform Admin" persona. -type LLMBackendPool struct { - metav1.ObjectMeta - metav1.TypeMeta - Spec LLMBackendSpec -} - -type LLMBackendPoolSpec struct { - // Pod selector, similar to k8s Service. - Selector map[string]string `json:"selector,omitempty"` - // Allows names within the `model` param that don't have an explicitly defined UseCase to pass through. Defaults to `false`. - AllowUndefinedModels bool - // Admin-defined minimum objective value that can be requested, will cause an error in LLMUseCase if limit is broken. - MinTPOT float32 -} -``` - - **LLMUseCase** ```golang -// LLMUseCase represents a use case of an LLM, which is multiplexed onto one or more LLMBackendPools. -// It allows mapping a use case to backend pools, and models or adapters that backend model servers understand. -// The LLMUseCase defines request routing behavior within an LLMBackendPool. -// Generally the LLMUseCase is owned by the "LLM Use Case Owner" persona, which can be teams in an organization. -type LLMUseCase struct { +// LLMUseCaseSet represents a set of LLM use cases that are multiplexed onto one or more backend pools. +// This is generally owned by the "LLM Use Case Owner" persona, which can be teams in an organization. +type LLMUseCaseSet struct { metav1.ObjectMeta metav1.TypeMeta - Spec LLMUseCaseSpec + Spec LLMUseCaseSetSpec } -type LLMUseCaseSpec struct { - // Map use case to one or more backend pools. - // In the most common case, this should be a single backend pool. - // Multiple backend pools can be used for traffic splitting while migrating to a new backend pool. - Rules []LLMUseCaseRule +type LLMUseCaseSetSpec struct { + // Defines the use cases in the set. + UseCases []LLMUseCase + // Reference to the backend pools that the use cases registers to. + PoolRef []corev1.ObjectReference } -// LLMUseCaseRule represents a mapping from a LLMUseCase to a backend pool and adapters/models in that pool. -type LLMUseCaseRule struct { - // The name used in the `model` param of incoming requests - // https://platform.openai.com/docs/api-reference/making-requests +// LLMUseCase defines the policies for routing the traffic of a use case, this includes performance objectives +// and traffic splitting between different versions of the model. +type LLMUseCase struct { + // The name of the model as the users set in the "model" parameter in the requests. + // The model name should be unique among the use cases that reference the same backend pool. + // This is the parameter that will be used to match the request with. In the future, we may + // allow to match on other request parameters. The other approach to support matching on + // on other request parameters is to use a different ModelName f HTTPFilter ModelName string // Optional + // Use cases with an objective have higher priority than use cases without. + // IMPORTANT: By specifying an objective, this places the UseCase in a higher priority class than UseCases without a defined priority class. + // In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected. Objective *Objective - // Required. - // Reference to an LLMBackendPool. This allows registering a use case - // as valid on a pool. - // NOTE: Allowing multiple pools is a configuration convenience. - PoolRef []corev1.ObjectReference - // Optional - // Allow multiple versions of a model for traffic splitting. - // If not specified, the target model name is defaulted to the - // modelName parameter. + // Optional. + // Allow multiple versions of a model for traffic splitting. + // If not specified, the target model name is defaulted to the modelName parameter. TargetModels []common.TargetModel } + // TargetModel represents a deployed model or a LoRA adapter. type TargetModel struct { // The name of the adapter as expected by the ModelServer. @@ -172,18 +151,18 @@ type TargetModel struct { Weight int } -// Objective defines the performance targets of a LLM use case. -// NOTE: Objectives are best effort +// Objective captures the latency SLO of a LLM use case. +// In MVP, meeting the SLO is on a best effort basis. +// Future: Extend the API for different behaviors of meeting the SLO. +// The gateway will perform best-effort load balancing, and work with other components (e.g., autoscaler) to meet the +// objectives. type Objective struct { - // Only one target can be set. - TPOT []LatencyTarget - FairnessWeight int -} - - -type LatencyTarget struct { - Percentile float64 `json:"percentile,omitempty"` - Target time.Duration `json:"target,omitempty"` + // The AverageLatencyPerOutputToken is calculated as the e2e request latency divided by output token + // length. Note that this is different from what is known as TPOT (time per output token) which only + // takes decode time into account. + // The P95 is calculated over a fixed time window defined at the operator level. + DesiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests + *time.Duration } ``` From 620c834ac7bf8cf2aa98dacc9979b367a7ddcad5 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Mon, 16 Sep 2024 21:06:12 +0000 Subject: [PATCH 05/24] glossary additions --- docs/proposals/002-api-proposal/glossary.md | 70 +++++++++++++++++---- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/docs/proposals/002-api-proposal/glossary.md b/docs/proposals/002-api-proposal/glossary.md index 08a37f66..55955adc 100644 --- a/docs/proposals/002-api-proposal/glossary.md +++ b/docs/proposals/002-api-proposal/glossary.md @@ -1,27 +1,34 @@ # Glossary -This is a glossary that deep-dives on terms used within the api proposal, in an effort to give context to the API decisions +This is a glossary that attempts to more thoroughly emplain terms used within the api proposal, in an effort to give context to API decisions. -- [API Terms](#api-terms) +- [API Terms](#api) - [BackendPool](#backendpool) -- [Priority](#priority) -- [Fairness](#fairness) -- [Lora Affinity](#lora-affinity) -- [Latency Based Routing](#latency-based-routing) + - [UseCase](#UseCase) +- [Capacity Constrained Routing](#capacity-constrained-routing) + - [Priority](#priority) + - [Fairness](#fairness) +- [General Routing](#general-routing) + - [Latency Based Routing](#latency-based-routing) + - [Lora Affinity](#lora-affinity) -## API Terms -This is a very brief description of terms used to describe API objects, this is included only if the glossary is the first doc you are reading. +## API +This is a very brief description of terms used to describe API objects, included for completeness. ### BackendPool -A grouping of model servers that serve the same set of fine-tunes (LoRA as a primary example). +A grouping of model servers that serve the same set of fine-tunes (LoRA as a primary example). + +Shortened to: `BEP` ### UseCase An LLM workload that is defined and runs on a BackendPool with other use cases. - + +# Capacity Constrained Routing + ## Priority ### Summary @@ -41,6 +48,47 @@ Your current request load is using 80 Arbitrary Compute Units(ACU) of your pools ## Fairness +### Summary +Fairness specifies how resources are shared among different UseCases, in a way that is most acceptable to the user. + +### Description + +Fairness, like priority, is only used in resource scarcity events. + +Fairness is utilized when requests of the same priority class need to be rejected, or queued. There are many dimensions that could be considered when considering shared resources. To name a few: +- KV-cache utilization +- Total request count +- SLO adherence + +For the v1 MVP, the only objective a User can specify is the SLO objective they would like to meet. So, in following that pattern, fairness in MVP will simply be considered for SLO adherence. SLO Adherence is only being considered over a rolling time window of data. + +The TTL we are currently assuming is: `5 min` + +### Example + +**Assumption:** Services have equally weighted fairness for this example. + +- Service A has been meeting its SLO 98% of the requests made in the time window, and Service B has met the SLO 94% of the time. + +- A request for both Service A and Service B come in at the same time, and there is only capacity to start a single new request in the BEP, this capacity would meet the SLO for both services. The other request would be queued (potentially causing that request to not meet SLO). + +- To fairly share these resources. Service B *must* be selected to begin the request immediately as Service A has had its SLO met a larger percentage of the time. + +# General Routing +Different from the previous definitons, these terms are used to describe methods of routing that are constant, and seek to better utilize compute resources to avoid capacity constraints as much as possible. + +## Latency Based Routing + +### Summary +Latency Based Routing uses data to ensure UseCases meet their specified SLO. + +### Description +Data collected from the model servers and data collected from the request is used to predict the time a request will take on a *specific* model server, and route in a way that will best satisfy the SLO of the incoming requests. + ## Lora Affinity -## Latency Based Routing \ No newline at end of file +### Summary +LoRA Affinity describes the routing strategy displayed in the [demo](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458), to better utilize Model Servers within the BEP. + +### Description +Model Servers that support multi-LoRA handle requests in a FCFS basis. By utilizing the data provided by the model server (the state of loaded LoRA adapters), a routing system can route requests for a given LoRA adapter, to a model server that already has that adapter loaded, to create larger batches than a naive route, which better utilizes the model server hardware. \ No newline at end of file From ecc4015e772e736f05952690d082728c05481725 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 18 Sep 2024 14:55:05 +0000 Subject: [PATCH 06/24] adding axioms and faq --- docs/proposals/002-api-proposal/proposal.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 75c48d6b..9a9ac244 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -72,13 +72,12 @@ The API design is based on these axioms: - Simple use cases should be simple to define (or are implicitly defined via reasonable defaults) - This solution should be composable with other Gateway solutions and flexible to fit customer needs - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future +- The Gateway should route in a way that does not generate a queue of requests at the model server level The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **BackendPool**. ### BackendPool -*** FOR MVP THE BACKEND IS PROPOSED TO BE IMPLICIT *** - The BackendPool at its core is a logical grouping of compute, expressed in the form of Pods (typically model servers), akin to a K8s Service. The BackendPool would deploy its own routing, and offer administrative configuration to the Platform Admin. It is expected for the BackendPool to: @@ -213,13 +212,20 @@ We toyed with the idea of allowing an LLMUsecase be the target of an HTTPRouteRu Our original idea was to define all UseCase config at the Kubernetes Gateway layer, and have no BackendPool. This is inherently challenging, as LLMRoute would become a superset of HTTPRoute, or the Gateway would become bespoke, and work only for the LLMRoute use case. ## FAQ -- Why 2 layers of weighting? (HttpRoute & UseCase) +- **Why 2 layers of weighting?** (HttpRoute & UseCase) - Feasibly done - No extension of HttpRoute. Just works, as BackendPool operates like a service. - Complexity is only expressed during transition states (model version upgrade) - Keeps Pools self contained - multiple K8s gateways can direct traffic to the same pool without needing to re-express Pool-level behavior -- What is a backend pool attempting to define? +- **What is a BEP attempting to define?** - BackendPool groups resources that should be shared over the UseCases that are affiliated with the pool - Best practice would also suggest keeping the same base model for all ModelServers in the pool, but that is not enforced +- **Can a UseCase reference multiple BEPs?** +- **How is this deployed?** + - We will follow [common patterns](https://gateway.envoyproxy.io/docs/tasks/quickstart/#installation) to install the CRDs & Controllers +- **Are all controllers necessary for this solution going to be provided by Instance Gateway(this repo)?** + - Yes + + ## Open Questions From 9f241033cd1996940a3b38b9748664ae3eb1cc4b Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 19 Sep 2024 16:01:41 +0000 Subject: [PATCH 07/24] Addressing review comments --- docs/proposals/002-api-proposal/glossary.md | 4 ++-- docs/proposals/002-api-proposal/proposal.md | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/proposals/002-api-proposal/glossary.md b/docs/proposals/002-api-proposal/glossary.md index 55955adc..1ecd42cb 100644 --- a/docs/proposals/002-api-proposal/glossary.md +++ b/docs/proposals/002-api-proposal/glossary.md @@ -3,10 +3,10 @@ This is a glossary that attempts to more thoroughly emplain terms used within the api proposal, in an effort to give context to API decisions. -- [API Terms](#api) +- [API Terms](#api) - [BackendPool](#backendpool) - [UseCase](#UseCase) -- [Capacity Constrained Routing](#capacity-constrained-routing) +- [Capacity Constrained Routing](#capacity-constrained-routing) - [Priority](#priority) - [Fairness](#fairness) - [General Routing](#general-routing) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 9a9ac244..4168b165 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -30,6 +30,8 @@ This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **LLMUseCase** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMUseCase defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. +**NOTE: Some routing terms are defined in the [glossary](./glossary.md) file, to more deeply describe how we will handle behaviors like priority and fairness** + ## Goals - Drive concensus on direction of LLM Instance Gateway Solution @@ -60,7 +62,11 @@ The Inference Platform Admin creates and manages the infrastructure necessary to An LLM Use Case Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: - Defining SLO -- Deploying LoRA Adapters (or other fine-tune) +- Managing fine-tunes + - LoRA Adapters + - System Prompts + - Prompt Cache + - etc. - Managing rollout of adapters ### Axioms @@ -114,6 +120,9 @@ type LLMUseCaseSet struct { type LLMUseCaseSetSpec struct { // Defines the use cases in the set. + // UseCases can be in 2 priority classes, CRITICAL and NONCRITICAL. + // Priority class is implicit, and by specifying an Objective, + // places the UseCase in the CRITICAL priority class. UseCases []LLMUseCase // Reference to the backend pools that the use cases registers to. PoolRef []corev1.ObjectReference From 8d26e3b2d913c77e442ba49c33de256400920083 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Mon, 23 Sep 2024 17:24:55 +0000 Subject: [PATCH 08/24] more proposal updates --- docs/proposals/002-api-proposal/proposal.md | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 4168b165..79b4a41b 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -17,7 +17,7 @@ - [LLM Use Case Owner](#llm-use-case-owner) - [Axioms](#axioms) - [BackendPool](#backendpool) - - [LLMUseCase](#llmusecase) + - [ModelGroup](#modelgroup) - [Spec](#spec) - [Diagrams](#diagrams) - [Alternatives](#alternatives) @@ -28,7 +28,7 @@ ## Summary -This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **LLMUseCase** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMUseCase defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. +This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **ModelGroup** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the ModelGroup defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. **NOTE: Some routing terms are defined in the [glossary](./glossary.md) file, to more deeply describe how we will handle behaviors like priority and fairness** @@ -97,40 +97,40 @@ It is _not_ expected for the BackendPool to: Additionally, any Pod that seeks to join a BackendPool would need to support a protocol, defined by LLM Instance Gateway, to ensure the Pool has adequate information to intelligently route requests. -### LLMUseCase +### ModelGroup -An LLMUseCase allows the UseCaseOwner to define: +A ModelGroup allows the UseCaseOwner to define: - Which LoRA adapter(s) to consume - - LLMUseCase allows for traffic splitting between adapters _in the same backendpool_ to allow for new LoRA adapter versions to be easily rolled out + - ModelGroup allows for traffic splitting between adapters _in the same BackendPool_ to allow for new LoRA adapter versions to be easily rolled out - SLO objectives for the UseCase - The Pools this UseCase is relevant to ### Spec -**LLMUseCase** +**ModelGroup** ```golang -// LLMUseCaseSet represents a set of LLM use cases that are multiplexed onto one or more backend pools. +// ModelGroupSet represents a set of LLM use cases that are multiplexed onto one or more backend pools. // This is generally owned by the "LLM Use Case Owner" persona, which can be teams in an organization. -type LLMUseCaseSet struct { +type ModelGroupSet struct { metav1.ObjectMeta metav1.TypeMeta - Spec LLMUseCaseSetSpec + Spec ModelGroupSetSpec } -type LLMUseCaseSetSpec struct { +type ModelGroupSetSpec struct { // Defines the use cases in the set. // UseCases can be in 2 priority classes, CRITICAL and NONCRITICAL. // Priority class is implicit, and by specifying an Objective, // places the UseCase in the CRITICAL priority class. - UseCases []LLMUseCase + UseCases []ModelGroup // Reference to the backend pools that the use cases registers to. PoolRef []corev1.ObjectReference } -// LLMUseCase defines the policies for routing the traffic of a use case, this includes performance objectives +// ModelGroup defines the policies for routing the traffic of a use case, this includes performance objectives // and traffic splitting between different versions of the model. -type LLMUseCase struct { +type ModelGroup struct { // The name of the model as the users set in the "model" parameter in the requests. // The model name should be unique among the use cases that reference the same backend pool. // This is the parameter that will be used to match the request with. In the future, we may @@ -208,13 +208,13 @@ The functionality of the Kubernetes Gateway is unchanged with this proposal, all Our alternatives hinge on some key decisions: - Allowing HTTPRoute to treat the BackendPool as the backendRef - - Whereas the alternatives might have the LLMUseCase as the backend ref + - Whereas the alternatives might have the ModelGroup as the backend ref - Creating a separate layer of abstraction, instead of extending HTTPRoute - Explained in more detail in the LLMRoute section -#### LLMUseCase as a backend ref +#### ModelGroup as a backend ref -We toyed with the idea of allowing an LLMUsecase be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters (assuming OpenAI protocol continues to require the model param in the body), and require that the HTTPRoute also specify the backend the UseCase is intended to run on. Since we our primary proposal already specifies the backend, packing this functionality would require substantial work on the Kubernetes Gateway, while not providing much flexibility. +We toyed with the idea of allowing an ModelGroup be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters (assuming OpenAI protocol continues to require the model param in the body), and require that the HTTPRoute also specify the backend the UseCase is intended to run on. Since we our primary proposal already specifies the backend, packing this functionality would require substantial work on the Kubernetes Gateway, while not providing much flexibility. #### LLMRoute From 1f3abf2ad9529545f7d81ef73b1d64d515c60dfa Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 24 Sep 2024 15:42:51 +0000 Subject: [PATCH 09/24] clarifying plural modelgroup --- docs/proposals/002-api-proposal/proposal.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 79b4a41b..371c6769 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -109,28 +109,31 @@ A ModelGroup allows the UseCaseOwner to define: **ModelGroup** ```golang -// ModelGroupSet represents a set of LLM use cases that are multiplexed onto one or more backend pools. +// ModelGroup represents a set of LLM use cases that are multiplexed onto one or more backend pools. // This is generally owned by the "LLM Use Case Owner" persona, which can be teams in an organization. -type ModelGroupSet struct { +// Plural ModelUseCases are allowed as a configuration convenience to the user. ModelUseCase names are +// unique for a given BackendPool, if the name is reused, an error will be shown on the status of a +// ModelGroup that attempted to reuse. +type ModelGroup struct { metav1.ObjectMeta metav1.TypeMeta - Spec ModelGroupSetSpec + Spec ModelGroupSpec } -type ModelGroupSetSpec struct { +type ModelGroupSpec struct { // Defines the use cases in the set. // UseCases can be in 2 priority classes, CRITICAL and NONCRITICAL. // Priority class is implicit, and by specifying an Objective, // places the UseCase in the CRITICAL priority class. - UseCases []ModelGroup + UseCases []ModelUseCases // Reference to the backend pools that the use cases registers to. PoolRef []corev1.ObjectReference } // ModelGroup defines the policies for routing the traffic of a use case, this includes performance objectives // and traffic splitting between different versions of the model. -type ModelGroup struct { +type ModelUseCases struct { // The name of the model as the users set in the "model" parameter in the requests. // The model name should be unique among the use cases that reference the same backend pool. // This is the parameter that will be used to match the request with. In the future, we may From 84186728753522c354dcce7d9deb9518b7619cad Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 24 Sep 2024 15:46:12 +0000 Subject: [PATCH 10/24] editing comment to max col 80 len --- docs/proposals/002-api-proposal/proposal.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 371c6769..c65e3fbd 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -109,11 +109,12 @@ A ModelGroup allows the UseCaseOwner to define: **ModelGroup** ```golang -// ModelGroup represents a set of LLM use cases that are multiplexed onto one or more backend pools. -// This is generally owned by the "LLM Use Case Owner" persona, which can be teams in an organization. -// Plural ModelUseCases are allowed as a configuration convenience to the user. ModelUseCase names are -// unique for a given BackendPool, if the name is reused, an error will be shown on the status of a -// ModelGroup that attempted to reuse. +// ModelGroup represents a set of LLM use cases that are multiplexed onto one +// or more backend pools. This is generally owned by the "LLM Use Case Owner" +// persona, which can be teams in an organization. Plural ModelUseCases are +// allowed as a configuration convenience to the user. ModelUseCase names are +// unique for a given BackendPool, if the name is reused, an error will be +// shown on the status of a ModelGroup that attempted to reuse. type ModelGroup struct { metav1.ObjectMeta metav1.TypeMeta From 3e1a5bfecedaad8a8f06aef7462b07d5aa9ac584 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 24 Sep 2024 16:28:43 +0000 Subject: [PATCH 11/24] More explicit documentation on plural usecases --- docs/proposals/002-api-proposal/proposal.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index c65e3fbd..83e3a299 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -112,9 +112,13 @@ A ModelGroup allows the UseCaseOwner to define: // ModelGroup represents a set of LLM use cases that are multiplexed onto one // or more backend pools. This is generally owned by the "LLM Use Case Owner" // persona, which can be teams in an organization. Plural ModelUseCases are -// allowed as a configuration convenience to the user. ModelUseCase names are -// unique for a given BackendPool, if the name is reused, an error will be -// shown on the status of a ModelGroup that attempted to reuse. +// allowed as a configuration convenience to the user. Allowing a user who +// has multiple usecases across multiple pools (with the same config) to +// specify the configuration exactly once, and deploy to many pools +// simultaneously, allowing for a simple config and single source of truth +// for a given user. ModelUseCase names are unique for a given BackendPool, +// if the name is reused, an error will be shown on the status of a +// ModelGroup that attempted to reuse. type ModelGroup struct { metav1.ObjectMeta metav1.TypeMeta From 97131ef0954805b9786fdcaf733fc2ab7371833e Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 24 Sep 2024 17:07:43 +0000 Subject: [PATCH 12/24] Adding examples and word clarification --- docs/proposals/002-api-proposal/proposal.md | 64 +++++++++++++++++++-- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 83e3a299..dbd611fa 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -111,14 +111,15 @@ A ModelGroup allows the UseCaseOwner to define: ```golang // ModelGroup represents a set of LLM use cases that are multiplexed onto one // or more backend pools. This is generally owned by the "LLM Use Case Owner" -// persona, which can be teams in an organization. Plural ModelUseCases are -// allowed as a configuration convenience to the user. Allowing a user who -// has multiple usecases across multiple pools (with the same config) to +// persona, which can be teams in an organization. We allow a user who +// has multiple UseCases across multiple pools (with the same config) to // specify the configuration exactly once, and deploy to many pools -// simultaneously, allowing for a simple config and single source of truth +// simultaneously. Enabling a simpler config and single source of truth // for a given user. ModelUseCase names are unique for a given BackendPool, // if the name is reused, an error will be shown on the status of a -// ModelGroup that attempted to reuse. +// ModelGroup that attempted to reuse. The oldest ModelUseCase, based on +// creation timestamp, will be selected to remain valid. In the event of a race +// condition, one will be selected at random. type ModelGroup struct { metav1.ObjectMeta metav1.TypeMeta @@ -158,7 +159,13 @@ type ModelUseCases struct { -// TargetModel represents a deployed model or a LoRA adapter. +// TargetModel represents a deployed model or a LoRA adapter. The +// TargetModelName is expected to match the name of the LoRA adapter +// (or base model) as it is registered within the model server. Inference +// Gateway assumes that the model exists on the model server and is the +// responsibility of the user to validate a correct match. Should a model fail +// to exist at request time, the error is processed by the Instance Gateway, +// and then emitted on the appropriate ModelGroup object. type TargetModel struct { // The name of the adapter as expected by the ModelServer. TargetModelName string @@ -182,6 +189,51 @@ type Objective struct { } ``` +### Yaml Examples + +#### BackendPool(s) +Here we create 2 BEPs that subscribe to services to collect the appropriate pods +```yaml +apiVersion: inference.x-k8s.io/v1alpha1 +kind: BackendPool +metadata: + name: llama-2-pool + services: + - llama-2-vllm +--- +apiVersion: inference.x-k8s.io/v1alpha1 +kind: BackendPool +metadata: + name: gemini-pool + services: + - gemini-jetstream-tpu-v5e + - gemini-vllm-a100 +``` + +#### Model Group + +Here we consume both pools with a single ModelGroup, while also specifying 2 useCases. Where `sql-code-assist` is both the name of the ModelUseCase, and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified objective. +```yaml +apiVersion: inference.x-k8s.io/v1alpha1 +kind: ModelGroup +metadata: + name: my-model-group +spec: + useCases: + - modelName: sql-code-assist + - modelName: npc-bot + objective: + desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50ms + targetModels: + targetModelName: npc-bot-v1 + weight: 50 + targetModelName: npc-bot-v2 + weight: 50 + poolRef: + - name: llama-2-pool + - name: gemini-pool +``` + ### Diagrams Much of this is better explained visually: From 7fc7879d784cbed90c00cacf53eaa245cd4d568e Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 24 Sep 2024 18:22:10 +0000 Subject: [PATCH 13/24] Adding persona summary to ModelGroup obj --- docs/proposals/002-api-proposal/proposal.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index dbd611fa..45c4a5cc 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -110,8 +110,13 @@ A ModelGroup allows the UseCaseOwner to define: **ModelGroup** ```golang // ModelGroup represents a set of LLM use cases that are multiplexed onto one -// or more backend pools. This is generally owned by the "LLM Use Case Owner" -// persona, which can be teams in an organization. We allow a user who +// or more backend pools. This resource is managed by the "Model Use Case Owner" +// persona. The model use case owner a team that trains, verifies, and +// leverages a large language model from a model frontend, drives the lifecycle +// and rollout of new versions of those models, and defines the specific +// performance and latency goals for the model. These model use cases are +// expected to operate within a BackendPool sharing compute capacity with other +// ModelUseCases, defined by the Inference Platform Admin. We allow a user who // has multiple UseCases across multiple pools (with the same config) to // specify the configuration exactly once, and deploy to many pools // simultaneously. Enabling a simpler config and single source of truth From c3bb56ba336e16fa846da3dbca1a08e4f68b6e32 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 24 Sep 2024 20:16:27 +0000 Subject: [PATCH 14/24] Changing Constants to adhere to style guide --- docs/proposals/002-api-proposal/proposal.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 45c4a5cc..158f0681 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -134,9 +134,9 @@ type ModelGroup struct { type ModelGroupSpec struct { // Defines the use cases in the set. - // UseCases can be in 2 priority classes, CRITICAL and NONCRITICAL. + // UseCases can be in 2 priority classes, Critical and Noncritical. // Priority class is implicit, and by specifying an Objective, - // places the UseCase in the CRITICAL priority class. + // places the UseCase in the Critical priority class. UseCases []ModelUseCases // Reference to the backend pools that the use cases registers to. PoolRef []corev1.ObjectReference From 5ad7e8f34e7ac6db4e21010b7d93c4e02229d670 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 25 Sep 2024 18:31:31 +0000 Subject: [PATCH 15/24] Grammatical fixes --- docs/proposals/002-api-proposal/glossary.md | 2 +- docs/proposals/002-api-proposal/proposal.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/proposals/002-api-proposal/glossary.md b/docs/proposals/002-api-proposal/glossary.md index 1ecd42cb..743f7a5c 100644 --- a/docs/proposals/002-api-proposal/glossary.md +++ b/docs/proposals/002-api-proposal/glossary.md @@ -44,7 +44,7 @@ The primary difference is that non-critical UseCase requests will be rejected in Example: -Your current request load is using 80 Arbitrary Compute Units(ACU) of your pools total of 100ACU capacity. 40ACU are critical workload requests, 45 are non-critical. If you were to lose 30 ACU due to an unforseen outage. Priority would dictate that of the 10 surplus ACU to be rejected the entirety of them would be from the non-critical requests. +Your current request load is using 80 Arbitrary Compute Units(ACU) of your pools total of 100ACU capacity. 40ACU are critical workload requests, 40 are non-critical. If you were to lose 30 ACU due to an unforseen outage. Priority would dictate that of the 10 surplus ACU to be rejected, the entirety of them would be from the _non-critical_ requests. ## Fairness diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 158f0681..1797fd46 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -135,8 +135,8 @@ type ModelGroup struct { type ModelGroupSpec struct { // Defines the use cases in the set. // UseCases can be in 2 priority classes, Critical and Noncritical. - // Priority class is implicit, and by specifying an Objective, - // places the UseCase in the Critical priority class. + // Priority class is implicitly set to Critical by specifying an Objective. + // Otherwise the UseCase is considered Noncritical. UseCases []ModelUseCases // Reference to the backend pools that the use cases registers to. PoolRef []corev1.ObjectReference @@ -254,7 +254,7 @@ The flow can be described as: - `interestingName` is currently undergoing a change of LoRA adapters from `creativeNameGen-v3` (20% traffic split) to `veryCreativeNameGen` (80% traffic split) - `veryCreativeNameGen` is selected as the LoRA adapter, and replaces `interestingName` in the body of the request (mutated by ext-proc) - the request is then efficiently scheduled onto one of the valid Pods -- metrics are sent back to the BEP, aggregated and re-emitted via sidecar (following the metric standardization) +- Prometheus metrics are sent back to the BEP, aggregated and re-emitted via sidecar (following the metric standardization) How Multiple BackendPools might integrate together: From 6bacaf1a1f20bd963a9ed5be2b5fd8acd7c117b7 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 26 Sep 2024 15:50:50 +0000 Subject: [PATCH 16/24] Changing BackendPool acronym --- docs/proposals/002-api-proposal/glossary.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/proposals/002-api-proposal/glossary.md b/docs/proposals/002-api-proposal/glossary.md index 743f7a5c..4720955a 100644 --- a/docs/proposals/002-api-proposal/glossary.md +++ b/docs/proposals/002-api-proposal/glossary.md @@ -22,7 +22,7 @@ This is a very brief description of terms used to describe API objects, included ### BackendPool A grouping of model servers that serve the same set of fine-tunes (LoRA as a primary example). -Shortened to: `BEP` +Shortened to: `BP` ### UseCase An LLM workload that is defined and runs on a BackendPool with other use cases. @@ -70,7 +70,7 @@ The TTL we are currently assuming is: `5 min` - Service A has been meeting its SLO 98% of the requests made in the time window, and Service B has met the SLO 94% of the time. -- A request for both Service A and Service B come in at the same time, and there is only capacity to start a single new request in the BEP, this capacity would meet the SLO for both services. The other request would be queued (potentially causing that request to not meet SLO). +- A request for both Service A and Service B come in at the same time, and there is only capacity to start a single new request in the BP, this capacity would meet the SLO for both services. The other request would be queued (potentially causing that request to not meet SLO). - To fairly share these resources. Service B *must* be selected to begin the request immediately as Service A has had its SLO met a larger percentage of the time. @@ -88,7 +88,7 @@ Data collected from the model servers and data collected from the request is use ## Lora Affinity ### Summary -LoRA Affinity describes the routing strategy displayed in the [demo](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458), to better utilize Model Servers within the BEP. +LoRA Affinity describes the routing strategy displayed in the [demo](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458), to better utilize Model Servers within the BP. ### Description Model Servers that support multi-LoRA handle requests in a FCFS basis. By utilizing the data provided by the model server (the state of loaded LoRA adapters), a routing system can route requests for a given LoRA adapter, to a model server that already has that adapter loaded, to create larger batches than a naive route, which better utilizes the model server hardware. \ No newline at end of file From e6e43601ea0381caf63d12c4cb8d896647e19039 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 26 Sep 2024 20:35:50 +0000 Subject: [PATCH 17/24] CUJ description clarification --- docs/proposals/002-api-proposal/proposal.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 1797fd46..629eb051 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -217,7 +217,7 @@ metadata: #### Model Group -Here we consume both pools with a single ModelGroup, while also specifying 2 useCases. Where `sql-code-assist` is both the name of the ModelUseCase, and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified objective. +Here we consume both pools with a single ModelGroup, while also specifying 2 useCases. Where `sql-code-assist` is both the name of the ModelUseCase, and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified objective. Both `sql-code-assist` and `npc-bot` have available LoRA adapters on both BackendPools and routing to each BackendPool happens earlier(at the K8s Gateway). So traffic splitting between separate pools happens at the K8s Gateway. ```yaml apiVersion: inference.x-k8s.io/v1alpha1 kind: ModelGroup From d385c80d17f531e5ad155ed3da09de665834f02b Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 27 Sep 2024 15:32:28 +0000 Subject: [PATCH 18/24] Adding back BackendPool with description --- docs/proposals/002-api-proposal/proposal.md | 26 +++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 629eb051..d5da9b79 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -194,6 +194,32 @@ type Objective struct { } ``` +**BackendPool** +```golang +// The BackendPool is a construct for pooling compute (often model servers) to +// serve large models, that have the ability to share capacity across multiple +// use cases (such as through prompt engineering, LoRA adapters, etc). +// BackendPools have a dependency on a Gateway that is compatible with ext-proc +// (External Processing). When a new BP object is created, a new ext proc +// deployment is created. BackendPools require at minimum a single UseCase to +// be subscribed to them to accept traffic, any traffic with a model not +// definied within a UseCase will be rejected. +type BackendPool struct { + metav1.ObjectMeta + metav1.TypeMeta + + Spec BackendPoolSpec +} + +type BackendPoolSpec struct { + // Select the distinct services to include in the backend pool. These + // services should be consumed by only the backendpool they are part + // of. Should this behavior be breached, routing behavior is not + // guaranteed. + ServiceRef []corev1.ObjectReference +} +``` + ### Yaml Examples #### BackendPool(s) From 439e7efb83d137a62b7ecd9124bf183ec1206b5e Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 3 Oct 2024 22:36:50 +0000 Subject: [PATCH 19/24] Updating names to LLM Service and LLMServerPool --- docs/proposals/002-api-proposal/glossary.md | 26 ++-- .../proposals/002-api-proposal/images/bep.svg | 1 - .../002-api-proposal/images/gw_w_bep.svg | 1 - .../002-api-proposal/images/gw_w_lsp.svg | 1 + .../proposals/002-api-proposal/images/lsp.svg | 1 + docs/proposals/002-api-proposal/proposal.md | 140 +++++++++--------- 6 files changed, 85 insertions(+), 85 deletions(-) delete mode 100644 docs/proposals/002-api-proposal/images/bep.svg delete mode 100644 docs/proposals/002-api-proposal/images/gw_w_bep.svg create mode 100644 docs/proposals/002-api-proposal/images/gw_w_lsp.svg create mode 100644 docs/proposals/002-api-proposal/images/lsp.svg diff --git a/docs/proposals/002-api-proposal/glossary.md b/docs/proposals/002-api-proposal/glossary.md index 4720955a..33e0e617 100644 --- a/docs/proposals/002-api-proposal/glossary.md +++ b/docs/proposals/002-api-proposal/glossary.md @@ -1,11 +1,11 @@ # Glossary -This is a glossary that attempts to more thoroughly emplain terms used within the api proposal, in an effort to give context to API decisions. +This is a glossary that attempts to more thoroughly explain terms used within the api proposal, in an effort to give context to API decisions. - [API Terms](#api) - - [BackendPool](#backendpool) - - [UseCase](#UseCase) + - [LLMServerPool](#llmserverpool) + - [LLMService](#llmservice) - [Capacity Constrained Routing](#capacity-constrained-routing) - [Priority](#priority) - [Fairness](#fairness) @@ -19,20 +19,20 @@ This is a glossary that attempts to more thoroughly emplain terms used within th ## API This is a very brief description of terms used to describe API objects, included for completeness. -### BackendPool +### LLMServerPool A grouping of model servers that serve the same set of fine-tunes (LoRA as a primary example). -Shortened to: `BP` +Shortened to: `LSP` -### UseCase -An LLM workload that is defined and runs on a BackendPool with other use cases. +### LLMService +An LLM workload that is defined and runs on a LLMServerPool with other use cases. # Capacity Constrained Routing ## Priority ### Summary -Priority specifies the importance of a UseCase relative to other usecases within a BackendPool. +Priority specifies the importance of a LLMService relative to other services within a LLMServerPool. ### Description @@ -40,7 +40,7 @@ For our purposes, priority can be thought of in two classes: - Critical - Non-Critical -The primary difference is that non-critical UseCase requests will be rejected in favor of Critical UseCases the face of resource scarcity. +The primary difference is that non-critical LLMService requests will be rejected in favor of Critical LLMServices the face of resource scarcity. Example: @@ -49,7 +49,7 @@ Your current request load is using 80 Arbitrary Compute Units(ACU) of your pools ## Fairness ### Summary -Fairness specifies how resources are shared among different UseCases, in a way that is most acceptable to the user. +Fairness specifies how resources are shared among different LLMServices, in a way that is most acceptable to the user. ### Description @@ -70,7 +70,7 @@ The TTL we are currently assuming is: `5 min` - Service A has been meeting its SLO 98% of the requests made in the time window, and Service B has met the SLO 94% of the time. -- A request for both Service A and Service B come in at the same time, and there is only capacity to start a single new request in the BP, this capacity would meet the SLO for both services. The other request would be queued (potentially causing that request to not meet SLO). +- A request for both Service A and Service B come in at the same time, and there is only capacity to start a single new request in the LSP, this capacity would meet the SLO for both services. The other request would be queued (potentially causing that request to not meet SLO). - To fairly share these resources. Service B *must* be selected to begin the request immediately as Service A has had its SLO met a larger percentage of the time. @@ -80,7 +80,7 @@ Different from the previous definitons, these terms are used to describe methods ## Latency Based Routing ### Summary -Latency Based Routing uses data to ensure UseCases meet their specified SLO. +Latency Based Routing uses data to ensure LLMServices meet their specified SLO. ### Description Data collected from the model servers and data collected from the request is used to predict the time a request will take on a *specific* model server, and route in a way that will best satisfy the SLO of the incoming requests. @@ -88,7 +88,7 @@ Data collected from the model servers and data collected from the request is use ## Lora Affinity ### Summary -LoRA Affinity describes the routing strategy displayed in the [demo](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458), to better utilize Model Servers within the BP. +LoRA Affinity describes the routing strategy displayed in the [demo](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458), to better utilize Model Servers within the LSP. ### Description Model Servers that support multi-LoRA handle requests in a FCFS basis. By utilizing the data provided by the model server (the state of loaded LoRA adapters), a routing system can route requests for a given LoRA adapter, to a model server that already has that adapter loaded, to create larger batches than a naive route, which better utilizes the model server hardware. \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/images/bep.svg b/docs/proposals/002-api-proposal/images/bep.svg deleted file mode 100644 index cfac8994..00000000 --- a/docs/proposals/002-api-proposal/images/bep.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/images/gw_w_bep.svg b/docs/proposals/002-api-proposal/images/gw_w_bep.svg deleted file mode 100644 index 077158ea..00000000 --- a/docs/proposals/002-api-proposal/images/gw_w_bep.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/images/gw_w_lsp.svg b/docs/proposals/002-api-proposal/images/gw_w_lsp.svg new file mode 100644 index 00000000..c7abbd0e --- /dev/null +++ b/docs/proposals/002-api-proposal/images/gw_w_lsp.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/images/lsp.svg b/docs/proposals/002-api-proposal/images/lsp.svg new file mode 100644 index 00000000..f30ad6a5 --- /dev/null +++ b/docs/proposals/002-api-proposal/images/lsp.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index d5da9b79..d042e9d2 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -16,8 +16,8 @@ - [Inference Platform Admin](#inference-platform-admin) - [LLM Use Case Owner](#llm-use-case-owner) - [Axioms](#axioms) - - [BackendPool](#backendpool) - - [ModelGroup](#modelgroup) + - [LLMServerPool](#llmServerPool) + - [LLMService](#LLMService) - [Spec](#spec) - [Diagrams](#diagrams) - [Alternatives](#alternatives) @@ -28,7 +28,7 @@ ## Summary -This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **BackendPool** and **ModelGroup** (names up for debate). The BackendPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the ModelGroup defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. +This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **LLMServerPool** and **LLMService** (names up for debate). The LLMServerPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMService defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. **NOTE: Some routing terms are defined in the [glossary](./glossary.md) file, to more deeply describe how we will handle behaviors like priority and fairness** @@ -80,71 +80,71 @@ The API design is based on these axioms: - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future - The Gateway should route in a way that does not generate a queue of requests at the model server level -The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **BackendPool**. +The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **LLMServerPool**. -### BackendPool +### LLMServerPool -The BackendPool at its core is a logical grouping of compute, expressed in the form of Pods (typically model servers), akin to a K8s Service. The BackendPool would deploy its own routing, and offer administrative configuration to the Platform Admin. +The LLMServerPool at its core is a logical grouping of compute, expressed in the form of Pods (typically model servers), akin to a K8s Service. The LLMServerPool would deploy its own routing, and offer administrative configuration to the Platform Admin. - It is expected for the BackendPool to: + It is expected for the LLMServerPool to: - Enforce fair consumption of resources across competing use cases - Efficiently route requests across shared compute (as displayed by the PoC) -It is _not_ expected for the BackendPool to: +It is _not_ expected for the LLMServerPool to: - Enforce any common set of adapters or base models are available on the Pods - Manage Deployments of Pods within the Pool - Manage Pod lifecycle of pods within the pool -Additionally, any Pod that seeks to join a BackendPool would need to support a protocol, defined by LLM Instance Gateway, to ensure the Pool has adequate information to intelligently route requests. +Additionally, any Pod that seeks to join a LLMServerPool would need to support a protocol, defined by LLM Instance Gateway, to ensure the Pool has adequate information to intelligently route requests. -### ModelGroup +### LLMService -A ModelGroup allows the UseCaseOwner to define: +A LLMService allows the LLMServiceOwner to define: - Which LoRA adapter(s) to consume - - ModelGroup allows for traffic splitting between adapters _in the same BackendPool_ to allow for new LoRA adapter versions to be easily rolled out -- SLO objectives for the UseCase -- The Pools this UseCase is relevant to + - LLMService allows for traffic splitting between adapters _in the same LLMServerPool_ to allow for new LoRA adapter versions to be easily rolled out +- SLO objectives for the LLMService +- The Pools this LLMService is relevant to ### Spec -**ModelGroup** +**LLMService** ```golang -// ModelGroup represents a set of LLM use cases that are multiplexed onto one -// or more backend pools. This resource is managed by the "Model Use Case Owner" +// LLMService represents a set of LLM use cases that are multiplexed onto one +// or more backend pools. This resource is managed by the "LLM Service Owner" // persona. The model use case owner a team that trains, verifies, and // leverages a large language model from a model frontend, drives the lifecycle // and rollout of new versions of those models, and defines the specific // performance and latency goals for the model. These model use cases are -// expected to operate within a BackendPool sharing compute capacity with other -// ModelUseCases, defined by the Inference Platform Admin. We allow a user who -// has multiple UseCases across multiple pools (with the same config) to +// expected to operate within a LLMServerPool sharing compute capacity with other +// LLMServices, defined by the Inference Platform Admin. We allow a user who +// has multiple LLMServices across multiple pools (with the same config) to // specify the configuration exactly once, and deploy to many pools // simultaneously. Enabling a simpler config and single source of truth -// for a given user. ModelUseCase names are unique for a given BackendPool, +// for a given user. LLMService names are unique for a given LLMServerPool, // if the name is reused, an error will be shown on the status of a -// ModelGroup that attempted to reuse. The oldest ModelUseCase, based on +// LLMService that attempted to reuse. The oldest LLMService, based on // creation timestamp, will be selected to remain valid. In the event of a race // condition, one will be selected at random. -type ModelGroup struct { +type LLMService struct { metav1.ObjectMeta metav1.TypeMeta - Spec ModelGroupSpec + Spec LLMServiceSpec } -type ModelGroupSpec struct { +type LLMServiceSpec struct { // Defines the use cases in the set. - // UseCases can be in 2 priority classes, Critical and Noncritical. + // LLMServices can be in 2 priority classes, Critical and Noncritical. // Priority class is implicitly set to Critical by specifying an Objective. - // Otherwise the UseCase is considered Noncritical. - UseCases []ModelUseCases + // Otherwise the LLMService is considered Noncritical. + LLMServices []ModelLLMServices // Reference to the backend pools that the use cases registers to. PoolRef []corev1.ObjectReference } -// ModelGroup defines the policies for routing the traffic of a use case, this includes performance objectives +// LLMService defines the policies for routing the traffic of a use case, this includes performance objectives // and traffic splitting between different versions of the model. -type ModelUseCases struct { +type ModelLLMServices struct { // The name of the model as the users set in the "model" parameter in the requests. // The model name should be unique among the use cases that reference the same backend pool. // This is the parameter that will be used to match the request with. In the future, we may @@ -153,7 +153,7 @@ type ModelUseCases struct { ModelName string // Optional // Use cases with an objective have higher priority than use cases without. - // IMPORTANT: By specifying an objective, this places the UseCase in a higher priority class than UseCases without a defined priority class. + // IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class. // In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected. Objective *Objective // Optional. @@ -170,7 +170,7 @@ type ModelUseCases struct { // Gateway assumes that the model exists on the model server and is the // responsibility of the user to validate a correct match. Should a model fail // to exist at request time, the error is processed by the Instance Gateway, -// and then emitted on the appropriate ModelGroup object. +// and then emitted on the appropriate LLMService object. type TargetModel struct { // The name of the adapter as expected by the ModelServer. TargetModelName string @@ -194,26 +194,26 @@ type Objective struct { } ``` -**BackendPool** +**LLMServerPool** ```golang -// The BackendPool is a construct for pooling compute (often model servers) to +// The LLMServerPool is a construct for pooling compute (often model servers) to // serve large models, that have the ability to share capacity across multiple // use cases (such as through prompt engineering, LoRA adapters, etc). -// BackendPools have a dependency on a Gateway that is compatible with ext-proc -// (External Processing). When a new BP object is created, a new ext proc -// deployment is created. BackendPools require at minimum a single UseCase to +// LLMServerPools have a dependency on a Gateway that is compatible with ext-proc +// (External Processing). When a new LSP object is created, a new ext proc +// deployment is created. LLMServerPools require at minimum a single LLMService to // be subscribed to them to accept traffic, any traffic with a model not -// definied within a UseCase will be rejected. -type BackendPool struct { +// definied within a LLMService will be rejected. +type LLMServerPool struct { metav1.ObjectMeta metav1.TypeMeta - Spec BackendPoolSpec + Spec LLMServerPoolSpec } -type BackendPoolSpec struct { +type LLMServerPoolSpec struct { // Select the distinct services to include in the backend pool. These - // services should be consumed by only the backendpool they are part + // services should be consumed by only the llmServerPool they are part // of. Should this behavior be breached, routing behavior is not // guaranteed. ServiceRef []corev1.ObjectReference @@ -222,18 +222,18 @@ type BackendPoolSpec struct { ### Yaml Examples -#### BackendPool(s) -Here we create 2 BEPs that subscribe to services to collect the appropriate pods +#### LLMServerPool(s) +Here we create 2 LSPs that subscribe to services to collect the appropriate pods ```yaml apiVersion: inference.x-k8s.io/v1alpha1 -kind: BackendPool +kind: LLMServerPool metadata: name: llama-2-pool services: - llama-2-vllm --- apiVersion: inference.x-k8s.io/v1alpha1 -kind: BackendPool +kind: LLMServerPool metadata: name: gemini-pool services: @@ -241,16 +241,16 @@ metadata: - gemini-vllm-a100 ``` -#### Model Group +#### LLMService -Here we consume both pools with a single ModelGroup, while also specifying 2 useCases. Where `sql-code-assist` is both the name of the ModelUseCase, and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified objective. Both `sql-code-assist` and `npc-bot` have available LoRA adapters on both BackendPools and routing to each BackendPool happens earlier(at the K8s Gateway). So traffic splitting between separate pools happens at the K8s Gateway. +Here we consume both pools with a single LLMService, while also specifying 2 LLMServices. Where `sql-code-assist` is both the name of the ModelLLMService, and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified objective. Both `sql-code-assist` and `npc-bot` have available LoRA adapters on both LLMServerPools and routing to each LLMServerPool happens earlier(at the K8s Gateway). So traffic splitting between separate pools happens at the K8s Gateway. ```yaml apiVersion: inference.x-k8s.io/v1alpha1 -kind: ModelGroup +kind: LLMService metadata: - name: my-model-group + name: my-llm-service spec: - useCases: + LLMServices: - modelName: sql-code-assist - modelName: npc-bot objective: @@ -269,28 +269,28 @@ spec: Much of this is better explained visually: -Below is a detailed view of the BackendPool +Below is a detailed view of the LLMServerPool -![BackendPool](./images/bep.svg) +![LLMServerPool](./images/lsp.svg) This diagram lightly follows the example request for a model `interestingName`. The flow can be described as: - The request comes in to our routing solution(Ext-Proc) -- ExtProc looks up the UseCases affiliated with this pool `examplePool` +- ExtProc looks up the LLMServices affiliated with this pool `examplePool` - `interestingName` is currently undergoing a change of LoRA adapters from `creativeNameGen-v3` (20% traffic split) to `veryCreativeNameGen` (80% traffic split) - `veryCreativeNameGen` is selected as the LoRA adapter, and replaces `interestingName` in the body of the request (mutated by ext-proc) - the request is then efficiently scheduled onto one of the valid Pods -- Prometheus metrics are sent back to the BEP, aggregated and re-emitted via sidecar (following the metric standardization) +- Prometheus metrics are sent back to the LSP, aggregated and re-emitted via sidecar (following the metric standardization) -How Multiple BackendPools might integrate together: +How Multiple LLMServerPools might integrate together: -![K8s Gateway with BackendPools](./images/gw_w_bep.svg) +![K8s Gateway with LLMServerPools](./images/gw_w_lsp.svg) Here we see that we can have: - Multiple Routes pointing to the same pool - Routes splitting traffic across multiple pools -The functionality of the Kubernetes Gateway is unchanged with this proposal, allowing seamless integration with the BackendPool. +The functionality of the Kubernetes Gateway is unchanged with this proposal, allowing seamless integration with the LLMServerPool. ### Alternatives @@ -298,28 +298,28 @@ The functionality of the Kubernetes Gateway is unchanged with this proposal, all #### Key Decisions Our alternatives hinge on some key decisions: -- Allowing HTTPRoute to treat the BackendPool as the backendRef - - Whereas the alternatives might have the ModelGroup as the backend ref +- Allowing HTTPRoute to treat the LLMServerPool as the backendRef + - Whereas the alternatives might have the LLMService as the backend ref - Creating a separate layer of abstraction, instead of extending HTTPRoute - Explained in more detail in the LLMRoute section -#### ModelGroup as a backend ref +#### LLMService as a backend ref -We toyed with the idea of allowing an ModelGroup be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters (assuming OpenAI protocol continues to require the model param in the body), and require that the HTTPRoute also specify the backend the UseCase is intended to run on. Since we our primary proposal already specifies the backend, packing this functionality would require substantial work on the Kubernetes Gateway, while not providing much flexibility. +We toyed with the idea of allowing an LLMService be the target of an HTTPRouteRules backend ref. However, doing so would require the Kubernetes Gateway to be able to interpret body level parameters (assuming OpenAI protocol continues to require the model param in the body), and require that the HTTPRoute also specify the backend the LLMService is intended to run on. Since we our primary proposal already specifies the backend, packing this functionality would require substantial work on the Kubernetes Gateway, while not providing much flexibility. #### LLMRoute -Our original idea was to define all UseCase config at the Kubernetes Gateway layer, and have no BackendPool. This is inherently challenging, as LLMRoute would become a superset of HTTPRoute, or the Gateway would become bespoke, and work only for the LLMRoute use case. +Our original idea was to define all LLMService config at the Kubernetes Gateway layer, and have no LLMServerPool. This is inherently challenging, as LLMRoute would become a superset of HTTPRoute, or the Gateway would become bespoke, and work only for the LLMRoute use case. ## FAQ -- **Why 2 layers of weighting?** (HttpRoute & UseCase) - - Feasibly done - No extension of HttpRoute. Just works, as BackendPool operates like a service. +- **Why 2 layers of weighting?** (HttpRoute & LLMService) + - Feasibly done - No extension of HttpRoute. Just works, as LLMServerPool operates like a service. - Complexity is only expressed during transition states (model version upgrade) - Keeps Pools self contained - multiple K8s gateways can direct traffic to the same pool without needing to re-express Pool-level behavior -- **What is a BEP attempting to define?** - - BackendPool groups resources that should be shared over the UseCases that are affiliated with the pool +- **What is a LSP attempting to define?** + - LLMServerPool groups resources that should be shared over the LLMServices that are affiliated with the pool - Best practice would also suggest keeping the same base model for all ModelServers in the pool, but that is not enforced -- **Can a UseCase reference multiple BEPs?** +- **Can a LLMService reference multiple LSPs?** - **How is this deployed?** - We will follow [common patterns](https://gateway.envoyproxy.io/docs/tasks/quickstart/#installation) to install the CRDs & Controllers - **Are all controllers necessary for this solution going to be provided by Instance Gateway(this repo)?** @@ -332,8 +332,8 @@ Our original idea was to define all UseCase config at the Kubernetes Gateway lay - Reasonable defaults (how do we behave in the absence of user-specified values in optional fields) - Should use cases be required? Or can a customer simply create a pool, and direct requests to the pool, and expect even fairness/priority across the different LoRA adapters that are requested? - - If so? How should we handle the mix between explicit and implicit use cases? Are implicit usecases just default everything? (and inherently lower prio). + - If so? How should we handle the mix between explicit and implicit use cases? Are implicit LLMServices just default everything? (and inherently lower prio). - NOTE: Current thinking is this is yes we should allow non-use case defined requests, but is a security risk if on by default. So pools should opt-in - Configuration control - How many routing decisions should we make on behalf of the user vs allow for configuration? - - Do we decide that SLO adherence is stricter than Fairness adherence? Do we allow for configuration of such tooling? (would be expressed in the BackendPool API) + - Do we decide that SLO adherence is stricter than Fairness adherence? Do we allow for configuration of such tooling? (would be expressed in the LLMServerPool API) From 2285b69212250095a65c8b56da496bebd65f1fc1 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 4 Oct 2024 04:06:50 +0000 Subject: [PATCH 20/24] Typos, rewording, and small fixes --- docs/proposals/002-api-proposal/proposal.md | 61 +++++++++++---------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index d042e9d2..63babcde 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -14,7 +14,7 @@ - [Proposal](#proposal) - [Personas](#personas) - [Inference Platform Admin](#inference-platform-admin) - - [LLM Use Case Owner](#llm-use-case-owner) + - [LLM Service Owner](#llm-use-case-owner) - [Axioms](#axioms) - [LLMServerPool](#llmServerPool) - [LLMService](#LLMService) @@ -28,7 +28,7 @@ ## Summary -This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **LLMServerPool** and **LLMService** (names up for debate). The LLMServerPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMService defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Use Case Owner. +This proposal presents 2 new CRD objects to express the needs of the LLM Instance Gateway. **LLMServerPool** and **LLMService** (names up for debate). The LLMServerPool is the logical grouping of compute, owned by the Inference Platform Admin persona. While the LLMService defines the serving objectives of a specific model or LoRA adapter, and is owned by the LLM Service Owner. **NOTE: Some routing terms are defined in the [glossary](./glossary.md) file, to more deeply describe how we will handle behaviors like priority and fairness** @@ -58,9 +58,9 @@ The Inference Platform Admin creates and manages the infrastructure necessary to - Gateway configuration - etc -#### LLM Use Case Owner +#### LLM Service Owner -An LLM Use Case Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: +An LLM Service Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: - Defining SLO - Managing fine-tunes - LoRA Adapters @@ -75,7 +75,7 @@ The API design is based on these axioms: - Pools of shared compute should be *discrete* for scheduling to properly work - Pod-level scheduling should not be handled by a high-level gateway -- Simple use cases should be simple to define (or are implicitly defined via reasonable defaults) +- Simple services should be simple to define (or are implicitly defined via reasonable defaults) - This solution should be composable with other Gateway solutions and flexible to fit customer needs - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future - The Gateway should route in a way that does not generate a queue of requests at the model server level @@ -87,7 +87,7 @@ The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused o The LLMServerPool at its core is a logical grouping of compute, expressed in the form of Pods (typically model servers), akin to a K8s Service. The LLMServerPool would deploy its own routing, and offer administrative configuration to the Platform Admin. It is expected for the LLMServerPool to: - - Enforce fair consumption of resources across competing use cases + - Enforce fair consumption of resources across competing services - Efficiently route requests across shared compute (as displayed by the PoC) It is _not_ expected for the LLMServerPool to: @@ -109,12 +109,12 @@ A LLMService allows the LLMServiceOwner to define: **LLMService** ```golang -// LLMService represents a set of LLM use cases that are multiplexed onto one +// LLMService represents a set of LLM services that are multiplexed onto one // or more backend pools. This resource is managed by the "LLM Service Owner" -// persona. The model use case owner a team that trains, verifies, and +// persona. The Service Owner persona is: a team that trains, verifies, and // leverages a large language model from a model frontend, drives the lifecycle // and rollout of new versions of those models, and defines the specific -// performance and latency goals for the model. These model use cases are +// performance and latency goals for the model. These services are // expected to operate within a LLMServerPool sharing compute capacity with other // LLMServices, defined by the Inference Platform Admin. We allow a user who // has multiple LLMServices across multiple pools (with the same config) to @@ -133,33 +133,38 @@ type LLMService struct { } type LLMServiceSpec struct { - // Defines the use cases in the set. - // LLMServices can be in 2 priority classes, Critical and Noncritical. + // Defines the distinct services. + // Model can be in 2 priority classes, Critical and Noncritical. // Priority class is implicitly set to Critical by specifying an Objective. - // Otherwise the LLMService is considered Noncritical. - LLMServices []ModelLLMServices - // Reference to the backend pools that the use cases registers to. + // Otherwise the Model is considered Noncritical. + Models []Model + // Reference to the backend pools that the services registers to. PoolRef []corev1.ObjectReference } -// LLMService defines the policies for routing the traffic of a use case, this includes performance objectives +// Model defines the policies for routing the traffic of a use case, this includes performance objectives // and traffic splitting between different versions of the model. -type ModelLLMServices struct { +type Model struct { // The name of the model as the users set in the "model" parameter in the requests. - // The model name should be unique among the use cases that reference the same backend pool. + // The model name should be unique among the services that reference the same backend pool. // This is the parameter that will be used to match the request with. In the future, we may // allow to match on other request parameters. The other approach to support matching on - // on other request parameters is to use a different ModelName f HTTPFilter + // on other request parameters is to use a different ModelName per HTTPFilter. + // Due to these properties. ModelNames must be unique across an LSP. + // ModelNames can be reserved without implementing an actual model in the pool. + // This can be done by specifying a target model and setting the weight to zero, + // an error will be returned specifying that no valid target model is found. ModelName string // Optional - // Use cases with an objective have higher priority than use cases without. + // Use cases with an objective have higher priority than services without. // IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class. // In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected. Objective *Objective // Optional. - // Allow multiple versions of a model for traffic splitting. - // If not specified, the target model name is defaulted to the modelName parameter. - TargetModels []common.TargetModel + // Allow multiple versions of a model for traffic splitting. + // If not specified, the target model name is defaulted to the modelName parameter. + // modelName is often in reference to a LoRA adapter. + TargetModels []TargetModel } @@ -198,7 +203,7 @@ type Objective struct { ```golang // The LLMServerPool is a construct for pooling compute (often model servers) to // serve large models, that have the ability to share capacity across multiple -// use cases (such as through prompt engineering, LoRA adapters, etc). +// services (such as through prompt engineering, LoRA adapters, etc). // LLMServerPools have a dependency on a Gateway that is compatible with ext-proc // (External Processing). When a new LSP object is created, a new ext proc // deployment is created. LLMServerPools require at minimum a single LLMService to @@ -273,12 +278,12 @@ Below is a detailed view of the LLMServerPool ![LLMServerPool](./images/lsp.svg) -This diagram lightly follows the example request for a model `interestingName`. +This diagram lightly follows the example request for a model `name-generator`. The flow can be described as: - The request comes in to our routing solution(Ext-Proc) - ExtProc looks up the LLMServices affiliated with this pool `examplePool` -- `interestingName` is currently undergoing a change of LoRA adapters from `creativeNameGen-v3` (20% traffic split) to `veryCreativeNameGen` (80% traffic split) -- `veryCreativeNameGen` is selected as the LoRA adapter, and replaces `interestingName` in the body of the request (mutated by ext-proc) +- `name-generator` is currently undergoing a change of LoRA adapters from `name-generator-v3` (20% traffic split) to `name-generator-v2` (80% traffic split) +- `name-generator-v2` is selected as the LoRA adapter, and replaces `name-generator` in the body of the request (mutated by ext-proc) - the request is then efficiently scheduled onto one of the valid Pods - Prometheus metrics are sent back to the LSP, aggregated and re-emitted via sidecar (following the metric standardization) @@ -331,8 +336,8 @@ Our original idea was to define all LLMService config at the Kubernetes Gateway ## Open Questions - Reasonable defaults (how do we behave in the absence of user-specified values in optional fields) - - Should use cases be required? Or can a customer simply create a pool, and direct requests to the pool, and expect even fairness/priority across the different LoRA adapters that are requested? - - If so? How should we handle the mix between explicit and implicit use cases? Are implicit LLMServices just default everything? (and inherently lower prio). + - Should services be required? Or can a customer simply create a pool, and direct requests to the pool, and expect even fairness/priority across the different LoRA adapters that are requested? + - If so? How should we handle the mix between explicit and implicit services? Are implicit LLMServices just default everything? (and inherently lower prio). - NOTE: Current thinking is this is yes we should allow non-use case defined requests, but is a security risk if on by default. So pools should opt-in - Configuration control - How many routing decisions should we make on behalf of the user vs allow for configuration? From 649d2c387abdb9da1635e6da4a282c4def749787 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 4 Oct 2024 15:44:21 +0000 Subject: [PATCH 21/24] another review pass --- docs/proposals/002-api-proposal/proposal.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 63babcde..35bbe0d2 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -14,7 +14,7 @@ - [Proposal](#proposal) - [Personas](#personas) - [Inference Platform Admin](#inference-platform-admin) - - [LLM Service Owner](#llm-use-case-owner) + - [LLM Service Owner](#llm-service-owner) - [Axioms](#axioms) - [LLMServerPool](#llmServerPool) - [LLMService](#LLMService) @@ -99,7 +99,7 @@ Additionally, any Pod that seeks to join a LLMServerPool would need to support a ### LLMService -A LLMService allows the LLMServiceOwner to define: +A LLMService allows the LLM Service Owner to define: - Which LoRA adapter(s) to consume - LLMService allows for traffic splitting between adapters _in the same LLMServerPool_ to allow for new LoRA adapter versions to be easily rolled out - SLO objectives for the LLMService @@ -156,7 +156,7 @@ type Model struct { // an error will be returned specifying that no valid target model is found. ModelName string // Optional - // Use cases with an objective have higher priority than services without. + // LLM Services with an objective have higher priority than services without. // IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class. // In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected. Objective *Objective @@ -184,7 +184,7 @@ type TargetModel struct { Weight int } -// Objective captures the latency SLO of a LLM use case. +// Objective captures the latency SLO of a LLM service. // In MVP, meeting the SLO is on a best effort basis. // Future: Extend the API for different behaviors of meeting the SLO. // The gateway will perform best-effort load balancing, and work with other components (e.g., autoscaler) to meet the From 063a80d674b118da03f059be43da174dd133aa9b Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 4 Oct 2024 15:45:53 +0000 Subject: [PATCH 22/24] link fixes --- docs/proposals/002-api-proposal/proposal.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 35bbe0d2..f4b04add 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -16,8 +16,8 @@ - [Inference Platform Admin](#inference-platform-admin) - [LLM Service Owner](#llm-service-owner) - [Axioms](#axioms) - - [LLMServerPool](#llmServerPool) - - [LLMService](#LLMService) + - [LLMServerPool](#llmserverpool) + - [LLMService](#llmservice) - [Spec](#spec) - [Diagrams](#diagrams) - [Alternatives](#alternatives) From 54d0543e55ab1902850c638cfe7a71691f068614 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 4 Oct 2024 15:47:23 +0000 Subject: [PATCH 23/24] fixing wording, removing duplication --- docs/proposals/002-api-proposal/proposal.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index f4b04add..262383b9 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -146,15 +146,14 @@ type LLMServiceSpec struct { // and traffic splitting between different versions of the model. type Model struct { // The name of the model as the users set in the "model" parameter in the requests. - // The model name should be unique among the services that reference the same backend pool. + // The name should be unique among the services that reference the same backend pool. // This is the parameter that will be used to match the request with. In the future, we may // allow to match on other request parameters. The other approach to support matching on // on other request parameters is to use a different ModelName per HTTPFilter. - // Due to these properties. ModelNames must be unique across an LSP. - // ModelNames can be reserved without implementing an actual model in the pool. + // Names can be reserved without implementing an actual model in the pool. // This can be done by specifying a target model and setting the weight to zero, // an error will be returned specifying that no valid target model is found. - ModelName string + Name string // Optional // LLM Services with an objective have higher priority than services without. // IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class. From 75861c2bea8a11879b010f9f74604f5d712f6bef Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 4 Oct 2024 17:10:49 +0000 Subject: [PATCH 24/24] shortining targetmodel name field --- docs/proposals/002-api-proposal/proposal.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 262383b9..edf71f49 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -169,7 +169,7 @@ type Model struct { // TargetModel represents a deployed model or a LoRA adapter. The -// TargetModelName is expected to match the name of the LoRA adapter +// Name field is expected to match the name of the LoRA adapter // (or base model) as it is registered within the model server. Inference // Gateway assumes that the model exists on the model server and is the // responsibility of the user to validate a correct match. Should a model fail @@ -177,7 +177,7 @@ type Model struct { // and then emitted on the appropriate LLMService object. type TargetModel struct { // The name of the adapter as expected by the ModelServer. - TargetModelName string + Name string // Weight is used to determine the percentage of traffic that should be // sent to this target model when multiple versions of the model are specified. Weight int