From 634ce4a1f07e47e224b70526fca07589bcb9f748 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 16 Jan 2025 23:37:34 +0000 Subject: [PATCH 1/7] Documentation improvement --- api/v1alpha1/inferencemodel_types.go | 35 ++++++++++++++++------------ 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 3661820d8..97cfdbde4 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -21,9 +21,13 @@ import ( ) // InferenceModel is the Schema for the InferenceModels API. +// The InferenceModel is intended to represent a model workload within Kubernetes. // // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="ModelName",type=string,JSONPath=`.spec.modelName` +// +kubebuilder:printcolumn:name="Accepted",type=string,JSONPath=`.status.conditions[?(@.type=="Accepted")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` // +genclient type InferenceModel struct { metav1.TypeMeta `json:",inline"` @@ -51,20 +55,11 @@ type InferenceModelList struct { // performance and latency goals for the model. These workloads are // expected to operate within an InferencePool sharing compute capacity with other // InferenceModels, defined by the Inference Platform Admin. -// -// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, -// if the name is reused, an error will be shown on the status of a -// InferenceModel that attempted to reuse. The oldest InferenceModel, based on -// creation timestamp, will be selected to remain valid. In the event of a race -// condition, one will be selected at random. type InferenceModelSpec struct { - // ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. - // ModelNames must be unique for a referencing InferencePool - // (names can be reused for a different pool in the same cluster). - // The modelName with the oldest creation timestamp is retained, and the incoming - // InferenceModel is sets the Ready status to false with a corresponding reason. - // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. - // Names can be reserved without an underlying model configured in the pool. + // ModelName is the name of the model as the users set in the "model" parameter in the requests. + // The name should be unique among the workloads that reference the same backend pool. + // This is the parameter that will be used to match the request with. + // Names can be reserved without implementing an actual model in the pool. // This can be done by specifying a target model and setting the weight to zero, // an error will be returned specifying that no valid target model is found. // @@ -84,9 +79,19 @@ type InferenceModelSpec struct { Criticality *Criticality `json:"criticality,omitempty"` // TargetModels allow multiple versions of a model for traffic splitting. - // If not specified, the target model name is defaulted to the modelName parameter. + // Traffic splitting is handled via weights. The targetModel field is optional, however, + // if not specified, the target model name is defaulted to the modelName parameter. // modelName is often in reference to a LoRA adapter. // + // Examples: + // - A model server serving `llama2-7b` may be represented by: + // - setting the modelName to `llama2-7b` and setting no targetModels + // - setting the modelName to `hello-world` and setting a single targetModel to `llama2-7b`, and setting no weights + // - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2' and setting no weights. + // This has the effect of weighing the two models equally + // - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1. + // This has the effect of the fine-tune-v1 being selected 10x as often as v2 + // // +optional // +kubebuilder:validation:MaxItems=10 // +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))" @@ -154,7 +159,7 @@ const ( // to exist at request time, the error is processed by the Inference Gateway // and emitted on the appropriate InferenceModel object. type TargetModel struct { - // Name is the name of the adapter or base model, as expected by the ModelServer. + // Name is the name of the LoRA adapter or base model, as expected by the ModelServer. // // +kubebuilder:validation:MaxLength=253 // +kubebuilder:validation:Required From eca40f41b7bf4963c37c50aed84fb370eee11dfb Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 16 Jan 2025 23:40:16 +0000 Subject: [PATCH 2/7] generated manifests --- ...e.networking.x-k8s.io_inferencemodels.yaml | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index bca196059..52a608d6b 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -14,10 +14,22 @@ spec: singular: inferencemodel scope: Namespaced versions: - - name: v1alpha1 + - additionalPrinterColumns: + - jsonPath: .spec.modelName + name: ModelName + type: string + - jsonPath: .status.conditions[?(@.type=="Accepted")].status + name: Accepted + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 schema: openAPIV3Schema: - description: InferenceModel is the Schema for the InferenceModels API. + description: |- + InferenceModel is the Schema for the InferenceModels API. + The InferenceModel is intended to represent a model workload within Kubernetes. properties: apiVersion: description: |- @@ -47,12 +59,6 @@ spec: performance and latency goals for the model. These workloads are expected to operate within an InferencePool sharing compute capacity with other InferenceModels, defined by the Inference Platform Admin. - - InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, - if the name is reused, an error will be shown on the status of a - InferenceModel that attempted to reuse. The oldest InferenceModel, based on - creation timestamp, will be selected to remain valid. In the event of a race - condition, one will be selected at random. properties: criticality: description: |- @@ -71,13 +77,10 @@ spec: type: string modelName: description: |- - ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. - ModelNames must be unique for a referencing InferencePool - (names can be reused for a different pool in the same cluster). - The modelName with the oldest creation timestamp is retained, and the incoming - InferenceModel is sets the Ready status to false with a corresponding reason. - In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. - Names can be reserved without an underlying model configured in the pool. + ModelName is the name of the model as the users set in the "model" parameter in the requests. + The name should be unique among the workloads that reference the same backend pool. + This is the parameter that will be used to match the request with. + Names can be reserved without implementing an actual model in the pool. This can be done by specifying a target model and setting the weight to zero, an error will be returned specifying that no valid target model is found. maxLength: 256 @@ -110,8 +113,18 @@ spec: targetModels: description: |- TargetModels allow multiple versions of a model for traffic splitting. - If not specified, the target model name is defaulted to the modelName parameter. + Traffic splitting is handled via weights. The targetModel field is optional, however, + if not specified, the target model name is defaulted to the modelName parameter. modelName is often in reference to a LoRA adapter. + + Examples: + - A model server serving `llama2-7b` may be represented by: + - setting the modelName to `llama2-7b` and setting no targetModels + - setting the modelName to `hello-world` and setting a single targetModel to `llama2-7b`, and setting no weights + - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2' and setting no weights. + This has the effect of weighing the two models equally + - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1. + This has the effect of the fine-tune-v1 being selected 10x as often as v2 items: description: |- TargetModel represents a deployed model or a LoRA adapter. The @@ -123,7 +136,7 @@ spec: and emitted on the appropriate InferenceModel object. properties: name: - description: Name is the name of the adapter or base model, + description: Name is the name of the LoRA adapter or base model, as expected by the ModelServer. maxLength: 253 type: string From 22f16f8544eb3896b8a6a1332f246ac963dfe4cf Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Sat, 18 Jan 2025 01:20:15 +0000 Subject: [PATCH 3/7] wording updates --- api/v1alpha1/inferencemodel_types.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 97cfdbde4..64d914b6a 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -21,7 +21,12 @@ import ( ) // InferenceModel is the Schema for the InferenceModels API. -// The InferenceModel is intended to represent a model workload within Kubernetes. +// The InferenceModel is intended to represent a model workload (also referred to as a model use case) within Kubernetes. +// The management of the model server is not done by the InferenceModel. Instead, the +// focus of the InferenceModel is to provide the tools needed to effectively manage multiple models +// that share the same base model (currently the focus is LoRA adapters). Fields such as TargetModel +// are intended to simplify A/B testing and version rollout of adapters. While Criticality assists with +// governance of multiplexing many usecases over shared hardware. // // +kubebuilder:object:root=true // +kubebuilder:subresource:status @@ -46,15 +51,16 @@ type InferenceModelList struct { Items []InferenceModel `json:"items"` } -// InferenceModelSpec represents the desired state of a specific model use case. This resource is +// InferenceModelSpec represents the desired state of an InferenceModel. This resource is // managed by the "Inference Workload Owner" persona. // // The Inference Workload Owner persona is someone that trains, verifies, and -// leverages a large language model from a model frontend, drives the lifecycle -// and rollout of new versions of those models, and defines the specific +// leverages a large language model focusing on model fidelity performance, and +// less on inference performance (which is managed by the Inference Platform Admin). +// They also drive the lifecycle and rollout of new versions of those models, and defines the specific // performance and latency goals for the model. These workloads are // expected to operate within an InferencePool sharing compute capacity with other -// InferenceModels, defined by the Inference Platform Admin. +// InferenceModels, with specific governance defined by the Inference Platform Admin. type InferenceModelSpec struct { // ModelName is the name of the model as the users set in the "model" parameter in the requests. // The name should be unique among the workloads that reference the same backend pool. From b8f5479f4c58b1df36970325983d929d0df4924a Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 21 Jan 2025 15:57:04 +0000 Subject: [PATCH 4/7] generate new manifests --- ...ence.networking.x-k8s.io_inferencemodels.yaml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index 52a608d6b..b69120f04 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -29,7 +29,12 @@ spec: openAPIV3Schema: description: |- InferenceModel is the Schema for the InferenceModels API. - The InferenceModel is intended to represent a model workload within Kubernetes. + The InferenceModel is intended to represent a model workload (also referred to as a model use case) within Kubernetes. + The management of the model server is not done by the InferenceModel. Instead, the + focus of the InferenceModel is to provide the tools needed to effectively manage multiple models + that share the same base model (currently the focus is LoRA adapters). Fields such as TargetModel + are intended to simplify A/B testing and version rollout of adapters. While Criticality assists with + governance of multiplexing many usecases over shared hardware. properties: apiVersion: description: |- @@ -50,15 +55,16 @@ spec: type: object spec: description: |- - InferenceModelSpec represents the desired state of a specific model use case. This resource is + InferenceModelSpec represents the desired state of an InferenceModel. This resource is managed by the "Inference Workload Owner" persona. The Inference Workload Owner persona is someone that trains, verifies, and - leverages a large language model from a model frontend, drives the lifecycle - and rollout of new versions of those models, and defines the specific + leverages a large language model focusing on model fidelity performance, and + less on inference performance (which is managed by the Inference Platform Admin). + They also drive the lifecycle and rollout of new versions of those models, and defines the specific performance and latency goals for the model. These workloads are expected to operate within an InferencePool sharing compute capacity with other - InferenceModels, defined by the Inference Platform Admin. + InferenceModels, with specific governance defined by the Inference Platform Admin. properties: criticality: description: |- From 1c9786f2c6956c259c406349701092fc7f0cbb96 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 21 Jan 2025 16:06:46 +0000 Subject: [PATCH 5/7] updating InferencePool wording --- api/v1alpha1/inferencepool_types.go | 4 ++++ .../inference.networking.x-k8s.io_inferencepools.yaml | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go index d89b8df53..6f5a4731d 100644 --- a/api/v1alpha1/inferencepool_types.go +++ b/api/v1alpha1/inferencepool_types.go @@ -21,6 +21,10 @@ import ( ) // InferencePool is the Schema for the InferencePools API. +// The InferencePool object is intended to allow for easy maintenance of a set of model servers. +// Best practice is for every model server to share a base model, or, for every model server to be able to serve every 'modelName' that will be available. +// The InferencePool was made for the Inference Platform Admin: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/docs/proposals/002-api-proposal/proposal.md#inference-platform-admin +// The InferencePool depends on the K8s Gateway, and relies on the gateway controller to manage reconciliation. // // +kubebuilder:object:root=true // +kubebuilder:subresource:status diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 8e0ff54d2..e0d2340d7 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -17,7 +17,12 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: InferencePool is the Schema for the InferencePools API. + description: |- + InferencePool is the Schema for the InferencePools API. + The InferencePool object is intended to allow for easy maintenance of a set of model servers. + Best practice is for every model server to share a base model, or, for every model server to be able to serve every 'modelName' that will be available. + The InferencePool was made for the Inference Platform Admin: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/docs/proposals/002-api-proposal/proposal.md#inference-platform-admin + The InferencePool depends on the K8s Gateway, and relies on the gateway controller to manage reconciliation. properties: apiVersion: description: |- From bb516eb0223c4c75ff011d77d38380425b065801 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 21 Jan 2025 16:14:54 +0000 Subject: [PATCH 6/7] updating criticality comment to link to discussion issue --- api/v1alpha1/inferencemodel_types.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 64d914b6a..4ef7e307f 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -74,10 +74,7 @@ type InferenceModelSpec struct { ModelName string `json:"modelName"` // Criticality defines how important it is to serve the model compared to other models referencing the same pool. - // Criticality impacts how traffic is handled in resource constrained situations. It handles this by - // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will - // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, - // and the proportionality of fairness will be configurable. + // TODO: Update field upon resolution of: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/213 // // Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. // Any implementations that may consume this field may treat an unset value as the 'Standard' range. From 5895cc33251b7952a6d6fc2c06bb112a98cfaf1d Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 23 Jan 2025 00:41:15 +0000 Subject: [PATCH 7/7] grammatical fixes --- api/v1alpha1/inferencemodel_types.go | 4 ++-- .../inference.networking.x-k8s.io_inferencemodels.yaml | 8 ++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 4ef7e307f..83657c341 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -90,9 +90,9 @@ type InferenceModelSpec struct { // - A model server serving `llama2-7b` may be represented by: // - setting the modelName to `llama2-7b` and setting no targetModels // - setting the modelName to `hello-world` and setting a single targetModel to `llama2-7b`, and setting no weights - // - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2' and setting no weights. + // - setting modelName to 'my-fine-tune', setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2', and setting no weights. // This has the effect of weighing the two models equally - // - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1. + // - setting modelName to 'my-fine-tune', setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1. // This has the effect of the fine-tune-v1 being selected 10x as often as v2 // // +optional diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index b69120f04..aeefc2ce8 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -69,10 +69,6 @@ spec: criticality: description: |- Criticality defines how important it is to serve the model compared to other models referencing the same pool. - Criticality impacts how traffic is handled in resource constrained situations. It handles this by - queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will - fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, - and the proportionality of fairness will be configurable. Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. Any implementations that may consume this field may treat an unset value as the 'Standard' range. @@ -127,9 +123,9 @@ spec: - A model server serving `llama2-7b` may be represented by: - setting the modelName to `llama2-7b` and setting no targetModels - setting the modelName to `hello-world` and setting a single targetModel to `llama2-7b`, and setting no weights - - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2' and setting no weights. + - setting modelName to 'my-fine-tune', setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2', and setting no weights. This has the effect of weighing the two models equally - - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1. + - setting modelName to 'my-fine-tune', setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1. This has the effect of the fine-tune-v1 being selected 10x as often as v2 items: description: |-