diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 847483036..8e81b4e8f 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -20,12 +20,32 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. +// InferenceModel is the Schema for the InferenceModels API. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +genclient +type InferenceModel struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec InferenceModelSpec `json:"spec,omitempty"` + Status InferenceModelStatus `json:"status,omitempty"` +} + +// InferenceModelList contains a list of InferenceModel. +// +// +kubebuilder:object:root=true +type InferenceModelList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []InferenceModel `json:"items"` +} -// InferenceModelSpec represents a specific model use case. This resource is +// InferenceModelSpec represents the desired state of a specific model use case. This resource is // managed by the "Inference Workload Owner" persona. // -// The Inference Workload Owner persona is: a team that trains, verifies, and +// The Inference Workload Owner persona is someone that trains, verifies, and // leverages a large language model from a model frontend, drives the lifecycle // and rollout of new versions of those models, and defines the specific // performance and latency goals for the model. These workloads are @@ -38,7 +58,7 @@ import ( // creation timestamp, will be selected to remain valid. In the event of a race // condition, one will be selected at random. type InferenceModelSpec struct { - // The name of the model as the users set in the "model" parameter in the requests. + // ModelName is the name of the model as the users set in the "model" parameter in the requests. // The name should be unique among the workloads that reference the same backend pool. // This is the parameter that will be used to match the request with. In the future, we may // allow to match on other request parameters. The other approach to support matching @@ -47,22 +67,25 @@ type InferenceModelSpec struct { // This can be done by specifying a target model and setting the weight to zero, // an error will be returned specifying that no valid target model is found. // - // +optional // +kubebuilder:validation:MaxLength=253 - ModelName string `json:"modelName,omitempty"` - // Defines how important it is to serve the model compared to other models referencing the same pool. + // +kubebuilder:validation:Required + ModelName string `json:"modelName"` + + // Criticality defines how important it is to serve the model compared to other models referencing the same pool. // // +optional // +kubebuilder:default="Default" Criticality *Criticality `json:"criticality,omitempty"` - // Allow multiple versions of a model for traffic splitting. + + // TargetModels allow multiple versions of a model for traffic splitting. // If not specified, the target model name is defaulted to the modelName parameter. // modelName is often in reference to a LoRA adapter. // // +optional // +kubebuilder:validation:MaxItems=10 TargetModels []TargetModel `json:"targetModels,omitempty"` - // Reference to the inference pool, the pool must exist in the same namespace. + + // PoolRef is a reference to the inference pool, the pool must exist in the same namespace. // // +kubebuilder:validation:Required PoolRef PoolObjectReference `json:"poolRef"` @@ -93,39 +116,54 @@ type PoolObjectReference struct { // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 // +kubebuilder:validation:Required - Name string `json:"name,omitempty"` + Name string `json:"name"` } -// Defines how important it is to serve the model compared to other models. +// Criticality defines how important it is to serve the model compared to other models. // +kubebuilder:validation:Enum=Critical;Default;Sheddable type Criticality string const ( - // Most important. Requests to this band will be shed last. + // Critical defines the highest level of criticality. Requests to this band will be shed last. Critical Criticality = "Critical" - // More important than Sheddable, less important than Critical. - // Requests in this band will be shed before critical traffic. - // +kubebuilder:default=Default + + // Default defines the default criticality level and is more important than Sheddable but less + // important than Critical. Requests in this band will be shed before critical traffic. Default Criticality = "Default" - // Least important. Requests to this band will be shed before all other bands. + + // Sheddable defines the lowest level of criticality. Requests to this band will be shed before + // all other bands. Sheddable Criticality = "Sheddable" ) // TargetModel represents a deployed model or a LoRA adapter. The // Name field is expected to match the name of the LoRA adapter // (or base model) as it is registered within the model server. Inference -// Gateway assumes that the model exists on the model server and is the +// Gateway assumes that the model exists on the model server and it's the // responsibility of the user to validate a correct match. Should a model fail -// to exist at request time, the error is processed by the Instance Gateway, -// and then emitted on the appropriate InferenceModel object. +// to exist at request time, the error is processed by the Inference Gateway +// and emitted on the appropriate InferenceModel object. type TargetModel struct { - // The name of the adapter as expected by the ModelServer. + // Name is the name of the adapter as expected by the ModelServer. // - // +optional // +kubebuilder:validation:MaxLength=253 - Name string `json:"name,omitempty"` + // +kubebuilder:validation:Required + Name string `json:"name"` + // Weight is used to determine the proportion of traffic that should be - // sent to this target model when multiple versions of the model are specified. + // sent to this model when multiple target models are specified. + // + // Weight defines the proportion of requests forwarded to the specified + // model. This is computed as weight/(sum of all weights in this + // TargetModels list). For non-zero values, there may be some epsilon from + // the exact proportion defined here depending on the precision an + // implementation supports. Weight is not a percentage and the sum of + // weights does not need to equal 100. + // + // If only one model is specified and it has a weight greater than 0, 100% + // of the traffic is forwarded to that model. If weight is set to 0, no + // traffic should be forwarded for this model. If unspecified, weight + // defaults to 1. // // +optional // +kubebuilder:default=1 @@ -140,28 +178,6 @@ type InferenceModelStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` } -// +kubebuilder:object:root=true -// +kubebuilder:subresource:status -// +genclient - -// InferenceModel is the Schema for the InferenceModels API -type InferenceModel struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - - Spec InferenceModelSpec `json:"spec,omitempty"` - Status InferenceModelStatus `json:"status,omitempty"` -} - -// +kubebuilder:object:root=true - -// InferenceModelList contains a list of InferenceModel -type InferenceModelList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []InferenceModel `json:"items"` -} - func init() { SchemeBuilder.Register(&InferenceModel{}, &InferenceModelList{}) } diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go index 7be27c163..666d0ac1c 100644 --- a/api/v1alpha1/inferencepool_types.go +++ b/api/v1alpha1/inferencepool_types.go @@ -20,12 +20,31 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. +// InferencePool is the Schema for the InferencePools API. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +genclient +type InferencePool struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec InferencePoolSpec `json:"spec,omitempty"` + Status InferencePoolStatus `json:"status,omitempty"` +} + +// InferencePoolList contains a list of InferencePool. +// +// +kubebuilder:object:root=true +type InferencePoolList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []InferencePool `json:"items"` +} // InferencePoolSpec defines the desired state of InferencePool type InferencePoolSpec struct { - - // Selector uses a map of label to watch model server pods + // Selector defines a map of label to watch model server pods // that should be included in the InferencePool. ModelServers should not // be with any other Service or InferencePool, that behavior is not supported // and will result in sub-optimal utilization. @@ -33,16 +52,15 @@ type InferencePoolSpec struct { // map used for Service selectors instead of the full Kubernetes LabelSelector type. // // +kubebuilder:validation:Required - Selector map[LabelKey]LabelValue `json:"selector,omitempty"` + Selector map[LabelKey]LabelValue `json:"selector"` - // TargetPortNumber is the port number that the model servers within the pool expect - // to receive traffic from. - // This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort + // TargetPortNumber defines the port number to access the selected model servers. + // The number must be in the range 1 to 65535. // - // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Minimum=1 // +kubebuilder:validation:Maximum=65535 // +kubebuilder:validation:Required - TargetPortNumber int32 `json:"targetPortNumber,omitempty"` + TargetPortNumber int32 `json:"targetPortNumber"` } // Originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 @@ -87,33 +105,10 @@ type LabelValue string // InferencePoolStatus defines the observed state of InferencePool type InferencePoolStatus struct { - // Conditions track the state of the InferencePool. Conditions []metav1.Condition `json:"conditions,omitempty"` } -// +kubebuilder:object:root=true -// +kubebuilder:subresource:status -// +genclient - -// InferencePool is the Schema for the Inferencepools API -type InferencePool struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - - Spec InferencePoolSpec `json:"spec,omitempty"` - Status InferencePoolStatus `json:"status,omitempty"` -} - -// +kubebuilder:object:root=true - -// InferencePoolList contains a list of InferencePool -type InferencePoolList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []InferencePool `json:"items"` -} - func init() { SchemeBuilder.Register(&InferencePool{}, &InferencePoolList{}) } diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index 757cf0d09..7fe1d561a 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -17,7 +17,7 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: InferenceModel is the Schema for the InferenceModels API + description: InferenceModel is the Schema for the InferenceModels API. properties: apiVersion: description: |- @@ -38,10 +38,10 @@ spec: type: object spec: description: |- - InferenceModelSpec represents a specific model use case. This resource is + InferenceModelSpec represents the desired state of a specific model use case. This resource is managed by the "Inference Workload Owner" persona. - The Inference Workload Owner persona is: a team that trains, verifies, and + The Inference Workload Owner persona is someone that trains, verifies, and leverages a large language model from a model frontend, drives the lifecycle and rollout of new versions of those models, and defines the specific performance and latency goals for the model. These workloads are @@ -56,8 +56,8 @@ spec: properties: criticality: default: Default - description: Defines how important it is to serve the model compared - to other models referencing the same pool. + description: Criticality defines how important it is to serve the + model compared to other models referencing the same pool. enum: - Critical - Default @@ -65,7 +65,7 @@ spec: type: string modelName: description: |- - The name of the model as the users set in the "model" parameter in the requests. + ModelName is the name of the model as the users set in the "model" parameter in the requests. The name should be unique among the workloads that reference the same backend pool. This is the parameter that will be used to match the request with. In the future, we may allow to match on other request parameters. The other approach to support matching @@ -76,8 +76,8 @@ spec: maxLength: 253 type: string poolRef: - description: Reference to the inference pool, the pool must exist - in the same namespace. + description: PoolRef is a reference to the inference pool, the pool + must exist in the same namespace. properties: group: default: inference.networking.x-k8s.io @@ -102,7 +102,7 @@ spec: type: object targetModels: description: |- - Allow multiple versions of a model for traffic splitting. + TargetModels allow multiple versions of a model for traffic splitting. If not specified, the target model name is defaulted to the modelName parameter. modelName is often in reference to a LoRA adapter. items: @@ -110,28 +110,44 @@ spec: TargetModel represents a deployed model or a LoRA adapter. The Name field is expected to match the name of the LoRA adapter (or base model) as it is registered within the model server. Inference - Gateway assumes that the model exists on the model server and is the + Gateway assumes that the model exists on the model server and it's the responsibility of the user to validate a correct match. Should a model fail - to exist at request time, the error is processed by the Instance Gateway, - and then emitted on the appropriate InferenceModel object. + to exist at request time, the error is processed by the Inference Gateway + and emitted on the appropriate InferenceModel object. properties: name: - description: The name of the adapter as expected by the ModelServer. + description: Name is the name of the adapter as expected by + the ModelServer. maxLength: 253 type: string weight: default: 1 description: |- Weight is used to determine the proportion of traffic that should be - sent to this target model when multiple versions of the model are specified. + sent to this model when multiple target models are specified. + + Weight defines the proportion of requests forwarded to the specified + model. This is computed as weight/(sum of all weights in this + TargetModels list). For non-zero values, there may be some epsilon from + the exact proportion defined here depending on the precision an + implementation supports. Weight is not a percentage and the sum of + weights does not need to equal 100. + + If only one model is specified and it has a weight greater than 0, 100% + of the traffic is forwarded to that model. If weight is set to 0, no + traffic should be forwarded for this model. If unspecified, weight + defaults to 1. format: int32 maximum: 1000000 minimum: 0 type: integer + required: + - name type: object maxItems: 10 type: array required: + - modelName - poolRef type: object status: diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index c006a79f0..d4500a135 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -17,7 +17,7 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: InferencePool is the Schema for the Inferencepools API + description: InferencePool is the Schema for the InferencePools API. properties: apiVersion: description: |- @@ -58,7 +58,7 @@ spec: pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ type: string description: |- - Selector uses a map of label to watch model server pods + Selector defines a map of label to watch model server pods that should be included in the InferencePool. ModelServers should not be with any other Service or InferencePool, that behavior is not supported and will result in sub-optimal utilization. @@ -67,12 +67,11 @@ spec: type: object targetPortNumber: description: |- - TargetPortNumber is the port number that the model servers within the pool expect - to receive traffic from. - This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort + TargetPortNumber defines the port number to access the selected model servers. + The number must be in the range 1 to 65535. format: int32 maximum: 65535 - minimum: 0 + minimum: 1 type: integer required: - selector diff --git a/config/samples/gateway_v1alpha1_inferencemodel.yaml b/config/samples/gateway_v1alpha1_inferencemodel.yaml index 120ed6bce..f1f46a2f1 100644 --- a/config/samples/gateway_v1alpha1_inferencemodel.yaml +++ b/config/samples/gateway_v1alpha1_inferencemodel.yaml @@ -6,16 +6,12 @@ metadata: app.kubernetes.io/managed-by: kustomize name: inferencemodel-sample spec: - InferenceModels: - - modelName: sql-code-assist - - modelName: npc-bot - objective: - desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50ms - targetModels: - targetModelName: npc-bot-v1 - weight: 50 - targetModelName: npc-bot-v2 - weight: 50 - poolRefs: - - name: llama-2-pool - - name: gemini-pool + criticality: Critical + modelName: sql-code-assist + poolRef: + name: inferencepool-sample + targetModels: + - name: npc-bot-v1 + weight: 50 + - name: npc-bot-v2 + weight: 50 diff --git a/config/samples/gateway_v1alpha1_inferencepool.yaml b/config/samples/gateway_v1alpha1_inferencepool.yaml index 54046489d..42ac62965 100644 --- a/config/samples/gateway_v1alpha1_inferencepool.yaml +++ b/config/samples/gateway_v1alpha1_inferencepool.yaml @@ -6,6 +6,6 @@ metadata: app.kubernetes.io/managed-by: kustomize name: inferencepool-sample spec: - serviceRefs: - - gemini-jetstream-tpu-v5e-service - - gemini-vllm-a100-service + selector: + app: npc-bot + targetPortNumber: 8000 diff --git a/examples/poc/manifests/inferencepool-with-model.yaml b/examples/poc/manifests/inferencepool-with-model.yaml index f05823eaa..b5980e2ea 100644 --- a/examples/poc/manifests/inferencepool-with-model.yaml +++ b/examples/poc/manifests/inferencepool-with-model.yaml @@ -6,7 +6,7 @@ metadata: spec: targetPortNumber: 8000 selector: - "app": "vllm-llama2-7b-pool" + app: vllm-llama2-7b-pool --- apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: InferenceModel @@ -29,4 +29,3 @@ spec: weight: 50 - name: tweet-summary-1 weight: 50 -