[v0.1 API Review] Cleaning up optional fields/clearer wording (#185)

kfswain · web-flow · commit 271b63878987 · 2025-01-21T10:36:36.000-08:00
* feedback updates

* updating the actual API obj as well

* Generating new manifests/ Additional Criticality wording

* More feedback updates

* generating new manifests

* feedback updates

* generated code

* build fix

* test fixes

* Renaming the 'base' Criticality band to 'Standard'

* detailing how to handle unset criticality
diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go
@@ -58,23 +58,29 @@ type InferenceModelList struct {
 // creation timestamp, will be selected to remain valid. In the event of a race
 // condition, one will be selected at random.
 type InferenceModelSpec struct {
-	// ModelName is the name of the model as the users set in the "model" parameter in the requests.
-	// The name should be unique among the workloads that reference the same backend pool.
-	// This is the parameter that will be used to match the request with. In the future, we may
-	// allow to match on other request parameters. The other approach to support matching
-	// on other request parameters is to use a different ModelName per HTTPFilter.
-	// Names can be reserved without implementing an actual model in the pool.
+	// ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
+	// ModelNames must be unique for a referencing InferencePool
+	// (names can be reused for a different pool in the same cluster).
+	// The modelName with the oldest creation timestamp is retained, and the incoming
+	// InferenceModel is sets the Ready status to false with a corresponding reason.
+	// In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+	// Names can be reserved without an underlying model configured in the pool.
 	// This can be done by specifying a target model and setting the weight to zero,
 	// an error will be returned specifying that no valid target model is found.
 	//
-	// +kubebuilder:validation:MaxLength=253
+	// +kubebuilder:validation:MaxLength=256
 	// +kubebuilder:validation:Required
 	ModelName string `json:"modelName"`
 
 	// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
+	// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
+	// queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+	// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+	// and the proportionality of fairness will be configurable.
 	//
+	// Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
+	// Any implementations that may consume this field may treat an unset value as the 'Standard' range.
 	// +optional
-	// +kubebuilder:default="Default"
 	Criticality *Criticality `json:"criticality,omitempty"`
 
 	// TargetModels allow multiple versions of a model for traffic splitting.
@@ -83,6 +89,7 @@ type InferenceModelSpec struct {
 	//
 	// +optional
 	// +kubebuilder:validation:MaxItems=10
+	// +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))"
 	TargetModels []TargetModel `json:"targetModels,omitempty"`
 
 	// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
@@ -120,16 +127,19 @@ type PoolObjectReference struct {
 }
 
 // Criticality defines how important it is to serve the model compared to other models.
-// +kubebuilder:validation:Enum=Critical;Default;Sheddable
+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
+// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
+// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
 type Criticality string
 
 const (
 	// Critical defines the highest level of criticality. Requests to this band will be shed last.
 	Critical Criticality = "Critical"
 
-	// Default defines the default criticality level and is more important than Sheddable but less
+	// Standard defines the base criticality level and is more important than Sheddable but less
 	// important than Critical. Requests in this band will be shed before critical traffic.
-	Default Criticality = "Default"
+	// Most models are expected to fall within this band.
+	Standard Criticality = "Standard"
 
 	// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
 	// all other bands.
@@ -160,16 +170,13 @@ type TargetModel struct {
 	// implementation supports. Weight is not a percentage and the sum of
 	// weights does not need to equal 100.
 	//
-	// If only one model is specified and it has a weight greater than 0, 100%
-	// of the traffic is forwarded to that model. If weight is set to 0, no
-	// traffic should be forwarded for this model. If unspecified, weight
-	// defaults to 1.
+	// If a weight is set for any targetModel, it must be set for all targetModels.
+	// Conversely weights are optional, so long as ALL targetModels do not specify a weight.
 	//
 	// +optional
-	// +kubebuilder:default=1
 	// +kubebuilder:validation:Minimum=0
 	// +kubebuilder:validation:Maximum=1000000
-	Weight int32 `json:"weight,omitempty"`
+	Weight *int32 `json:"weight,omitempty"`
 }
 
 // InferenceModelStatus defines the observed state of InferenceModel
diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go
@@ -44,11 +44,9 @@ type InferencePoolList struct {
 
 // InferencePoolSpec defines the desired state of InferencePool
 type InferencePoolSpec struct {
-	// Selector defines a map of label to watch model server pods
-	// that should be included in the InferencePool. ModelServers should not
-	// be with any other Service or InferencePool, that behavior is not supported
-	// and will result in sub-optimal utilization.
-	// In some cases, implementations may translate this to a Service selector, so this matches the simple
+	// Selector defines a map of labels to watch model server pods
+	// that should be included in the InferencePool.
+	// In some cases, implementations may translate this field to a Service selector, so this matches the simple
 	// map used for Service selectors instead of the full Kubernetes LabelSelector type.
 	//
 	// +kubebuilder:validation:Required
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml
@@ -55,25 +55,32 @@ spec:
               condition, one will be selected at random.
             properties:
               criticality:
-                default: Default
-                description: Criticality defines how important it is to serve the
-                  model compared to other models referencing the same pool.
+                description: |-
+                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
+                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
+                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+                  and the proportionality of fairness will be configurable.
+
+                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
+                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
                 enum:
                 - Critical
-                - Default
+                - Standard
                 - Sheddable
                 type: string
               modelName:
                 description: |-
-                  ModelName is the name of the model as the users set in the "model" parameter in the requests.
-                  The name should be unique among the workloads that reference the same backend pool.
-                  This is the parameter that will be used to match the request with. In the future, we may
-                  allow to match on other request parameters. The other approach to support matching
-                  on other request parameters is to use a different ModelName per HTTPFilter.
-                  Names can be reserved without implementing an actual model in the pool.
+                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
+                  ModelNames must be unique for a referencing InferencePool
+                  (names can be reused for a different pool in the same cluster).
+                  The modelName with the oldest creation timestamp is retained, and the incoming
+                  InferenceModel is sets the Ready status to false with a corresponding reason.
+                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+                  Names can be reserved without an underlying model configured in the pool.
                   This can be done by specifying a target model and setting the weight to zero,
                   an error will be returned specifying that no valid target model is found.
-                maxLength: 253
+                maxLength: 256
                 type: string
               poolRef:
                 description: PoolRef is a reference to the inference pool, the pool
@@ -121,7 +128,6 @@ spec:
                       maxLength: 253
                       type: string
                     weight:
-                      default: 1
                       description: |-
                         Weight is used to determine the proportion of traffic that should be
                         sent to this model when multiple target models are specified.
@@ -133,10 +139,8 @@ spec:
                         implementation supports. Weight is not a percentage and the sum of
                         weights does not need to equal 100.
 
-                        If only one model is specified and it has a weight greater than 0, 100%
-                        of the traffic is forwarded to that model. If weight is set to 0, no
-                        traffic should be forwarded for this model. If unspecified, weight
-                        defaults to 1.
+                        If a weight is set for any targetModel, it must be set for all targetModels.
+                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
                       format: int32
                       maximum: 1000000
                       minimum: 0
@@ -146,6 +150,9 @@ spec:
                   type: object
                 maxItems: 10
                 type: array
+                x-kubernetes-validations:
+                - message: Weights should be set for all models, or none of the models.
+                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
             required:
             - modelName
             - poolRef
diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml
@@ -58,11 +58,9 @@ spec:
                   pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
                   type: string
                 description: |-
-                  Selector defines a map of label to watch model server pods
-                  that should be included in the InferencePool. ModelServers should not
-                  be with any other Service or InferencePool, that behavior is not supported
-                  and will result in sub-optimal utilization.
-                  In some cases, implementations may translate this to a Service selector, so this matches the simple
+                  Selector defines a map of labels to watch model server pods
+                  that should be included in the InferencePool.
+                  In some cases, implementations may translate this field to a Service selector, so this matches the simple
                   map used for Service selectors instead of the full Kubernetes LabelSelector type.
                 type: object
               targetPortNumber:
diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md
@@ -78,6 +78,7 @@ The API design is based on these axioms:
 - This solution should be composable with other Gateway solutions and flexible to fit customer needs
 - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future
 - The Gateway should route in a way that does not generate a queue of requests at the model server level
+- Model serving differs from web-serving in critical ways. One of these is the existence of multiple models for the same service, which can materially impact behavior, depending on the model served. As opposed to a web-service that has mechanisms to render implementation changes invisible to an end user 
 
 The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **InferencePool**.
 
@@ -126,27 +127,21 @@ type InferencePool struct {
 
 type InferencePoolSpec struct {
         // ModelServerSelector uses label selection to watch model server pods
-        // that should be included in the InferencePool. ModelServers should not
-        // be with any other Service or InferencePool, that behavior is not supported
-        // and will result in sub-optimal utilization.
+        // that should be included in the InferencePool.
         ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"`
 }
 ```
 
 **InferenceModel**
 ```golang
 // InferenceModel represents a set of Models/Adapters that are multiplexed onto one 
-// or more Inferencepools. This resource is managed by the "Inference Workload Owner"
+// or more InferencePools. This resource is managed by the "Inference Workload Owner"
 // persona. The Inference Workload Owner persona is: a team that trains, verifies, and
 // leverages a large language model from a model frontend, drives the lifecycle
 // and rollout of new versions of those models, and defines the specific
 // performance and latency goals for the model. These workloads are
-// expected to operate within an InferencePool sharing compute capacity with other
-// InferenceModels, defined by the Inference Platform Admin. We allow a user who
-// has multiple InferenceModels across multiple pools (with the same config) to
-// specify the configuration exactly once, and deploy to many pools 
-// simultaneously. Enabling a simpler config and single source of truth
-// for a given user. InferenceModel ModelNames are unique for a given InferencePool,
+// expected to coexist within an InferencePool: sharing compute capacity with other
+// InferenceModels, with sharing limitations defined by the Inference Platform Admin.
 type InferenceModel struct {
         metav1.ObjectMeta
         metav1.TypeMeta
@@ -155,28 +150,35 @@ type InferenceModel struct {
 }
 
 type InferenceModelSpec struct {
-        // The name of the model as the users set in the "model" parameter in the requests.
-        // The name should be unique among the workloads that reference the same backend pool.
-        // This is the parameter that will be used to match the request with. In the future, we may
-        // allow to match on other request parameters. The other approach to support matching on 
-        // on other request parameters is to use a different ModelName per HTTPFilter.
-        // Names can be reserved without implementing an actual model in the pool.
+        // The name of the model as it will be set in the "model" parameter for an incoming request.
+        // ModelNames are expected to be unique for a specific InferencePool 
+        // (names can be reused for a different pool in the same cluster). 
+        // The modelName with the oldest creation timestamp is retained, and the incoming
+        // InferenceModel is sets the Ready status to false with a corresponding reason. 
+        // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+        // Names can be reserved without an underlying model configured in the pool.
         // This can be done by specifying a target model and setting the weight to zero,
         // an error will be returned specifying that no valid target model is found.
         ModelName string
         // Optional
         // Defines how important it is to serve the model compared to other models referencing the same pool.
+        // Criticality impacts how traffic is handled in resource constrained situations. It handles this by 
+        // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+        // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+        // and the proportionality of fairness will be configurable.
         Criticality *Criticality
         // Optional.
-	    // Allow multiple versions of a model for traffic splitting. 
-	    // If not specified, the target model name is defaulted to the ModelName parameter.
+        // Allow multiple versions of a model for traffic splitting.
+        // If not specified, the target model name is defaulted to the ModelName parameter.
         // ModelName is often in reference to a LoRA adapter.
         TargetModels []TargetModel
         // Reference to the InferencePool that the model registers to. It must exist in the same namespace.
         PoolReference *LocalObjectReference
 }
 
 // Defines how important it is to serve the model compared to other models.
+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field should ALWAYS be optional(use a pointer), and set no default.
+// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
 type Criticality string
 const (
     // Most important. Requests to this band will be shed last.
@@ -200,7 +202,7 @@ type TargetModel struct {
         Name string
         // Weight is used to determine the percentage of traffic that should be 
         // sent to this target model when multiple versions of the model are specified.
-        Weight int
+        Weight *int
 }
 
 // LocalObjectReference identifies an API object within the namespace of the
diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go
@@ -91,15 +91,15 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string {
 	}
 	r := rand.New(source)
 	for _, model := range model.Spec.TargetModels {
-		weights += model.Weight
+		weights += *model.Weight
 	}
 	klog.V(3).Infof("Weights for Model(%v) total to: %v", model.Name, weights)
 	randomVal := r.Int31n(weights)
 	for _, model := range model.Spec.TargetModels {
-		if randomVal < model.Weight {
+		if randomVal < *model.Weight {
 			return model.Name
 		}
-		randomVal -= model.Weight
+		randomVal -= *model.Weight
 	}
 	return ""
 }
diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go
diff --git a/pkg/ext-proc/test/hermetic_test.go b/pkg/ext-proc/test/hermetic_test.go

Original file line number	Diff line number	Diff line change
`@@ -91,15 +91,15 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string {`
`91`	`91`	`}`
`92`	`92`	`r := rand.New(source)`
`93`	`93`	`for _, model := range model.Spec.TargetModels {`
`94`		`- weights += model.Weight`
	`94`	`+ weights += *model.Weight`
`95`	`95`	`}`
`96`	`96`	`klog.V(3).Infof("Weights for Model(%v) total to: %v", model.Name, weights)`
`97`	`97`	`randomVal := r.Int31n(weights)`
`98`	`98`	`for _, model := range model.Spec.TargetModels {`
`99`		`- if randomVal < model.Weight {`
	`99`	`+ if randomVal < *model.Weight {`
`100`	`100`	`return model.Name`
`101`	`101`	`}`
`102`		`- randomVal -= model.Weight`
	`102`	`+ randomVal -= *model.Weight`
`103`	`103`	`}`
`104`	`104`	`return ""`
`105`	`105`	`}`