Skip to content

Commit 271b638

Browse files
authored
[v0.1 API Review] Cleaning up optional fields/clearer wording (#185)
* feedback updates * updating the actual API obj as well * Generating new manifests/ Additional Criticality wording * More feedback updates * generating new manifests * feedback updates * generated code * build fix * test fixes * Renaming the 'base' Criticality band to 'Standard' * detailing how to handle unset criticality
1 parent 5d000fa commit 271b638

9 files changed

+102
-75
lines changed

api/v1alpha1/inferencemodel_types.go

+24-17
Original file line numberDiff line numberDiff line change
@@ -58,23 +58,29 @@ type InferenceModelList struct {
5858
// creation timestamp, will be selected to remain valid. In the event of a race
5959
// condition, one will be selected at random.
6060
type InferenceModelSpec struct {
61-
// ModelName is the name of the model as the users set in the "model" parameter in the requests.
62-
// The name should be unique among the workloads that reference the same backend pool.
63-
// This is the parameter that will be used to match the request with. In the future, we may
64-
// allow to match on other request parameters. The other approach to support matching
65-
// on other request parameters is to use a different ModelName per HTTPFilter.
66-
// Names can be reserved without implementing an actual model in the pool.
61+
// ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
62+
// ModelNames must be unique for a referencing InferencePool
63+
// (names can be reused for a different pool in the same cluster).
64+
// The modelName with the oldest creation timestamp is retained, and the incoming
65+
// InferenceModel is sets the Ready status to false with a corresponding reason.
66+
// In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
67+
// Names can be reserved without an underlying model configured in the pool.
6768
// This can be done by specifying a target model and setting the weight to zero,
6869
// an error will be returned specifying that no valid target model is found.
6970
//
70-
// +kubebuilder:validation:MaxLength=253
71+
// +kubebuilder:validation:MaxLength=256
7172
// +kubebuilder:validation:Required
7273
ModelName string `json:"modelName"`
7374

7475
// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
76+
// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
77+
// queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
78+
// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
79+
// and the proportionality of fairness will be configurable.
7580
//
81+
// Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
82+
// Any implementations that may consume this field may treat an unset value as the 'Standard' range.
7683
// +optional
77-
// +kubebuilder:default="Default"
7884
Criticality *Criticality `json:"criticality,omitempty"`
7985

8086
// TargetModels allow multiple versions of a model for traffic splitting.
@@ -83,6 +89,7 @@ type InferenceModelSpec struct {
8389
//
8490
// +optional
8591
// +kubebuilder:validation:MaxItems=10
92+
// +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))"
8693
TargetModels []TargetModel `json:"targetModels,omitempty"`
8794

8895
// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
@@ -120,16 +127,19 @@ type PoolObjectReference struct {
120127
}
121128

122129
// Criticality defines how important it is to serve the model compared to other models.
123-
// +kubebuilder:validation:Enum=Critical;Default;Sheddable
130+
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
131+
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
132+
// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
124133
type Criticality string
125134

126135
const (
127136
// Critical defines the highest level of criticality. Requests to this band will be shed last.
128137
Critical Criticality = "Critical"
129138

130-
// Default defines the default criticality level and is more important than Sheddable but less
139+
// Standard defines the base criticality level and is more important than Sheddable but less
131140
// important than Critical. Requests in this band will be shed before critical traffic.
132-
Default Criticality = "Default"
141+
// Most models are expected to fall within this band.
142+
Standard Criticality = "Standard"
133143

134144
// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
135145
// all other bands.
@@ -160,16 +170,13 @@ type TargetModel struct {
160170
// implementation supports. Weight is not a percentage and the sum of
161171
// weights does not need to equal 100.
162172
//
163-
// If only one model is specified and it has a weight greater than 0, 100%
164-
// of the traffic is forwarded to that model. If weight is set to 0, no
165-
// traffic should be forwarded for this model. If unspecified, weight
166-
// defaults to 1.
173+
// If a weight is set for any targetModel, it must be set for all targetModels.
174+
// Conversely weights are optional, so long as ALL targetModels do not specify a weight.
167175
//
168176
// +optional
169-
// +kubebuilder:default=1
170177
// +kubebuilder:validation:Minimum=0
171178
// +kubebuilder:validation:Maximum=1000000
172-
Weight int32 `json:"weight,omitempty"`
179+
Weight *int32 `json:"weight,omitempty"`
173180
}
174181

175182
// InferenceModelStatus defines the observed state of InferenceModel

api/v1alpha1/inferencepool_types.go

+3-5
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,9 @@ type InferencePoolList struct {
4444

4545
// InferencePoolSpec defines the desired state of InferencePool
4646
type InferencePoolSpec struct {
47-
// Selector defines a map of label to watch model server pods
48-
// that should be included in the InferencePool. ModelServers should not
49-
// be with any other Service or InferencePool, that behavior is not supported
50-
// and will result in sub-optimal utilization.
51-
// In some cases, implementations may translate this to a Service selector, so this matches the simple
47+
// Selector defines a map of labels to watch model server pods
48+
// that should be included in the InferencePool.
49+
// In some cases, implementations may translate this field to a Service selector, so this matches the simple
5250
// map used for Service selectors instead of the full Kubernetes LabelSelector type.
5351
//
5452
// +kubebuilder:validation:Required

api/v1alpha1/zz_generated.deepcopy.go

+8-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml

+23-16
Original file line numberDiff line numberDiff line change
@@ -55,25 +55,32 @@ spec:
5555
condition, one will be selected at random.
5656
properties:
5757
criticality:
58-
default: Default
59-
description: Criticality defines how important it is to serve the
60-
model compared to other models referencing the same pool.
58+
description: |-
59+
Criticality defines how important it is to serve the model compared to other models referencing the same pool.
60+
Criticality impacts how traffic is handled in resource constrained situations. It handles this by
61+
queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
62+
fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
63+
and the proportionality of fairness will be configurable.
64+
65+
Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
66+
Any implementations that may consume this field may treat an unset value as the 'Standard' range.
6167
enum:
6268
- Critical
63-
- Default
69+
- Standard
6470
- Sheddable
6571
type: string
6672
modelName:
6773
description: |-
68-
ModelName is the name of the model as the users set in the "model" parameter in the requests.
69-
The name should be unique among the workloads that reference the same backend pool.
70-
This is the parameter that will be used to match the request with. In the future, we may
71-
allow to match on other request parameters. The other approach to support matching
72-
on other request parameters is to use a different ModelName per HTTPFilter.
73-
Names can be reserved without implementing an actual model in the pool.
74+
ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
75+
ModelNames must be unique for a referencing InferencePool
76+
(names can be reused for a different pool in the same cluster).
77+
The modelName with the oldest creation timestamp is retained, and the incoming
78+
InferenceModel is sets the Ready status to false with a corresponding reason.
79+
In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
80+
Names can be reserved without an underlying model configured in the pool.
7481
This can be done by specifying a target model and setting the weight to zero,
7582
an error will be returned specifying that no valid target model is found.
76-
maxLength: 253
83+
maxLength: 256
7784
type: string
7885
poolRef:
7986
description: PoolRef is a reference to the inference pool, the pool
@@ -121,7 +128,6 @@ spec:
121128
maxLength: 253
122129
type: string
123130
weight:
124-
default: 1
125131
description: |-
126132
Weight is used to determine the proportion of traffic that should be
127133
sent to this model when multiple target models are specified.
@@ -133,10 +139,8 @@ spec:
133139
implementation supports. Weight is not a percentage and the sum of
134140
weights does not need to equal 100.
135141
136-
If only one model is specified and it has a weight greater than 0, 100%
137-
of the traffic is forwarded to that model. If weight is set to 0, no
138-
traffic should be forwarded for this model. If unspecified, weight
139-
defaults to 1.
142+
If a weight is set for any targetModel, it must be set for all targetModels.
143+
Conversely weights are optional, so long as ALL targetModels do not specify a weight.
140144
format: int32
141145
maximum: 1000000
142146
minimum: 0
@@ -146,6 +150,9 @@ spec:
146150
type: object
147151
maxItems: 10
148152
type: array
153+
x-kubernetes-validations:
154+
- message: Weights should be set for all models, or none of the models.
155+
rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
149156
required:
150157
- modelName
151158
- poolRef

config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml

+3-5
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,9 @@ spec:
5858
pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
5959
type: string
6060
description: |-
61-
Selector defines a map of label to watch model server pods
62-
that should be included in the InferencePool. ModelServers should not
63-
be with any other Service or InferencePool, that behavior is not supported
64-
and will result in sub-optimal utilization.
65-
In some cases, implementations may translate this to a Service selector, so this matches the simple
61+
Selector defines a map of labels to watch model server pods
62+
that should be included in the InferencePool.
63+
In some cases, implementations may translate this field to a Service selector, so this matches the simple
6664
map used for Service selectors instead of the full Kubernetes LabelSelector type.
6765
type: object
6866
targetPortNumber:

docs/proposals/002-api-proposal/proposal.md

+21-19
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ The API design is based on these axioms:
7878
- This solution should be composable with other Gateway solutions and flexible to fit customer needs
7979
- The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future
8080
- The Gateway should route in a way that does not generate a queue of requests at the model server level
81+
- Model serving differs from web-serving in critical ways. One of these is the existence of multiple models for the same service, which can materially impact behavior, depending on the model served. As opposed to a web-service that has mechanisms to render implementation changes invisible to an end user
8182

8283
The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **InferencePool**.
8384

@@ -126,27 +127,21 @@ type InferencePool struct {
126127

127128
type InferencePoolSpec struct {
128129
// ModelServerSelector uses label selection to watch model server pods
129-
// that should be included in the InferencePool. ModelServers should not
130-
// be with any other Service or InferencePool, that behavior is not supported
131-
// and will result in sub-optimal utilization.
130+
// that should be included in the InferencePool.
132131
ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"`
133132
}
134133
```
135134

136135
**InferenceModel**
137136
```golang
138137
// InferenceModel represents a set of Models/Adapters that are multiplexed onto one
139-
// or more Inferencepools. This resource is managed by the "Inference Workload Owner"
138+
// or more InferencePools. This resource is managed by the "Inference Workload Owner"
140139
// persona. The Inference Workload Owner persona is: a team that trains, verifies, and
141140
// leverages a large language model from a model frontend, drives the lifecycle
142141
// and rollout of new versions of those models, and defines the specific
143142
// performance and latency goals for the model. These workloads are
144-
// expected to operate within an InferencePool sharing compute capacity with other
145-
// InferenceModels, defined by the Inference Platform Admin. We allow a user who
146-
// has multiple InferenceModels across multiple pools (with the same config) to
147-
// specify the configuration exactly once, and deploy to many pools
148-
// simultaneously. Enabling a simpler config and single source of truth
149-
// for a given user. InferenceModel ModelNames are unique for a given InferencePool,
143+
// expected to coexist within an InferencePool: sharing compute capacity with other
144+
// InferenceModels, with sharing limitations defined by the Inference Platform Admin.
150145
type InferenceModel struct {
151146
metav1.ObjectMeta
152147
metav1.TypeMeta
@@ -155,28 +150,35 @@ type InferenceModel struct {
155150
}
156151

157152
type InferenceModelSpec struct {
158-
// The name of the model as the users set in the "model" parameter in the requests.
159-
// The name should be unique among the workloads that reference the same backend pool.
160-
// This is the parameter that will be used to match the request with. In the future, we may
161-
// allow to match on other request parameters. The other approach to support matching on
162-
// on other request parameters is to use a different ModelName per HTTPFilter.
163-
// Names can be reserved without implementing an actual model in the pool.
153+
// The name of the model as it will be set in the "model" parameter for an incoming request.
154+
// ModelNames are expected to be unique for a specific InferencePool
155+
// (names can be reused for a different pool in the same cluster).
156+
// The modelName with the oldest creation timestamp is retained, and the incoming
157+
// InferenceModel is sets the Ready status to false with a corresponding reason.
158+
// In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
159+
// Names can be reserved without an underlying model configured in the pool.
164160
// This can be done by specifying a target model and setting the weight to zero,
165161
// an error will be returned specifying that no valid target model is found.
166162
ModelName string
167163
// Optional
168164
// Defines how important it is to serve the model compared to other models referencing the same pool.
165+
// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
166+
// queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
167+
// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
168+
// and the proportionality of fairness will be configurable.
169169
Criticality *Criticality
170170
// Optional.
171-
// Allow multiple versions of a model for traffic splitting.
172-
// If not specified, the target model name is defaulted to the ModelName parameter.
171+
// Allow multiple versions of a model for traffic splitting.
172+
// If not specified, the target model name is defaulted to the ModelName parameter.
173173
// ModelName is often in reference to a LoRA adapter.
174174
TargetModels []TargetModel
175175
// Reference to the InferencePool that the model registers to. It must exist in the same namespace.
176176
PoolReference *LocalObjectReference
177177
}
178178

179179
// Defines how important it is to serve the model compared to other models.
180+
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field should ALWAYS be optional(use a pointer), and set no default.
181+
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
180182
type Criticality string
181183
const (
182184
// Most important. Requests to this band will be shed last.
@@ -200,7 +202,7 @@ type TargetModel struct {
200202
Name string
201203
// Weight is used to determine the percentage of traffic that should be
202204
// sent to this target model when multiple versions of the model are specified.
203-
Weight int
205+
Weight *int
204206
}
205207

206208
// LocalObjectReference identifies an API object within the namespace of the

pkg/ext-proc/backend/datastore.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,15 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string {
9191
}
9292
r := rand.New(source)
9393
for _, model := range model.Spec.TargetModels {
94-
weights += model.Weight
94+
weights += *model.Weight
9595
}
9696
klog.V(3).Infof("Weights for Model(%v) total to: %v", model.Name, weights)
9797
randomVal := r.Int31n(weights)
9898
for _, model := range model.Spec.TargetModels {
99-
if randomVal < model.Weight {
99+
if randomVal < *model.Weight {
100100
return model.Name
101101
}
102-
randomVal -= model.Weight
102+
randomVal -= *model.Weight
103103
}
104104
return ""
105105
}

0 commit comments

Comments
 (0)