diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go index 1b8782a0b..7ff9c3990 100644 --- a/api/v1alpha1/groupversion_info.go +++ b/api/v1alpha1/groupversion_info.go @@ -16,7 +16,7 @@ limitations under the License. // Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API group // +kubebuilder:object:generate=true -// +groupName=gateway.inference.networking.x-k8s.io +// +groupName=inference.networking.x-k8s.io package v1alpha1 import ( diff --git a/config/crd/bases/gateway.inference.k8s.io_llmserverpools.yaml b/config/crd/bases/gateway.inference.k8s.io_llmserverpools.yaml deleted file mode 100644 index 56bb3ff02..000000000 --- a/config/crd/bases/gateway.inference.k8s.io_llmserverpools.yaml +++ /dev/null @@ -1,158 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.16.1 - name: llmserverpools.gateway.inference.networking.x-k8s.io -spec: - group: gateway.inference.networking.x-k8s.io - names: - kind: LLMServerPool - listKind: LLMServerPoolList - plural: llmserverpools - singular: llmserverpool - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: LLMServerPool is the Schema for the llmserverpools API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: LLMServerPoolSpec defines the desired state of LLMServerPool - properties: - modelServerSelector: - description: |- - ModelServerSelector uses label selection to watch model server pods - that should be included in the LLMServerPool. ModelServers should not - be with any other Service or LLMServerPool, that behavior is not supported - and will result in sub-optimal utilization. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - type: object - status: - description: LLMServerPoolStatus defines the observed state of LLMServerPool - properties: - conditions: - description: Conditions track the state of the LLMServerPool. - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/config/crd/bases/gateway.inference.networking.x-k8s.io_llmservices.yaml b/config/crd/bases/gateway.inference.networking.x-k8s.io_llmservices.yaml deleted file mode 100644 index b9a1a80ee..000000000 --- a/config/crd/bases/gateway.inference.networking.x-k8s.io_llmservices.yaml +++ /dev/null @@ -1,239 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.16.1 - name: llmservices.gateway.inference.networking.x-k8s.io -spec: - group: gateway.inference.networking.x-k8s.io - names: - kind: LLMService - listKind: LLMServiceList - plural: llmservices - singular: llmservice - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: LLMService is the Schema for the llmservices API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - LLMService represents a set of LLM services that are multiplexed onto one - or more LLMServerPools. This resource is managed by the "LLM Service Owner" - persona. The Service Owner persona is: a team that trains, verifies, and - leverages a large language model from a model frontend, drives the lifecycle - and rollout of new versions of those models, and defines the specific - performance and latency goals for the model. These services are - expected to operate within a LLMServerPool sharing compute capacity with other - LLMServices, defined by the Inference Platform Admin. We allow a user who - has multiple LLMServices across multiple pools (with the same config) to - specify the configuration exactly once, and deploy to many pools - simultaneously. Enabling a simpler config and single source of truth - for a given user. LLMService names are unique for a given LLMServerPool, - if the name is reused, an error will be shown on the status of a - LLMService that attempted to reuse. The oldest LLMService, based on - creation timestamp, will be selected to remain valid. In the event of a race - condition, one will be selected at random. - properties: - models: - description: |- - Model defines the distinct services. - Model can be in 2 priority classes, Critical and Noncritical. - Priority class is implicitly set to Critical by specifying an Objective. - Otherwise the Model is considered Noncritical. - items: - description: |- - Model defines the policies for routing the traffic of a use case, this includes performance objectives - and traffic splitting between different versions of the model. - properties: - name: - description: |- - The name of the model as the users set in the "model" parameter in the requests. - The name should be unique among the services that reference the same backend pool. - This is the parameter that will be used to match the request with. In the future, we may - allow to match on other request parameters. The other approach to support matching on - on other request parameters is to use a different ModelName per HTTPFilter. - Names can be reserved without implementing an actual model in the pool. - This can be done by specifying a target model and setting the weight to zero, - an error will be returned specifying that no valid target model is found. - type: string - objective: - description: |- - Optional - LLM Services with an objective have higher priority than services without. - IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class. - In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected. - properties: - desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: - description: |- - The AverageLatencyPerOutputToken is calculated as the e2e request latency divided by output token - length. Note that this is different from what is known as TPOT (time per output token) which only - takes decode time into account. - The P95 is calculated over a fixed time window defined at the operator level. - format: int64 - type: integer - type: object - targetModels: - description: |- - Optional. - Allow multiple versions of a model for traffic splitting. - If not specified, the target model name is defaulted to the modelName parameter. - modelName is often in reference to a LoRA adapter. - items: - description: |- - TargetModel represents a deployed model or a LoRA adapter. The - Name field is expected to match the name of the LoRA adapter - (or base model) as it is registered within the model server. Inference - Gateway assumes that the model exists on the model server and is the - responsibility of the user to validate a correct match. Should a model fail - to exist at request time, the error is processed by the Instance Gateway, - and then emitted on the appropriate LLMService object. - properties: - name: - description: The name of the adapter as expected by the - ModelServer. - type: string - weight: - description: |- - Weight is used to determine the percentage of traffic that should be - sent to this target model when multiple versions of the model are specified. - type: integer - type: object - type: array - type: object - type: array - poolRef: - description: PoolRef are references to the backend pools that the - LLMService registers to. - items: - description: ObjectReference contains enough information to let - you inspect or modify the referred object. - properties: - apiVersion: - description: API version of the referent. - type: string - fieldPath: - description: |- - If referring to a piece of an object instead of an entire object, this string - should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. - For example, if the object reference is to a container within a pod, this would take on a value like: - "spec.containers{name}" (where "name" refers to the name of the container that triggered - the event) or if no container name is specified "spec.containers[2]" (container with - index 2 in this pod). This syntax is chosen only to have some well-defined way of - referencing a part of an object. - type: string - kind: - description: |- - Kind of the referent. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - name: - description: |- - Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - namespace: - description: |- - Namespace of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ - type: string - resourceVersion: - description: |- - Specific resourceVersion to which this reference is made, if any. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency - type: string - uid: - description: |- - UID of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids - type: string - type: object - x-kubernetes-map-type: atomic - type: array - type: object - status: - description: LLMServiceStatus defines the observed state of LLMService - properties: - conditions: - description: Conditions track the state of the LLMServerPool. - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/config/crd/bases/gateway.inference.networking.x-k8s.io_llmserverpools.yaml b/config/crd/bases/inference.networking.x-k8s.io_llmserverpools.yaml similarity index 98% rename from config/crd/bases/gateway.inference.networking.x-k8s.io_llmserverpools.yaml rename to config/crd/bases/inference.networking.x-k8s.io_llmserverpools.yaml index 02ac00ac0..eb4d6c0a1 100644 --- a/config/crd/bases/gateway.inference.networking.x-k8s.io_llmserverpools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_llmserverpools.yaml @@ -4,9 +4,9 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.1 - name: llmserverpools.gateway.inference.networking.x-k8s.io + name: llmserverpools.inference.networking.x-k8s.io spec: - group: gateway.inference.networking.x-k8s.io + group: inference.networking.x-k8s.io names: kind: LLMServerPool listKind: LLMServerPoolList diff --git a/config/crd/bases/gateway.inference.k8s.io_llmservices.yaml b/config/crd/bases/inference.networking.x-k8s.io_llmservices.yaml similarity index 99% rename from config/crd/bases/gateway.inference.k8s.io_llmservices.yaml rename to config/crd/bases/inference.networking.x-k8s.io_llmservices.yaml index b9a1a80ee..73e87bee3 100644 --- a/config/crd/bases/gateway.inference.k8s.io_llmservices.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_llmservices.yaml @@ -4,9 +4,9 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.1 - name: llmservices.gateway.inference.networking.x-k8s.io + name: llmservices.inference.networking.x-k8s.io spec: - group: gateway.inference.networking.x-k8s.io + group: inference.networking.x-k8s.io names: kind: LLMService listKind: LLMServiceList diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 2a0bcfcbb..76a4574e8 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -2,8 +2,8 @@ # since it depends on service name and namespace that are out of this kustomize package. # It should be run by config/default resources: -- bases/gateway.inference.networking.x-k8s.io_llmserverpools.yaml -- bases/gateway.inference.networking.x-k8s.io_llmservices.yaml +- bases/inference.networking.x-k8s.io_llmserverpools.yaml +- bases/inference.networking.x-k8s.io_llmservices.yaml # +kubebuilder:scaffold:crdkustomizeresource patches: diff --git a/config/rbac/llmserverpool_editor_role.yaml b/config/rbac/llmserverpool_editor_role.yaml index ef8ad5c14..54139d22d 100644 --- a/config/rbac/llmserverpool_editor_role.yaml +++ b/config/rbac/llmserverpool_editor_role.yaml @@ -8,7 +8,7 @@ metadata: name: llmserverpool-editor-role rules: - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmserverpools verbs: @@ -20,7 +20,7 @@ rules: - update - watch - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmserverpools/status verbs: diff --git a/config/rbac/llmserverpool_viewer_role.yaml b/config/rbac/llmserverpool_viewer_role.yaml index 35a4ade89..c3355ba27 100644 --- a/config/rbac/llmserverpool_viewer_role.yaml +++ b/config/rbac/llmserverpool_viewer_role.yaml @@ -8,7 +8,7 @@ metadata: name: llmserverpool-viewer-role rules: - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmserverpools verbs: @@ -16,7 +16,7 @@ rules: - list - watch - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmserverpools/status verbs: diff --git a/config/rbac/llmservice_editor_role.yaml b/config/rbac/llmservice_editor_role.yaml index 092151884..300216acc 100644 --- a/config/rbac/llmservice_editor_role.yaml +++ b/config/rbac/llmservice_editor_role.yaml @@ -8,7 +8,7 @@ metadata: name: llmservice-editor-role rules: - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmservices verbs: @@ -20,7 +20,7 @@ rules: - update - watch - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmservices/status verbs: diff --git a/config/rbac/llmservice_viewer_role.yaml b/config/rbac/llmservice_viewer_role.yaml index 9c26d7c8b..cfdec182f 100644 --- a/config/rbac/llmservice_viewer_role.yaml +++ b/config/rbac/llmservice_viewer_role.yaml @@ -8,7 +8,7 @@ metadata: name: llmservice-viewer-role rules: - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmservices verbs: @@ -16,7 +16,7 @@ rules: - list - watch - apiGroups: - - gateway.inference.networking.x-k8s.io + - inference.networking.x-k8s.io resources: - llmservices/status verbs: