diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 3414b797..766ecfef 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -65,7 +65,7 @@ type InferenceModelSpec struct { // Reference to the inference pool, the pool must exist in the same namespace. // // +kubebuilder:validation:Required - PoolRef *PoolObjectReference `json:"poolRef,omitempty"` + PoolRef PoolObjectReference `json:"poolRef"` } // PoolObjectReference identifies an API object within the namespace of the diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index fd866b35..4f17fbd0 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -97,11 +97,7 @@ func (in *InferenceModelSpec) DeepCopyInto(out *InferenceModelSpec) { *out = make([]TargetModel, len(*in)) copy(*out, *in) } - if in.PoolRef != nil { - in, out := &in.PoolRef, &out.PoolRef - *out = new(PoolObjectReference) - **out = **in - } + out.PoolRef = in.PoolRef } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelSpec. diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 2c1202c4..17135c67 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -67,7 +67,7 @@ spec: type: object targetPortNumber: description: |- - TargetPort is the port number that the model servers within the pool expect + TargetPortNumber is the port number that the model servers within the pool expect to recieve traffic from. This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort format: int32 diff --git a/config/crd/bases/inference.networking.x-k8s.io_llmserverpools.yaml b/config/crd/bases/inference.networking.x-k8s.io_llmserverpools.yaml deleted file mode 100644 index 3ef34c19..00000000 --- a/config/crd/bases/inference.networking.x-k8s.io_llmserverpools.yaml +++ /dev/null @@ -1,127 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.16.1 - name: llmserverpools.inference.networking.x-k8s.io -spec: - group: inference.networking.x-k8s.io - names: - kind: InferencePool - listKind: InferencePoolList - plural: llmserverpools - singular: llmserverpool - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: InferencePool is the Schema for the llmserverpools API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: InferencePoolSpec defines the desired state of InferencePool - properties: - modelServerSelector: - additionalProperties: - type: string - description: |- - ModelServerSelector uses a map of label to watch model server pods - that should be included in the InferencePool. ModelServers should not - be with any other Service or InferencePool, that behavior is not supported - and will result in sub-optimal utilization. - Due to this selector being translated to a service a simple map is used instead - of: https://pkg.go.dev/k8s.io/apimachinery/pkg/apis/meta/v1#LabelSelector - To avoid footshoot errors when the https://pkg.go.dev/k8s.io/apimachinery/pkg/apis/meta/v1#LabelSelectorAsMap would be used. - type: object - targetPort: - description: |- - TargetPort is the port number that the model servers within the pool expect - to recieve traffic from. - This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort - format: int32 - type: integer - type: object - status: - description: InferencePoolStatus defines the observed state of InferencePool - properties: - conditions: - description: Conditions track the state of the InferencePool. - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/config/crd/bases/inference.networking.x-k8s.io_llmservices.yaml b/config/crd/bases/inference.networking.x-k8s.io_llmservices.yaml deleted file mode 100644 index 390affa8..00000000 --- a/config/crd/bases/inference.networking.x-k8s.io_llmservices.yaml +++ /dev/null @@ -1,239 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.16.1 - name: inferencemodels.inference.networking.x-k8s.io -spec: - group: inference.networking.x-k8s.io - names: - kind: InferenceModel - listKind: InferenceModelList - plural: inferencemodels - singular: inferencemodel - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: InferenceModel is the Schema for the inferencemodels API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - InferenceModel represents a set of LLM services that are multiplexed onto one - or more InferencePools. This resource is managed by the "LLM Service Owner" - persona. The Service Owner persona is: a team that trains, verifies, and - leverages a large language model from a model frontend, drives the lifecycle - and rollout of new versions of those models, and defines the specific - performance and latency goals for the model. These services are - expected to operate within a InferencePool sharing compute capacity with other - InferenceModels, defined by the Inference Platform Admin. We allow a user who - has multiple InferenceModels across multiple pools (with the same config) to - specify the configuration exactly once, and deploy to many pools - simultaneously. Enabling a simpler config and single source of truth - for a given user. InferenceModel names are unique for a given InferencePool, - if the name is reused, an error will be shown on the status of a - InferenceModel that attempted to reuse. The oldest InferenceModel, based on - creation timestamp, will be selected to remain valid. In the event of a race - condition, one will be selected at random. - properties: - models: - description: |- - Model defines the distinct services. - Model can be in 2 priority classes, Critical and Noncritical. - Priority class is implicitly set to Critical by specifying an Objective. - Otherwise the Model is considered Noncritical. - items: - description: |- - Model defines the policies for routing the traffic of a use case, this includes performance objectives - and traffic splitting between different versions of the model. - properties: - name: - description: |- - The name of the model as the users set in the "model" parameter in the requests. - The name should be unique among the services that reference the same backend pool. - This is the parameter that will be used to match the request with. In the future, we may - allow to match on other request parameters. The other approach to support matching on - on other request parameters is to use a different ModelName per HTTPFilter. - Names can be reserved without implementing an actual model in the pool. - This can be done by specifying a target model and setting the weight to zero, - an error will be returned specifying that no valid target model is found. - type: string - objective: - description: |- - Optional - LLM Services with an objective have higher priority than services without. - IMPORTANT: By specifying an objective, this places the InferenceModel in a higher priority class than InferenceModels without a defined priority class. - In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected. - properties: - desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: - description: |- - The AverageLatencyPerOutputToken is calculated as the e2e request latency divided by output token - length. Note that this is different from what is known as TPOT (time per output token) which only - takes decode time into account. - The P95 is calculated over a fixed time window defined at the operator level. - format: int64 - type: integer - type: object - targetModels: - description: |- - Optional. - Allow multiple versions of a model for traffic splitting. - If not specified, the target model name is defaulted to the modelName parameter. - modelName is often in reference to a LoRA adapter. - items: - description: |- - TargetModel represents a deployed model or a LoRA adapter. The - Name field is expected to match the name of the LoRA adapter - (or base model) as it is registered within the model server. Inference - Gateway assumes that the model exists on the model server and is the - responsibility of the user to validate a correct match. Should a model fail - to exist at request time, the error is processed by the Instance Gateway, - and then emitted on the appropriate InferenceModel object. - properties: - name: - description: The name of the adapter as expected by the - ModelServer. - type: string - weight: - description: |- - Weight is used to determine the percentage of traffic that should be - sent to this target model when multiple versions of the model are specified. - type: integer - type: object - type: array - type: object - type: array - poolRef: - description: PoolRef are references to the backend pools that the - InferenceModel registers to. - items: - description: ObjectReference contains enough information to let - you inspect or modify the referred object. - properties: - apiVersion: - description: API version of the referent. - type: string - fieldPath: - description: |- - If referring to a piece of an object instead of an entire object, this string - should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. - For example, if the object reference is to a container within a pod, this would take on a value like: - "spec.containers{name}" (where "name" refers to the name of the container that triggered - the event) or if no container name is specified "spec.containers[2]" (container with - index 2 in this pod). This syntax is chosen only to have some well-defined way of - referencing a part of an object. - type: string - kind: - description: |- - Kind of the referent. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - name: - description: |- - Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - namespace: - description: |- - Namespace of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ - type: string - resourceVersion: - description: |- - Specific resourceVersion to which this reference is made, if any. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency - type: string - uid: - description: |- - UID of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids - type: string - type: object - x-kubernetes-map-type: atomic - type: array - type: object - status: - description: InferenceModelStatus defines the observed state of InferenceModel - properties: - conditions: - description: Conditions track the state of the InferencePool. - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 9f3d8c42..60bac40b 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -2,7 +2,7 @@ # since it depends on service name and namespace that are out of this kustomize package. # It should be run by config/default resources: -- bases/inference.networking.x-k8s.io_llmserverpools.yaml +- bases/inference.networking.x-k8s.io_inferencepools.yaml - bases/inference.networking.x-k8s.io_inferencemodels.yaml # +kubebuilder:scaffold:crdkustomizeresource @@ -13,7 +13,7 @@ patches: # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. # patches here are for enabling the CA injection for each CRD -#- path: patches/cainjection_in_llmserverpools.yaml +#- path: patches/cainjection_in_inferencepools.yaml #- path: patches/cainjection_in_inferencemodels.yaml # +kubebuilder:scaffold:crdkustomizecainjectionpatch diff --git a/config/rbac/llmservice_editor_role.yaml b/config/rbac/inferencemodel_editor_role.yaml similarity index 100% rename from config/rbac/llmservice_editor_role.yaml rename to config/rbac/inferencemodel_editor_role.yaml diff --git a/config/rbac/llmservice_viewer_role.yaml b/config/rbac/inferencemodel_viewer_role.yaml similarity index 100% rename from config/rbac/llmservice_viewer_role.yaml rename to config/rbac/inferencemodel_viewer_role.yaml diff --git a/config/rbac/llmserverpool_editor_role.yaml b/config/rbac/inferencepool_editor_role.yaml similarity index 74% rename from config/rbac/llmserverpool_editor_role.yaml rename to config/rbac/inferencepool_editor_role.yaml index 54139d22..cc1f7c35 100644 --- a/config/rbac/llmserverpool_editor_role.yaml +++ b/config/rbac/inferencepool_editor_role.yaml @@ -1,16 +1,16 @@ -# permissions for end users to edit llmserverpools. +# permissions for end users to edit inferencepools. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: api app.kubernetes.io/managed-by: kustomize - name: llmserverpool-editor-role + name: inferencepool-editor-role rules: - apiGroups: - inference.networking.x-k8s.io resources: - - llmserverpools + - inferencepools verbs: - create - delete @@ -22,6 +22,6 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - llmserverpools/status + - inferencepools/status verbs: - get diff --git a/config/rbac/llmserverpool_viewer_role.yaml b/config/rbac/inferencepool_viewer_role.yaml similarity index 71% rename from config/rbac/llmserverpool_viewer_role.yaml rename to config/rbac/inferencepool_viewer_role.yaml index c3355ba2..828e0022 100644 --- a/config/rbac/llmserverpool_viewer_role.yaml +++ b/config/rbac/inferencepool_viewer_role.yaml @@ -1,16 +1,16 @@ -# permissions for end users to view llmserverpools. +# permissions for end users to view inferencepools. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: api app.kubernetes.io/managed-by: kustomize - name: llmserverpool-viewer-role + name: inferencepool-viewer-role rules: - apiGroups: - inference.networking.x-k8s.io resources: - - llmserverpools + - inferencepools verbs: - get - list @@ -18,6 +18,6 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - llmserverpools/status + - inferencepools/status verbs: - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 45cadfd7..c3a52137 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -24,6 +24,6 @@ resources: # if you do not want those helpers be installed with your Project. - inferencemodel_editor_role.yaml - inferencemodel_viewer_role.yaml -- llmserverpool_editor_role.yaml -- llmserverpool_viewer_role.yaml +- inferencepool_editor_role.yaml +- inferencepool_viewer_role.yaml diff --git a/config/samples/gateway_v1alpha1_llmservice.yaml b/config/samples/gateway_v1alpha1_inferencemodel.yaml similarity index 100% rename from config/samples/gateway_v1alpha1_llmservice.yaml rename to config/samples/gateway_v1alpha1_inferencemodel.yaml diff --git a/config/samples/gateway_v1alpha1_llmserverpool.yaml b/config/samples/gateway_v1alpha1_inferencepool.yaml similarity index 89% rename from config/samples/gateway_v1alpha1_llmserverpool.yaml rename to config/samples/gateway_v1alpha1_inferencepool.yaml index b734e2bd..54046489 100644 --- a/config/samples/gateway_v1alpha1_llmserverpool.yaml +++ b/config/samples/gateway_v1alpha1_inferencepool.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/name: api app.kubernetes.io/managed-by: kustomize - name: llmserverpool-sample + name: inferencepool-sample spec: serviceRefs: - gemini-jetstream-tpu-v5e-service diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml index e6def51a..e4b9f2e8 100644 --- a/config/samples/kustomization.yaml +++ b/config/samples/kustomization.yaml @@ -1,5 +1,5 @@ ## Append samples of your project ## resources: -- gateway_v1alpha1_llmserverpool.yaml +- gateway_v1alpha1_inferencepool.yaml - gateway_v1alpha1_inferencemodel.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index 2291b9ec..b6d46f43 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -76,7 +76,7 @@ func (s *K8sDatastore) FetchModelData(modelName string) (returnModel *v1alpha1.I } func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { - weights := 0 + var weights int32 source := rand.NewSource(rand.Int63()) if seed > 0 { @@ -87,7 +87,7 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { weights += model.Weight } klog.V(3).Infof("Weights for Model(%v) total to: %v", model.Name, weights) - randomVal := r.Intn(weights) + randomVal := r.Int31n(weights) for _, model := range model.Spec.TargetModels { if randomVal < model.Weight { return model.Name diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go index 6f541ea9..d84206bb 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/backend/datastore_test.go @@ -6,8 +6,6 @@ import ( "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" ) -var () - func TestRandomWeightedDraw(t *testing.T) { tests := []struct { name string @@ -77,8 +75,7 @@ func TestRandomWeightedDraw(t *testing.T) { want: "v1.1", }, } - var seedVal int64 - seedVal = 420 + var seedVal int64 = 420 for _, test := range tests { t.Run(test.name, func(t *testing.T) { for range 10000 { diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go index 3beb0105..f99ff61f 100644 --- a/pkg/ext-proc/backend/endpointslice_reconciler.go +++ b/pkg/ext-proc/backend/endpointslice_reconciler.go @@ -47,13 +47,18 @@ func (c *EndpointSliceReconciler) Reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, nil } -func (c *EndpointSliceReconciler) updateDatastore(slice *discoveryv1.EndpointSlice, inferencePool *v1alpha1.InferencePool) { +func (c *EndpointSliceReconciler) updateDatastore( + slice *discoveryv1.EndpointSlice, + inferencePool *v1alpha1.InferencePool) { podMap := make(map[Pod]bool) for _, endpoint := range slice.Endpoints { klog.V(4).Infof("Zone: %v \n endpoint: %+v \n", c.Zone, endpoint) if c.validPod(endpoint) { - pod := Pod{Name: *&endpoint.TargetRef.Name, Address: endpoint.Addresses[0] + ":" + fmt.Sprint(inferencePool.Spec.TargetPort)} + pod := Pod{ + Name: endpoint.TargetRef.Name, + Address: endpoint.Addresses[0] + ":" + fmt.Sprint(inferencePool.Spec.TargetPortNumber), + } podMap[pod] = true c.Datastore.pods.Store(pod, true) } @@ -93,12 +98,14 @@ func (c *EndpointSliceReconciler) SetupWithManager(mgr ctrl.Manager) error { } return ctrl.NewControllerManagedBy(mgr). - For(&discoveryv1.EndpointSlice{}, builder.WithPredicates(predicate.NewPredicateFuncs(inferencePoolAvailable), predicate.NewPredicateFuncs(ownsEndPointSlice))). + For(&discoveryv1.EndpointSlice{}, + builder.WithPredicates(predicate.NewPredicateFuncs(inferencePoolAvailable), + predicate.NewPredicateFuncs(ownsEndPointSlice))). Complete(c) } func (c *EndpointSliceReconciler) validPod(endpoint discoveryv1.Endpoint) bool { validZone := c.Zone == "" || c.Zone != "" && *endpoint.Zone == c.Zone - return validZone && *endpoint.Conditions.Ready == true + return validZone && *endpoint.Conditions.Ready } diff --git a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go index 0199905f..16bcd8c2 100644 --- a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go +++ b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go @@ -28,7 +28,7 @@ func TestUpdateDatastore_EndpointSliceReconciler(t *testing.T) { pods: populateMap(basePod1, basePod2), inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - TargetPort: int32(8000), + TargetPortNumber: int32(8000), }, }, }, @@ -74,7 +74,7 @@ func TestUpdateDatastore_EndpointSliceReconciler(t *testing.T) { pods: populateMap(basePod1, basePod2), inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - TargetPort: int32(8000), + TargetPortNumber: int32(8000), }, }, }, @@ -120,7 +120,7 @@ func TestUpdateDatastore_EndpointSliceReconciler(t *testing.T) { pods: populateMap(basePod1, basePod2), inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - TargetPort: int32(8000), + TargetPortNumber: int32(8000), }, }, }, @@ -167,7 +167,9 @@ func TestUpdateDatastore_EndpointSliceReconciler(t *testing.T) { endpointSliceReconciler.updateDatastore(test.incomingSlice, test.datastore.inferencePool) if mapsEqual(endpointSliceReconciler.Datastore.pods, test.wantPods) { - t.Errorf("Unexpected output pod mismatch. \n Got %v \n Want: %v \n", endpointSliceReconciler.Datastore.pods, test.wantPods) + t.Errorf("Unexpected output pod mismatch. \n Got %v \n Want: %v \n", + endpointSliceReconciler.Datastore.pods, + test.wantPods) } }) } diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index a616c899..9f1ef6ed 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -12,7 +12,7 @@ var ( service1 = &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model1", - PoolRef: &v1alpha1.PoolObjectReference{Name: "test-pool"}, + PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-service", @@ -21,7 +21,7 @@ var ( service1Modified = &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model1", - PoolRef: &v1alpha1.PoolObjectReference{Name: "test-poolio"}, + PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-service", @@ -30,7 +30,7 @@ var ( service2 = &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model", - PoolRef: &v1alpha1.PoolObjectReference{Name: "test-pool"}, + PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-service-2", @@ -50,7 +50,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { datastore: &K8sDatastore{ inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelString]v1alpha1.LabelString{"app": "vllm"}, + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -67,7 +67,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { datastore: &K8sDatastore{ inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelString]v1alpha1.LabelString{"app": "vllm"}, + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -84,7 +84,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { datastore: &K8sDatastore{ inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelString]v1alpha1.LabelString{"app": "vllm"}, + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -96,7 +96,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { incomingService: &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model", - PoolRef: &v1alpha1.PoolObjectReference{Name: "test-poolio"}, + PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "unrelated-service", @@ -109,7 +109,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { datastore: &K8sDatastore{ inferencePool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelString]v1alpha1.LabelString{"app": "vllm"}, + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -124,7 +124,10 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - InferenceModelReconciler := &InferenceModelReconciler{Datastore: test.datastore, ServerPoolName: test.datastore.inferencePool.Name} + InferenceModelReconciler := &InferenceModelReconciler{ + Datastore: test.datastore, + ServerPoolName: test.datastore.inferencePool.Name, + } InferenceModelReconciler.updateDatastore(test.incomingService) if ok := mapsEqual(InferenceModelReconciler.Datastore.InferenceModels, test.wantInferenceModels); !ok { diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index b53912ff..662b6f41 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -12,10 +12,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" ) -const ( - reconcilerNamePrefix = "instance-gateway-" -) - // InferencePoolReconciler utilizes the controller runtime to reconcile Instance Gateway resources // This implementation is just used for reading & maintaining data sync. The Gateway implementation // will have the proper controller that will create/manage objects on behalf of the server pool. @@ -47,7 +43,8 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques } func (c *InferencePoolReconciler) updateDatastore(serverPool *v1alpha1.InferencePool) { - if c.Datastore.inferencePool == nil || serverPool.ObjectMeta.ResourceVersion != c.Datastore.inferencePool.ObjectMeta.ResourceVersion { + if c.Datastore.inferencePool == nil || + serverPool.ObjectMeta.ResourceVersion != c.Datastore.inferencePool.ObjectMeta.ResourceVersion { c.Datastore.setInferencePool(serverPool) } } diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index f074741e..27a29d9a 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -35,7 +35,11 @@ type PodMetricsClientImpl struct { } // FetchMetrics fetches metrics from a given pod. -func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod backend.Pod, existing *backend.PodMetrics) (*backend.PodMetrics, error) { +func (p *PodMetricsClientImpl) FetchMetrics( + ctx context.Context, + pod backend.Pod, + existing *backend.PodMetrics, +) (*backend.PodMetrics, error) { // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16): Consume this from InferencePool config. url := fmt.Sprintf("http://%s/metrics", pod.Address) @@ -66,7 +70,10 @@ func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod backend.Pod // promToPodMetrics updates internal pod metrics with scraped prometheus metrics. // A combined error is returned if errors occur in one or more metric processing. // it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map. -func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *backend.PodMetrics) (*backend.PodMetrics, error) { +func promToPodMetrics( + metricFamilies map[string]*dto.MetricFamily, + existing *backend.PodMetrics, +) (*backend.PodMetrics, error) { var errs error updated := existing.Clone() runningQueueSize, _, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName) diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index f6ac403f..6121fa11 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -178,7 +178,7 @@ func TestPromToPodMetrics(t *testing.T) { Value: proto.String("lora3,lora4"), }, { - Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), + Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), Value: proto.String("2a"), }, },