From 4931640fb098e61c20c5313b535335a60b6ee662 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Thu, 27 Feb 2025 10:53:55 +0800 Subject: [PATCH 1/9] initialize helm template Signed-off-by: Kuromesi --- .../.helmignore | 23 + .../Chart.yaml | 9 + .../crds/crds.yaml | 917 ++++++++++++++++++ .../generated.yaml | 300 ++++++ .../templates/NOTES.txt | 1 + .../templates/_helpers.tpl | 42 + .../templates/enable_patch_policy.yaml | 18 + .../templates/ext_proc.yaml | 73 ++ .../templates/extension_policy.yaml | 29 + .../templates/gateway.yaml | 51 + .../templates/patch_policy.yaml | 47 + .../templates/rbac.yaml | 49 + .../templates/traffic_policy.yaml | 17 + .../values.yaml | 25 + 14 files changed, 1601 insertions(+) create mode 100644 config/manifests/gateway-api-inference-extension/.helmignore create mode 100644 config/manifests/gateway-api-inference-extension/Chart.yaml create mode 100644 config/manifests/gateway-api-inference-extension/crds/crds.yaml create mode 100644 config/manifests/gateway-api-inference-extension/generated.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/NOTES.txt create mode 100644 config/manifests/gateway-api-inference-extension/templates/_helpers.tpl create mode 100644 config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/gateway.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/rbac.yaml create mode 100644 config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml create mode 100644 config/manifests/gateway-api-inference-extension/values.yaml diff --git a/config/manifests/gateway-api-inference-extension/.helmignore b/config/manifests/gateway-api-inference-extension/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/manifests/gateway-api-inference-extension/Chart.yaml new file mode 100644 index 000000000..b6cecc408 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: gateway-api-inference-extension +description: A Helm chart for gateway-api-inference-extension + +type: application + +version: 0.1.0 + +appVersion: "1.16.0" diff --git a/config/manifests/gateway-api-inference-extension/crds/crds.yaml b/config/manifests/gateway-api-inference-extension/crds/crds.yaml new file mode 100644 index 000000000..31e654baf --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/crds/crds.yaml @@ -0,0 +1,917 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: inferencemodels.inference.networking.x-k8s.io +spec: + group: inference.networking.x-k8s.io + names: + kind: InferenceModel + listKind: InferenceModelList + plural: inferencemodels + singular: inferencemodel + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: InferenceModel is the Schema for the InferenceModels API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + InferenceModelSpec represents the desired state of a specific model use case. This resource is + managed by the "Inference Workload Owner" persona. + + The Inference Workload Owner persona is someone that trains, verifies, and + leverages a large language model from a model frontend, drives the lifecycle + and rollout of new versions of those models, and defines the specific + performance and latency goals for the model. These workloads are + expected to operate within an InferencePool sharing compute capacity with other + InferenceModels, defined by the Inference Platform Admin. + + InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, + if the name is reused, an error will be shown on the status of a + InferenceModel that attempted to reuse. The oldest InferenceModel, based on + creation timestamp, will be selected to remain valid. In the event of a race + condition, one will be selected at random. + properties: + criticality: + description: |- + Criticality defines how important it is to serve the model compared to other models referencing the same pool. + Criticality impacts how traffic is handled in resource constrained situations. It handles this by + queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + and the proportionality of fairness will be configurable. + + Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. + Any implementations that may consume this field may treat an unset value as the 'Standard' range. + enum: + - Critical + - Standard + - Sheddable + type: string + modelName: + description: |- + ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. + ModelNames must be unique for a referencing InferencePool + (names can be reused for a different pool in the same cluster). + The modelName with the oldest creation timestamp is retained, and the incoming + InferenceModel is sets the Ready status to false with a corresponding reason. + In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + Names can be reserved without an underlying model configured in the pool. + This can be done by specifying a target model and setting the weight to zero, + an error will be returned specifying that no valid target model is found. + maxLength: 256 + type: string + poolRef: + description: PoolRef is a reference to the inference pool, the pool + must exist in the same namespace. + properties: + group: + default: inference.networking.x-k8s.io + description: Group is the group of the referent. + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: InferencePool + description: Kind is kind of the referent. For example "InferencePool". + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + description: Name is the name of the referent. + maxLength: 253 + minLength: 1 + type: string + required: + - name + type: object + targetModels: + description: |- + TargetModels allow multiple versions of a model for traffic splitting. + If not specified, the target model name is defaulted to the modelName parameter. + modelName is often in reference to a LoRA adapter. + items: + description: |- + TargetModel represents a deployed model or a LoRA adapter. The + Name field is expected to match the name of the LoRA adapter + (or base model) as it is registered within the model server. Inference + Gateway assumes that the model exists on the model server and it's the + responsibility of the user to validate a correct match. Should a model fail + to exist at request time, the error is processed by the Inference Gateway + and emitted on the appropriate InferenceModel object. + properties: + name: + description: Name is the name of the adapter or base model, + as expected by the ModelServer. + maxLength: 253 + type: string + weight: + description: |- + Weight is used to determine the proportion of traffic that should be + sent to this model when multiple target models are specified. + + Weight defines the proportion of requests forwarded to the specified + model. This is computed as weight/(sum of all weights in this + TargetModels list). For non-zero values, there may be some epsilon from + the exact proportion defined here depending on the precision an + implementation supports. Weight is not a percentage and the sum of + weights does not need to equal 100. + + If a weight is set for any targetModel, it must be set for all targetModels. + Conversely weights are optional, so long as ALL targetModels do not specify a weight. + format: int32 + maximum: 1000000 + minimum: 0 + type: integer + required: + - name + type: object + maxItems: 10 + type: array + x-kubernetes-validations: + - message: Weights should be set for all models, or none of the models. + rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) + required: + - modelName + - poolRef + type: object + status: + description: InferenceModelStatus defines the observed state of InferenceModel + properties: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Ready + description: |- + Conditions track the state of the InferenceModel. + + Known condition types are: + + * "Accepted" + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + type: object + served: true + storage: false + subresources: + status: {} + - name: v1alpha2 + schema: + openAPIV3Schema: + description: InferenceModel is the Schema for the InferenceModels API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + InferenceModelSpec represents the desired state of a specific model use case. This resource is + managed by the "Inference Workload Owner" persona. + + The Inference Workload Owner persona is someone that trains, verifies, and + leverages a large language model from a model frontend, drives the lifecycle + and rollout of new versions of those models, and defines the specific + performance and latency goals for the model. These workloads are + expected to operate within an InferencePool sharing compute capacity with other + InferenceModels, defined by the Inference Platform Admin. + + InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, + if the name is reused, an error will be shown on the status of a + InferenceModel that attempted to reuse. The oldest InferenceModel, based on + creation timestamp, will be selected to remain valid. In the event of a race + condition, one will be selected at random. + properties: + criticality: + description: |- + Criticality defines how important it is to serve the model compared to other models referencing the same pool. + Criticality impacts how traffic is handled in resource constrained situations. It handles this by + queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + and the proportionality of fairness will be configurable. + + Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. + Any implementations that may consume this field may treat an unset value as the 'Standard' range. + enum: + - Critical + - Standard + - Sheddable + type: string + modelName: + description: |- + ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. + ModelNames must be unique for a referencing InferencePool + (names can be reused for a different pool in the same cluster). + The modelName with the oldest creation timestamp is retained, and the incoming + InferenceModel is sets the Ready status to false with a corresponding reason. + In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + Names can be reserved without an underlying model configured in the pool. + This can be done by specifying a target model and setting the weight to zero, + an error will be returned specifying that no valid target model is found. + maxLength: 256 + type: string + poolRef: + description: PoolRef is a reference to the inference pool, the pool + must exist in the same namespace. + properties: + group: + default: inference.networking.x-k8s.io + description: Group is the group of the referent. + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: InferencePool + description: Kind is kind of the referent. For example "InferencePool". + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + description: Name is the name of the referent. + maxLength: 253 + minLength: 1 + type: string + required: + - name + type: object + targetModels: + description: |- + TargetModels allow multiple versions of a model for traffic splitting. + If not specified, the target model name is defaulted to the modelName parameter. + modelName is often in reference to a LoRA adapter. + items: + description: |- + TargetModel represents a deployed model or a LoRA adapter. The + Name field is expected to match the name of the LoRA adapter + (or base model) as it is registered within the model server. Inference + Gateway assumes that the model exists on the model server and it's the + responsibility of the user to validate a correct match. Should a model fail + to exist at request time, the error is processed by the Inference Gateway + and emitted on the appropriate InferenceModel object. + properties: + name: + description: Name is the name of the adapter or base model, + as expected by the ModelServer. + maxLength: 253 + type: string + weight: + description: |- + Weight is used to determine the proportion of traffic that should be + sent to this model when multiple target models are specified. + + Weight defines the proportion of requests forwarded to the specified + model. This is computed as weight/(sum of all weights in this + TargetModels list). For non-zero values, there may be some epsilon from + the exact proportion defined here depending on the precision an + implementation supports. Weight is not a percentage and the sum of + weights does not need to equal 100. + + If a weight is set for any targetModel, it must be set for all targetModels. + Conversely weights are optional, so long as ALL targetModels do not specify a weight. + format: int32 + maximum: 1000000 + minimum: 0 + type: integer + required: + - name + type: object + maxItems: 10 + type: array + x-kubernetes-validations: + - message: Weights should be set for all models, or none of the models. + rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) + required: + - modelName + - poolRef + type: object + status: + description: InferenceModelStatus defines the observed state of InferenceModel + properties: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Ready + description: |- + Conditions track the state of the InferenceModel. + + Known condition types are: + + * "Accepted" + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: inferencepools.inference.networking.x-k8s.io +spec: + group: inference.networking.x-k8s.io + names: + kind: InferencePool + listKind: InferencePoolList + plural: inferencepools + singular: inferencepool + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: InferencePool is the Schema for the InferencePools API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: InferencePoolSpec defines the desired state of InferencePool + properties: + extensionRef: + description: Extension configures an endpoint picker as an extension + service. + properties: + failureMode: + default: FailClose + description: |- + Configures how the gateway handles the case when the extension is not responsive. + Defaults to failClose. + enum: + - FailOpen + - FailClose + type: string + group: + default: "" + description: |- + Group is the group of the referent. + When unspecified or empty string, core API group is inferred. + type: string + kind: + default: Service + description: |- + Kind is the Kubernetes resource kind of the referent. For example + "Service". + + Defaults to "Service" when not specified. + + ExternalName services can refer to CNAME DNS records that may live + outside of the cluster and as such are difficult to reason about in + terms of conformance. They also may not be safe to forward to (see + CVE-2021-25740 for more information). Implementations MUST NOT + support ExternalName Services. + type: string + name: + description: Name is the name of the referent. + type: string + targetPortNumber: + description: |- + The port number on the pods running the extension. When unspecified, implementations SHOULD infer a + default value of 9002 when the Kind is Service. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + selector: + additionalProperties: + description: |- + LabelValue is the value of a label. This is used for validation + of maps. This matches the Kubernetes label validation rules: + * must be 63 characters or less (can be empty), + * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), + * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. + + Valid values include: + + * MyValue + * my.name + * 123-my-value + maxLength: 63 + minLength: 0 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + type: string + description: |- + Selector defines a map of labels to watch model server pods + that should be included in the InferencePool. + In some cases, implementations may translate this field to a Service selector, so this matches the simple + map used for Service selectors instead of the full Kubernetes LabelSelector type. + type: object + targetPortNumber: + description: |- + TargetPortNumber defines the port number to access the selected model servers. + The number must be in the range 1 to 65535. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - extensionRef + - selector + - targetPortNumber + type: object + status: + description: InferencePoolStatus defines the observed state of InferencePool + properties: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Ready + description: |- + Conditions track the state of the InferencePool. + + Known condition types are: + + * "Ready" + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + type: object + served: true + storage: false + subresources: + status: {} + - name: v1alpha2 + schema: + openAPIV3Schema: + description: InferencePool is the Schema for the InferencePools API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: InferencePoolSpec defines the desired state of InferencePool + properties: + extensionRef: + description: Extension configures an endpoint picker as an extension + service. + properties: + failureMode: + default: FailClose + description: |- + Configures how the gateway handles the case when the extension is not responsive. + Defaults to failClose. + enum: + - FailOpen + - FailClose + type: string + group: + default: "" + description: |- + Group is the group of the referent. + When unspecified or empty string, core API group is inferred. + type: string + kind: + default: Service + description: |- + Kind is the Kubernetes resource kind of the referent. For example + "Service". + + Defaults to "Service" when not specified. + + ExternalName services can refer to CNAME DNS records that may live + outside of the cluster and as such are difficult to reason about in + terms of conformance. They also may not be safe to forward to (see + CVE-2021-25740 for more information). Implementations MUST NOT + support ExternalName Services. + type: string + name: + description: Name is the name of the referent. + type: string + targetPortNumber: + description: |- + The port number on the service running the extension. When unspecified, implementations SHOULD infer a + default value of 9002 when the Kind is Service. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + selector: + additionalProperties: + description: |- + LabelValue is the value of a label. This is used for validation + of maps. This matches the Kubernetes label validation rules: + * must be 63 characters or less (can be empty), + * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), + * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. + + Valid values include: + + * MyValue + * my.name + * 123-my-value + maxLength: 63 + minLength: 0 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + type: string + description: |- + Selector defines a map of labels to watch model server pods + that should be included in the InferencePool. + In some cases, implementations may translate this field to a Service selector, so this matches the simple + map used for Service selectors instead of the full Kubernetes LabelSelector type. + type: object + targetPortNumber: + description: |- + TargetPortNumber defines the port number to access the selected model servers. + The number must be in the range 1 to 65535. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - extensionRef + - selector + - targetPortNumber + type: object + status: + description: InferencePoolStatus defines the observed state of InferencePool + properties: + parent: + description: |- + Parents is a list of parent resources (usually Gateways) that are + associated with the route, and the status of the InferencePool with respect to + each parent. + + A maximum of 32 Gateways will be represented in this list. An empty list + means the route has not been attached to any Gateway. + items: + description: PoolStatus defines the observed state of InferencePool + from a gateway. + properties: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Ready + description: |- + Conditions track the state of the InferencePool. + + Known condition types are: + + * "Ready" + items: + description: Condition contains details for one aspect of + the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + parentRef: + description: GatewayRef indicates the gateway that observed + state of InferencePool. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic + required: + - parentRef + type: object + maxItems: 32 + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/generated.yaml b/config/manifests/gateway-api-inference-extension/generated.yaml new file mode 100644 index 000000000..3a1980294 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/generated.yaml @@ -0,0 +1,300 @@ +--- +# Source: gateway-api-inference-extension/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: inference-gateway-ext-proc-release-name + namespace: default + labels: + app: inference-gateway-ext-proc-release-name +--- +# Source: gateway-api-inference-extension/templates/enable_patch_policy.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: envoy-gateway-system +data: + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + provider: + type: Kubernetes + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + extensionApis: + enableEnvoyPatchPolicy: true + enableBackend: true +--- +# Source: gateway-api-inference-extension/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: inference-extension-default-release-name +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +# Source: gateway-api-inference-extension/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: inference-extension-default-release-name +subjects: +- kind: ServiceAccount + name: inference-gateway-ext-proc-release-name + namespace: default +roleRef: + kind: ClusterRole + name: inference-extension-default-release-name +--- +# Source: gateway-api-inference-extension/templates/ext_proc.yaml +apiVersion: v1 +kind: Service +metadata: + name: inference-gateway-ext-proc-release-name + namespace: default +spec: + selector: + app: inference-gateway-ext-proc-release-name + ports: + - name: grpc + protocol: TCP + port: 9002 + targetPort: 9002 + - name: http-metrics + protocol: TCP + port: 9090 + targetPort: 9090 + type: ClusterIP +--- +# Source: gateway-api-inference-extension/templates/ext_proc.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference-gateway-ext-proc-release-name + namespace: default + labels: + app: inference-gateway-ext-proc-release-name +spec: + replicas: 1 + selector: + matchLabels: + app: inference-gateway-ext-proc-release-name + template: + metadata: + labels: + app: inference-gateway-ext-proc-release-name + spec: + serviceAccountName: inference-gateway-ext-proc-release-name + containers: + - name: inference-gateway-ext-proc + image: registry-cn-hangzhou.ack.aliyuncs.com/dev/gateway-api-inference-extension/epp:main + imagePullPolicy: Always + args: + - -poolName + - vllm-llama2-7b-pool + - -poolNamespace + - default + - -v + - "3" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + - -metricsPort + - "9090" + ports: + - name: grpc + containerPort: 9002 + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +# Source: gateway-api-inference-extension/templates/gateway.yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: Backend +metadata: + name: backend-release-name +spec: + endpoints: + - fqdn: + hostname: 'foo.bar.com' + port: 8080 +--- +# Source: gateway-api-inference-extension/templates/traffic_policy.yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: BackendTrafficPolicy +metadata: + name: high-connection-route-policy-release-name # 确保引用有 . 前缀 + namespace: +spec: + targetRefs: + - group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route-release-name + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h +--- +# Source: gateway-api-inference-extension/templates/extension_policy.yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy-release-name + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: inference-gateway-ext-proc-release-name + port: 9002 + processingMode: + request: + body: Buffered + response: + messageTimeout: 1000s + backendSettings: + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route-release-name +--- +# Source: gateway-api-inference-extension/templates/patch_policy.yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyPatchPolicy +metadata: + name: custom-response-patch-policy-release-name + namespace: default +spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway-release-name + type: JSONPatch + jsonPatches: + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: original_destination_cluster + operation: + op: add + path: "" + value: + name: original_destination_cluster + type: ORIGINAL_DST + original_dst_lb_config: + use_http_header: true + http_header_name: "x-gateway-destination-endpoint" + connect_timeout: 1000s + lb_policy: CLUSTER_PROVIDED + dns_lookup_family: V4_ONLY + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: "envoyextensionpolicy/default/ext-proc-policy-release-name/extproc/0" + operation: + op: add + path: "/transport_socket" + value: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" + common_tls_context: {} + - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" + name: default/inference-gateway-release-name/llm-gw + operation: + op: replace + path: "/virtual_hosts/0/routes/0/route/cluster" + value: original_destination_cluster +--- +# Source: gateway-api-inference-extension/templates/gateway.yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway-release-name + namespace: default +spec: + gatewayClassName: inference-gateway-release-name + listeners: + - name: http + protocol: HTTP + port: 8080 + - name: llm-gw + protocol: HTTP + port: 8081 +--- +# Source: gateway-api-inference-extension/templates/gateway.yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: inference-gateway-release-name +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +# Source: gateway-api-inference-extension/templates/gateway.yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route-release-name + namespace: default +spec: + parentRefs: + - name: inference-gateway-release-name + sectionName: llm-gw + rules: + - backendRefs: + - group: gateway.envoyproxy.io + kind: Backend + name: backend-release-name + timeouts: + request: "24h" + backendRequest: "24h" diff --git a/config/manifests/gateway-api-inference-extension/templates/NOTES.txt b/config/manifests/gateway-api-inference-extension/templates/NOTES.txt new file mode 100644 index 000000000..5d5ea8794 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/NOTES.txt @@ -0,0 +1 @@ +Gateway api inference extension deployed. \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl new file mode 100644 index 000000000..7294f7f99 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl @@ -0,0 +1,42 @@ +{{- define "httpRoute.name" -}} +llm-route-{{ .Release.Name }} +{{- end -}} + +{{- define "backend.name" -}} +backend-{{ .Release.Name }} +{{- end -}} + +{{- define "gatewayClass.name" -}} +inference-gateway-{{ .Release.Name }} +{{- end -}} + +{{- define "gateway.name" -}} +inference-gateway-{{ .Release.Name }} +{{- end -}} + +{{- define "envoyExtensionPolicy.name" -}} +ext-proc-policy-{{ .Release.Name }} +{{- end -}} + +{{- define "envoyPatchPolicy.name" -}} +custom-response-patch-policy-{{ .Release.Name }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "gateway-api-inference-extension.selectorLabels" -}} +app: {{ include "gateway-api-inference-extension.name" . }} +{{- end -}} + +{{- define "clusterRole.name" -}} +inference-extension-{{ .Release.Namespace }}-{{ .Release.Name }} +{{- end -}} + +{{- define "backendTrafficPolicy.name" -}} +high-connection-route-policy-{{ .Release.Name }} +{{- end -}} + +{{- define "gateway-api-inference-extension.name" -}} +inference-gateway-ext-proc-{{ .Release.Name }} +{{- end -}} diff --git a/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml new file mode 100644 index 000000000..21b0aa866 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml @@ -0,0 +1,18 @@ +{{ if .Values.envoy.enablePatchPolicy }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: {{ .Values.envoy.namespace | default "envoy-gateway-system" }} +data: + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + provider: + type: Kubernetes + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + extensionApis: + enableEnvoyPatchPolicy: true + enableBackend: true +{{ end }} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml new file mode 100644 index 000000000..bd53c9334 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ include "gateway-api-inference-extension.name" . }} +spec: + replicas: {{ .Values.inferenceExtension.replicas | default 1 }} + selector: + matchLabels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} + containers: + - name: inference-gateway-ext-proc + image: {{ .Values.inferenceExtension.image.hub }}:{{ .Values.inferenceExtension.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }} + args: + - -poolName + - {{ .Values.inferencePool.name }} + - -poolNamespace + - {{ .Release.Namespace }} + - -v + - {{ .Values.inferenceExtension.logLevel | default 3 | quote }} + - -grpcPort + - {{ .Values.inferenceExtension.grpcPort | default 9002 | quote }} + - -grpcHealthPort + - "9003" + - -metricsPort + - {{ .Values.inferenceExtension.metricsPort | default 9090 | quote }} + ports: + - name: grpc + containerPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }} + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} +spec: + selector: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} + ports: + - name: grpc + protocol: TCP + port: {{ .Values.inferenceExtension.grpcPort | default 9002 }} + targetPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }} + - name: http-metrics + protocol: TCP + port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + targetPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + type: ClusterIP diff --git a/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml new file mode 100644 index 000000000..ed84e6f5c --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml @@ -0,0 +1,29 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: {{ include "envoyExtensionPolicy.name" . }} + namespace: {{ .Release.Namespace }} +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: {{ include "gateway-api-inference-extension.name" . }} + port: {{ .Values.inferenceExtension.port | default 9002 }} + processingMode: + request: + body: Buffered + response: + messageTimeout: 1000s + backendSettings: + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: {{ include "httpRoute.name" . }} diff --git a/config/manifests/gateway-api-inference-extension/templates/gateway.yaml b/config/manifests/gateway-api-inference-extension/templates/gateway.yaml new file mode 100644 index 000000000..f0259f527 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/gateway.yaml @@ -0,0 +1,51 @@ + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: {{ include "gateway.name" . }} + namespace: {{ .Release.Namespace }} +spec: + gatewayClassName: {{ include "gatewayClass.name" . }} + listeners: + - name: http + protocol: HTTP + port: 8080 + - name: llm-gw + protocol: HTTP + port: {{ .Values.gateway.port }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: {{ include "gatewayClass.name" . }} +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: Backend +metadata: + name: {{ include "backend.name" . }} +spec: + endpoints: + - fqdn: + hostname: 'foo.bar.com' + port: 8080 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "httpRoute.name" . }} + namespace: {{ .Release.Namespace }} +spec: + parentRefs: + - name: {{ include "gateway.name" . }} + sectionName: llm-gw + rules: + - backendRefs: + - group: gateway.envoyproxy.io + kind: Backend + name: {{ include "backend.name" . }} + timeouts: + request: "24h" + backendRequest: "24h" diff --git a/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml new file mode 100644 index 000000000..e789b0e2f --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml @@ -0,0 +1,47 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyPatchPolicy +metadata: + name: {{ include "envoyPatchPolicy.name" . }} + namespace: {{ .Release.Namespace }} +spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: {{ include "gateway.name" . }} + type: JSONPatch + jsonPatches: + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: original_destination_cluster + operation: + op: add + path: "" + value: + name: original_destination_cluster + type: ORIGINAL_DST + original_dst_lb_config: + use_http_header: true + http_header_name: "x-gateway-destination-endpoint" + connect_timeout: 1000s + lb_policy: CLUSTER_PROVIDED + dns_lookup_family: V4_ONLY + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: "envoyextensionpolicy/{{ .Release.Namespace }}/{{ include "envoyExtensionPolicy.name" . }}/extproc/0" + operation: + op: add + path: "/transport_socket" + value: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" + common_tls_context: {} + - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" + name: {{ .Release.Namespace }}/{{ include "gateway.name" . }}/llm-gw + operation: + op: replace + path: "/virtual_hosts/0/routes/0/route/cluster" + value: original_destination_cluster diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml new file mode 100644 index 000000000..73ff0aa6c --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml @@ -0,0 +1,49 @@ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ include "clusterRole.name" . }} +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ include "clusterRole.name" . }} +subjects: +- kind: ServiceAccount + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: {{ include "clusterRole.name" . }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ include "gateway-api-inference-extension.name" . }} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml new file mode 100644 index 000000000..92ba989c3 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml @@ -0,0 +1,17 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: BackendTrafficPolicy +metadata: + name: {{ include "backendTrafficPolicy.name" . }} + namespace: {{ .Release.namespace }} +spec: + targetRefs: + - group: gateway.networking.k8s.io + kind: HTTPRoute + name: {{ include "httpRoute.name" . }} + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/manifests/gateway-api-inference-extension/values.yaml new file mode 100644 index 000000000..cbda8e573 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/values.yaml @@ -0,0 +1,25 @@ +inferenceExtension: + replicas: 1 + image: + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp + tag: main + pullPolicy: Always + + name: inference-gateway-ext-proc + serviceName: inference-gateway-ext-proc + grpcPort: 9002 + metricsPort: 9090 + logLevel: 3 + +inferencePool: + name: vllm-llama2-7b-pool + +gateway: + port: 8081 + +envoy: + # envoy gateway system namespace + namespace: envoy-gateway-system + + # enabling the Envoy Patch Policy feature + enablePatchPolicy: true From 23664605b40cb4176c5eddb2e165a4824b7e7171 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Fri, 28 Feb 2025 16:24:05 +0800 Subject: [PATCH 2/9] tidy template Signed-off-by: Kuromesi --- .../Chart.yaml | 2 +- .../crds/crds.yaml | 917 ------------------ .../generated.yaml | 300 ------ .../templates/_helpers.tpl | 48 +- .../templates/enable_patch_policy.yaml | 18 - .../templates/ext_proc.yaml | 24 +- .../templates/extension_policy.yaml | 29 - .../templates/gateway.yaml | 51 - .../templates/patch_policy.yaml | 47 - .../templates/rbac.yaml | 14 +- .../templates/traffic_policy.yaml | 17 - .../values.yaml | 20 +- config/manifests/install.yaml | 137 +++ 13 files changed, 174 insertions(+), 1450 deletions(-) delete mode 100644 config/manifests/gateway-api-inference-extension/crds/crds.yaml delete mode 100644 config/manifests/gateway-api-inference-extension/generated.yaml delete mode 100644 config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml delete mode 100644 config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml delete mode 100644 config/manifests/gateway-api-inference-extension/templates/gateway.yaml delete mode 100644 config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml delete mode 100644 config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml create mode 100644 config/manifests/install.yaml diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/manifests/gateway-api-inference-extension/Chart.yaml index b6cecc408..dd194a652 100644 --- a/config/manifests/gateway-api-inference-extension/Chart.yaml +++ b/config/manifests/gateway-api-inference-extension/Chart.yaml @@ -6,4 +6,4 @@ type: application version: 0.1.0 -appVersion: "1.16.0" +appVersion: "0.1.0" diff --git a/config/manifests/gateway-api-inference-extension/crds/crds.yaml b/config/manifests/gateway-api-inference-extension/crds/crds.yaml deleted file mode 100644 index 31e654baf..000000000 --- a/config/manifests/gateway-api-inference-extension/crds/crds.yaml +++ /dev/null @@ -1,917 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: inferencemodels.inference.networking.x-k8s.io -spec: - group: inference.networking.x-k8s.io - names: - kind: InferenceModel - listKind: InferenceModelList - plural: inferencemodels - singular: inferencemodel - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: InferenceModel is the Schema for the InferenceModels API. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - InferenceModelSpec represents the desired state of a specific model use case. This resource is - managed by the "Inference Workload Owner" persona. - - The Inference Workload Owner persona is someone that trains, verifies, and - leverages a large language model from a model frontend, drives the lifecycle - and rollout of new versions of those models, and defines the specific - performance and latency goals for the model. These workloads are - expected to operate within an InferencePool sharing compute capacity with other - InferenceModels, defined by the Inference Platform Admin. - - InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, - if the name is reused, an error will be shown on the status of a - InferenceModel that attempted to reuse. The oldest InferenceModel, based on - creation timestamp, will be selected to remain valid. In the event of a race - condition, one will be selected at random. - properties: - criticality: - description: |- - Criticality defines how important it is to serve the model compared to other models referencing the same pool. - Criticality impacts how traffic is handled in resource constrained situations. It handles this by - queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will - fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, - and the proportionality of fairness will be configurable. - - Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. - Any implementations that may consume this field may treat an unset value as the 'Standard' range. - enum: - - Critical - - Standard - - Sheddable - type: string - modelName: - description: |- - ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. - ModelNames must be unique for a referencing InferencePool - (names can be reused for a different pool in the same cluster). - The modelName with the oldest creation timestamp is retained, and the incoming - InferenceModel is sets the Ready status to false with a corresponding reason. - In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. - Names can be reserved without an underlying model configured in the pool. - This can be done by specifying a target model and setting the weight to zero, - an error will be returned specifying that no valid target model is found. - maxLength: 256 - type: string - poolRef: - description: PoolRef is a reference to the inference pool, the pool - must exist in the same namespace. - properties: - group: - default: inference.networking.x-k8s.io - description: Group is the group of the referent. - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: InferencePool - description: Kind is kind of the referent. For example "InferencePool". - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - description: Name is the name of the referent. - maxLength: 253 - minLength: 1 - type: string - required: - - name - type: object - targetModels: - description: |- - TargetModels allow multiple versions of a model for traffic splitting. - If not specified, the target model name is defaulted to the modelName parameter. - modelName is often in reference to a LoRA adapter. - items: - description: |- - TargetModel represents a deployed model or a LoRA adapter. The - Name field is expected to match the name of the LoRA adapter - (or base model) as it is registered within the model server. Inference - Gateway assumes that the model exists on the model server and it's the - responsibility of the user to validate a correct match. Should a model fail - to exist at request time, the error is processed by the Inference Gateway - and emitted on the appropriate InferenceModel object. - properties: - name: - description: Name is the name of the adapter or base model, - as expected by the ModelServer. - maxLength: 253 - type: string - weight: - description: |- - Weight is used to determine the proportion of traffic that should be - sent to this model when multiple target models are specified. - - Weight defines the proportion of requests forwarded to the specified - model. This is computed as weight/(sum of all weights in this - TargetModels list). For non-zero values, there may be some epsilon from - the exact proportion defined here depending on the precision an - implementation supports. Weight is not a percentage and the sum of - weights does not need to equal 100. - - If a weight is set for any targetModel, it must be set for all targetModels. - Conversely weights are optional, so long as ALL targetModels do not specify a weight. - format: int32 - maximum: 1000000 - minimum: 0 - type: integer - required: - - name - type: object - maxItems: 10 - type: array - x-kubernetes-validations: - - message: Weights should be set for all models, or none of the models. - rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) - required: - - modelName - - poolRef - type: object - status: - description: InferenceModelStatus defines the observed state of InferenceModel - properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready - description: |- - Conditions track the state of the InferenceModel. - - Known condition types are: - - * "Accepted" - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object - type: object - served: true - storage: false - subresources: - status: {} - - name: v1alpha2 - schema: - openAPIV3Schema: - description: InferenceModel is the Schema for the InferenceModels API. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - InferenceModelSpec represents the desired state of a specific model use case. This resource is - managed by the "Inference Workload Owner" persona. - - The Inference Workload Owner persona is someone that trains, verifies, and - leverages a large language model from a model frontend, drives the lifecycle - and rollout of new versions of those models, and defines the specific - performance and latency goals for the model. These workloads are - expected to operate within an InferencePool sharing compute capacity with other - InferenceModels, defined by the Inference Platform Admin. - - InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, - if the name is reused, an error will be shown on the status of a - InferenceModel that attempted to reuse. The oldest InferenceModel, based on - creation timestamp, will be selected to remain valid. In the event of a race - condition, one will be selected at random. - properties: - criticality: - description: |- - Criticality defines how important it is to serve the model compared to other models referencing the same pool. - Criticality impacts how traffic is handled in resource constrained situations. It handles this by - queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will - fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, - and the proportionality of fairness will be configurable. - - Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. - Any implementations that may consume this field may treat an unset value as the 'Standard' range. - enum: - - Critical - - Standard - - Sheddable - type: string - modelName: - description: |- - ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. - ModelNames must be unique for a referencing InferencePool - (names can be reused for a different pool in the same cluster). - The modelName with the oldest creation timestamp is retained, and the incoming - InferenceModel is sets the Ready status to false with a corresponding reason. - In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. - Names can be reserved without an underlying model configured in the pool. - This can be done by specifying a target model and setting the weight to zero, - an error will be returned specifying that no valid target model is found. - maxLength: 256 - type: string - poolRef: - description: PoolRef is a reference to the inference pool, the pool - must exist in the same namespace. - properties: - group: - default: inference.networking.x-k8s.io - description: Group is the group of the referent. - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: InferencePool - description: Kind is kind of the referent. For example "InferencePool". - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - description: Name is the name of the referent. - maxLength: 253 - minLength: 1 - type: string - required: - - name - type: object - targetModels: - description: |- - TargetModels allow multiple versions of a model for traffic splitting. - If not specified, the target model name is defaulted to the modelName parameter. - modelName is often in reference to a LoRA adapter. - items: - description: |- - TargetModel represents a deployed model or a LoRA adapter. The - Name field is expected to match the name of the LoRA adapter - (or base model) as it is registered within the model server. Inference - Gateway assumes that the model exists on the model server and it's the - responsibility of the user to validate a correct match. Should a model fail - to exist at request time, the error is processed by the Inference Gateway - and emitted on the appropriate InferenceModel object. - properties: - name: - description: Name is the name of the adapter or base model, - as expected by the ModelServer. - maxLength: 253 - type: string - weight: - description: |- - Weight is used to determine the proportion of traffic that should be - sent to this model when multiple target models are specified. - - Weight defines the proportion of requests forwarded to the specified - model. This is computed as weight/(sum of all weights in this - TargetModels list). For non-zero values, there may be some epsilon from - the exact proportion defined here depending on the precision an - implementation supports. Weight is not a percentage and the sum of - weights does not need to equal 100. - - If a weight is set for any targetModel, it must be set for all targetModels. - Conversely weights are optional, so long as ALL targetModels do not specify a weight. - format: int32 - maximum: 1000000 - minimum: 0 - type: integer - required: - - name - type: object - maxItems: 10 - type: array - x-kubernetes-validations: - - message: Weights should be set for all models, or none of the models. - rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) - required: - - modelName - - poolRef - type: object - status: - description: InferenceModelStatus defines the observed state of InferenceModel - properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready - description: |- - Conditions track the state of the InferenceModel. - - Known condition types are: - - * "Accepted" - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object - type: object - served: true - storage: true - subresources: - status: {} ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: inferencepools.inference.networking.x-k8s.io -spec: - group: inference.networking.x-k8s.io - names: - kind: InferencePool - listKind: InferencePoolList - plural: inferencepools - singular: inferencepool - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: InferencePool is the Schema for the InferencePools API. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: InferencePoolSpec defines the desired state of InferencePool - properties: - extensionRef: - description: Extension configures an endpoint picker as an extension - service. - properties: - failureMode: - default: FailClose - description: |- - Configures how the gateway handles the case when the extension is not responsive. - Defaults to failClose. - enum: - - FailOpen - - FailClose - type: string - group: - default: "" - description: |- - Group is the group of the referent. - When unspecified or empty string, core API group is inferred. - type: string - kind: - default: Service - description: |- - Kind is the Kubernetes resource kind of the referent. For example - "Service". - - Defaults to "Service" when not specified. - - ExternalName services can refer to CNAME DNS records that may live - outside of the cluster and as such are difficult to reason about in - terms of conformance. They also may not be safe to forward to (see - CVE-2021-25740 for more information). Implementations MUST NOT - support ExternalName Services. - type: string - name: - description: Name is the name of the referent. - type: string - targetPortNumber: - description: |- - The port number on the pods running the extension. When unspecified, implementations SHOULD infer a - default value of 9002 when the Kind is Service. - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - selector: - additionalProperties: - description: |- - LabelValue is the value of a label. This is used for validation - of maps. This matches the Kubernetes label validation rules: - * must be 63 characters or less (can be empty), - * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), - * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. - - Valid values include: - - * MyValue - * my.name - * 123-my-value - maxLength: 63 - minLength: 0 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - type: string - description: |- - Selector defines a map of labels to watch model server pods - that should be included in the InferencePool. - In some cases, implementations may translate this field to a Service selector, so this matches the simple - map used for Service selectors instead of the full Kubernetes LabelSelector type. - type: object - targetPortNumber: - description: |- - TargetPortNumber defines the port number to access the selected model servers. - The number must be in the range 1 to 65535. - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - extensionRef - - selector - - targetPortNumber - type: object - status: - description: InferencePoolStatus defines the observed state of InferencePool - properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready - description: |- - Conditions track the state of the InferencePool. - - Known condition types are: - - * "Ready" - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object - type: object - served: true - storage: false - subresources: - status: {} - - name: v1alpha2 - schema: - openAPIV3Schema: - description: InferencePool is the Schema for the InferencePools API. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: InferencePoolSpec defines the desired state of InferencePool - properties: - extensionRef: - description: Extension configures an endpoint picker as an extension - service. - properties: - failureMode: - default: FailClose - description: |- - Configures how the gateway handles the case when the extension is not responsive. - Defaults to failClose. - enum: - - FailOpen - - FailClose - type: string - group: - default: "" - description: |- - Group is the group of the referent. - When unspecified or empty string, core API group is inferred. - type: string - kind: - default: Service - description: |- - Kind is the Kubernetes resource kind of the referent. For example - "Service". - - Defaults to "Service" when not specified. - - ExternalName services can refer to CNAME DNS records that may live - outside of the cluster and as such are difficult to reason about in - terms of conformance. They also may not be safe to forward to (see - CVE-2021-25740 for more information). Implementations MUST NOT - support ExternalName Services. - type: string - name: - description: Name is the name of the referent. - type: string - targetPortNumber: - description: |- - The port number on the service running the extension. When unspecified, implementations SHOULD infer a - default value of 9002 when the Kind is Service. - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - selector: - additionalProperties: - description: |- - LabelValue is the value of a label. This is used for validation - of maps. This matches the Kubernetes label validation rules: - * must be 63 characters or less (can be empty), - * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), - * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. - - Valid values include: - - * MyValue - * my.name - * 123-my-value - maxLength: 63 - minLength: 0 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - type: string - description: |- - Selector defines a map of labels to watch model server pods - that should be included in the InferencePool. - In some cases, implementations may translate this field to a Service selector, so this matches the simple - map used for Service selectors instead of the full Kubernetes LabelSelector type. - type: object - targetPortNumber: - description: |- - TargetPortNumber defines the port number to access the selected model servers. - The number must be in the range 1 to 65535. - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - extensionRef - - selector - - targetPortNumber - type: object - status: - description: InferencePoolStatus defines the observed state of InferencePool - properties: - parent: - description: |- - Parents is a list of parent resources (usually Gateways) that are - associated with the route, and the status of the InferencePool with respect to - each parent. - - A maximum of 32 Gateways will be represented in this list. An empty list - means the route has not been attached to any Gateway. - items: - description: PoolStatus defines the observed state of InferencePool - from a gateway. - properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready - description: |- - Conditions track the state of the InferencePool. - - Known condition types are: - - * "Ready" - items: - description: Condition contains details for one aspect of - the current state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, - Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - parentRef: - description: GatewayRef indicates the gateway that observed - state of InferencePool. - properties: - apiVersion: - description: API version of the referent. - type: string - fieldPath: - description: |- - If referring to a piece of an object instead of an entire object, this string - should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. - For example, if the object reference is to a container within a pod, this would take on a value like: - "spec.containers{name}" (where "name" refers to the name of the container that triggered - the event) or if no container name is specified "spec.containers[2]" (container with - index 2 in this pod). This syntax is chosen only to have some well-defined way of - referencing a part of an object. - type: string - kind: - description: |- - Kind of the referent. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - name: - description: |- - Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - namespace: - description: |- - Namespace of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ - type: string - resourceVersion: - description: |- - Specific resourceVersion to which this reference is made, if any. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency - type: string - uid: - description: |- - UID of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids - type: string - type: object - x-kubernetes-map-type: atomic - required: - - parentRef - type: object - maxItems: 32 - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/generated.yaml b/config/manifests/gateway-api-inference-extension/generated.yaml deleted file mode 100644 index 3a1980294..000000000 --- a/config/manifests/gateway-api-inference-extension/generated.yaml +++ /dev/null @@ -1,300 +0,0 @@ ---- -# Source: gateway-api-inference-extension/templates/rbac.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: inference-gateway-ext-proc-release-name - namespace: default - labels: - app: inference-gateway-ext-proc-release-name ---- -# Source: gateway-api-inference-extension/templates/enable_patch_policy.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: envoy-gateway-system -data: - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - provider: - type: Kubernetes - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true ---- -# Source: gateway-api-inference-extension/templates/rbac.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: inference-extension-default-release-name -rules: -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencemodels"] - verbs: ["get", "watch", "list"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: ["discovery.k8s.io"] - resources: ["endpointslices"] - verbs: ["get", "watch", "list"] -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: gateway-api-inference-extension/templates/rbac.yaml -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: inference-extension-default-release-name -subjects: -- kind: ServiceAccount - name: inference-gateway-ext-proc-release-name - namespace: default -roleRef: - kind: ClusterRole - name: inference-extension-default-release-name ---- -# Source: gateway-api-inference-extension/templates/ext_proc.yaml -apiVersion: v1 -kind: Service -metadata: - name: inference-gateway-ext-proc-release-name - namespace: default -spec: - selector: - app: inference-gateway-ext-proc-release-name - ports: - - name: grpc - protocol: TCP - port: 9002 - targetPort: 9002 - - name: http-metrics - protocol: TCP - port: 9090 - targetPort: 9090 - type: ClusterIP ---- -# Source: gateway-api-inference-extension/templates/ext_proc.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: inference-gateway-ext-proc-release-name - namespace: default - labels: - app: inference-gateway-ext-proc-release-name -spec: - replicas: 1 - selector: - matchLabels: - app: inference-gateway-ext-proc-release-name - template: - metadata: - labels: - app: inference-gateway-ext-proc-release-name - spec: - serviceAccountName: inference-gateway-ext-proc-release-name - containers: - - name: inference-gateway-ext-proc - image: registry-cn-hangzhou.ack.aliyuncs.com/dev/gateway-api-inference-extension/epp:main - imagePullPolicy: Always - args: - - -poolName - - vllm-llama2-7b-pool - - -poolNamespace - - default - - -v - - "3" - - -grpcPort - - "9002" - - -grpcHealthPort - - "9003" - - -metricsPort - - "9090" - ports: - - name: grpc - containerPort: 9002 - - name: grpc-health - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 ---- -# Source: gateway-api-inference-extension/templates/gateway.yaml -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: Backend -metadata: - name: backend-release-name -spec: - endpoints: - - fqdn: - hostname: 'foo.bar.com' - port: 8080 ---- -# Source: gateway-api-inference-extension/templates/traffic_policy.yaml -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: BackendTrafficPolicy -metadata: - name: high-connection-route-policy-release-name # 确保引用有 . 前缀 - namespace: -spec: - targetRefs: - - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route-release-name - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h ---- -# Source: gateway-api-inference-extension/templates/extension_policy.yaml -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy-release-name - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: inference-gateway-ext-proc-release-name - port: 9002 - processingMode: - request: - body: Buffered - response: - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route-release-name ---- -# Source: gateway-api-inference-extension/templates/patch_policy.yaml -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyPatchPolicy -metadata: - name: custom-response-patch-policy-release-name - namespace: default -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway-release-name - type: JSONPatch - jsonPatches: - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: original_destination_cluster - operation: - op: add - path: "" - value: - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "x-gateway-destination-endpoint" - connect_timeout: 1000s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - circuit_breakers: - thresholds: - - max_connections: 40000 - max_pending_requests: 40000 - max_requests: 40000 - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: "envoyextensionpolicy/default/ext-proc-policy-release-name/extproc/0" - operation: - op: add - path: "/transport_socket" - value: - name: "envoy.transport_sockets.tls" - typed_config: - "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" - common_tls_context: {} - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: default/inference-gateway-release-name/llm-gw - operation: - op: replace - path: "/virtual_hosts/0/routes/0/route/cluster" - value: original_destination_cluster ---- -# Source: gateway-api-inference-extension/templates/gateway.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: inference-gateway-release-name - namespace: default -spec: - gatewayClassName: inference-gateway-release-name - listeners: - - name: http - protocol: HTTP - port: 8080 - - name: llm-gw - protocol: HTTP - port: 8081 ---- -# Source: gateway-api-inference-extension/templates/gateway.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: inference-gateway-release-name -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller ---- -# Source: gateway-api-inference-extension/templates/gateway.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route-release-name - namespace: default -spec: - parentRefs: - - name: inference-gateway-release-name - sectionName: llm-gw - rules: - - backendRefs: - - group: gateway.envoyproxy.io - kind: Backend - name: backend-release-name - timeouts: - request: "24h" - backendRequest: "24h" diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl index 7294f7f99..c1e40133f 100644 --- a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl +++ b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl @@ -1,42 +1,16 @@ -{{- define "httpRoute.name" -}} -llm-route-{{ .Release.Name }} -{{- end -}} - -{{- define "backend.name" -}} -backend-{{ .Release.Name }} -{{- end -}} - -{{- define "gatewayClass.name" -}} -inference-gateway-{{ .Release.Name }} -{{- end -}} - -{{- define "gateway.name" -}} -inference-gateway-{{ .Release.Name }} -{{- end -}} - -{{- define "envoyExtensionPolicy.name" -}} -ext-proc-policy-{{ .Release.Name }} -{{- end -}} - -{{- define "envoyPatchPolicy.name" -}} -custom-response-patch-policy-{{ .Release.Name }} -{{- end -}} +{{/* +Common labels +*/}} +{{- define "gateway-api-inference-extension.labels" -}} +app.kubernetes.io/name: {{ .Values.inferenceExtension.name }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +{{- end }} {{/* Selector labels */}} {{- define "gateway-api-inference-extension.selectorLabels" -}} -app: {{ include "gateway-api-inference-extension.name" . }} -{{- end -}} - -{{- define "clusterRole.name" -}} -inference-extension-{{ .Release.Namespace }}-{{ .Release.Name }} -{{- end -}} - -{{- define "backendTrafficPolicy.name" -}} -high-connection-route-policy-{{ .Release.Name }} -{{- end -}} - -{{- define "gateway-api-inference-extension.name" -}} -inference-gateway-ext-proc-{{ .Release.Name }} -{{- end -}} +app: {{ .Values.inferenceExtension.name }} +{{- end -}} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml deleted file mode 100644 index 21b0aa866..000000000 --- a/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml +++ /dev/null @@ -1,18 +0,0 @@ -{{ if .Values.envoy.enablePatchPolicy }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: {{ .Values.envoy.namespace | default "envoy-gateway-system" }} -data: - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - provider: - type: Kubernetes - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true -{{ end }} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml index bd53c9334..0bfde2db4 100644 --- a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml +++ b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml @@ -1,10 +1,10 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: inference-gateway-ext-proc namespace: {{ .Release.Namespace }} labels: - app: {{ include "gateway-api-inference-extension.name" . }} + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: replicas: {{ .Values.inferenceExtension.replicas | default 1 }} selector: @@ -15,31 +15,31 @@ spec: labels: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} spec: - serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} + serviceAccountName: inference-gateway-ext-proc containers: - name: inference-gateway-ext-proc - image: {{ .Values.inferenceExtension.image.hub }}:{{ .Values.inferenceExtension.image.tag }} + image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }} args: - -poolName - {{ .Values.inferencePool.name }} - -poolNamespace - - {{ .Release.Namespace }} + - {{ .Values.inferencePool.namespace }} - -v - - {{ .Values.inferenceExtension.logLevel | default 3 | quote }} + - "3" - -grpcPort - - {{ .Values.inferenceExtension.grpcPort | default 9002 | quote }} + - "9002" - -grpcHealthPort - "9003" - -metricsPort - - {{ .Values.inferenceExtension.metricsPort | default 9090 | quote }} + - "9090" ports: - name: grpc - containerPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }} + containerPort: 9002 - name: grpc-health containerPort: 9003 - name: metrics - containerPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + containerPort: 9090 livenessProbe: grpc: port: 9003 @@ -56,8 +56,10 @@ spec: apiVersion: v1 kind: Service metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ .Values.inferenceExtension.name }} namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: selector: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} diff --git a/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml deleted file mode 100644 index ed84e6f5c..000000000 --- a/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: {{ include "envoyExtensionPolicy.name" . }} - namespace: {{ .Release.Namespace }} -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: {{ include "gateway-api-inference-extension.name" . }} - port: {{ .Values.inferenceExtension.port | default 9002 }} - processingMode: - request: - body: Buffered - response: - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: {{ include "httpRoute.name" . }} diff --git a/config/manifests/gateway-api-inference-extension/templates/gateway.yaml b/config/manifests/gateway-api-inference-extension/templates/gateway.yaml deleted file mode 100644 index f0259f527..000000000 --- a/config/manifests/gateway-api-inference-extension/templates/gateway.yaml +++ /dev/null @@ -1,51 +0,0 @@ - ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: {{ include "gateway.name" . }} - namespace: {{ .Release.Namespace }} -spec: - gatewayClassName: {{ include "gatewayClass.name" . }} - listeners: - - name: http - protocol: HTTP - port: 8080 - - name: llm-gw - protocol: HTTP - port: {{ .Values.gateway.port }} ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: {{ include "gatewayClass.name" . }} -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: Backend -metadata: - name: {{ include "backend.name" . }} -spec: - endpoints: - - fqdn: - hostname: 'foo.bar.com' - port: 8080 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: {{ include "httpRoute.name" . }} - namespace: {{ .Release.Namespace }} -spec: - parentRefs: - - name: {{ include "gateway.name" . }} - sectionName: llm-gw - rules: - - backendRefs: - - group: gateway.envoyproxy.io - kind: Backend - name: {{ include "backend.name" . }} - timeouts: - request: "24h" - backendRequest: "24h" diff --git a/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml deleted file mode 100644 index e789b0e2f..000000000 --- a/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml +++ /dev/null @@ -1,47 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyPatchPolicy -metadata: - name: {{ include "envoyPatchPolicy.name" . }} - namespace: {{ .Release.Namespace }} -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: {{ include "gateway.name" . }} - type: JSONPatch - jsonPatches: - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: original_destination_cluster - operation: - op: add - path: "" - value: - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "x-gateway-destination-endpoint" - connect_timeout: 1000s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - circuit_breakers: - thresholds: - - max_connections: 40000 - max_pending_requests: 40000 - max_requests: 40000 - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: "envoyextensionpolicy/{{ .Release.Namespace }}/{{ include "envoyExtensionPolicy.name" . }}/extproc/0" - operation: - op: add - path: "/transport_socket" - value: - name: "envoy.transport_sockets.tls" - typed_config: - "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" - common_tls_context: {} - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: {{ .Release.Namespace }}/{{ include "gateway.name" . }}/llm-gw - operation: - op: replace - path: "/virtual_hosts/0/routes/0/route/cluster" - value: original_destination_cluster diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml index 73ff0aa6c..b2c21f674 100644 --- a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml +++ b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml @@ -1,7 +1,9 @@ kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "clusterRole.name" . }} + name: {{ .Values.inferenceExtension.name }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} rules: - apiGroups: ["inference.networking.x-k8s.io"] resources: ["inferencemodels"] @@ -31,19 +33,19 @@ rules: kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "clusterRole.name" . }} + name: {{ .Values.inferenceExtension.name }} subjects: - kind: ServiceAccount - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ .Values.inferenceExtension.name }} namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole - name: {{ include "clusterRole.name" . }} + name: {{ .Values.inferenceExtension.name }} --- apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ .Values.inferenceExtension.name }} namespace: {{ .Release.Namespace }} labels: - app: {{ include "gateway-api-inference-extension.name" . }} \ No newline at end of file + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml deleted file mode 100644 index 92ba989c3..000000000 --- a/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: BackendTrafficPolicy -metadata: - name: {{ include "backendTrafficPolicy.name" . }} - namespace: {{ .Release.namespace }} -spec: - targetRefs: - - group: gateway.networking.k8s.io - kind: HTTPRoute - name: {{ include "httpRoute.name" . }} - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/manifests/gateway-api-inference-extension/values.yaml index cbda8e573..6a5137485 100644 --- a/config/manifests/gateway-api-inference-extension/values.yaml +++ b/config/manifests/gateway-api-inference-extension/values.yaml @@ -1,25 +1,13 @@ inferenceExtension: replicas: 1 image: - hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp + name: epp + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension tag: main pullPolicy: Always name: inference-gateway-ext-proc - serviceName: inference-gateway-ext-proc - grpcPort: 9002 - metricsPort: 9090 - logLevel: 3 inferencePool: - name: vllm-llama2-7b-pool - -gateway: - port: 8081 - -envoy: - # envoy gateway system namespace - namespace: envoy-gateway-system - - # enabling the Envoy Patch Policy feature - enablePatchPolicy: true + namespace: default + name: vllm-llama2-7b-pool \ No newline at end of file diff --git a/config/manifests/install.yaml b/config/manifests/install.yaml new file mode 100644 index 000000000..976075560 --- /dev/null +++ b/config/manifests/install.yaml @@ -0,0 +1,137 @@ +--- +# Source: gateway-api-inference-extension/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: inference-gateway-ext-proc + namespace: default + labels: + app.kubernetes.io/name: inference-gateway-ext-proc + app.kubernetes.io/version: "0.1.0" +--- +# Source: gateway-api-inference-extension/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: inference-gateway-ext-proc + labels: + app.kubernetes.io/name: inference-gateway-ext-proc + app.kubernetes.io/version: "0.1.0" +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +# Source: gateway-api-inference-extension/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: inference-gateway-ext-proc +subjects: +- kind: ServiceAccount + name: inference-gateway-ext-proc + namespace: default +roleRef: + kind: ClusterRole + name: inference-gateway-ext-proc +--- +# Source: gateway-api-inference-extension/templates/ext_proc.yaml +apiVersion: v1 +kind: Service +metadata: + name: inference-gateway-ext-proc + namespace: default + labels: + app.kubernetes.io/name: inference-gateway-ext-proc + app.kubernetes.io/version: "0.1.0" +spec: + selector: + app: inference-gateway-ext-proc + ports: + - name: grpc + protocol: TCP + port: 9002 + targetPort: 9002 + - name: http-metrics + protocol: TCP + port: 9090 + targetPort: 9090 + type: ClusterIP +--- +# Source: gateway-api-inference-extension/templates/ext_proc.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference-gateway-ext-proc + namespace: default + labels: + app.kubernetes.io/name: inference-gateway-ext-proc + app.kubernetes.io/version: "0.1.0" +spec: + replicas: 1 + selector: + matchLabels: + app: inference-gateway-ext-proc + template: + metadata: + labels: + app: inference-gateway-ext-proc + spec: + serviceAccountName: inference-gateway-ext-proc + containers: + - name: inference-gateway-ext-proc + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + imagePullPolicy: Always + args: + - -poolName + - vllm-llama2-7b-pool + - -poolNamespace + - default + - -v + - "3" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + - -metricsPort + - "9090" + ports: + - name: grpc + containerPort: 9002 + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 From dcd3bd56d14a3f9afe810c82cc450b9bdb132971 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Tue, 18 Mar 2025 08:50:38 +0800 Subject: [PATCH 3/9] nit and add inference pool Signed-off-by: Kuromesi --- .../Chart.yaml | 6 +++--- .../templates/_helpers.tpl | 14 ++++++++++--- .../templates/ext_proc.yaml | 9 ++++----- .../templates/inferencepool.yaml | 12 +++++++++++ .../templates/rbac.yaml | 20 +++++++------------ .../values.yaml | 8 +++++--- 6 files changed, 42 insertions(+), 27 deletions(-) create mode 100644 config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/manifests/gateway-api-inference-extension/Chart.yaml index dd194a652..5e46737ca 100644 --- a/config/manifests/gateway-api-inference-extension/Chart.yaml +++ b/config/manifests/gateway-api-inference-extension/Chart.yaml @@ -1,9 +1,9 @@ apiVersion: v2 -name: gateway-api-inference-extension -description: A Helm chart for gateway-api-inference-extension +name: InferencePool +description: A Helm chart for InferencePool type: application version: 0.1.0 -appVersion: "0.1.0" +appVersion: "0.2.0" diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl index c1e40133f..4068e7ea6 100644 --- a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl +++ b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl @@ -2,15 +2,23 @@ Common labels */}} {{- define "gateway-api-inference-extension.labels" -}} -app.kubernetes.io/name: {{ .Values.inferenceExtension.name }} +app.kubernetes.io/name: epp-{{ .Values.inferencePool.name }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} {{- end }} +{{/* +Inference extension name +*/}} +{{- define "gateway-api-inference-extension.name" -}} +{{- $base := .Values.inferencePool.name | default "default-pool" | lower | trim | trunc 40 -}} +epp-{{ $base }} +{{- end -}} + {{/* Selector labels */}} {{- define "gateway-api-inference-extension.selectorLabels" -}} -app: {{ .Values.inferenceExtension.name }} -{{- end -}} \ No newline at end of file +app: epp-{{ .Values.inferencePool.name }} +{{- end -}} diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml index 0bfde2db4..a80bcbdd9 100644 --- a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml +++ b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: inference-gateway-ext-proc + name: {{ include "gateway-api-inference-extension.name" . }} namespace: {{ .Release.Namespace }} labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} @@ -56,7 +56,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: {{ .Values.inferenceExtension.name }} + name: {{ include "gateway-api-inference-extension.name" . }} namespace: {{ .Release.Namespace }} labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} @@ -64,10 +64,9 @@ spec: selector: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} ports: - - name: grpc + - name: grpc-ext-proc protocol: TCP - port: {{ .Values.inferenceExtension.grpcPort | default 9002 }} - targetPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }} + port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} - name: http-metrics protocol: TCP port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} diff --git a/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml b/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml new file mode 100644 index 000000000..8662c9f86 --- /dev/null +++ b/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml @@ -0,0 +1,12 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: {{ .Values.inferencePool.name }} +spec: + targetPortNumber: 8000 + selector: + {{- range $key, $value := .Values.inferencePool.selector }} + {{ $key }}: {{ quote $value }} + {{- end }} + extensionRef: + name: {{ include "gateway-api-inference-extension.name" . }} \ No newline at end of file diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml index b2c21f674..7a98e8206 100644 --- a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml +++ b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml @@ -1,22 +1,16 @@ kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ .Values.inferenceExtension.name }} + name: {{ include "gateway-api-inference-extension.name" . }} labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} rules: - apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencemodels"] + resources: ["inferencemodels, inferencepools"] verbs: ["get", "watch", "list"] - apiGroups: [""] resources: ["pods"] verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: ["discovery.k8s.io"] - resources: ["endpointslices"] - verbs: ["get", "watch", "list"] - apiGroups: - authentication.k8s.io resources: @@ -33,19 +27,19 @@ rules: kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ .Values.inferenceExtension.name }} + name: {{ include "gateway-api-inference-extension.name" . }} subjects: - kind: ServiceAccount - name: {{ .Values.inferenceExtension.name }} + name: {{ include "gateway-api-inference-extension.name" . }} namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole - name: {{ .Values.inferenceExtension.name }} + name: {{ include "gateway-api-inference-extension.name" . }} --- apiVersion: v1 kind: ServiceAccount metadata: - name: {{ .Values.inferenceExtension.name }} + name: {{ include "gateway-api-inference-extension.name" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} \ No newline at end of file + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/manifests/gateway-api-inference-extension/values.yaml index 6a5137485..0f20a3e66 100644 --- a/config/manifests/gateway-api-inference-extension/values.yaml +++ b/config/manifests/gateway-api-inference-extension/values.yaml @@ -5,9 +5,11 @@ inferenceExtension: hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension tag: main pullPolicy: Always - - name: inference-gateway-ext-proc + extProcPort: 9002 inferencePool: namespace: default - name: vllm-llama2-7b-pool \ No newline at end of file + name: pool-1 + targetPortNumber: 8000 + selector: + app: vllm-llama2-7b \ No newline at end of file From 154f67068bbb3600854ccd3ebd545129b78fae3c Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Tue, 18 Mar 2025 09:00:58 +0800 Subject: [PATCH 4/9] relocate Signed-off-by: Kuromesi --- .../.helmignore | 0 .../Chart.yaml | 0 .../generated.yaml | 145 ++++++++++++++++++ .../templates/NOTES.txt | 0 .../templates/_helpers.tpl | 0 .../templates/ext_proc.yaml | 7 +- .../templates/inferencepool.yaml | 5 +- .../templates/rbac.yaml | 0 .../values.yaml | 1 - 9 files changed, 152 insertions(+), 6 deletions(-) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/.helmignore (100%) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/Chart.yaml (100%) create mode 100644 config/charts/inferencepool/gateway-api-inference-extension/generated.yaml rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/NOTES.txt (100%) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/_helpers.tpl (100%) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/ext_proc.yaml (90%) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/inferencepool.yaml (65%) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/rbac.yaml (100%) rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/values.yaml (93%) diff --git a/config/manifests/gateway-api-inference-extension/.helmignore b/config/charts/inferencepool/gateway-api-inference-extension/.helmignore similarity index 100% rename from config/manifests/gateway-api-inference-extension/.helmignore rename to config/charts/inferencepool/gateway-api-inference-extension/.helmignore diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml similarity index 100% rename from config/manifests/gateway-api-inference-extension/Chart.yaml rename to config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml diff --git a/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml b/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml new file mode 100644 index 000000000..16b3bf4ef --- /dev/null +++ b/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml @@ -0,0 +1,145 @@ +--- +# Source: InferencePool/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: epp-pool-1 + namespace: default + labels: + app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/version: "0.2.0" +--- +# Source: InferencePool/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: epp-pool-1 + labels: + app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/version: "0.2.0" +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels, inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +# Source: InferencePool/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: epp-pool-1 +subjects: +- kind: ServiceAccount + name: epp-pool-1 + namespace: default +roleRef: + kind: ClusterRole + name: epp-pool-1 +--- +# Source: InferencePool/templates/ext_proc.yaml +apiVersion: v1 +kind: Service +metadata: + name: epp-pool-1 + namespace: default + labels: + app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/version: "0.2.0" +spec: + selector: + app: epp-pool-1 + ports: + - name: grpc-ext-proc + protocol: TCP + port: 9002 + - name: http-metrics + protocol: TCP + port: 9090 + type: ClusterIP +--- +# Source: InferencePool/templates/ext_proc.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: epp-pool-1 + namespace: default + labels: + app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/version: "0.2.0" +spec: + replicas: 1 + selector: + matchLabels: + app: epp-pool-1 + template: + metadata: + labels: + app: epp-pool-1 + spec: + serviceAccountName: epp-pool-1 + containers: + - name: epp + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + imagePullPolicy: Always + args: + - -poolName + - pool-1 + - -poolNamespace + - default + - -v + - "3" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + - -metricsPort + - "9090" + ports: + - name: grpc + containerPort: 9002 + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +# Source: InferencePool/templates/inferencepool.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: pool-1 + namespace: default + labels: + app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/version: "0.2.0" +spec: + targetPortNumber: + selector: + app: "vllm-llama2-7b" + extensionRef: + name: epp-pool-1 diff --git a/config/manifests/gateway-api-inference-extension/templates/NOTES.txt b/config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt similarity index 100% rename from config/manifests/gateway-api-inference-extension/templates/NOTES.txt rename to config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl similarity index 100% rename from config/manifests/gateway-api-inference-extension/templates/_helpers.tpl rename to config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml similarity index 90% rename from config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml rename to config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml index a80bcbdd9..cf68ab872 100644 --- a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml +++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml @@ -15,16 +15,16 @@ spec: labels: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} spec: - serviceAccountName: inference-gateway-ext-proc + serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} containers: - - name: inference-gateway-ext-proc + - name: epp image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }} args: - -poolName - {{ .Values.inferencePool.name }} - -poolNamespace - - {{ .Values.inferencePool.namespace }} + - {{ .Release.Namespace }} - -v - "3" - -grpcPort @@ -70,5 +70,4 @@ spec: - name: http-metrics protocol: TCP port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} - targetPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }} type: ClusterIP diff --git a/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml similarity index 65% rename from config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml rename to config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml index 8662c9f86..9700711d7 100644 --- a/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml +++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml @@ -2,8 +2,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: name: {{ .Values.inferencePool.name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: - targetPortNumber: 8000 + targetPortNumber: {{ .Values.inferencePool.targetPort }} selector: {{- range $key, $value := .Values.inferencePool.selector }} {{ $key }}: {{ quote $value }} diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml similarity index 100% rename from config/manifests/gateway-api-inference-extension/templates/rbac.yaml rename to config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/charts/inferencepool/gateway-api-inference-extension/values.yaml similarity index 93% rename from config/manifests/gateway-api-inference-extension/values.yaml rename to config/charts/inferencepool/gateway-api-inference-extension/values.yaml index 0f20a3e66..c4a0fb934 100644 --- a/config/manifests/gateway-api-inference-extension/values.yaml +++ b/config/charts/inferencepool/gateway-api-inference-extension/values.yaml @@ -8,7 +8,6 @@ inferenceExtension: extProcPort: 9002 inferencePool: - namespace: default name: pool-1 targetPortNumber: 8000 selector: From 2490c28a48ff5e52cbce85bed65f57ff56c55e76 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Tue, 18 Mar 2025 09:13:56 +0800 Subject: [PATCH 5/9] fix Signed-off-by: Kuromesi --- .../templates/_helpers.tpl | 6 +- .../templates/ext_proc.yaml | 73 ----------------- .../templates/inferencepool.yaml | 78 ++++++++++++++++++- .../generated.yaml | 40 +++++----- 4 files changed, 99 insertions(+), 98 deletions(-) delete mode 100644 config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml rename config/{charts/inferencepool/gateway-api-inference-extension => manifests}/generated.yaml (81%) diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl b/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl index 4068e7ea6..bb15f9e4e 100644 --- a/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl +++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl @@ -2,7 +2,7 @@ Common labels */}} {{- define "gateway-api-inference-extension.labels" -}} -app.kubernetes.io/name: epp-{{ .Values.inferencePool.name }} +app.kubernetes.io/name: {{ include "gateway-api-inference-extension.name" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -13,12 +13,12 @@ Inference extension name */}} {{- define "gateway-api-inference-extension.name" -}} {{- $base := .Values.inferencePool.name | default "default-pool" | lower | trim | trunc 40 -}} -epp-{{ $base }} +{{ $base }}-epp {{- end -}} {{/* Selector labels */}} {{- define "gateway-api-inference-extension.selectorLabels" -}} -app: epp-{{ .Values.inferencePool.name }} +app: {{ include "gateway-api-inference-extension.name" . }} {{- end -}} diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml deleted file mode 100644 index cf68ab872..000000000 --- a/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml +++ /dev/null @@ -1,73 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} -spec: - replicas: {{ .Values.inferenceExtension.replicas | default 1 }} - selector: - matchLabels: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} - spec: - serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} - containers: - - name: epp - image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} - imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }} - args: - - -poolName - - {{ .Values.inferencePool.name }} - - -poolNamespace - - {{ .Release.Namespace }} - - -v - - "3" - - -grpcPort - - "9002" - - -grpcHealthPort - - "9003" - - -metricsPort - - "9090" - ports: - - name: grpc - containerPort: 9002 - - name: grpc-health - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 ---- -apiVersion: v1 -kind: Service -metadata: - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} -spec: - selector: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} - ports: - - name: grpc-ext-proc - protocol: TCP - port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} - - name: http-metrics - protocol: TCP - port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} - type: ClusterIP diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml index 9700711d7..8fc974965 100644 --- a/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml +++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml @@ -6,10 +6,84 @@ metadata: labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: - targetPortNumber: {{ .Values.inferencePool.targetPort }} + targetPortNumber: {{ .Values.inferencePool.targetPortNumber }} selector: {{- range $key, $value := .Values.inferencePool.selector }} {{ $key }}: {{ quote $value }} {{- end }} extensionRef: - name: {{ include "gateway-api-inference-extension.name" . }} \ No newline at end of file + name: {{ include "gateway-api-inference-extension.name" . }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.inferenceExtension.replicas | default 1 }} + selector: + matchLabels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} + containers: + - name: epp + image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }} + args: + - -poolName + - {{ .Values.inferencePool.name }} + - -poolNamespace + - {{ .Release.Namespace }} + - -v + - "3" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + - -metricsPort + - "9090" + ports: + - name: grpc + containerPort: 9002 + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +spec: + selector: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} + ports: + - name: grpc-ext-proc + protocol: TCP + port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} + - name: http-metrics + protocol: TCP + port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + type: ClusterIP diff --git a/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml b/config/manifests/generated.yaml similarity index 81% rename from config/charts/inferencepool/gateway-api-inference-extension/generated.yaml rename to config/manifests/generated.yaml index 16b3bf4ef..f615e25a1 100644 --- a/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml +++ b/config/manifests/generated.yaml @@ -3,19 +3,19 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: epp-pool-1 + name: pool-1-epp namespace: default labels: - app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/name: pool-1-epp app.kubernetes.io/version: "0.2.0" --- # Source: InferencePool/templates/rbac.yaml kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: epp-pool-1 + name: pool-1-epp labels: - app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/name: pool-1-epp app.kubernetes.io/version: "0.2.0" rules: - apiGroups: ["inference.networking.x-k8s.io"] @@ -41,27 +41,27 @@ rules: kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: epp-pool-1 + name: pool-1-epp subjects: - kind: ServiceAccount - name: epp-pool-1 + name: pool-1-epp namespace: default roleRef: kind: ClusterRole - name: epp-pool-1 + name: pool-1-epp --- -# Source: InferencePool/templates/ext_proc.yaml +# Source: InferencePool/templates/inferencepool.yaml apiVersion: v1 kind: Service metadata: - name: epp-pool-1 + name: pool-1-epp namespace: default labels: - app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/name: pool-1-epp app.kubernetes.io/version: "0.2.0" spec: selector: - app: epp-pool-1 + app: pool-1-epp ports: - name: grpc-ext-proc protocol: TCP @@ -71,26 +71,26 @@ spec: port: 9090 type: ClusterIP --- -# Source: InferencePool/templates/ext_proc.yaml +# Source: InferencePool/templates/inferencepool.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: epp-pool-1 + name: pool-1-epp namespace: default labels: - app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/name: pool-1-epp app.kubernetes.io/version: "0.2.0" spec: replicas: 1 selector: matchLabels: - app: epp-pool-1 + app: pool-1-epp template: metadata: labels: - app: epp-pool-1 + app: pool-1-epp spec: - serviceAccountName: epp-pool-1 + serviceAccountName: pool-1-epp containers: - name: epp image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main @@ -135,11 +135,11 @@ metadata: name: pool-1 namespace: default labels: - app.kubernetes.io/name: epp-pool-1 + app.kubernetes.io/name: pool-1-epp app.kubernetes.io/version: "0.2.0" spec: - targetPortNumber: + targetPortNumber: 8000 selector: app: "vllm-llama2-7b" extensionRef: - name: epp-pool-1 + name: pool-1-epp From 814bec38ef789b8aafb3dd51bdc211014828bde6 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Tue, 18 Mar 2025 09:24:03 +0800 Subject: [PATCH 6/9] fix --- config/manifests/install.yaml | 137 ---------------------------------- 1 file changed, 137 deletions(-) delete mode 100644 config/manifests/install.yaml diff --git a/config/manifests/install.yaml b/config/manifests/install.yaml deleted file mode 100644 index 976075560..000000000 --- a/config/manifests/install.yaml +++ /dev/null @@ -1,137 +0,0 @@ ---- -# Source: gateway-api-inference-extension/templates/rbac.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: inference-gateway-ext-proc - namespace: default - labels: - app.kubernetes.io/name: inference-gateway-ext-proc - app.kubernetes.io/version: "0.1.0" ---- -# Source: gateway-api-inference-extension/templates/rbac.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: inference-gateway-ext-proc - labels: - app.kubernetes.io/name: inference-gateway-ext-proc - app.kubernetes.io/version: "0.1.0" -rules: -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencemodels"] - verbs: ["get", "watch", "list"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: ["discovery.k8s.io"] - resources: ["endpointslices"] - verbs: ["get", "watch", "list"] -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: gateway-api-inference-extension/templates/rbac.yaml -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: inference-gateway-ext-proc -subjects: -- kind: ServiceAccount - name: inference-gateway-ext-proc - namespace: default -roleRef: - kind: ClusterRole - name: inference-gateway-ext-proc ---- -# Source: gateway-api-inference-extension/templates/ext_proc.yaml -apiVersion: v1 -kind: Service -metadata: - name: inference-gateway-ext-proc - namespace: default - labels: - app.kubernetes.io/name: inference-gateway-ext-proc - app.kubernetes.io/version: "0.1.0" -spec: - selector: - app: inference-gateway-ext-proc - ports: - - name: grpc - protocol: TCP - port: 9002 - targetPort: 9002 - - name: http-metrics - protocol: TCP - port: 9090 - targetPort: 9090 - type: ClusterIP ---- -# Source: gateway-api-inference-extension/templates/ext_proc.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: inference-gateway-ext-proc - namespace: default - labels: - app.kubernetes.io/name: inference-gateway-ext-proc - app.kubernetes.io/version: "0.1.0" -spec: - replicas: 1 - selector: - matchLabels: - app: inference-gateway-ext-proc - template: - metadata: - labels: - app: inference-gateway-ext-proc - spec: - serviceAccountName: inference-gateway-ext-proc - containers: - - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main - imagePullPolicy: Always - args: - - -poolName - - vllm-llama2-7b-pool - - -poolNamespace - - default - - -v - - "3" - - -grpcPort - - "9002" - - -grpcHealthPort - - "9003" - - -metricsPort - - "9090" - ports: - - name: grpc - containerPort: 9002 - - name: grpc-health - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 From 6712198853a55afff23e7e1e17f8c6eb8b63bba3 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Wed, 19 Mar 2025 08:23:16 +0800 Subject: [PATCH 7/9] add readme Signed-off-by: Kuromesi --- .../.helmignore | 0 .../Chart.yaml | 0 config/charts/inferencepool/README.md | 61 +++++++++++++++++++ .../templates/NOTES.txt | 0 .../templates/_helpers.tpl | 0 .../templates/inferencepool.yaml | 0 .../templates/rbac.yaml | 0 .../values.yaml | 0 8 files changed, 61 insertions(+) rename config/charts/inferencepool/{gateway-api-inference-extension => }/.helmignore (100%) rename config/charts/inferencepool/{gateway-api-inference-extension => }/Chart.yaml (100%) create mode 100644 config/charts/inferencepool/README.md rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/NOTES.txt (100%) rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/_helpers.tpl (100%) rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/inferencepool.yaml (100%) rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/rbac.yaml (100%) rename config/charts/inferencepool/{gateway-api-inference-extension => }/values.yaml (100%) diff --git a/config/charts/inferencepool/gateway-api-inference-extension/.helmignore b/config/charts/inferencepool/.helmignore similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/.helmignore rename to config/charts/inferencepool/.helmignore diff --git a/config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml b/config/charts/inferencepool/Chart.yaml similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml rename to config/charts/inferencepool/Chart.yaml diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md new file mode 100644 index 000000000..10ed808b0 --- /dev/null +++ b/config/charts/inferencepool/README.md @@ -0,0 +1,61 @@ +# Gateway Api Inference Extension + +A chart to deploy the inference extension and a InferencePool managed by the extension. + +## Install + +Suppose now a vllm service with label `app: vllm-llama2-7b` and served on port `8000` is deployed in `default` namespace in the cluster. + +To deploy the inference extension, you can run the following command: + +```txt +$ helm install my-release . -n default \ + --set inferencePool.targetPortNumber=8000 \ + --set inferencePool.selector.app=vllm-llama2-7b +``` + +Or you can change the `values.yaml` to: + +```yaml +inferencePool: + name: pool-1 + targetPortNumber: 8000 + selector: + app: vllm-llama2-7b +``` + +where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends. And then run: + +```txt +$ helm install my-release . +``` + +## Uninstall + +Run the following command to uninstall the chart: + +```txt +$ helm uninstall my-release +``` + +## Configuration + +The following table list the configurable parameters of the chart. + +| **Parameter Name** | **Description** | +|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| `inferenceExtension.replicas` | Number of replicas for the inference extension service. Defaults to `1`. | +| `inferenceExtension.image.name` | Name of the container image used for the inference extension. | +| `inferenceExtension.image.hub` | Registry URL where the inference extension image is hosted. | +| `inferenceExtension.image.tag` | Image tag of the inference extension. | +| `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | +| `inferenceExtension.extProcPort` | Port where the inference extension service is served for external processing. Defaults to `9002`. | +| `inferencePool.name` | Name for the InferencePool, and inference extension will be named as `${inferencePool.name}-epp`. | +| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. | +| `inferencePool.selector` | Label selector to match vllm backends managed by the inference pool. | + +## Notes + +This chart will only deploy the inference extension and InferencePool, before install the chart, please make sure that the inference extension CRDs have already been installed in the cluster. And You need to apply traffic policies to route traffic to the inference extension from the gateway after the inference extension is deployed. + +For more details, please refer to the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/). \ No newline at end of file diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt b/config/charts/inferencepool/templates/NOTES.txt similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt rename to config/charts/inferencepool/templates/NOTES.txt diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl b/config/charts/inferencepool/templates/_helpers.tpl similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl rename to config/charts/inferencepool/templates/_helpers.tpl diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml b/config/charts/inferencepool/templates/inferencepool.yaml similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml rename to config/charts/inferencepool/templates/inferencepool.yaml diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml b/config/charts/inferencepool/templates/rbac.yaml similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml rename to config/charts/inferencepool/templates/rbac.yaml diff --git a/config/charts/inferencepool/gateway-api-inference-extension/values.yaml b/config/charts/inferencepool/values.yaml similarity index 100% rename from config/charts/inferencepool/gateway-api-inference-extension/values.yaml rename to config/charts/inferencepool/values.yaml From a885ea9fa3df81ad5e94d6416f755207d4d056da Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Wed, 19 Mar 2025 10:45:38 +0800 Subject: [PATCH 8/9] nit Signed-off-by: Kuromesi --- config/charts/inferencepool/README.md | 42 ++--- .../charts/inferencepool/templates/NOTES.txt | 2 +- config/charts/inferencepool/values.yaml | 2 +- config/manifests/generated.yaml | 145 ------------------ 4 files changed, 15 insertions(+), 176 deletions(-) delete mode 100644 config/manifests/generated.yaml diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 10ed808b0..761c9a9dc 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -1,34 +1,20 @@ -# Gateway Api Inference Extension +# InferencePool -A chart to deploy the inference extension and a InferencePool managed by the extension. +A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment. -## Install -Suppose now a vllm service with label `app: vllm-llama2-7b` and served on port `8000` is deployed in `default` namespace in the cluster. +## Install -To deploy the inference extension, you can run the following command: +To install an InferencePool named `pool-1` that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command: ```txt -$ helm install my-release . -n default \ - --set inferencePool.targetPortNumber=8000 \ - --set inferencePool.selector.app=vllm-llama2-7b -``` - -Or you can change the `values.yaml` to: - -```yaml -inferencePool: - name: pool-1 - targetPortNumber: 8000 - selector: - app: vllm-llama2-7b +$ helm install my-release ./config/charts/inferencepool \ + --set inferencePool.name=pool-1 \ + --set inferencePool.selector.app=vllm-llama2-7b \ + --set inferencePool.targetPortNumber=8000 ``` -where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends. And then run: - -```txt -$ helm install my-release . -``` +where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends. ## Uninstall @@ -44,18 +30,16 @@ The following table list the configurable parameters of the chart. | **Parameter Name** | **Description** | |---------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| `inferencePool.name` | Name for the InferencePool, and inference extension will be named as `${inferencePool.name}-epp`. | +| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. | +| `inferencePool.selector` | Label selector to match vllm backends managed by the inference pool. | | `inferenceExtension.replicas` | Number of replicas for the inference extension service. Defaults to `1`. | | `inferenceExtension.image.name` | Name of the container image used for the inference extension. | | `inferenceExtension.image.hub` | Registry URL where the inference extension image is hosted. | | `inferenceExtension.image.tag` | Image tag of the inference extension. | | `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | | `inferenceExtension.extProcPort` | Port where the inference extension service is served for external processing. Defaults to `9002`. | -| `inferencePool.name` | Name for the InferencePool, and inference extension will be named as `${inferencePool.name}-epp`. | -| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. | -| `inferencePool.selector` | Label selector to match vllm backends managed by the inference pool. | ## Notes -This chart will only deploy the inference extension and InferencePool, before install the chart, please make sure that the inference extension CRDs have already been installed in the cluster. And You need to apply traffic policies to route traffic to the inference extension from the gateway after the inference extension is deployed. - -For more details, please refer to the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/). \ No newline at end of file +This chart will only deploy an InferencePool and its corresponding EndpointPicker extension. Before install the chart, please make sure that the inference extension CRDs are installed in the cluster. For more details, please refer to the [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/). diff --git a/config/charts/inferencepool/templates/NOTES.txt b/config/charts/inferencepool/templates/NOTES.txt index 5d5ea8794..3d8221659 100644 --- a/config/charts/inferencepool/templates/NOTES.txt +++ b/config/charts/inferencepool/templates/NOTES.txt @@ -1 +1 @@ -Gateway api inference extension deployed. \ No newline at end of file +InferencePool {{ .Values.inferencePool.name }} deployed. diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index c4a0fb934..7d3e868dd 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -11,4 +11,4 @@ inferencePool: name: pool-1 targetPortNumber: 8000 selector: - app: vllm-llama2-7b \ No newline at end of file + app: vllm-llama2-7b diff --git a/config/manifests/generated.yaml b/config/manifests/generated.yaml deleted file mode 100644 index f615e25a1..000000000 --- a/config/manifests/generated.yaml +++ /dev/null @@ -1,145 +0,0 @@ ---- -# Source: InferencePool/templates/rbac.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pool-1-epp - namespace: default - labels: - app.kubernetes.io/name: pool-1-epp - app.kubernetes.io/version: "0.2.0" ---- -# Source: InferencePool/templates/rbac.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pool-1-epp - labels: - app.kubernetes.io/name: pool-1-epp - app.kubernetes.io/version: "0.2.0" -rules: -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencemodels, inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "watch", "list"] -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: InferencePool/templates/rbac.yaml -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pool-1-epp -subjects: -- kind: ServiceAccount - name: pool-1-epp - namespace: default -roleRef: - kind: ClusterRole - name: pool-1-epp ---- -# Source: InferencePool/templates/inferencepool.yaml -apiVersion: v1 -kind: Service -metadata: - name: pool-1-epp - namespace: default - labels: - app.kubernetes.io/name: pool-1-epp - app.kubernetes.io/version: "0.2.0" -spec: - selector: - app: pool-1-epp - ports: - - name: grpc-ext-proc - protocol: TCP - port: 9002 - - name: http-metrics - protocol: TCP - port: 9090 - type: ClusterIP ---- -# Source: InferencePool/templates/inferencepool.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pool-1-epp - namespace: default - labels: - app.kubernetes.io/name: pool-1-epp - app.kubernetes.io/version: "0.2.0" -spec: - replicas: 1 - selector: - matchLabels: - app: pool-1-epp - template: - metadata: - labels: - app: pool-1-epp - spec: - serviceAccountName: pool-1-epp - containers: - - name: epp - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main - imagePullPolicy: Always - args: - - -poolName - - pool-1 - - -poolNamespace - - default - - -v - - "3" - - -grpcPort - - "9002" - - -grpcHealthPort - - "9003" - - -metricsPort - - "9090" - ports: - - name: grpc - containerPort: 9002 - - name: grpc-health - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 ---- -# Source: InferencePool/templates/inferencepool.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: pool-1 - namespace: default - labels: - app.kubernetes.io/name: pool-1-epp - app.kubernetes.io/version: "0.2.0" -spec: - targetPortNumber: 8000 - selector: - app: "vllm-llama2-7b" - extensionRef: - name: pool-1-epp From bf51f9a3e1b58827614578d5516709ffe0e97b46 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Tue, 18 Mar 2025 21:09:01 -0700 Subject: [PATCH 9/9] Apply suggestions from code review --- config/charts/inferencepool/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 761c9a9dc..ee0481d30 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -8,7 +8,7 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl To install an InferencePool named `pool-1` that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command: ```txt -$ helm install my-release ./config/charts/inferencepool \ +$ helm install pool-1 ./config/charts/inferencepool \ --set inferencePool.name=pool-1 \ --set inferencePool.selector.app=vllm-llama2-7b \ --set inferencePool.targetPortNumber=8000 @@ -21,7 +21,7 @@ where `inferencePool.targetPortNumber` is the pod that vllm backends served on a Run the following command to uninstall the chart: ```txt -$ helm uninstall my-release +$ helm uninstall pool-1 ``` ## Configuration