From 4931640fb098e61c20c5313b535335a60b6ee662 Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Thu, 27 Feb 2025 10:53:55 +0800
Subject: [PATCH 1/9] initialize helm template

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 .../.helmignore                               |  23 +
 .../Chart.yaml                                |   9 +
 .../crds/crds.yaml                            | 917 ++++++++++++++++++
 .../generated.yaml                            | 300 ++++++
 .../templates/NOTES.txt                       |   1 +
 .../templates/_helpers.tpl                    |  42 +
 .../templates/enable_patch_policy.yaml        |  18 +
 .../templates/ext_proc.yaml                   |  73 ++
 .../templates/extension_policy.yaml           |  29 +
 .../templates/gateway.yaml                    |  51 +
 .../templates/patch_policy.yaml               |  47 +
 .../templates/rbac.yaml                       |  49 +
 .../templates/traffic_policy.yaml             |  17 +
 .../values.yaml                               |  25 +
 14 files changed, 1601 insertions(+)
 create mode 100644 config/manifests/gateway-api-inference-extension/.helmignore
 create mode 100644 config/manifests/gateway-api-inference-extension/Chart.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/crds/crds.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/generated.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/NOTES.txt
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/gateway.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/rbac.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml
 create mode 100644 config/manifests/gateway-api-inference-extension/values.yaml

diff --git a/config/manifests/gateway-api-inference-extension/.helmignore b/config/manifests/gateway-api-inference-extension/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/manifests/gateway-api-inference-extension/Chart.yaml
new file mode 100644
index 000000000..b6cecc408
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/Chart.yaml
@@ -0,0 +1,9 @@
+apiVersion: v2
+name: gateway-api-inference-extension
+description: A Helm chart for gateway-api-inference-extension
+
+type: application
+
+version: 0.1.0
+
+appVersion: "1.16.0"
diff --git a/config/manifests/gateway-api-inference-extension/crds/crds.yaml b/config/manifests/gateway-api-inference-extension/crds/crds.yaml
new file mode 100644
index 000000000..31e654baf
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/crds/crds.yaml
@@ -0,0 +1,917 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: inferencemodels.inference.networking.x-k8s.io
+spec:
+  group: inference.networking.x-k8s.io
+  names:
+    kind: InferenceModel
+    listKind: InferenceModelList
+    plural: inferencemodels
+    singular: inferencemodel
+  scope: Namespaced
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: InferenceModel is the Schema for the InferenceModels API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: |-
+              InferenceModelSpec represents the desired state of a specific model use case. This resource is
+              managed by the "Inference Workload Owner" persona.
+
+              The Inference Workload Owner persona is someone that trains, verifies, and
+              leverages a large language model from a model frontend, drives the lifecycle
+              and rollout of new versions of those models, and defines the specific
+              performance and latency goals for the model. These workloads are
+              expected to operate within an InferencePool sharing compute capacity with other
+              InferenceModels, defined by the Inference Platform Admin.
+
+              InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
+              if the name is reused, an error will be shown on the status of a
+              InferenceModel that attempted to reuse. The oldest InferenceModel, based on
+              creation timestamp, will be selected to remain valid. In the event of a race
+              condition, one will be selected at random.
+            properties:
+              criticality:
+                description: |-
+                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
+                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
+                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+                  and the proportionality of fairness will be configurable.
+
+                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
+                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
+                enum:
+                - Critical
+                - Standard
+                - Sheddable
+                type: string
+              modelName:
+                description: |-
+                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
+                  ModelNames must be unique for a referencing InferencePool
+                  (names can be reused for a different pool in the same cluster).
+                  The modelName with the oldest creation timestamp is retained, and the incoming
+                  InferenceModel is sets the Ready status to false with a corresponding reason.
+                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+                  Names can be reserved without an underlying model configured in the pool.
+                  This can be done by specifying a target model and setting the weight to zero,
+                  an error will be returned specifying that no valid target model is found.
+                maxLength: 256
+                type: string
+              poolRef:
+                description: PoolRef is a reference to the inference pool, the pool
+                  must exist in the same namespace.
+                properties:
+                  group:
+                    default: inference.networking.x-k8s.io
+                    description: Group is the group of the referent.
+                    maxLength: 253
+                    pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                    type: string
+                  kind:
+                    default: InferencePool
+                    description: Kind is kind of the referent. For example "InferencePool".
+                    maxLength: 63
+                    minLength: 1
+                    pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$
+                    type: string
+                  name:
+                    description: Name is the name of the referent.
+                    maxLength: 253
+                    minLength: 1
+                    type: string
+                required:
+                - name
+                type: object
+              targetModels:
+                description: |-
+                  TargetModels allow multiple versions of a model for traffic splitting.
+                  If not specified, the target model name is defaulted to the modelName parameter.
+                  modelName is often in reference to a LoRA adapter.
+                items:
+                  description: |-
+                    TargetModel represents a deployed model or a LoRA adapter. The
+                    Name field is expected to match the name of the LoRA adapter
+                    (or base model) as it is registered within the model server. Inference
+                    Gateway assumes that the model exists on the model server and it's the
+                    responsibility of the user to validate a correct match. Should a model fail
+                    to exist at request time, the error is processed by the Inference Gateway
+                    and emitted on the appropriate InferenceModel object.
+                  properties:
+                    name:
+                      description: Name is the name of the adapter or base model,
+                        as expected by the ModelServer.
+                      maxLength: 253
+                      type: string
+                    weight:
+                      description: |-
+                        Weight is used to determine the proportion of traffic that should be
+                        sent to this model when multiple target models are specified.
+
+                        Weight defines the proportion of requests forwarded to the specified
+                        model. This is computed as weight/(sum of all weights in this
+                        TargetModels list). For non-zero values, there may be some epsilon from
+                        the exact proportion defined here depending on the precision an
+                        implementation supports. Weight is not a percentage and the sum of
+                        weights does not need to equal 100.
+
+                        If a weight is set for any targetModel, it must be set for all targetModels.
+                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
+                      format: int32
+                      maximum: 1000000
+                      minimum: 0
+                      type: integer
+                  required:
+                  - name
+                  type: object
+                maxItems: 10
+                type: array
+                x-kubernetes-validations:
+                - message: Weights should be set for all models, or none of the models.
+                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
+            required:
+            - modelName
+            - poolRef
+            type: object
+          status:
+            description: InferenceModelStatus defines the observed state of InferenceModel
+            properties:
+              conditions:
+                default:
+                - lastTransitionTime: "1970-01-01T00:00:00Z"
+                  message: Waiting for controller
+                  reason: Pending
+                  status: Unknown
+                  type: Ready
+                description: |-
+                  Conditions track the state of the InferenceModel.
+
+                  Known condition types are:
+
+                  * "Accepted"
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+    served: true
+    storage: false
+    subresources:
+      status: {}
+  - name: v1alpha2
+    schema:
+      openAPIV3Schema:
+        description: InferenceModel is the Schema for the InferenceModels API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: |-
+              InferenceModelSpec represents the desired state of a specific model use case. This resource is
+              managed by the "Inference Workload Owner" persona.
+
+              The Inference Workload Owner persona is someone that trains, verifies, and
+              leverages a large language model from a model frontend, drives the lifecycle
+              and rollout of new versions of those models, and defines the specific
+              performance and latency goals for the model. These workloads are
+              expected to operate within an InferencePool sharing compute capacity with other
+              InferenceModels, defined by the Inference Platform Admin.
+
+              InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
+              if the name is reused, an error will be shown on the status of a
+              InferenceModel that attempted to reuse. The oldest InferenceModel, based on
+              creation timestamp, will be selected to remain valid. In the event of a race
+              condition, one will be selected at random.
+            properties:
+              criticality:
+                description: |-
+                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
+                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
+                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+                  and the proportionality of fairness will be configurable.
+
+                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
+                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
+                enum:
+                - Critical
+                - Standard
+                - Sheddable
+                type: string
+              modelName:
+                description: |-
+                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
+                  ModelNames must be unique for a referencing InferencePool
+                  (names can be reused for a different pool in the same cluster).
+                  The modelName with the oldest creation timestamp is retained, and the incoming
+                  InferenceModel is sets the Ready status to false with a corresponding reason.
+                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+                  Names can be reserved without an underlying model configured in the pool.
+                  This can be done by specifying a target model and setting the weight to zero,
+                  an error will be returned specifying that no valid target model is found.
+                maxLength: 256
+                type: string
+              poolRef:
+                description: PoolRef is a reference to the inference pool, the pool
+                  must exist in the same namespace.
+                properties:
+                  group:
+                    default: inference.networking.x-k8s.io
+                    description: Group is the group of the referent.
+                    maxLength: 253
+                    pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                    type: string
+                  kind:
+                    default: InferencePool
+                    description: Kind is kind of the referent. For example "InferencePool".
+                    maxLength: 63
+                    minLength: 1
+                    pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$
+                    type: string
+                  name:
+                    description: Name is the name of the referent.
+                    maxLength: 253
+                    minLength: 1
+                    type: string
+                required:
+                - name
+                type: object
+              targetModels:
+                description: |-
+                  TargetModels allow multiple versions of a model for traffic splitting.
+                  If not specified, the target model name is defaulted to the modelName parameter.
+                  modelName is often in reference to a LoRA adapter.
+                items:
+                  description: |-
+                    TargetModel represents a deployed model or a LoRA adapter. The
+                    Name field is expected to match the name of the LoRA adapter
+                    (or base model) as it is registered within the model server. Inference
+                    Gateway assumes that the model exists on the model server and it's the
+                    responsibility of the user to validate a correct match. Should a model fail
+                    to exist at request time, the error is processed by the Inference Gateway
+                    and emitted on the appropriate InferenceModel object.
+                  properties:
+                    name:
+                      description: Name is the name of the adapter or base model,
+                        as expected by the ModelServer.
+                      maxLength: 253
+                      type: string
+                    weight:
+                      description: |-
+                        Weight is used to determine the proportion of traffic that should be
+                        sent to this model when multiple target models are specified.
+
+                        Weight defines the proportion of requests forwarded to the specified
+                        model. This is computed as weight/(sum of all weights in this
+                        TargetModels list). For non-zero values, there may be some epsilon from
+                        the exact proportion defined here depending on the precision an
+                        implementation supports. Weight is not a percentage and the sum of
+                        weights does not need to equal 100.
+
+                        If a weight is set for any targetModel, it must be set for all targetModels.
+                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
+                      format: int32
+                      maximum: 1000000
+                      minimum: 0
+                      type: integer
+                  required:
+                  - name
+                  type: object
+                maxItems: 10
+                type: array
+                x-kubernetes-validations:
+                - message: Weights should be set for all models, or none of the models.
+                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
+            required:
+            - modelName
+            - poolRef
+            type: object
+          status:
+            description: InferenceModelStatus defines the observed state of InferenceModel
+            properties:
+              conditions:
+                default:
+                - lastTransitionTime: "1970-01-01T00:00:00Z"
+                  message: Waiting for controller
+                  reason: Pending
+                  status: Unknown
+                  type: Ready
+                description: |-
+                  Conditions track the state of the InferenceModel.
+
+                  Known condition types are:
+
+                  * "Accepted"
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: inferencepools.inference.networking.x-k8s.io
+spec:
+  group: inference.networking.x-k8s.io
+  names:
+    kind: InferencePool
+    listKind: InferencePoolList
+    plural: inferencepools
+    singular: inferencepool
+  scope: Namespaced
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: InferencePool is the Schema for the InferencePools API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: InferencePoolSpec defines the desired state of InferencePool
+            properties:
+              extensionRef:
+                description: Extension configures an endpoint picker as an extension
+                  service.
+                properties:
+                  failureMode:
+                    default: FailClose
+                    description: |-
+                      Configures how the gateway handles the case when the extension is not responsive.
+                      Defaults to failClose.
+                    enum:
+                    - FailOpen
+                    - FailClose
+                    type: string
+                  group:
+                    default: ""
+                    description: |-
+                      Group is the group of the referent.
+                      When unspecified or empty string, core API group is inferred.
+                    type: string
+                  kind:
+                    default: Service
+                    description: |-
+                      Kind is the Kubernetes resource kind of the referent. For example
+                      "Service".
+
+                      Defaults to "Service" when not specified.
+
+                      ExternalName services can refer to CNAME DNS records that may live
+                      outside of the cluster and as such are difficult to reason about in
+                      terms of conformance. They also may not be safe to forward to (see
+                      CVE-2021-25740 for more information). Implementations MUST NOT
+                      support ExternalName Services.
+                    type: string
+                  name:
+                    description: Name is the name of the referent.
+                    type: string
+                  targetPortNumber:
+                    description: |-
+                      The port number on the pods running the extension. When unspecified, implementations SHOULD infer a
+                      default value of 9002 when the Kind is Service.
+                    format: int32
+                    maximum: 65535
+                    minimum: 1
+                    type: integer
+                required:
+                - name
+                type: object
+              selector:
+                additionalProperties:
+                  description: |-
+                    LabelValue is the value of a label. This is used for validation
+                    of maps. This matches the Kubernetes label validation rules:
+                    * must be 63 characters or less (can be empty),
+                    * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+                    * could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+
+                    Valid values include:
+
+                    * MyValue
+                    * my.name
+                    * 123-my-value
+                  maxLength: 63
+                  minLength: 0
+                  pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
+                  type: string
+                description: |-
+                  Selector defines a map of labels to watch model server pods
+                  that should be included in the InferencePool.
+                  In some cases, implementations may translate this field to a Service selector, so this matches the simple
+                  map used for Service selectors instead of the full Kubernetes LabelSelector type.
+                type: object
+              targetPortNumber:
+                description: |-
+                  TargetPortNumber defines the port number to access the selected model servers.
+                  The number must be in the range 1 to 65535.
+                format: int32
+                maximum: 65535
+                minimum: 1
+                type: integer
+            required:
+            - extensionRef
+            - selector
+            - targetPortNumber
+            type: object
+          status:
+            description: InferencePoolStatus defines the observed state of InferencePool
+            properties:
+              conditions:
+                default:
+                - lastTransitionTime: "1970-01-01T00:00:00Z"
+                  message: Waiting for controller
+                  reason: Pending
+                  status: Unknown
+                  type: Ready
+                description: |-
+                  Conditions track the state of the InferencePool.
+
+                  Known condition types are:
+
+                  * "Ready"
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+    served: true
+    storage: false
+    subresources:
+      status: {}
+  - name: v1alpha2
+    schema:
+      openAPIV3Schema:
+        description: InferencePool is the Schema for the InferencePools API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: InferencePoolSpec defines the desired state of InferencePool
+            properties:
+              extensionRef:
+                description: Extension configures an endpoint picker as an extension
+                  service.
+                properties:
+                  failureMode:
+                    default: FailClose
+                    description: |-
+                      Configures how the gateway handles the case when the extension is not responsive.
+                      Defaults to failClose.
+                    enum:
+                    - FailOpen
+                    - FailClose
+                    type: string
+                  group:
+                    default: ""
+                    description: |-
+                      Group is the group of the referent.
+                      When unspecified or empty string, core API group is inferred.
+                    type: string
+                  kind:
+                    default: Service
+                    description: |-
+                      Kind is the Kubernetes resource kind of the referent. For example
+                      "Service".
+
+                      Defaults to "Service" when not specified.
+
+                      ExternalName services can refer to CNAME DNS records that may live
+                      outside of the cluster and as such are difficult to reason about in
+                      terms of conformance. They also may not be safe to forward to (see
+                      CVE-2021-25740 for more information). Implementations MUST NOT
+                      support ExternalName Services.
+                    type: string
+                  name:
+                    description: Name is the name of the referent.
+                    type: string
+                  targetPortNumber:
+                    description: |-
+                      The port number on the service running the extension. When unspecified, implementations SHOULD infer a
+                      default value of 9002 when the Kind is Service.
+                    format: int32
+                    maximum: 65535
+                    minimum: 1
+                    type: integer
+                required:
+                - name
+                type: object
+              selector:
+                additionalProperties:
+                  description: |-
+                    LabelValue is the value of a label. This is used for validation
+                    of maps. This matches the Kubernetes label validation rules:
+                    * must be 63 characters or less (can be empty),
+                    * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+                    * could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+
+                    Valid values include:
+
+                    * MyValue
+                    * my.name
+                    * 123-my-value
+                  maxLength: 63
+                  minLength: 0
+                  pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
+                  type: string
+                description: |-
+                  Selector defines a map of labels to watch model server pods
+                  that should be included in the InferencePool.
+                  In some cases, implementations may translate this field to a Service selector, so this matches the simple
+                  map used for Service selectors instead of the full Kubernetes LabelSelector type.
+                type: object
+              targetPortNumber:
+                description: |-
+                  TargetPortNumber defines the port number to access the selected model servers.
+                  The number must be in the range 1 to 65535.
+                format: int32
+                maximum: 65535
+                minimum: 1
+                type: integer
+            required:
+            - extensionRef
+            - selector
+            - targetPortNumber
+            type: object
+          status:
+            description: InferencePoolStatus defines the observed state of InferencePool
+            properties:
+              parent:
+                description: |-
+                  Parents is a list of parent resources (usually Gateways) that are
+                  associated with the route, and the status of the InferencePool with respect to
+                  each parent.
+
+                  A maximum of 32 Gateways will be represented in this list. An empty list
+                  means the route has not been attached to any Gateway.
+                items:
+                  description: PoolStatus defines the observed state of InferencePool
+                    from a gateway.
+                  properties:
+                    conditions:
+                      default:
+                      - lastTransitionTime: "1970-01-01T00:00:00Z"
+                        message: Waiting for controller
+                        reason: Pending
+                        status: Unknown
+                        type: Ready
+                      description: |-
+                        Conditions track the state of the InferencePool.
+
+                        Known condition types are:
+
+                        * "Ready"
+                      items:
+                        description: Condition contains details for one aspect of
+                          the current state of this API Resource.
+                        properties:
+                          lastTransitionTime:
+                            description: |-
+                              lastTransitionTime is the last time the condition transitioned from one status to another.
+                              This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                            format: date-time
+                            type: string
+                          message:
+                            description: |-
+                              message is a human readable message indicating details about the transition.
+                              This may be an empty string.
+                            maxLength: 32768
+                            type: string
+                          observedGeneration:
+                            description: |-
+                              observedGeneration represents the .metadata.generation that the condition was set based upon.
+                              For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                              with respect to the current state of the instance.
+                            format: int64
+                            minimum: 0
+                            type: integer
+                          reason:
+                            description: |-
+                              reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                              Producers of specific condition types may define expected values and meanings for this field,
+                              and whether the values are considered a guaranteed API.
+                              The value should be a CamelCase string.
+                              This field may not be empty.
+                            maxLength: 1024
+                            minLength: 1
+                            pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                            type: string
+                          status:
+                            description: status of the condition, one of True, False,
+                              Unknown.
+                            enum:
+                            - "True"
+                            - "False"
+                            - Unknown
+                            type: string
+                          type:
+                            description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                            maxLength: 316
+                            pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                            type: string
+                        required:
+                        - lastTransitionTime
+                        - message
+                        - reason
+                        - status
+                        - type
+                        type: object
+                      maxItems: 8
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - type
+                      x-kubernetes-list-type: map
+                    parentRef:
+                      description: GatewayRef indicates the gateway that observed
+                        state of InferencePool.
+                      properties:
+                        apiVersion:
+                          description: API version of the referent.
+                          type: string
+                        fieldPath:
+                          description: |-
+                            If referring to a piece of an object instead of an entire object, this string
+                            should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2].
+                            For example, if the object reference is to a container within a pod, this would take on a value like:
+                            "spec.containers{name}" (where "name" refers to the name of the container that triggered
+                            the event) or if no container name is specified "spec.containers[2]" (container with
+                            index 2 in this pod). This syntax is chosen only to have some well-defined way of
+                            referencing a part of an object.
+                          type: string
+                        kind:
+                          description: |-
+                            Kind of the referent.
+                            More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+                          type: string
+                        name:
+                          description: |-
+                            Name of the referent.
+                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                          type: string
+                        namespace:
+                          description: |-
+                            Namespace of the referent.
+                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/
+                          type: string
+                        resourceVersion:
+                          description: |-
+                            Specific resourceVersion to which this reference is made, if any.
+                            More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency
+                          type: string
+                        uid:
+                          description: |-
+                            UID of the referent.
+                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids
+                          type: string
+                      type: object
+                      x-kubernetes-map-type: atomic
+                  required:
+                  - parentRef
+                  type: object
+                maxItems: 32
+                type: array
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/generated.yaml b/config/manifests/gateway-api-inference-extension/generated.yaml
new file mode 100644
index 000000000..3a1980294
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/generated.yaml
@@ -0,0 +1,300 @@
+---
+# Source: gateway-api-inference-extension/templates/rbac.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: inference-gateway-ext-proc-release-name
+  namespace: default
+  labels:
+    app: inference-gateway-ext-proc-release-name
+---
+# Source: gateway-api-inference-extension/templates/enable_patch_policy.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: envoy-gateway-config
+  namespace: envoy-gateway-system
+data:
+  envoy-gateway.yaml: |
+    apiVersion: gateway.envoyproxy.io/v1alpha1
+    kind: EnvoyGateway
+    provider:
+      type: Kubernetes
+    gateway:
+      controllerName: gateway.envoyproxy.io/gatewayclass-controller
+    extensionApis:
+      enableEnvoyPatchPolicy: true      
+      enableBackend: true
+---
+# Source: gateway-api-inference-extension/templates/rbac.yaml
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: inference-extension-default-release-name
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencemodels"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "watch", "list"]
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+---
+# Source: gateway-api-inference-extension/templates/rbac.yaml
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: inference-extension-default-release-name
+subjects:
+- kind: ServiceAccount
+  name: inference-gateway-ext-proc-release-name
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: inference-extension-default-release-name
+---
+# Source: gateway-api-inference-extension/templates/ext_proc.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: inference-gateway-ext-proc-release-name
+  namespace: default
+spec:
+  selector:
+    app: inference-gateway-ext-proc-release-name
+  ports:
+    - name: grpc
+      protocol: TCP
+      port: 9002
+      targetPort: 9002
+    - name: http-metrics
+      protocol: TCP
+      port: 9090
+      targetPort: 9090
+  type: ClusterIP
+---
+# Source: gateway-api-inference-extension/templates/ext_proc.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: inference-gateway-ext-proc-release-name
+  namespace: default
+  labels:
+    app: inference-gateway-ext-proc-release-name
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: inference-gateway-ext-proc-release-name
+  template:
+    metadata:
+      labels:
+        app: inference-gateway-ext-proc-release-name
+    spec:
+      serviceAccountName: inference-gateway-ext-proc-release-name
+      containers:
+      - name: inference-gateway-ext-proc
+        image: registry-cn-hangzhou.ack.aliyuncs.com/dev/gateway-api-inference-extension/epp:main
+        imagePullPolicy: Always
+        args:
+        - -poolName
+        - vllm-llama2-7b-pool
+        - -poolNamespace
+        - default
+        - -v
+        - "3"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        - -metricsPort
+        - "9090"
+        ports:
+        - name: grpc
+          containerPort: 9002
+        - name: grpc-health
+          containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+# Source: gateway-api-inference-extension/templates/gateway.yaml
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: Backend
+metadata:
+  name: backend-release-name
+spec:
+  endpoints:
+    - fqdn:
+        hostname: 'foo.bar.com'
+        port: 8080
+---
+# Source: gateway-api-inference-extension/templates/traffic_policy.yaml
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: BackendTrafficPolicy
+metadata:
+  name: high-connection-route-policy-release-name  # 确保引用有 . 前缀
+  namespace: 
+spec:
+  targetRefs:
+  - group: gateway.networking.k8s.io
+    kind: HTTPRoute
+    name: llm-route-release-name
+  circuitBreaker:
+    maxConnections: 40000
+    maxPendingRequests: 40000
+    maxParallelRequests: 40000 
+  timeout:
+    tcp:
+      connectTimeout: 24h
+---
+# Source: gateway-api-inference-extension/templates/extension_policy.yaml
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyExtensionPolicy
+metadata:
+  name: ext-proc-policy-release-name
+  namespace: default
+spec:
+  extProc:
+    - backendRefs:
+      - group: ""
+        kind: Service
+        name: inference-gateway-ext-proc-release-name
+        port: 9002
+      processingMode:
+        request:
+          body: Buffered
+        response:
+      messageTimeout: 1000s
+      backendSettings:
+        circuitBreaker:
+          maxConnections: 40000
+          maxPendingRequests: 40000
+          maxParallelRequests: 40000
+        timeout:
+          tcp:
+            connectTimeout: 24h
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: HTTPRoute
+    name: llm-route-release-name
+---
+# Source: gateway-api-inference-extension/templates/patch_policy.yaml
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyPatchPolicy
+metadata:
+  name: custom-response-patch-policy-release-name
+  namespace: default
+spec:
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway-release-name
+  type: JSONPatch
+  jsonPatches:
+    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
+      name: original_destination_cluster
+      operation:
+        op: add
+        path: ""
+        value:
+          name: original_destination_cluster
+          type: ORIGINAL_DST
+          original_dst_lb_config:
+            use_http_header: true
+            http_header_name: "x-gateway-destination-endpoint"
+          connect_timeout: 1000s
+          lb_policy: CLUSTER_PROVIDED
+          dns_lookup_family: V4_ONLY
+          circuit_breakers:
+            thresholds:
+            - max_connections: 40000
+              max_pending_requests: 40000
+              max_requests: 40000
+    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
+      name: "envoyextensionpolicy/default/ext-proc-policy-release-name/extproc/0"
+      operation:
+        op: add
+        path: "/transport_socket"
+        value:
+          name: "envoy.transport_sockets.tls"
+          typed_config:
+            "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext"
+            common_tls_context: {}
+    - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
+      name: default/inference-gateway-release-name/llm-gw
+      operation:
+        op: replace
+        path: "/virtual_hosts/0/routes/0/route/cluster"
+        value: original_destination_cluster
+---
+# Source: gateway-api-inference-extension/templates/gateway.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: inference-gateway-release-name
+  namespace: default
+spec:
+  gatewayClassName: inference-gateway-release-name
+  listeners:
+    - name: http
+      protocol: HTTP
+      port: 8080
+    - name: llm-gw
+      protocol: HTTP
+      port: 8081
+---
+# Source: gateway-api-inference-extension/templates/gateway.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: GatewayClass
+metadata:
+  name: inference-gateway-release-name
+spec:
+  controllerName: gateway.envoyproxy.io/gatewayclass-controller
+---
+# Source: gateway-api-inference-extension/templates/gateway.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-route-release-name
+  namespace: default
+spec:
+  parentRefs:
+    - name: inference-gateway-release-name
+      sectionName: llm-gw
+  rules:
+  - backendRefs:
+      - group: gateway.envoyproxy.io
+        kind: Backend
+        name: backend-release-name
+    timeouts:
+      request: "24h"
+      backendRequest: "24h"
diff --git a/config/manifests/gateway-api-inference-extension/templates/NOTES.txt b/config/manifests/gateway-api-inference-extension/templates/NOTES.txt
new file mode 100644
index 000000000..5d5ea8794
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/NOTES.txt
@@ -0,0 +1 @@
+Gateway api inference extension deployed.
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
new file mode 100644
index 000000000..7294f7f99
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
@@ -0,0 +1,42 @@
+{{- define "httpRoute.name" -}}
+llm-route-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "backend.name" -}}
+backend-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "gatewayClass.name" -}}
+inference-gateway-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "gateway.name" -}}
+inference-gateway-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "envoyExtensionPolicy.name" -}}
+ext-proc-policy-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "envoyPatchPolicy.name" -}}
+custom-response-patch-policy-{{ .Release.Name }}
+{{- end -}}
+
+{{/*
+Selector labels
+*/}}
+{{- define "gateway-api-inference-extension.selectorLabels" -}}
+app: {{ include "gateway-api-inference-extension.name" . }}
+{{- end -}}
+
+{{- define "clusterRole.name" -}}
+inference-extension-{{ .Release.Namespace }}-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "backendTrafficPolicy.name" -}}
+high-connection-route-policy-{{ .Release.Name }}
+{{- end -}}
+
+{{- define "gateway-api-inference-extension.name" -}}
+inference-gateway-ext-proc-{{ .Release.Name }}
+{{- end -}}
diff --git a/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml
new file mode 100644
index 000000000..21b0aa866
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml
@@ -0,0 +1,18 @@
+{{ if .Values.envoy.enablePatchPolicy }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: envoy-gateway-config
+  namespace: {{ .Values.envoy.namespace | default "envoy-gateway-system" }}
+data:
+  envoy-gateway.yaml: |
+    apiVersion: gateway.envoyproxy.io/v1alpha1
+    kind: EnvoyGateway
+    provider:
+      type: Kubernetes
+    gateway:
+      controllerName: gateway.envoyproxy.io/gatewayclass-controller
+    extensionApis:
+      enableEnvoyPatchPolicy: true      
+      enableBackend: true
+{{ end }}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
new file mode 100644
index 000000000..bd53c9334
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: {{ include "gateway-api-inference-extension.name" . }}
+spec:
+  replicas: {{ .Values.inferenceExtension.replicas | default 1 }}
+  selector:
+    matchLabels:
+      {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
+    spec:
+      serviceAccountName: {{ include "gateway-api-inference-extension.name" . }}
+      containers:
+      - name: inference-gateway-ext-proc
+        image: {{ .Values.inferenceExtension.image.hub }}:{{ .Values.inferenceExtension.image.tag }}
+        imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }}
+        args:
+        - -poolName
+        - {{ .Values.inferencePool.name }}
+        - -poolNamespace
+        - {{ .Release.Namespace }}
+        - -v
+        - {{ .Values.inferenceExtension.logLevel | default 3 | quote }}
+        - -grpcPort
+        - {{ .Values.inferenceExtension.grpcPort | default 9002 | quote }}
+        - -grpcHealthPort
+        - "9003"
+        - -metricsPort
+        - {{ .Values.inferenceExtension.metricsPort | default 9090 | quote }}
+        ports:
+        - name: grpc
+          containerPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }}
+        - name: grpc-health
+          containerPort: 9003
+        - name: metrics
+          containerPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  selector:
+    {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }}
+  ports:
+    - name: grpc
+      protocol: TCP
+      port: {{ .Values.inferenceExtension.grpcPort | default 9002 }}
+      targetPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }}
+    - name: http-metrics
+      protocol: TCP
+      port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+      targetPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+  type: ClusterIP
diff --git a/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml
new file mode 100644
index 000000000..ed84e6f5c
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml
@@ -0,0 +1,29 @@
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyExtensionPolicy
+metadata:
+  name: {{ include "envoyExtensionPolicy.name" . }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  extProc:
+    - backendRefs:
+      - group: ""
+        kind: Service
+        name: {{ include "gateway-api-inference-extension.name" . }}
+        port: {{ .Values.inferenceExtension.port | default 9002 }}
+      processingMode:
+        request:
+          body: Buffered
+        response:
+      messageTimeout: 1000s
+      backendSettings:
+        circuitBreaker:
+          maxConnections: 40000
+          maxPendingRequests: 40000
+          maxParallelRequests: 40000
+        timeout:
+          tcp:
+            connectTimeout: 24h
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: HTTPRoute
+    name: {{ include "httpRoute.name" . }}
diff --git a/config/manifests/gateway-api-inference-extension/templates/gateway.yaml b/config/manifests/gateway-api-inference-extension/templates/gateway.yaml
new file mode 100644
index 000000000..f0259f527
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/gateway.yaml
@@ -0,0 +1,51 @@
+
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: {{ include "gateway.name" . }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  gatewayClassName: {{ include "gatewayClass.name" . }}
+  listeners:
+    - name: http
+      protocol: HTTP
+      port: 8080
+    - name: llm-gw
+      protocol: HTTP
+      port: {{ .Values.gateway.port }}
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: GatewayClass
+metadata:
+  name: {{ include "gatewayClass.name" . }}
+spec:
+  controllerName: gateway.envoyproxy.io/gatewayclass-controller
+---
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: Backend
+metadata:
+  name: {{ include "backend.name" . }}
+spec:
+  endpoints:
+    - fqdn:
+        hostname: 'foo.bar.com'
+        port: 8080
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: {{ include "httpRoute.name" . }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  parentRefs:
+    - name: {{ include "gateway.name" . }}
+      sectionName: llm-gw
+  rules:
+  - backendRefs:
+      - group: gateway.envoyproxy.io
+        kind: Backend
+        name: {{ include "backend.name" . }}
+    timeouts:
+      request: "24h"
+      backendRequest: "24h"
diff --git a/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml
new file mode 100644
index 000000000..e789b0e2f
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml
@@ -0,0 +1,47 @@
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyPatchPolicy
+metadata:
+  name: {{ include "envoyPatchPolicy.name" . }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: Gateway
+    name: {{ include "gateway.name" . }}
+  type: JSONPatch
+  jsonPatches:
+    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
+      name: original_destination_cluster
+      operation:
+        op: add
+        path: ""
+        value:
+          name: original_destination_cluster
+          type: ORIGINAL_DST
+          original_dst_lb_config:
+            use_http_header: true
+            http_header_name: "x-gateway-destination-endpoint"
+          connect_timeout: 1000s
+          lb_policy: CLUSTER_PROVIDED
+          dns_lookup_family: V4_ONLY
+          circuit_breakers:
+            thresholds:
+            - max_connections: 40000
+              max_pending_requests: 40000
+              max_requests: 40000
+    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
+      name: "envoyextensionpolicy/{{ .Release.Namespace }}/{{ include "envoyExtensionPolicy.name" . }}/extproc/0"
+      operation:
+        op: add
+        path: "/transport_socket"
+        value:
+          name: "envoy.transport_sockets.tls"
+          typed_config:
+            "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext"
+            common_tls_context: {}
+    - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
+      name: {{ .Release.Namespace }}/{{ include "gateway.name" . }}/llm-gw
+      operation:
+        op: replace
+        path: "/virtual_hosts/0/routes/0/route/cluster"
+        value: original_destination_cluster
diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
new file mode 100644
index 000000000..73ff0aa6c
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
@@ -0,0 +1,49 @@
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ include "clusterRole.name" . }}
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencemodels"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "watch", "list"]
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ include "clusterRole.name" . }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+roleRef:
+  kind: ClusterRole
+  name: {{ include "clusterRole.name" . }}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: {{ include "gateway-api-inference-extension.name" . }}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml
new file mode 100644
index 000000000..92ba989c3
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml
@@ -0,0 +1,17 @@
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: BackendTrafficPolicy
+metadata:
+  name: {{ include "backendTrafficPolicy.name" . }}
+  namespace: {{ .Release.namespace }}
+spec:
+  targetRefs:
+  - group: gateway.networking.k8s.io
+    kind: HTTPRoute
+    name: {{ include "httpRoute.name" . }}
+  circuitBreaker:
+    maxConnections: 40000
+    maxPendingRequests: 40000
+    maxParallelRequests: 40000 
+  timeout:
+    tcp:
+      connectTimeout: 24h
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/manifests/gateway-api-inference-extension/values.yaml
new file mode 100644
index 000000000..cbda8e573
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/values.yaml
@@ -0,0 +1,25 @@
+inferenceExtension:
+  replicas: 1
+  image:
+    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp
+    tag: main
+    pullPolicy: Always
+
+  name: inference-gateway-ext-proc
+  serviceName: inference-gateway-ext-proc
+  grpcPort: 9002
+  metricsPort: 9090
+  logLevel: 3
+
+inferencePool:
+  name: vllm-llama2-7b-pool
+
+gateway:
+  port: 8081
+  
+envoy:
+  # envoy gateway system namespace
+  namespace: envoy-gateway-system
+
+  # enabling the Envoy Patch Policy feature
+  enablePatchPolicy: true

From 23664605b40cb4176c5eddb2e165a4824b7e7171 Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Fri, 28 Feb 2025 16:24:05 +0800
Subject: [PATCH 2/9] tidy template

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 .../Chart.yaml                                |   2 +-
 .../crds/crds.yaml                            | 917 ------------------
 .../generated.yaml                            | 300 ------
 .../templates/_helpers.tpl                    |  48 +-
 .../templates/enable_patch_policy.yaml        |  18 -
 .../templates/ext_proc.yaml                   |  24 +-
 .../templates/extension_policy.yaml           |  29 -
 .../templates/gateway.yaml                    |  51 -
 .../templates/patch_policy.yaml               |  47 -
 .../templates/rbac.yaml                       |  14 +-
 .../templates/traffic_policy.yaml             |  17 -
 .../values.yaml                               |  20 +-
 config/manifests/install.yaml                 | 137 +++
 13 files changed, 174 insertions(+), 1450 deletions(-)
 delete mode 100644 config/manifests/gateway-api-inference-extension/crds/crds.yaml
 delete mode 100644 config/manifests/gateway-api-inference-extension/generated.yaml
 delete mode 100644 config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml
 delete mode 100644 config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml
 delete mode 100644 config/manifests/gateway-api-inference-extension/templates/gateway.yaml
 delete mode 100644 config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml
 delete mode 100644 config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml
 create mode 100644 config/manifests/install.yaml

diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/manifests/gateway-api-inference-extension/Chart.yaml
index b6cecc408..dd194a652 100644
--- a/config/manifests/gateway-api-inference-extension/Chart.yaml
+++ b/config/manifests/gateway-api-inference-extension/Chart.yaml
@@ -6,4 +6,4 @@ type: application
 
 version: 0.1.0
 
-appVersion: "1.16.0"
+appVersion: "0.1.0"
diff --git a/config/manifests/gateway-api-inference-extension/crds/crds.yaml b/config/manifests/gateway-api-inference-extension/crds/crds.yaml
deleted file mode 100644
index 31e654baf..000000000
--- a/config/manifests/gateway-api-inference-extension/crds/crds.yaml
+++ /dev/null
@@ -1,917 +0,0 @@
-apiVersion: apiextensions.k8s.io/v1
-kind: CustomResourceDefinition
-metadata:
-  name: inferencemodels.inference.networking.x-k8s.io
-spec:
-  group: inference.networking.x-k8s.io
-  names:
-    kind: InferenceModel
-    listKind: InferenceModelList
-    plural: inferencemodels
-    singular: inferencemodel
-  scope: Namespaced
-  versions:
-  - name: v1alpha1
-    schema:
-      openAPIV3Schema:
-        description: InferenceModel is the Schema for the InferenceModels API.
-        properties:
-          apiVersion:
-            description: |-
-              APIVersion defines the versioned schema of this representation of an object.
-              Servers should convert recognized schemas to the latest internal value, and
-              may reject unrecognized values.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-            type: string
-          kind:
-            description: |-
-              Kind is a string value representing the REST resource this object represents.
-              Servers may infer this from the endpoint the client submits requests to.
-              Cannot be updated.
-              In CamelCase.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-            type: string
-          metadata:
-            type: object
-          spec:
-            description: |-
-              InferenceModelSpec represents the desired state of a specific model use case. This resource is
-              managed by the "Inference Workload Owner" persona.
-
-              The Inference Workload Owner persona is someone that trains, verifies, and
-              leverages a large language model from a model frontend, drives the lifecycle
-              and rollout of new versions of those models, and defines the specific
-              performance and latency goals for the model. These workloads are
-              expected to operate within an InferencePool sharing compute capacity with other
-              InferenceModels, defined by the Inference Platform Admin.
-
-              InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
-              if the name is reused, an error will be shown on the status of a
-              InferenceModel that attempted to reuse. The oldest InferenceModel, based on
-              creation timestamp, will be selected to remain valid. In the event of a race
-              condition, one will be selected at random.
-            properties:
-              criticality:
-                description: |-
-                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
-                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
-                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
-                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
-                  and the proportionality of fairness will be configurable.
-
-                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
-                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
-                enum:
-                - Critical
-                - Standard
-                - Sheddable
-                type: string
-              modelName:
-                description: |-
-                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
-                  ModelNames must be unique for a referencing InferencePool
-                  (names can be reused for a different pool in the same cluster).
-                  The modelName with the oldest creation timestamp is retained, and the incoming
-                  InferenceModel is sets the Ready status to false with a corresponding reason.
-                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
-                  Names can be reserved without an underlying model configured in the pool.
-                  This can be done by specifying a target model and setting the weight to zero,
-                  an error will be returned specifying that no valid target model is found.
-                maxLength: 256
-                type: string
-              poolRef:
-                description: PoolRef is a reference to the inference pool, the pool
-                  must exist in the same namespace.
-                properties:
-                  group:
-                    default: inference.networking.x-k8s.io
-                    description: Group is the group of the referent.
-                    maxLength: 253
-                    pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
-                    type: string
-                  kind:
-                    default: InferencePool
-                    description: Kind is kind of the referent. For example "InferencePool".
-                    maxLength: 63
-                    minLength: 1
-                    pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$
-                    type: string
-                  name:
-                    description: Name is the name of the referent.
-                    maxLength: 253
-                    minLength: 1
-                    type: string
-                required:
-                - name
-                type: object
-              targetModels:
-                description: |-
-                  TargetModels allow multiple versions of a model for traffic splitting.
-                  If not specified, the target model name is defaulted to the modelName parameter.
-                  modelName is often in reference to a LoRA adapter.
-                items:
-                  description: |-
-                    TargetModel represents a deployed model or a LoRA adapter. The
-                    Name field is expected to match the name of the LoRA adapter
-                    (or base model) as it is registered within the model server. Inference
-                    Gateway assumes that the model exists on the model server and it's the
-                    responsibility of the user to validate a correct match. Should a model fail
-                    to exist at request time, the error is processed by the Inference Gateway
-                    and emitted on the appropriate InferenceModel object.
-                  properties:
-                    name:
-                      description: Name is the name of the adapter or base model,
-                        as expected by the ModelServer.
-                      maxLength: 253
-                      type: string
-                    weight:
-                      description: |-
-                        Weight is used to determine the proportion of traffic that should be
-                        sent to this model when multiple target models are specified.
-
-                        Weight defines the proportion of requests forwarded to the specified
-                        model. This is computed as weight/(sum of all weights in this
-                        TargetModels list). For non-zero values, there may be some epsilon from
-                        the exact proportion defined here depending on the precision an
-                        implementation supports. Weight is not a percentage and the sum of
-                        weights does not need to equal 100.
-
-                        If a weight is set for any targetModel, it must be set for all targetModels.
-                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
-                      format: int32
-                      maximum: 1000000
-                      minimum: 0
-                      type: integer
-                  required:
-                  - name
-                  type: object
-                maxItems: 10
-                type: array
-                x-kubernetes-validations:
-                - message: Weights should be set for all models, or none of the models.
-                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
-            required:
-            - modelName
-            - poolRef
-            type: object
-          status:
-            description: InferenceModelStatus defines the observed state of InferenceModel
-            properties:
-              conditions:
-                default:
-                - lastTransitionTime: "1970-01-01T00:00:00Z"
-                  message: Waiting for controller
-                  reason: Pending
-                  status: Unknown
-                  type: Ready
-                description: |-
-                  Conditions track the state of the InferenceModel.
-
-                  Known condition types are:
-
-                  * "Accepted"
-                items:
-                  description: Condition contains details for one aspect of the current
-                    state of this API Resource.
-                  properties:
-                    lastTransitionTime:
-                      description: |-
-                        lastTransitionTime is the last time the condition transitioned from one status to another.
-                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
-                      format: date-time
-                      type: string
-                    message:
-                      description: |-
-                        message is a human readable message indicating details about the transition.
-                        This may be an empty string.
-                      maxLength: 32768
-                      type: string
-                    observedGeneration:
-                      description: |-
-                        observedGeneration represents the .metadata.generation that the condition was set based upon.
-                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
-                        with respect to the current state of the instance.
-                      format: int64
-                      minimum: 0
-                      type: integer
-                    reason:
-                      description: |-
-                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
-                        Producers of specific condition types may define expected values and meanings for this field,
-                        and whether the values are considered a guaranteed API.
-                        The value should be a CamelCase string.
-                        This field may not be empty.
-                      maxLength: 1024
-                      minLength: 1
-                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
-                      type: string
-                    status:
-                      description: status of the condition, one of True, False, Unknown.
-                      enum:
-                      - "True"
-                      - "False"
-                      - Unknown
-                      type: string
-                    type:
-                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
-                      maxLength: 316
-                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
-                      type: string
-                  required:
-                  - lastTransitionTime
-                  - message
-                  - reason
-                  - status
-                  - type
-                  type: object
-                maxItems: 8
-                type: array
-                x-kubernetes-list-map-keys:
-                - type
-                x-kubernetes-list-type: map
-            type: object
-        type: object
-    served: true
-    storage: false
-    subresources:
-      status: {}
-  - name: v1alpha2
-    schema:
-      openAPIV3Schema:
-        description: InferenceModel is the Schema for the InferenceModels API.
-        properties:
-          apiVersion:
-            description: |-
-              APIVersion defines the versioned schema of this representation of an object.
-              Servers should convert recognized schemas to the latest internal value, and
-              may reject unrecognized values.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-            type: string
-          kind:
-            description: |-
-              Kind is a string value representing the REST resource this object represents.
-              Servers may infer this from the endpoint the client submits requests to.
-              Cannot be updated.
-              In CamelCase.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-            type: string
-          metadata:
-            type: object
-          spec:
-            description: |-
-              InferenceModelSpec represents the desired state of a specific model use case. This resource is
-              managed by the "Inference Workload Owner" persona.
-
-              The Inference Workload Owner persona is someone that trains, verifies, and
-              leverages a large language model from a model frontend, drives the lifecycle
-              and rollout of new versions of those models, and defines the specific
-              performance and latency goals for the model. These workloads are
-              expected to operate within an InferencePool sharing compute capacity with other
-              InferenceModels, defined by the Inference Platform Admin.
-
-              InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
-              if the name is reused, an error will be shown on the status of a
-              InferenceModel that attempted to reuse. The oldest InferenceModel, based on
-              creation timestamp, will be selected to remain valid. In the event of a race
-              condition, one will be selected at random.
-            properties:
-              criticality:
-                description: |-
-                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
-                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
-                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
-                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
-                  and the proportionality of fairness will be configurable.
-
-                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
-                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
-                enum:
-                - Critical
-                - Standard
-                - Sheddable
-                type: string
-              modelName:
-                description: |-
-                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
-                  ModelNames must be unique for a referencing InferencePool
-                  (names can be reused for a different pool in the same cluster).
-                  The modelName with the oldest creation timestamp is retained, and the incoming
-                  InferenceModel is sets the Ready status to false with a corresponding reason.
-                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
-                  Names can be reserved without an underlying model configured in the pool.
-                  This can be done by specifying a target model and setting the weight to zero,
-                  an error will be returned specifying that no valid target model is found.
-                maxLength: 256
-                type: string
-              poolRef:
-                description: PoolRef is a reference to the inference pool, the pool
-                  must exist in the same namespace.
-                properties:
-                  group:
-                    default: inference.networking.x-k8s.io
-                    description: Group is the group of the referent.
-                    maxLength: 253
-                    pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
-                    type: string
-                  kind:
-                    default: InferencePool
-                    description: Kind is kind of the referent. For example "InferencePool".
-                    maxLength: 63
-                    minLength: 1
-                    pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$
-                    type: string
-                  name:
-                    description: Name is the name of the referent.
-                    maxLength: 253
-                    minLength: 1
-                    type: string
-                required:
-                - name
-                type: object
-              targetModels:
-                description: |-
-                  TargetModels allow multiple versions of a model for traffic splitting.
-                  If not specified, the target model name is defaulted to the modelName parameter.
-                  modelName is often in reference to a LoRA adapter.
-                items:
-                  description: |-
-                    TargetModel represents a deployed model or a LoRA adapter. The
-                    Name field is expected to match the name of the LoRA adapter
-                    (or base model) as it is registered within the model server. Inference
-                    Gateway assumes that the model exists on the model server and it's the
-                    responsibility of the user to validate a correct match. Should a model fail
-                    to exist at request time, the error is processed by the Inference Gateway
-                    and emitted on the appropriate InferenceModel object.
-                  properties:
-                    name:
-                      description: Name is the name of the adapter or base model,
-                        as expected by the ModelServer.
-                      maxLength: 253
-                      type: string
-                    weight:
-                      description: |-
-                        Weight is used to determine the proportion of traffic that should be
-                        sent to this model when multiple target models are specified.
-
-                        Weight defines the proportion of requests forwarded to the specified
-                        model. This is computed as weight/(sum of all weights in this
-                        TargetModels list). For non-zero values, there may be some epsilon from
-                        the exact proportion defined here depending on the precision an
-                        implementation supports. Weight is not a percentage and the sum of
-                        weights does not need to equal 100.
-
-                        If a weight is set for any targetModel, it must be set for all targetModels.
-                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
-                      format: int32
-                      maximum: 1000000
-                      minimum: 0
-                      type: integer
-                  required:
-                  - name
-                  type: object
-                maxItems: 10
-                type: array
-                x-kubernetes-validations:
-                - message: Weights should be set for all models, or none of the models.
-                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
-            required:
-            - modelName
-            - poolRef
-            type: object
-          status:
-            description: InferenceModelStatus defines the observed state of InferenceModel
-            properties:
-              conditions:
-                default:
-                - lastTransitionTime: "1970-01-01T00:00:00Z"
-                  message: Waiting for controller
-                  reason: Pending
-                  status: Unknown
-                  type: Ready
-                description: |-
-                  Conditions track the state of the InferenceModel.
-
-                  Known condition types are:
-
-                  * "Accepted"
-                items:
-                  description: Condition contains details for one aspect of the current
-                    state of this API Resource.
-                  properties:
-                    lastTransitionTime:
-                      description: |-
-                        lastTransitionTime is the last time the condition transitioned from one status to another.
-                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
-                      format: date-time
-                      type: string
-                    message:
-                      description: |-
-                        message is a human readable message indicating details about the transition.
-                        This may be an empty string.
-                      maxLength: 32768
-                      type: string
-                    observedGeneration:
-                      description: |-
-                        observedGeneration represents the .metadata.generation that the condition was set based upon.
-                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
-                        with respect to the current state of the instance.
-                      format: int64
-                      minimum: 0
-                      type: integer
-                    reason:
-                      description: |-
-                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
-                        Producers of specific condition types may define expected values and meanings for this field,
-                        and whether the values are considered a guaranteed API.
-                        The value should be a CamelCase string.
-                        This field may not be empty.
-                      maxLength: 1024
-                      minLength: 1
-                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
-                      type: string
-                    status:
-                      description: status of the condition, one of True, False, Unknown.
-                      enum:
-                      - "True"
-                      - "False"
-                      - Unknown
-                      type: string
-                    type:
-                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
-                      maxLength: 316
-                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
-                      type: string
-                  required:
-                  - lastTransitionTime
-                  - message
-                  - reason
-                  - status
-                  - type
-                  type: object
-                maxItems: 8
-                type: array
-                x-kubernetes-list-map-keys:
-                - type
-                x-kubernetes-list-type: map
-            type: object
-        type: object
-    served: true
-    storage: true
-    subresources:
-      status: {}
----
-apiVersion: apiextensions.k8s.io/v1
-kind: CustomResourceDefinition
-metadata:
-  name: inferencepools.inference.networking.x-k8s.io
-spec:
-  group: inference.networking.x-k8s.io
-  names:
-    kind: InferencePool
-    listKind: InferencePoolList
-    plural: inferencepools
-    singular: inferencepool
-  scope: Namespaced
-  versions:
-  - name: v1alpha1
-    schema:
-      openAPIV3Schema:
-        description: InferencePool is the Schema for the InferencePools API.
-        properties:
-          apiVersion:
-            description: |-
-              APIVersion defines the versioned schema of this representation of an object.
-              Servers should convert recognized schemas to the latest internal value, and
-              may reject unrecognized values.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-            type: string
-          kind:
-            description: |-
-              Kind is a string value representing the REST resource this object represents.
-              Servers may infer this from the endpoint the client submits requests to.
-              Cannot be updated.
-              In CamelCase.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-            type: string
-          metadata:
-            type: object
-          spec:
-            description: InferencePoolSpec defines the desired state of InferencePool
-            properties:
-              extensionRef:
-                description: Extension configures an endpoint picker as an extension
-                  service.
-                properties:
-                  failureMode:
-                    default: FailClose
-                    description: |-
-                      Configures how the gateway handles the case when the extension is not responsive.
-                      Defaults to failClose.
-                    enum:
-                    - FailOpen
-                    - FailClose
-                    type: string
-                  group:
-                    default: ""
-                    description: |-
-                      Group is the group of the referent.
-                      When unspecified or empty string, core API group is inferred.
-                    type: string
-                  kind:
-                    default: Service
-                    description: |-
-                      Kind is the Kubernetes resource kind of the referent. For example
-                      "Service".
-
-                      Defaults to "Service" when not specified.
-
-                      ExternalName services can refer to CNAME DNS records that may live
-                      outside of the cluster and as such are difficult to reason about in
-                      terms of conformance. They also may not be safe to forward to (see
-                      CVE-2021-25740 for more information). Implementations MUST NOT
-                      support ExternalName Services.
-                    type: string
-                  name:
-                    description: Name is the name of the referent.
-                    type: string
-                  targetPortNumber:
-                    description: |-
-                      The port number on the pods running the extension. When unspecified, implementations SHOULD infer a
-                      default value of 9002 when the Kind is Service.
-                    format: int32
-                    maximum: 65535
-                    minimum: 1
-                    type: integer
-                required:
-                - name
-                type: object
-              selector:
-                additionalProperties:
-                  description: |-
-                    LabelValue is the value of a label. This is used for validation
-                    of maps. This matches the Kubernetes label validation rules:
-                    * must be 63 characters or less (can be empty),
-                    * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
-                    * could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
-
-                    Valid values include:
-
-                    * MyValue
-                    * my.name
-                    * 123-my-value
-                  maxLength: 63
-                  minLength: 0
-                  pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
-                  type: string
-                description: |-
-                  Selector defines a map of labels to watch model server pods
-                  that should be included in the InferencePool.
-                  In some cases, implementations may translate this field to a Service selector, so this matches the simple
-                  map used for Service selectors instead of the full Kubernetes LabelSelector type.
-                type: object
-              targetPortNumber:
-                description: |-
-                  TargetPortNumber defines the port number to access the selected model servers.
-                  The number must be in the range 1 to 65535.
-                format: int32
-                maximum: 65535
-                minimum: 1
-                type: integer
-            required:
-            - extensionRef
-            - selector
-            - targetPortNumber
-            type: object
-          status:
-            description: InferencePoolStatus defines the observed state of InferencePool
-            properties:
-              conditions:
-                default:
-                - lastTransitionTime: "1970-01-01T00:00:00Z"
-                  message: Waiting for controller
-                  reason: Pending
-                  status: Unknown
-                  type: Ready
-                description: |-
-                  Conditions track the state of the InferencePool.
-
-                  Known condition types are:
-
-                  * "Ready"
-                items:
-                  description: Condition contains details for one aspect of the current
-                    state of this API Resource.
-                  properties:
-                    lastTransitionTime:
-                      description: |-
-                        lastTransitionTime is the last time the condition transitioned from one status to another.
-                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
-                      format: date-time
-                      type: string
-                    message:
-                      description: |-
-                        message is a human readable message indicating details about the transition.
-                        This may be an empty string.
-                      maxLength: 32768
-                      type: string
-                    observedGeneration:
-                      description: |-
-                        observedGeneration represents the .metadata.generation that the condition was set based upon.
-                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
-                        with respect to the current state of the instance.
-                      format: int64
-                      minimum: 0
-                      type: integer
-                    reason:
-                      description: |-
-                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
-                        Producers of specific condition types may define expected values and meanings for this field,
-                        and whether the values are considered a guaranteed API.
-                        The value should be a CamelCase string.
-                        This field may not be empty.
-                      maxLength: 1024
-                      minLength: 1
-                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
-                      type: string
-                    status:
-                      description: status of the condition, one of True, False, Unknown.
-                      enum:
-                      - "True"
-                      - "False"
-                      - Unknown
-                      type: string
-                    type:
-                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
-                      maxLength: 316
-                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
-                      type: string
-                  required:
-                  - lastTransitionTime
-                  - message
-                  - reason
-                  - status
-                  - type
-                  type: object
-                maxItems: 8
-                type: array
-                x-kubernetes-list-map-keys:
-                - type
-                x-kubernetes-list-type: map
-            type: object
-        type: object
-    served: true
-    storage: false
-    subresources:
-      status: {}
-  - name: v1alpha2
-    schema:
-      openAPIV3Schema:
-        description: InferencePool is the Schema for the InferencePools API.
-        properties:
-          apiVersion:
-            description: |-
-              APIVersion defines the versioned schema of this representation of an object.
-              Servers should convert recognized schemas to the latest internal value, and
-              may reject unrecognized values.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-            type: string
-          kind:
-            description: |-
-              Kind is a string value representing the REST resource this object represents.
-              Servers may infer this from the endpoint the client submits requests to.
-              Cannot be updated.
-              In CamelCase.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-            type: string
-          metadata:
-            type: object
-          spec:
-            description: InferencePoolSpec defines the desired state of InferencePool
-            properties:
-              extensionRef:
-                description: Extension configures an endpoint picker as an extension
-                  service.
-                properties:
-                  failureMode:
-                    default: FailClose
-                    description: |-
-                      Configures how the gateway handles the case when the extension is not responsive.
-                      Defaults to failClose.
-                    enum:
-                    - FailOpen
-                    - FailClose
-                    type: string
-                  group:
-                    default: ""
-                    description: |-
-                      Group is the group of the referent.
-                      When unspecified or empty string, core API group is inferred.
-                    type: string
-                  kind:
-                    default: Service
-                    description: |-
-                      Kind is the Kubernetes resource kind of the referent. For example
-                      "Service".
-
-                      Defaults to "Service" when not specified.
-
-                      ExternalName services can refer to CNAME DNS records that may live
-                      outside of the cluster and as such are difficult to reason about in
-                      terms of conformance. They also may not be safe to forward to (see
-                      CVE-2021-25740 for more information). Implementations MUST NOT
-                      support ExternalName Services.
-                    type: string
-                  name:
-                    description: Name is the name of the referent.
-                    type: string
-                  targetPortNumber:
-                    description: |-
-                      The port number on the service running the extension. When unspecified, implementations SHOULD infer a
-                      default value of 9002 when the Kind is Service.
-                    format: int32
-                    maximum: 65535
-                    minimum: 1
-                    type: integer
-                required:
-                - name
-                type: object
-              selector:
-                additionalProperties:
-                  description: |-
-                    LabelValue is the value of a label. This is used for validation
-                    of maps. This matches the Kubernetes label validation rules:
-                    * must be 63 characters or less (can be empty),
-                    * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
-                    * could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
-
-                    Valid values include:
-
-                    * MyValue
-                    * my.name
-                    * 123-my-value
-                  maxLength: 63
-                  minLength: 0
-                  pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
-                  type: string
-                description: |-
-                  Selector defines a map of labels to watch model server pods
-                  that should be included in the InferencePool.
-                  In some cases, implementations may translate this field to a Service selector, so this matches the simple
-                  map used for Service selectors instead of the full Kubernetes LabelSelector type.
-                type: object
-              targetPortNumber:
-                description: |-
-                  TargetPortNumber defines the port number to access the selected model servers.
-                  The number must be in the range 1 to 65535.
-                format: int32
-                maximum: 65535
-                minimum: 1
-                type: integer
-            required:
-            - extensionRef
-            - selector
-            - targetPortNumber
-            type: object
-          status:
-            description: InferencePoolStatus defines the observed state of InferencePool
-            properties:
-              parent:
-                description: |-
-                  Parents is a list of parent resources (usually Gateways) that are
-                  associated with the route, and the status of the InferencePool with respect to
-                  each parent.
-
-                  A maximum of 32 Gateways will be represented in this list. An empty list
-                  means the route has not been attached to any Gateway.
-                items:
-                  description: PoolStatus defines the observed state of InferencePool
-                    from a gateway.
-                  properties:
-                    conditions:
-                      default:
-                      - lastTransitionTime: "1970-01-01T00:00:00Z"
-                        message: Waiting for controller
-                        reason: Pending
-                        status: Unknown
-                        type: Ready
-                      description: |-
-                        Conditions track the state of the InferencePool.
-
-                        Known condition types are:
-
-                        * "Ready"
-                      items:
-                        description: Condition contains details for one aspect of
-                          the current state of this API Resource.
-                        properties:
-                          lastTransitionTime:
-                            description: |-
-                              lastTransitionTime is the last time the condition transitioned from one status to another.
-                              This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
-                            format: date-time
-                            type: string
-                          message:
-                            description: |-
-                              message is a human readable message indicating details about the transition.
-                              This may be an empty string.
-                            maxLength: 32768
-                            type: string
-                          observedGeneration:
-                            description: |-
-                              observedGeneration represents the .metadata.generation that the condition was set based upon.
-                              For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
-                              with respect to the current state of the instance.
-                            format: int64
-                            minimum: 0
-                            type: integer
-                          reason:
-                            description: |-
-                              reason contains a programmatic identifier indicating the reason for the condition's last transition.
-                              Producers of specific condition types may define expected values and meanings for this field,
-                              and whether the values are considered a guaranteed API.
-                              The value should be a CamelCase string.
-                              This field may not be empty.
-                            maxLength: 1024
-                            minLength: 1
-                            pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
-                            type: string
-                          status:
-                            description: status of the condition, one of True, False,
-                              Unknown.
-                            enum:
-                            - "True"
-                            - "False"
-                            - Unknown
-                            type: string
-                          type:
-                            description: type of condition in CamelCase or in foo.example.com/CamelCase.
-                            maxLength: 316
-                            pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
-                            type: string
-                        required:
-                        - lastTransitionTime
-                        - message
-                        - reason
-                        - status
-                        - type
-                        type: object
-                      maxItems: 8
-                      type: array
-                      x-kubernetes-list-map-keys:
-                      - type
-                      x-kubernetes-list-type: map
-                    parentRef:
-                      description: GatewayRef indicates the gateway that observed
-                        state of InferencePool.
-                      properties:
-                        apiVersion:
-                          description: API version of the referent.
-                          type: string
-                        fieldPath:
-                          description: |-
-                            If referring to a piece of an object instead of an entire object, this string
-                            should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2].
-                            For example, if the object reference is to a container within a pod, this would take on a value like:
-                            "spec.containers{name}" (where "name" refers to the name of the container that triggered
-                            the event) or if no container name is specified "spec.containers[2]" (container with
-                            index 2 in this pod). This syntax is chosen only to have some well-defined way of
-                            referencing a part of an object.
-                          type: string
-                        kind:
-                          description: |-
-                            Kind of the referent.
-                            More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-                          type: string
-                        name:
-                          description: |-
-                            Name of the referent.
-                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
-                          type: string
-                        namespace:
-                          description: |-
-                            Namespace of the referent.
-                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/
-                          type: string
-                        resourceVersion:
-                          description: |-
-                            Specific resourceVersion to which this reference is made, if any.
-                            More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency
-                          type: string
-                        uid:
-                          description: |-
-                            UID of the referent.
-                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids
-                          type: string
-                      type: object
-                      x-kubernetes-map-type: atomic
-                  required:
-                  - parentRef
-                  type: object
-                maxItems: 32
-                type: array
-            type: object
-        type: object
-    served: true
-    storage: true
-    subresources:
-      status: {}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/generated.yaml b/config/manifests/gateway-api-inference-extension/generated.yaml
deleted file mode 100644
index 3a1980294..000000000
--- a/config/manifests/gateway-api-inference-extension/generated.yaml
+++ /dev/null
@@ -1,300 +0,0 @@
----
-# Source: gateway-api-inference-extension/templates/rbac.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: inference-gateway-ext-proc-release-name
-  namespace: default
-  labels:
-    app: inference-gateway-ext-proc-release-name
----
-# Source: gateway-api-inference-extension/templates/enable_patch_policy.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: envoy-gateway-config
-  namespace: envoy-gateway-system
-data:
-  envoy-gateway.yaml: |
-    apiVersion: gateway.envoyproxy.io/v1alpha1
-    kind: EnvoyGateway
-    provider:
-      type: Kubernetes
-    gateway:
-      controllerName: gateway.envoyproxy.io/gatewayclass-controller
-    extensionApis:
-      enableEnvoyPatchPolicy: true      
-      enableBackend: true
----
-# Source: gateway-api-inference-extension/templates/rbac.yaml
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: inference-extension-default-release-name
-rules:
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: [""]
-  resources: ["pods"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["discovery.k8s.io"]
-  resources: ["endpointslices"]
-  verbs: ["get", "watch", "list"]
-- apiGroups:
-  - authentication.k8s.io
-  resources:
-  - tokenreviews
-  verbs:
-  - create
-- apiGroups:
-  - authorization.k8s.io
-  resources:
-  - subjectaccessreviews
-  verbs:
-  - create
----
-# Source: gateway-api-inference-extension/templates/rbac.yaml
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: inference-extension-default-release-name
-subjects:
-- kind: ServiceAccount
-  name: inference-gateway-ext-proc-release-name
-  namespace: default
-roleRef:
-  kind: ClusterRole
-  name: inference-extension-default-release-name
----
-# Source: gateway-api-inference-extension/templates/ext_proc.yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: inference-gateway-ext-proc-release-name
-  namespace: default
-spec:
-  selector:
-    app: inference-gateway-ext-proc-release-name
-  ports:
-    - name: grpc
-      protocol: TCP
-      port: 9002
-      targetPort: 9002
-    - name: http-metrics
-      protocol: TCP
-      port: 9090
-      targetPort: 9090
-  type: ClusterIP
----
-# Source: gateway-api-inference-extension/templates/ext_proc.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: inference-gateway-ext-proc-release-name
-  namespace: default
-  labels:
-    app: inference-gateway-ext-proc-release-name
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: inference-gateway-ext-proc-release-name
-  template:
-    metadata:
-      labels:
-        app: inference-gateway-ext-proc-release-name
-    spec:
-      serviceAccountName: inference-gateway-ext-proc-release-name
-      containers:
-      - name: inference-gateway-ext-proc
-        image: registry-cn-hangzhou.ack.aliyuncs.com/dev/gateway-api-inference-extension/epp:main
-        imagePullPolicy: Always
-        args:
-        - -poolName
-        - vllm-llama2-7b-pool
-        - -poolNamespace
-        - default
-        - -v
-        - "3"
-        - -grpcPort
-        - "9002"
-        - -grpcHealthPort
-        - "9003"
-        - -metricsPort
-        - "9090"
-        ports:
-        - name: grpc
-          containerPort: 9002
-        - name: grpc-health
-          containerPort: 9003
-        - name: metrics
-          containerPort: 9090
-        livenessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
-        readinessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
----
-# Source: gateway-api-inference-extension/templates/gateway.yaml
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: Backend
-metadata:
-  name: backend-release-name
-spec:
-  endpoints:
-    - fqdn:
-        hostname: 'foo.bar.com'
-        port: 8080
----
-# Source: gateway-api-inference-extension/templates/traffic_policy.yaml
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: BackendTrafficPolicy
-metadata:
-  name: high-connection-route-policy-release-name  # 确保引用有 . 前缀
-  namespace: 
-spec:
-  targetRefs:
-  - group: gateway.networking.k8s.io
-    kind: HTTPRoute
-    name: llm-route-release-name
-  circuitBreaker:
-    maxConnections: 40000
-    maxPendingRequests: 40000
-    maxParallelRequests: 40000 
-  timeout:
-    tcp:
-      connectTimeout: 24h
----
-# Source: gateway-api-inference-extension/templates/extension_policy.yaml
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: EnvoyExtensionPolicy
-metadata:
-  name: ext-proc-policy-release-name
-  namespace: default
-spec:
-  extProc:
-    - backendRefs:
-      - group: ""
-        kind: Service
-        name: inference-gateway-ext-proc-release-name
-        port: 9002
-      processingMode:
-        request:
-          body: Buffered
-        response:
-      messageTimeout: 1000s
-      backendSettings:
-        circuitBreaker:
-          maxConnections: 40000
-          maxPendingRequests: 40000
-          maxParallelRequests: 40000
-        timeout:
-          tcp:
-            connectTimeout: 24h
-  targetRef:
-    group: gateway.networking.k8s.io
-    kind: HTTPRoute
-    name: llm-route-release-name
----
-# Source: gateway-api-inference-extension/templates/patch_policy.yaml
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: EnvoyPatchPolicy
-metadata:
-  name: custom-response-patch-policy-release-name
-  namespace: default
-spec:
-  targetRef:
-    group: gateway.networking.k8s.io
-    kind: Gateway
-    name: inference-gateway-release-name
-  type: JSONPatch
-  jsonPatches:
-    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
-      name: original_destination_cluster
-      operation:
-        op: add
-        path: ""
-        value:
-          name: original_destination_cluster
-          type: ORIGINAL_DST
-          original_dst_lb_config:
-            use_http_header: true
-            http_header_name: "x-gateway-destination-endpoint"
-          connect_timeout: 1000s
-          lb_policy: CLUSTER_PROVIDED
-          dns_lookup_family: V4_ONLY
-          circuit_breakers:
-            thresholds:
-            - max_connections: 40000
-              max_pending_requests: 40000
-              max_requests: 40000
-    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
-      name: "envoyextensionpolicy/default/ext-proc-policy-release-name/extproc/0"
-      operation:
-        op: add
-        path: "/transport_socket"
-        value:
-          name: "envoy.transport_sockets.tls"
-          typed_config:
-            "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext"
-            common_tls_context: {}
-    - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
-      name: default/inference-gateway-release-name/llm-gw
-      operation:
-        op: replace
-        path: "/virtual_hosts/0/routes/0/route/cluster"
-        value: original_destination_cluster
----
-# Source: gateway-api-inference-extension/templates/gateway.yaml
-apiVersion: gateway.networking.k8s.io/v1
-kind: Gateway
-metadata:
-  name: inference-gateway-release-name
-  namespace: default
-spec:
-  gatewayClassName: inference-gateway-release-name
-  listeners:
-    - name: http
-      protocol: HTTP
-      port: 8080
-    - name: llm-gw
-      protocol: HTTP
-      port: 8081
----
-# Source: gateway-api-inference-extension/templates/gateway.yaml
-apiVersion: gateway.networking.k8s.io/v1
-kind: GatewayClass
-metadata:
-  name: inference-gateway-release-name
-spec:
-  controllerName: gateway.envoyproxy.io/gatewayclass-controller
----
-# Source: gateway-api-inference-extension/templates/gateway.yaml
-apiVersion: gateway.networking.k8s.io/v1
-kind: HTTPRoute
-metadata:
-  name: llm-route-release-name
-  namespace: default
-spec:
-  parentRefs:
-    - name: inference-gateway-release-name
-      sectionName: llm-gw
-  rules:
-  - backendRefs:
-      - group: gateway.envoyproxy.io
-        kind: Backend
-        name: backend-release-name
-    timeouts:
-      request: "24h"
-      backendRequest: "24h"
diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
index 7294f7f99..c1e40133f 100644
--- a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
+++ b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
@@ -1,42 +1,16 @@
-{{- define "httpRoute.name" -}}
-llm-route-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "backend.name" -}}
-backend-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "gatewayClass.name" -}}
-inference-gateway-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "gateway.name" -}}
-inference-gateway-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "envoyExtensionPolicy.name" -}}
-ext-proc-policy-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "envoyPatchPolicy.name" -}}
-custom-response-patch-policy-{{ .Release.Name }}
-{{- end -}}
+{{/*
+Common labels
+*/}}
+{{- define "gateway-api-inference-extension.labels" -}}
+app.kubernetes.io/name: {{ .Values.inferenceExtension.name }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+{{- end }}
 
 {{/*
 Selector labels
 */}}
 {{- define "gateway-api-inference-extension.selectorLabels" -}}
-app: {{ include "gateway-api-inference-extension.name" . }}
-{{- end -}}
-
-{{- define "clusterRole.name" -}}
-inference-extension-{{ .Release.Namespace }}-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "backendTrafficPolicy.name" -}}
-high-connection-route-policy-{{ .Release.Name }}
-{{- end -}}
-
-{{- define "gateway-api-inference-extension.name" -}}
-inference-gateway-ext-proc-{{ .Release.Name }}
-{{- end -}}
+app: {{ .Values.inferenceExtension.name }}
+{{- end -}}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml
deleted file mode 100644
index 21b0aa866..000000000
--- a/config/manifests/gateway-api-inference-extension/templates/enable_patch_policy.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-{{ if .Values.envoy.enablePatchPolicy }}
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: envoy-gateway-config
-  namespace: {{ .Values.envoy.namespace | default "envoy-gateway-system" }}
-data:
-  envoy-gateway.yaml: |
-    apiVersion: gateway.envoyproxy.io/v1alpha1
-    kind: EnvoyGateway
-    provider:
-      type: Kubernetes
-    gateway:
-      controllerName: gateway.envoyproxy.io/gatewayclass-controller
-    extensionApis:
-      enableEnvoyPatchPolicy: true      
-      enableBackend: true
-{{ end }}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
index bd53c9334..0bfde2db4 100644
--- a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
+++ b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
@@ -1,10 +1,10 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: {{ include "gateway-api-inference-extension.name" . }}
+  name: inference-gateway-ext-proc
   namespace: {{ .Release.Namespace }}
   labels:
-    app: {{ include "gateway-api-inference-extension.name" . }}
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 spec:
   replicas: {{ .Values.inferenceExtension.replicas | default 1 }}
   selector:
@@ -15,31 +15,31 @@ spec:
       labels:
         {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
     spec:
-      serviceAccountName: {{ include "gateway-api-inference-extension.name" . }}
+      serviceAccountName: inference-gateway-ext-proc
       containers:
       - name: inference-gateway-ext-proc
-        image: {{ .Values.inferenceExtension.image.hub }}:{{ .Values.inferenceExtension.image.tag }}
+        image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }}
         imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }}
         args:
         - -poolName
         - {{ .Values.inferencePool.name }}
         - -poolNamespace
-        - {{ .Release.Namespace }}
+        - {{ .Values.inferencePool.namespace }}
         - -v
-        - {{ .Values.inferenceExtension.logLevel | default 3 | quote }}
+        - "3"
         - -grpcPort
-        - {{ .Values.inferenceExtension.grpcPort | default 9002 | quote }}
+        - "9002"
         - -grpcHealthPort
         - "9003"
         - -metricsPort
-        - {{ .Values.inferenceExtension.metricsPort | default 9090 | quote }}
+        - "9090"
         ports:
         - name: grpc
-          containerPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }}
+          containerPort: 9002
         - name: grpc-health
           containerPort: 9003
         - name: metrics
-          containerPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+          containerPort: 9090
         livenessProbe:
           grpc:
             port: 9003
@@ -56,8 +56,10 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: {{ include "gateway-api-inference-extension.name" . }}
+  name: {{ .Values.inferenceExtension.name }}
   namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 spec:
   selector:
     {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }}
diff --git a/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml
deleted file mode 100644
index ed84e6f5c..000000000
--- a/config/manifests/gateway-api-inference-extension/templates/extension_policy.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: EnvoyExtensionPolicy
-metadata:
-  name: {{ include "envoyExtensionPolicy.name" . }}
-  namespace: {{ .Release.Namespace }}
-spec:
-  extProc:
-    - backendRefs:
-      - group: ""
-        kind: Service
-        name: {{ include "gateway-api-inference-extension.name" . }}
-        port: {{ .Values.inferenceExtension.port | default 9002 }}
-      processingMode:
-        request:
-          body: Buffered
-        response:
-      messageTimeout: 1000s
-      backendSettings:
-        circuitBreaker:
-          maxConnections: 40000
-          maxPendingRequests: 40000
-          maxParallelRequests: 40000
-        timeout:
-          tcp:
-            connectTimeout: 24h
-  targetRef:
-    group: gateway.networking.k8s.io
-    kind: HTTPRoute
-    name: {{ include "httpRoute.name" . }}
diff --git a/config/manifests/gateway-api-inference-extension/templates/gateway.yaml b/config/manifests/gateway-api-inference-extension/templates/gateway.yaml
deleted file mode 100644
index f0259f527..000000000
--- a/config/manifests/gateway-api-inference-extension/templates/gateway.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-
----
-apiVersion: gateway.networking.k8s.io/v1
-kind: Gateway
-metadata:
-  name: {{ include "gateway.name" . }}
-  namespace: {{ .Release.Namespace }}
-spec:
-  gatewayClassName: {{ include "gatewayClass.name" . }}
-  listeners:
-    - name: http
-      protocol: HTTP
-      port: 8080
-    - name: llm-gw
-      protocol: HTTP
-      port: {{ .Values.gateway.port }}
----
-apiVersion: gateway.networking.k8s.io/v1
-kind: GatewayClass
-metadata:
-  name: {{ include "gatewayClass.name" . }}
-spec:
-  controllerName: gateway.envoyproxy.io/gatewayclass-controller
----
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: Backend
-metadata:
-  name: {{ include "backend.name" . }}
-spec:
-  endpoints:
-    - fqdn:
-        hostname: 'foo.bar.com'
-        port: 8080
----
-apiVersion: gateway.networking.k8s.io/v1
-kind: HTTPRoute
-metadata:
-  name: {{ include "httpRoute.name" . }}
-  namespace: {{ .Release.Namespace }}
-spec:
-  parentRefs:
-    - name: {{ include "gateway.name" . }}
-      sectionName: llm-gw
-  rules:
-  - backendRefs:
-      - group: gateway.envoyproxy.io
-        kind: Backend
-        name: {{ include "backend.name" . }}
-    timeouts:
-      request: "24h"
-      backendRequest: "24h"
diff --git a/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml
deleted file mode 100644
index e789b0e2f..000000000
--- a/config/manifests/gateway-api-inference-extension/templates/patch_policy.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: EnvoyPatchPolicy
-metadata:
-  name: {{ include "envoyPatchPolicy.name" . }}
-  namespace: {{ .Release.Namespace }}
-spec:
-  targetRef:
-    group: gateway.networking.k8s.io
-    kind: Gateway
-    name: {{ include "gateway.name" . }}
-  type: JSONPatch
-  jsonPatches:
-    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
-      name: original_destination_cluster
-      operation:
-        op: add
-        path: ""
-        value:
-          name: original_destination_cluster
-          type: ORIGINAL_DST
-          original_dst_lb_config:
-            use_http_header: true
-            http_header_name: "x-gateway-destination-endpoint"
-          connect_timeout: 1000s
-          lb_policy: CLUSTER_PROVIDED
-          dns_lookup_family: V4_ONLY
-          circuit_breakers:
-            thresholds:
-            - max_connections: 40000
-              max_pending_requests: 40000
-              max_requests: 40000
-    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
-      name: "envoyextensionpolicy/{{ .Release.Namespace }}/{{ include "envoyExtensionPolicy.name" . }}/extproc/0"
-      operation:
-        op: add
-        path: "/transport_socket"
-        value:
-          name: "envoy.transport_sockets.tls"
-          typed_config:
-            "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext"
-            common_tls_context: {}
-    - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
-      name: {{ .Release.Namespace }}/{{ include "gateway.name" . }}/llm-gw
-      operation:
-        op: replace
-        path: "/virtual_hosts/0/routes/0/route/cluster"
-        value: original_destination_cluster
diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
index 73ff0aa6c..b2c21f674 100644
--- a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
+++ b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
@@ -1,7 +1,9 @@
 kind: ClusterRole
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: {{ include "clusterRole.name" . }}
+  name: {{ .Values.inferenceExtension.name }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 rules:
 - apiGroups: ["inference.networking.x-k8s.io"]
   resources: ["inferencemodels"]
@@ -31,19 +33,19 @@ rules:
 kind: ClusterRoleBinding
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: {{ include "clusterRole.name" . }}
+  name: {{ .Values.inferenceExtension.name }}
 subjects:
 - kind: ServiceAccount
-  name: {{ include "gateway-api-inference-extension.name" . }}
+  name: {{ .Values.inferenceExtension.name }}
   namespace: {{ .Release.Namespace }}
 roleRef:
   kind: ClusterRole
-  name: {{ include "clusterRole.name" . }}
+  name: {{ .Values.inferenceExtension.name }}
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: {{ include "gateway-api-inference-extension.name" . }}
+  name: {{ .Values.inferenceExtension.name }}
   namespace: {{ .Release.Namespace }}
   labels:
-    app: {{ include "gateway-api-inference-extension.name" . }}
\ No newline at end of file
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml b/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml
deleted file mode 100644
index 92ba989c3..000000000
--- a/config/manifests/gateway-api-inference-extension/templates/traffic_policy.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: BackendTrafficPolicy
-metadata:
-  name: {{ include "backendTrafficPolicy.name" . }}
-  namespace: {{ .Release.namespace }}
-spec:
-  targetRefs:
-  - group: gateway.networking.k8s.io
-    kind: HTTPRoute
-    name: {{ include "httpRoute.name" . }}
-  circuitBreaker:
-    maxConnections: 40000
-    maxPendingRequests: 40000
-    maxParallelRequests: 40000 
-  timeout:
-    tcp:
-      connectTimeout: 24h
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/manifests/gateway-api-inference-extension/values.yaml
index cbda8e573..6a5137485 100644
--- a/config/manifests/gateway-api-inference-extension/values.yaml
+++ b/config/manifests/gateway-api-inference-extension/values.yaml
@@ -1,25 +1,13 @@
 inferenceExtension:
   replicas: 1
   image:
-    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp
+    name: epp
+    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
     tag: main
     pullPolicy: Always
 
   name: inference-gateway-ext-proc
-  serviceName: inference-gateway-ext-proc
-  grpcPort: 9002
-  metricsPort: 9090
-  logLevel: 3
 
 inferencePool:
-  name: vllm-llama2-7b-pool
-
-gateway:
-  port: 8081
-  
-envoy:
-  # envoy gateway system namespace
-  namespace: envoy-gateway-system
-
-  # enabling the Envoy Patch Policy feature
-  enablePatchPolicy: true
+  namespace: default
+  name: vllm-llama2-7b-pool
\ No newline at end of file
diff --git a/config/manifests/install.yaml b/config/manifests/install.yaml
new file mode 100644
index 000000000..976075560
--- /dev/null
+++ b/config/manifests/install.yaml
@@ -0,0 +1,137 @@
+---
+# Source: gateway-api-inference-extension/templates/rbac.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: inference-gateway-ext-proc
+  namespace: default
+  labels:
+    app.kubernetes.io/name: inference-gateway-ext-proc
+    app.kubernetes.io/version: "0.1.0"
+---
+# Source: gateway-api-inference-extension/templates/rbac.yaml
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: inference-gateway-ext-proc
+  labels:
+    app.kubernetes.io/name: inference-gateway-ext-proc
+    app.kubernetes.io/version: "0.1.0"
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencemodels"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "watch", "list"]
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+---
+# Source: gateway-api-inference-extension/templates/rbac.yaml
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: inference-gateway-ext-proc
+subjects:
+- kind: ServiceAccount
+  name: inference-gateway-ext-proc
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: inference-gateway-ext-proc
+---
+# Source: gateway-api-inference-extension/templates/ext_proc.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: inference-gateway-ext-proc
+  namespace: default
+  labels:
+    app.kubernetes.io/name: inference-gateway-ext-proc
+    app.kubernetes.io/version: "0.1.0"
+spec:
+  selector:
+    app: inference-gateway-ext-proc
+  ports:
+    - name: grpc
+      protocol: TCP
+      port: 9002
+      targetPort: 9002
+    - name: http-metrics
+      protocol: TCP
+      port: 9090
+      targetPort: 9090
+  type: ClusterIP
+---
+# Source: gateway-api-inference-extension/templates/ext_proc.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: inference-gateway-ext-proc
+  namespace: default
+  labels:
+    app.kubernetes.io/name: inference-gateway-ext-proc
+    app.kubernetes.io/version: "0.1.0"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: inference-gateway-ext-proc
+  template:
+    metadata:
+      labels:
+        app: inference-gateway-ext-proc
+    spec:
+      serviceAccountName: inference-gateway-ext-proc
+      containers:
+      - name: inference-gateway-ext-proc
+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        imagePullPolicy: Always
+        args:
+        - -poolName
+        - vllm-llama2-7b-pool
+        - -poolNamespace
+        - default
+        - -v
+        - "3"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        - -metricsPort
+        - "9090"
+        ports:
+        - name: grpc
+          containerPort: 9002
+        - name: grpc-health
+          containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10

From dcd3bd56d14a3f9afe810c82cc450b9bdb132971 Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Tue, 18 Mar 2025 08:50:38 +0800
Subject: [PATCH 3/9] nit and add inference pool

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 .../Chart.yaml                                |  6 +++---
 .../templates/_helpers.tpl                    | 14 ++++++++++---
 .../templates/ext_proc.yaml                   |  9 ++++-----
 .../templates/inferencepool.yaml              | 12 +++++++++++
 .../templates/rbac.yaml                       | 20 +++++++------------
 .../values.yaml                               |  8 +++++---
 6 files changed, 42 insertions(+), 27 deletions(-)
 create mode 100644 config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml

diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/manifests/gateway-api-inference-extension/Chart.yaml
index dd194a652..5e46737ca 100644
--- a/config/manifests/gateway-api-inference-extension/Chart.yaml
+++ b/config/manifests/gateway-api-inference-extension/Chart.yaml
@@ -1,9 +1,9 @@
 apiVersion: v2
-name: gateway-api-inference-extension
-description: A Helm chart for gateway-api-inference-extension
+name: InferencePool
+description: A Helm chart for InferencePool
 
 type: application
 
 version: 0.1.0
 
-appVersion: "0.1.0"
+appVersion: "0.2.0"
diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
index c1e40133f..4068e7ea6 100644
--- a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
+++ b/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
@@ -2,15 +2,23 @@
 Common labels
 */}}
 {{- define "gateway-api-inference-extension.labels" -}}
-app.kubernetes.io/name: {{ .Values.inferenceExtension.name }}
+app.kubernetes.io/name: epp-{{ .Values.inferencePool.name }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 {{- end }}
 
+{{/*
+Inference extension name
+*/}}
+{{- define "gateway-api-inference-extension.name" -}}
+{{- $base := .Values.inferencePool.name | default "default-pool" | lower | trim | trunc 40 -}}
+epp-{{ $base }}
+{{- end -}}
+
 {{/*
 Selector labels
 */}}
 {{- define "gateway-api-inference-extension.selectorLabels" -}}
-app: {{ .Values.inferenceExtension.name }}
-{{- end -}}
\ No newline at end of file
+app: epp-{{ .Values.inferencePool.name }}
+{{- end -}}
diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
index 0bfde2db4..a80bcbdd9 100644
--- a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
+++ b/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: inference-gateway-ext-proc
+  name: {{ include "gateway-api-inference-extension.name" . }}
   namespace: {{ .Release.Namespace }}
   labels:
     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
@@ -56,7 +56,7 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: {{ .Values.inferenceExtension.name }}
+  name: {{ include "gateway-api-inference-extension.name" . }}
   namespace: {{ .Release.Namespace }}
   labels:
     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
@@ -64,10 +64,9 @@ spec:
   selector:
     {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }}
   ports:
-    - name: grpc
+    - name: grpc-ext-proc
       protocol: TCP
-      port: {{ .Values.inferenceExtension.grpcPort | default 9002 }}
-      targetPort: {{ .Values.inferenceExtension.grpcPort | default 9002 }}
+      port: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
     - name: http-metrics
       protocol: TCP
       port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
diff --git a/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml b/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml
new file mode 100644
index 000000000..8662c9f86
--- /dev/null
+++ b/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml
@@ -0,0 +1,12 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: {{ .Values.inferencePool.name }}
+spec:
+  targetPortNumber: 8000
+  selector:
+      {{- range $key, $value := .Values.inferencePool.selector }}
+      {{ $key }}: {{ quote $value }}
+      {{- end }}
+  extensionRef:
+    name: {{ include "gateway-api-inference-extension.name" . }}
\ No newline at end of file
diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
index b2c21f674..7a98e8206 100644
--- a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
+++ b/config/manifests/gateway-api-inference-extension/templates/rbac.yaml
@@ -1,22 +1,16 @@
 kind: ClusterRole
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: {{ .Values.inferenceExtension.name }}
+  name: {{ include "gateway-api-inference-extension.name" . }}
   labels:
     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 rules:
 - apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
+  resources: ["inferencemodels, inferencepools"]
   verbs: ["get", "watch", "list"]
 - apiGroups: [""]
   resources: ["pods"]
   verbs: ["get", "watch", "list"]
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["discovery.k8s.io"]
-  resources: ["endpointslices"]
-  verbs: ["get", "watch", "list"]
 - apiGroups:
   - authentication.k8s.io
   resources:
@@ -33,19 +27,19 @@ rules:
 kind: ClusterRoleBinding
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: {{ .Values.inferenceExtension.name }}
+  name: {{ include "gateway-api-inference-extension.name" . }}
 subjects:
 - kind: ServiceAccount
-  name: {{ .Values.inferenceExtension.name }}
+  name: {{ include "gateway-api-inference-extension.name" . }}
   namespace: {{ .Release.Namespace }}
 roleRef:
   kind: ClusterRole
-  name: {{ .Values.inferenceExtension.name }}
+  name: {{ include "gateway-api-inference-extension.name" . }}
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: {{ .Values.inferenceExtension.name }}
+  name: {{ include "gateway-api-inference-extension.name" . }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
\ No newline at end of file
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/manifests/gateway-api-inference-extension/values.yaml
index 6a5137485..0f20a3e66 100644
--- a/config/manifests/gateway-api-inference-extension/values.yaml
+++ b/config/manifests/gateway-api-inference-extension/values.yaml
@@ -5,9 +5,11 @@ inferenceExtension:
     hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
     tag: main
     pullPolicy: Always
-
-  name: inference-gateway-ext-proc
+  extProcPort: 9002
 
 inferencePool:
   namespace: default
-  name: vllm-llama2-7b-pool
\ No newline at end of file
+  name: pool-1
+  targetPortNumber: 8000
+  selector:
+    app: vllm-llama2-7b
\ No newline at end of file

From 154f67068bbb3600854ccd3ebd545129b78fae3c Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Tue, 18 Mar 2025 09:00:58 +0800
Subject: [PATCH 4/9] relocate

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 .../.helmignore                               |   0
 .../Chart.yaml                                |   0
 .../generated.yaml                            | 145 ++++++++++++++++++
 .../templates/NOTES.txt                       |   0
 .../templates/_helpers.tpl                    |   0
 .../templates/ext_proc.yaml                   |   7 +-
 .../templates/inferencepool.yaml              |   5 +-
 .../templates/rbac.yaml                       |   0
 .../values.yaml                               |   1 -
 9 files changed, 152 insertions(+), 6 deletions(-)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/.helmignore (100%)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/Chart.yaml (100%)
 create mode 100644 config/charts/inferencepool/gateway-api-inference-extension/generated.yaml
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/NOTES.txt (100%)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/_helpers.tpl (100%)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/ext_proc.yaml (90%)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/inferencepool.yaml (65%)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/templates/rbac.yaml (100%)
 rename config/{manifests => charts/inferencepool}/gateway-api-inference-extension/values.yaml (93%)

diff --git a/config/manifests/gateway-api-inference-extension/.helmignore b/config/charts/inferencepool/gateway-api-inference-extension/.helmignore
similarity index 100%
rename from config/manifests/gateway-api-inference-extension/.helmignore
rename to config/charts/inferencepool/gateway-api-inference-extension/.helmignore
diff --git a/config/manifests/gateway-api-inference-extension/Chart.yaml b/config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml
similarity index 100%
rename from config/manifests/gateway-api-inference-extension/Chart.yaml
rename to config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml b/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml
new file mode 100644
index 000000000..16b3bf4ef
--- /dev/null
+++ b/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml
@@ -0,0 +1,145 @@
+---
+# Source: InferencePool/templates/rbac.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: epp-pool-1
+  namespace: default
+  labels:
+    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/version: "0.2.0"
+---
+# Source: InferencePool/templates/rbac.yaml
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: epp-pool-1
+  labels:
+    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/version: "0.2.0"
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencemodels, inferencepools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+---
+# Source: InferencePool/templates/rbac.yaml
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: epp-pool-1
+subjects:
+- kind: ServiceAccount
+  name: epp-pool-1
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: epp-pool-1
+---
+# Source: InferencePool/templates/ext_proc.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: epp-pool-1
+  namespace: default
+  labels:
+    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/version: "0.2.0"
+spec:
+  selector:
+    app: epp-pool-1
+  ports:
+    - name: grpc-ext-proc
+      protocol: TCP
+      port: 9002
+    - name: http-metrics
+      protocol: TCP
+      port: 9090
+  type: ClusterIP
+---
+# Source: InferencePool/templates/ext_proc.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: epp-pool-1
+  namespace: default
+  labels:
+    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/version: "0.2.0"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: epp-pool-1
+  template:
+    metadata:
+      labels:
+        app: epp-pool-1
+    spec:
+      serviceAccountName: epp-pool-1
+      containers:
+      - name: epp
+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        imagePullPolicy: Always
+        args:
+        - -poolName
+        - pool-1
+        - -poolNamespace
+        - default
+        - -v
+        - "3"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        - -metricsPort
+        - "9090"
+        ports:
+        - name: grpc
+          containerPort: 9002
+        - name: grpc-health
+          containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+# Source: InferencePool/templates/inferencepool.yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: pool-1
+  namespace: default
+  labels:
+    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/version: "0.2.0"
+spec:
+  targetPortNumber: 
+  selector:
+      app: "vllm-llama2-7b"
+  extensionRef:
+    name: epp-pool-1
diff --git a/config/manifests/gateway-api-inference-extension/templates/NOTES.txt b/config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt
similarity index 100%
rename from config/manifests/gateway-api-inference-extension/templates/NOTES.txt
rename to config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt
diff --git a/config/manifests/gateway-api-inference-extension/templates/_helpers.tpl b/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl
similarity index 100%
rename from config/manifests/gateway-api-inference-extension/templates/_helpers.tpl
rename to config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl
diff --git a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml
similarity index 90%
rename from config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
rename to config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml
index a80bcbdd9..cf68ab872 100644
--- a/config/manifests/gateway-api-inference-extension/templates/ext_proc.yaml
+++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml
@@ -15,16 +15,16 @@ spec:
       labels:
         {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
     spec:
-      serviceAccountName: inference-gateway-ext-proc
+      serviceAccountName: {{ include "gateway-api-inference-extension.name" . }}
       containers:
-      - name: inference-gateway-ext-proc
+      - name: epp
         image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }}
         imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }}
         args:
         - -poolName
         - {{ .Values.inferencePool.name }}
         - -poolNamespace
-        - {{ .Values.inferencePool.namespace }}
+        - {{ .Release.Namespace }}
         - -v
         - "3"
         - -grpcPort
@@ -70,5 +70,4 @@ spec:
     - name: http-metrics
       protocol: TCP
       port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
-      targetPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
   type: ClusterIP
diff --git a/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
similarity index 65%
rename from config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml
rename to config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
index 8662c9f86..9700711d7 100644
--- a/config/manifests/gateway-api-inference-extension/templates/inferencepool.yaml
+++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
@@ -2,8 +2,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
   name: {{ .Values.inferencePool.name }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 spec:
-  targetPortNumber: 8000
+  targetPortNumber: {{ .Values.inferencePool.targetPort }}
   selector:
       {{- range $key, $value := .Values.inferencePool.selector }}
       {{ $key }}: {{ quote $value }}
diff --git a/config/manifests/gateway-api-inference-extension/templates/rbac.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml
similarity index 100%
rename from config/manifests/gateway-api-inference-extension/templates/rbac.yaml
rename to config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml
diff --git a/config/manifests/gateway-api-inference-extension/values.yaml b/config/charts/inferencepool/gateway-api-inference-extension/values.yaml
similarity index 93%
rename from config/manifests/gateway-api-inference-extension/values.yaml
rename to config/charts/inferencepool/gateway-api-inference-extension/values.yaml
index 0f20a3e66..c4a0fb934 100644
--- a/config/manifests/gateway-api-inference-extension/values.yaml
+++ b/config/charts/inferencepool/gateway-api-inference-extension/values.yaml
@@ -8,7 +8,6 @@ inferenceExtension:
   extProcPort: 9002
 
 inferencePool:
-  namespace: default
   name: pool-1
   targetPortNumber: 8000
   selector:

From 2490c28a48ff5e52cbce85bed65f57ff56c55e76 Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Tue, 18 Mar 2025 09:13:56 +0800
Subject: [PATCH 5/9] fix

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 .../templates/_helpers.tpl                    |  6 +-
 .../templates/ext_proc.yaml                   | 73 -----------------
 .../templates/inferencepool.yaml              | 78 ++++++++++++++++++-
 .../generated.yaml                            | 40 +++++-----
 4 files changed, 99 insertions(+), 98 deletions(-)
 delete mode 100644 config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml
 rename config/{charts/inferencepool/gateway-api-inference-extension => manifests}/generated.yaml (81%)

diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl b/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl
index 4068e7ea6..bb15f9e4e 100644
--- a/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl
+++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl
@@ -2,7 +2,7 @@
 Common labels
 */}}
 {{- define "gateway-api-inference-extension.labels" -}}
-app.kubernetes.io/name: epp-{{ .Values.inferencePool.name }}
+app.kubernetes.io/name: {{ include "gateway-api-inference-extension.name" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
@@ -13,12 +13,12 @@ Inference extension name
 */}}
 {{- define "gateway-api-inference-extension.name" -}}
 {{- $base := .Values.inferencePool.name | default "default-pool" | lower | trim | trunc 40 -}}
-epp-{{ $base }}
+{{ $base }}-epp
 {{- end -}}
 
 {{/*
 Selector labels
 */}}
 {{- define "gateway-api-inference-extension.selectorLabels" -}}
-app: epp-{{ .Values.inferencePool.name }}
+app: {{ include "gateway-api-inference-extension.name" . }}
 {{- end -}}
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml
deleted file mode 100644
index cf68ab872..000000000
--- a/config/charts/inferencepool/gateway-api-inference-extension/templates/ext_proc.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ include "gateway-api-inference-extension.name" . }}
-  namespace: {{ .Release.Namespace }}
-  labels:
-    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
-spec:
-  replicas: {{ .Values.inferenceExtension.replicas | default 1 }}
-  selector:
-    matchLabels:
-      {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }}
-  template:
-    metadata:
-      labels:
-        {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
-    spec:
-      serviceAccountName: {{ include "gateway-api-inference-extension.name" . }}
-      containers:
-      - name: epp
-        image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }}
-        imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }}
-        args:
-        - -poolName
-        - {{ .Values.inferencePool.name }}
-        - -poolNamespace
-        - {{ .Release.Namespace }}
-        - -v
-        - "3"
-        - -grpcPort
-        - "9002"
-        - -grpcHealthPort
-        - "9003"
-        - -metricsPort
-        - "9090"
-        ports:
-        - name: grpc
-          containerPort: 9002
-        - name: grpc-health
-          containerPort: 9003
-        - name: metrics
-          containerPort: 9090
-        livenessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
-        readinessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ include "gateway-api-inference-extension.name" . }}
-  namespace: {{ .Release.Namespace }}
-  labels:
-    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
-spec:
-  selector:
-    {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }}
-  ports:
-    - name: grpc-ext-proc
-      protocol: TCP
-      port: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
-    - name: http-metrics
-      protocol: TCP
-      port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
-  type: ClusterIP
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
index 9700711d7..8fc974965 100644
--- a/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
+++ b/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
@@ -6,10 +6,84 @@ metadata:
   labels:
     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 spec:
-  targetPortNumber: {{ .Values.inferencePool.targetPort }}
+  targetPortNumber: {{ .Values.inferencePool.targetPortNumber }}
   selector:
       {{- range $key, $value := .Values.inferencePool.selector }}
       {{ $key }}: {{ quote $value }}
       {{- end }}
   extensionRef:
-    name: {{ include "gateway-api-inference-extension.name" . }}
\ No newline at end of file
+    name: {{ include "gateway-api-inference-extension.name" . }}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.inferenceExtension.replicas | default 1 }}
+  selector:
+    matchLabels:
+      {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
+    spec:
+      serviceAccountName: {{ include "gateway-api-inference-extension.name" . }}
+      containers:
+      - name: epp
+        image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }}
+        imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }}
+        args:
+        - -poolName
+        - {{ .Values.inferencePool.name }}
+        - -poolNamespace
+        - {{ .Release.Namespace }}
+        - -v
+        - "3"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        - -metricsPort
+        - "9090"
+        ports:
+        - name: grpc
+          containerPort: 9002
+        - name: grpc-health
+          containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
+spec:
+  selector:
+    {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }}
+  ports:
+    - name: grpc-ext-proc
+      protocol: TCP
+      port: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
+    - name: http-metrics
+      protocol: TCP
+      port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+  type: ClusterIP
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml b/config/manifests/generated.yaml
similarity index 81%
rename from config/charts/inferencepool/gateway-api-inference-extension/generated.yaml
rename to config/manifests/generated.yaml
index 16b3bf4ef..f615e25a1 100644
--- a/config/charts/inferencepool/gateway-api-inference-extension/generated.yaml
+++ b/config/manifests/generated.yaml
@@ -3,19 +3,19 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: epp-pool-1
+  name: pool-1-epp
   namespace: default
   labels:
-    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/name: pool-1-epp
     app.kubernetes.io/version: "0.2.0"
 ---
 # Source: InferencePool/templates/rbac.yaml
 kind: ClusterRole
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: epp-pool-1
+  name: pool-1-epp
   labels:
-    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/name: pool-1-epp
     app.kubernetes.io/version: "0.2.0"
 rules:
 - apiGroups: ["inference.networking.x-k8s.io"]
@@ -41,27 +41,27 @@ rules:
 kind: ClusterRoleBinding
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: epp-pool-1
+  name: pool-1-epp
 subjects:
 - kind: ServiceAccount
-  name: epp-pool-1
+  name: pool-1-epp
   namespace: default
 roleRef:
   kind: ClusterRole
-  name: epp-pool-1
+  name: pool-1-epp
 ---
-# Source: InferencePool/templates/ext_proc.yaml
+# Source: InferencePool/templates/inferencepool.yaml
 apiVersion: v1
 kind: Service
 metadata:
-  name: epp-pool-1
+  name: pool-1-epp
   namespace: default
   labels:
-    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/name: pool-1-epp
     app.kubernetes.io/version: "0.2.0"
 spec:
   selector:
-    app: epp-pool-1
+    app: pool-1-epp
   ports:
     - name: grpc-ext-proc
       protocol: TCP
@@ -71,26 +71,26 @@ spec:
       port: 9090
   type: ClusterIP
 ---
-# Source: InferencePool/templates/ext_proc.yaml
+# Source: InferencePool/templates/inferencepool.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: epp-pool-1
+  name: pool-1-epp
   namespace: default
   labels:
-    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/name: pool-1-epp
     app.kubernetes.io/version: "0.2.0"
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: epp-pool-1
+      app: pool-1-epp
   template:
     metadata:
       labels:
-        app: epp-pool-1
+        app: pool-1-epp
     spec:
-      serviceAccountName: epp-pool-1
+      serviceAccountName: pool-1-epp
       containers:
       - name: epp
         image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
@@ -135,11 +135,11 @@ metadata:
   name: pool-1
   namespace: default
   labels:
-    app.kubernetes.io/name: epp-pool-1
+    app.kubernetes.io/name: pool-1-epp
     app.kubernetes.io/version: "0.2.0"
 spec:
-  targetPortNumber: 
+  targetPortNumber: 8000
   selector:
       app: "vllm-llama2-7b"
   extensionRef:
-    name: epp-pool-1
+    name: pool-1-epp

From 814bec38ef789b8aafb3dd51bdc211014828bde6 Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Tue, 18 Mar 2025 09:24:03 +0800
Subject: [PATCH 6/9] fix

---
 config/manifests/install.yaml | 137 ----------------------------------
 1 file changed, 137 deletions(-)
 delete mode 100644 config/manifests/install.yaml

diff --git a/config/manifests/install.yaml b/config/manifests/install.yaml
deleted file mode 100644
index 976075560..000000000
--- a/config/manifests/install.yaml
+++ /dev/null
@@ -1,137 +0,0 @@
----
-# Source: gateway-api-inference-extension/templates/rbac.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: inference-gateway-ext-proc
-  namespace: default
-  labels:
-    app.kubernetes.io/name: inference-gateway-ext-proc
-    app.kubernetes.io/version: "0.1.0"
----
-# Source: gateway-api-inference-extension/templates/rbac.yaml
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: inference-gateway-ext-proc
-  labels:
-    app.kubernetes.io/name: inference-gateway-ext-proc
-    app.kubernetes.io/version: "0.1.0"
-rules:
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: [""]
-  resources: ["pods"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: ["discovery.k8s.io"]
-  resources: ["endpointslices"]
-  verbs: ["get", "watch", "list"]
-- apiGroups:
-  - authentication.k8s.io
-  resources:
-  - tokenreviews
-  verbs:
-  - create
-- apiGroups:
-  - authorization.k8s.io
-  resources:
-  - subjectaccessreviews
-  verbs:
-  - create
----
-# Source: gateway-api-inference-extension/templates/rbac.yaml
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: inference-gateway-ext-proc
-subjects:
-- kind: ServiceAccount
-  name: inference-gateway-ext-proc
-  namespace: default
-roleRef:
-  kind: ClusterRole
-  name: inference-gateway-ext-proc
----
-# Source: gateway-api-inference-extension/templates/ext_proc.yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: inference-gateway-ext-proc
-  namespace: default
-  labels:
-    app.kubernetes.io/name: inference-gateway-ext-proc
-    app.kubernetes.io/version: "0.1.0"
-spec:
-  selector:
-    app: inference-gateway-ext-proc
-  ports:
-    - name: grpc
-      protocol: TCP
-      port: 9002
-      targetPort: 9002
-    - name: http-metrics
-      protocol: TCP
-      port: 9090
-      targetPort: 9090
-  type: ClusterIP
----
-# Source: gateway-api-inference-extension/templates/ext_proc.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: inference-gateway-ext-proc
-  namespace: default
-  labels:
-    app.kubernetes.io/name: inference-gateway-ext-proc
-    app.kubernetes.io/version: "0.1.0"
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: inference-gateway-ext-proc
-  template:
-    metadata:
-      labels:
-        app: inference-gateway-ext-proc
-    spec:
-      serviceAccountName: inference-gateway-ext-proc
-      containers:
-      - name: inference-gateway-ext-proc
-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
-        imagePullPolicy: Always
-        args:
-        - -poolName
-        - vllm-llama2-7b-pool
-        - -poolNamespace
-        - default
-        - -v
-        - "3"
-        - -grpcPort
-        - "9002"
-        - -grpcHealthPort
-        - "9003"
-        - -metricsPort
-        - "9090"
-        ports:
-        - name: grpc
-          containerPort: 9002
-        - name: grpc-health
-          containerPort: 9003
-        - name: metrics
-          containerPort: 9090
-        livenessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
-        readinessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10

From 6712198853a55afff23e7e1e17f8c6eb8b63bba3 Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Wed, 19 Mar 2025 08:23:16 +0800
Subject: [PATCH 7/9] add readme

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 .../.helmignore                               |  0
 .../Chart.yaml                                |  0
 config/charts/inferencepool/README.md         | 61 +++++++++++++++++++
 .../templates/NOTES.txt                       |  0
 .../templates/_helpers.tpl                    |  0
 .../templates/inferencepool.yaml              |  0
 .../templates/rbac.yaml                       |  0
 .../values.yaml                               |  0
 8 files changed, 61 insertions(+)
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/.helmignore (100%)
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/Chart.yaml (100%)
 create mode 100644 config/charts/inferencepool/README.md
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/NOTES.txt (100%)
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/_helpers.tpl (100%)
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/inferencepool.yaml (100%)
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/templates/rbac.yaml (100%)
 rename config/charts/inferencepool/{gateway-api-inference-extension => }/values.yaml (100%)

diff --git a/config/charts/inferencepool/gateway-api-inference-extension/.helmignore b/config/charts/inferencepool/.helmignore
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/.helmignore
rename to config/charts/inferencepool/.helmignore
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml b/config/charts/inferencepool/Chart.yaml
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/Chart.yaml
rename to config/charts/inferencepool/Chart.yaml
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
new file mode 100644
index 000000000..10ed808b0
--- /dev/null
+++ b/config/charts/inferencepool/README.md
@@ -0,0 +1,61 @@
+# Gateway Api Inference Extension
+
+A chart to deploy the inference extension and a InferencePool managed by the extension.
+
+## Install
+
+Suppose now a vllm service with label `app: vllm-llama2-7b` and served on port `8000` is deployed in `default` namespace in the cluster.
+
+To deploy the inference extension, you can run the following command:
+
+```txt
+$ helm install my-release . -n default \
+    --set inferencePool.targetPortNumber=8000 \
+    --set inferencePool.selector.app=vllm-llama2-7b
+```
+
+Or you can change the `values.yaml` to:
+
+```yaml
+inferencePool:
+  name: pool-1
+  targetPortNumber: 8000
+  selector:
+    app: vllm-llama2-7b
+```
+
+where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends. And then run:
+
+```txt
+$ helm install my-release .
+```
+
+## Uninstall
+
+Run the following command to uninstall the chart:
+
+```txt
+$ helm uninstall my-release
+```
+
+## Configuration
+
+The following table list the configurable parameters of the chart.
+
+| **Parameter Name**                          | **Description**                                                                                                   |
+|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
+| `inferenceExtension.replicas`               | Number of replicas for the inference extension service. Defaults to `1`.                                           |
+| `inferenceExtension.image.name`             | Name of the container image used for the inference extension.                                                    |
+| `inferenceExtension.image.hub`              | Registry URL where the inference extension image is hosted.                                                     |
+| `inferenceExtension.image.tag`              | Image tag of the inference extension.                                                                             |
+| `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. |
+| `inferenceExtension.extProcPort`            | Port where the inference extension service is served for external processing. Defaults to `9002`.                  |
+| `inferencePool.name`                        | Name for the InferencePool, and inference extension will be named as `${inferencePool.name}-epp`.                |
+| `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension.             |
+| `inferencePool.selector`                     | Label selector to match vllm backends managed by the inference pool.                                             |
+
+## Notes
+
+This chart will only deploy the inference extension and InferencePool, before install the chart, please make sure that the inference extension CRDs have already been installed in the cluster. And You need to apply traffic policies to route traffic to the inference extension from the gateway after the inference extension is deployed.
+
+For more details, please refer to the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/).
\ No newline at end of file
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt b/config/charts/inferencepool/templates/NOTES.txt
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/templates/NOTES.txt
rename to config/charts/inferencepool/templates/NOTES.txt
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl b/config/charts/inferencepool/templates/_helpers.tpl
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/templates/_helpers.tpl
rename to config/charts/inferencepool/templates/_helpers.tpl
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml b/config/charts/inferencepool/templates/inferencepool.yaml
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/templates/inferencepool.yaml
rename to config/charts/inferencepool/templates/inferencepool.yaml
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml b/config/charts/inferencepool/templates/rbac.yaml
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/templates/rbac.yaml
rename to config/charts/inferencepool/templates/rbac.yaml
diff --git a/config/charts/inferencepool/gateway-api-inference-extension/values.yaml b/config/charts/inferencepool/values.yaml
similarity index 100%
rename from config/charts/inferencepool/gateway-api-inference-extension/values.yaml
rename to config/charts/inferencepool/values.yaml

From a885ea9fa3df81ad5e94d6416f755207d4d056da Mon Sep 17 00:00:00 2001
From: Kuromesi <blackfacepan@163.com>
Date: Wed, 19 Mar 2025 10:45:38 +0800
Subject: [PATCH 8/9] nit

Signed-off-by: Kuromesi <blackfacepan@163.com>
---
 config/charts/inferencepool/README.md         |  42 ++---
 .../charts/inferencepool/templates/NOTES.txt  |   2 +-
 config/charts/inferencepool/values.yaml       |   2 +-
 config/manifests/generated.yaml               | 145 ------------------
 4 files changed, 15 insertions(+), 176 deletions(-)
 delete mode 100644 config/manifests/generated.yaml

diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
index 10ed808b0..761c9a9dc 100644
--- a/config/charts/inferencepool/README.md
+++ b/config/charts/inferencepool/README.md
@@ -1,34 +1,20 @@
-# Gateway Api Inference Extension
+# InferencePool
 
-A chart to deploy the inference extension and a InferencePool managed by the extension.
+A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment.  
 
-## Install
 
-Suppose now a vllm service with label `app: vllm-llama2-7b` and served on port `8000` is deployed in `default` namespace in the cluster.
+## Install
 
-To deploy the inference extension, you can run the following command:
+To install an InferencePool named `pool-1`  that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
 
 ```txt
-$ helm install my-release . -n default \
-    --set inferencePool.targetPortNumber=8000 \
-    --set inferencePool.selector.app=vllm-llama2-7b
-```
-
-Or you can change the `values.yaml` to:
-
-```yaml
-inferencePool:
-  name: pool-1
-  targetPortNumber: 8000
-  selector:
-    app: vllm-llama2-7b
+$ helm install my-release ./config/charts/inferencepool \
+  --set inferencePool.name=pool-1 \
+  --set inferencePool.selector.app=vllm-llama2-7b \
+  --set inferencePool.targetPortNumber=8000
 ```
 
-where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends. And then run:
-
-```txt
-$ helm install my-release .
-```
+where `inferencePool.targetPortNumber` is the pod that vllm backends served on and `inferencePool.selector` is the selector to match the vllm backends.
 
 ## Uninstall
 
@@ -44,18 +30,16 @@ The following table list the configurable parameters of the chart.
 
 | **Parameter Name**                          | **Description**                                                                                                   |
 |---------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
+| `inferencePool.name`                        | Name for the InferencePool, and inference extension will be named as `${inferencePool.name}-epp`.                |
+| `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension.             |
+| `inferencePool.selector`                     | Label selector to match vllm backends managed by the inference pool.                                             |
 | `inferenceExtension.replicas`               | Number of replicas for the inference extension service. Defaults to `1`.                                           |
 | `inferenceExtension.image.name`             | Name of the container image used for the inference extension.                                                    |
 | `inferenceExtension.image.hub`              | Registry URL where the inference extension image is hosted.                                                     |
 | `inferenceExtension.image.tag`              | Image tag of the inference extension.                                                                             |
 | `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. |
 | `inferenceExtension.extProcPort`            | Port where the inference extension service is served for external processing. Defaults to `9002`.                  |
-| `inferencePool.name`                        | Name for the InferencePool, and inference extension will be named as `${inferencePool.name}-epp`.                |
-| `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension.             |
-| `inferencePool.selector`                     | Label selector to match vllm backends managed by the inference pool.                                             |
 
 ## Notes
 
-This chart will only deploy the inference extension and InferencePool, before install the chart, please make sure that the inference extension CRDs have already been installed in the cluster. And You need to apply traffic policies to route traffic to the inference extension from the gateway after the inference extension is deployed.
-
-For more details, please refer to the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/).
\ No newline at end of file
+This chart will only deploy an InferencePool and its corresponding EndpointPicker extension. Before install the chart, please make sure that the inference extension CRDs are installed in the cluster. For more details, please refer to the [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/).
diff --git a/config/charts/inferencepool/templates/NOTES.txt b/config/charts/inferencepool/templates/NOTES.txt
index 5d5ea8794..3d8221659 100644
--- a/config/charts/inferencepool/templates/NOTES.txt
+++ b/config/charts/inferencepool/templates/NOTES.txt
@@ -1 +1 @@
-Gateway api inference extension deployed.
\ No newline at end of file
+InferencePool {{ .Values.inferencePool.name }} deployed.
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
index c4a0fb934..7d3e868dd 100644
--- a/config/charts/inferencepool/values.yaml
+++ b/config/charts/inferencepool/values.yaml
@@ -11,4 +11,4 @@ inferencePool:
   name: pool-1
   targetPortNumber: 8000
   selector:
-    app: vllm-llama2-7b
\ No newline at end of file
+    app: vllm-llama2-7b
diff --git a/config/manifests/generated.yaml b/config/manifests/generated.yaml
deleted file mode 100644
index f615e25a1..000000000
--- a/config/manifests/generated.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
----
-# Source: InferencePool/templates/rbac.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: pool-1-epp
-  namespace: default
-  labels:
-    app.kubernetes.io/name: pool-1-epp
-    app.kubernetes.io/version: "0.2.0"
----
-# Source: InferencePool/templates/rbac.yaml
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: pool-1-epp
-  labels:
-    app.kubernetes.io/name: pool-1-epp
-    app.kubernetes.io/version: "0.2.0"
-rules:
-- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels, inferencepools"]
-  verbs: ["get", "watch", "list"]
-- apiGroups: [""]
-  resources: ["pods"]
-  verbs: ["get", "watch", "list"]
-- apiGroups:
-  - authentication.k8s.io
-  resources:
-  - tokenreviews
-  verbs:
-  - create
-- apiGroups:
-  - authorization.k8s.io
-  resources:
-  - subjectaccessreviews
-  verbs:
-  - create
----
-# Source: InferencePool/templates/rbac.yaml
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: pool-1-epp
-subjects:
-- kind: ServiceAccount
-  name: pool-1-epp
-  namespace: default
-roleRef:
-  kind: ClusterRole
-  name: pool-1-epp
----
-# Source: InferencePool/templates/inferencepool.yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: pool-1-epp
-  namespace: default
-  labels:
-    app.kubernetes.io/name: pool-1-epp
-    app.kubernetes.io/version: "0.2.0"
-spec:
-  selector:
-    app: pool-1-epp
-  ports:
-    - name: grpc-ext-proc
-      protocol: TCP
-      port: 9002
-    - name: http-metrics
-      protocol: TCP
-      port: 9090
-  type: ClusterIP
----
-# Source: InferencePool/templates/inferencepool.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: pool-1-epp
-  namespace: default
-  labels:
-    app.kubernetes.io/name: pool-1-epp
-    app.kubernetes.io/version: "0.2.0"
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: pool-1-epp
-  template:
-    metadata:
-      labels:
-        app: pool-1-epp
-    spec:
-      serviceAccountName: pool-1-epp
-      containers:
-      - name: epp
-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
-        imagePullPolicy: Always
-        args:
-        - -poolName
-        - pool-1
-        - -poolNamespace
-        - default
-        - -v
-        - "3"
-        - -grpcPort
-        - "9002"
-        - -grpcHealthPort
-        - "9003"
-        - -metricsPort
-        - "9090"
-        ports:
-        - name: grpc
-          containerPort: 9002
-        - name: grpc-health
-          containerPort: 9003
-        - name: metrics
-          containerPort: 9090
-        livenessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
-        readinessProbe:
-          grpc:
-            port: 9003
-            service: inference-extension
-          initialDelaySeconds: 5
-          periodSeconds: 10
----
-# Source: InferencePool/templates/inferencepool.yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferencePool
-metadata:
-  name: pool-1
-  namespace: default
-  labels:
-    app.kubernetes.io/name: pool-1-epp
-    app.kubernetes.io/version: "0.2.0"
-spec:
-  targetPortNumber: 8000
-  selector:
-      app: "vllm-llama2-7b"
-  extensionRef:
-    name: pool-1-epp

From bf51f9a3e1b58827614578d5516709ffe0e97b46 Mon Sep 17 00:00:00 2001
From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com>
Date: Tue, 18 Mar 2025 21:09:01 -0700
Subject: [PATCH 9/9] Apply suggestions from code review

---
 config/charts/inferencepool/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
index 761c9a9dc..ee0481d30 100644
--- a/config/charts/inferencepool/README.md
+++ b/config/charts/inferencepool/README.md
@@ -8,7 +8,7 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl
 To install an InferencePool named `pool-1`  that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
 
 ```txt
-$ helm install my-release ./config/charts/inferencepool \
+$ helm install pool-1 ./config/charts/inferencepool \
   --set inferencePool.name=pool-1 \
   --set inferencePool.selector.app=vllm-llama2-7b \
   --set inferencePool.targetPortNumber=8000
@@ -21,7 +21,7 @@ where `inferencePool.targetPortNumber` is the pod that vllm backends served on a
 Run the following command to uninstall the chart:
 
 ```txt
-$ helm uninstall my-release
+$ helm uninstall pool-1
 ```
 
 ## Configuration