kubernetes-sigs
diff --git a/‎.github/ISSUE_TEMPLATE/blank_issue.md
Lines changed: 8 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/blank_issue.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/bug_request.md
Lines changed: 3 additions & 1 deletion b/‎.github/ISSUE_TEMPLATE/bug_request.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/ISSUE_TEMPLATE/config.yml
Lines changed: 1 addition & 0 deletions b/‎.github/ISSUE_TEMPLATE/config.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/feature_request.md
Lines changed: 1 addition & 2 deletions b/‎.github/ISSUE_TEMPLATE/feature_request.md
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/new-release.md
Lines changed: 1 addition & 0 deletions b/‎.github/ISSUE_TEMPLATE/new-release.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile
Lines changed: 5 additions & 1 deletion b/‎Makefile
Lines changed: 5 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 54 additions & 1 deletion b/‎README.md
Lines changed: 54 additions & 1 deletion
diff --git a/‎api/v1alpha2/inferencemodel_types.go
Lines changed: 1 addition & 1 deletion b/‎api/v1alpha2/inferencemodel_types.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/epp/main.go
Lines changed: 9 additions & 5 deletions b/‎cmd/epp/main.go
Lines changed: 9 additions & 5 deletions
diff --git a/‎config/charts/inferencepool/README.md
Lines changed: 13 additions & 1 deletion b/‎config/charts/inferencepool/README.md
Lines changed: 13 additions & 1 deletion
diff --git a/‎config/charts/inferencepool/templates/epp-deployment.yaml
Lines changed: 8 additions & 1 deletion b/‎config/charts/inferencepool/templates/epp-deployment.yaml
Lines changed: 8 additions & 1 deletion
diff --git a/‎config/charts/inferencepool/values.yaml
Lines changed: 1 addition & 0 deletions b/‎config/charts/inferencepool/values.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/manifests/inferencepool-resources.yaml
Lines changed: 2 additions & 1 deletion b/‎config/manifests/inferencepool-resources.yaml
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/proposals/0683-epp-architecture-proposal/README.md
Lines changed: 99 additions & 0 deletions b/‎docs/proposals/0683-epp-architecture-proposal/README.md
Lines changed: 99 additions & 0 deletions
diff --git a/‎docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg
Lines changed: 1 addition & 0 deletions b/‎docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/proposals/README.md
Lines changed: 5 additions & 0 deletions b/‎docs/proposals/README.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎go.mod
Lines changed: 3 additions & 4 deletions b/‎go.mod
Lines changed: 3 additions & 4 deletions
@@ -0,0 +1,8 @@
+---
+name: Blank Issue
+about: Create a new issue from scratch
+title: ''
+labels: needs-triage
+assignees: ''
+
+---
@@ -1,7 +1,9 @@
 ---
 name: Bug Report
 about: Report a bug you encountered
-labels: kind/bug
+title: ''
+labels: kind/bug, needs-triage
+assignees: ''
 
 ---
 
 
@@ -0,0 +1 @@
+blank_issues_enabled: false
@@ -2,7 +2,7 @@
 name: Feature request
 about: Suggest an idea for this project
 title: ''
-labels: ''
+labels: needs-triage
 assignees: ''
 
 ---
@@ -12,4 +12,3 @@ assignees: ''
 **What would you like to be added**:
 
 **Why is this needed**:
-
@@ -4,6 +4,7 @@ about: Propose a new release
 title: Release v0.x.0
 labels: ''
 assignees: ''
+
 ---
 
 - [Introduction](#introduction)
 
@@ -123,8 +123,12 @@ vet: ## Run go vet against code.
 test: manifests generate fmt vet envtest image-build ## Run tests.
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out
 
+.PHONY: test-unit
+test-unit: ## Run unit tests.
+	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./pkg/... -race -coverprofile cover.out
+
 .PHONY: test-integration
-test-integration: ## Run tests.
+test-integration: ## Run integration tests.
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration/epp/... -race -coverprofile cover.out
 
 .PHONY: test-e2e
 
@@ -1,4 +1,57 @@
-# Gateway API Inference Extension 
+[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/gateway-api-inference-extension)](https://goreportcard.com/report/sigs.k8s.io/gateway-api-inference-extension)
+[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension)
+[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE)
+
+# Gateway API Inference Extension (GIE)
+
+This project offers tools for AI Inference, enabling developers to build [Inference Gateways].
+
+[Inference Gateways]:#concepts-and-definitions
+
+## Concepts and Definitions
+
+The following are some key industry terms that are important to understand for
+this project:
+
+- **Model**: A generative AI model that has learned patterns from data and is
+  used for inference. Models vary in size and architecture, from smaller
+  domain-specific models to massive multi-billion parameter neural networks that
+  are optimized for diverse language tasks.
+- **Inference**: The process of running a generative AI model, such as a large
+  language model, diffusion model etc, to generate text, embeddings, or other
+  outputs from input data.
+- **Model server**: A service (in our case, containerized) responsible for
+  receiving inference requests and returning predictions from a model.
+- **Accelerator**: specialized hardware, such as Graphics Processing Units
+  (GPUs) that can be attached to Kubernetes nodes to speed up computations,
+  particularly for training and inference tasks.
+
+And the following are more specific terms to this project:
+
+- **Scheduler**: Makes decisions about which endpoint is optimal (best cost /
+  best performance) for an inference request based on `Metrics and Capabilities`
+  from [Model Serving](/docs/proposals/003-model-server-protocol/README.md).
+- **Metrics and Capabilities**: Data provided by model serving platforms about
+  performance, availability and capabilities to optimize routing. Includes
+  things like [Prefix Cache] status or [LoRA Adapters] availability.
+- **Endpoint Selector**: A `Scheduler` combined with `Metrics and Capabilities`
+  systems is often referred to together as an [Endpoint Selection Extension]
+  (this is also sometimes referred to as an "endpoint picker", or "EPP").
+- **Inference Gateway**: A proxy/load-balancer which has been coupled with a
+  `Endpoint Selector`. It provides optimized routing and load balancing for
+  serving Kubernetes self-hosted generative Artificial Intelligence (AI)
+  workloads. It simplifies the deployment, management, and observability of AI
+  inference workloads.
+
+For deeper insights and more advanced concepts, refer to our [proposals](/docs/proposals).
+
+[Inference]:https://www.digitalocean.com/community/tutorials/llm-inference-optimization
+[Gateway API]:https://github.com/kubernetes-sigs/gateway-api
+[Prefix Cache]:https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html
+[LoRA Adapters]:https://docs.vllm.ai/en/stable/features/lora.html
+[Endpoint Selection Extension]:https://gateway-api-inference-extension.sigs.k8s.io/#endpoint-selection-extension
+
+## Technical Overview
 
 This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.
 
 
@@ -126,7 +126,7 @@ type PoolObjectReference struct {
 }
 
 // Criticality defines how important it is to serve the model compared to other models.
-// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.
 // This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
 // +kubebuilder:validation:Enum=Critical;Standard;Sheddable
 type Criticality string
 
@@ -30,6 +30,7 @@ import (
 	"go.uber.org/zap/zapcore"
 	"google.golang.org/grpc"
 	healthPb "google.golang.org/grpc/health/grpc_health_v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/rest"
 	"k8s.io/component-base/metrics/legacyregistry"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -140,14 +141,16 @@ func run() error {
 		return err
 	}
 
-	mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg)
+	poolNamespacedName := types.NamespacedName{
+		Name:      *poolName,
+		Namespace: *poolNamespace,
+	}
+	mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg)
 	if err != nil {
 		setupLog.Error(err, "Failed to create controller manager")
 		return err
 	}
 
-	ctx := ctrl.SetupSignalHandler()
-
 	// Set up mapper for metric scraping.
 	mapping, err := backendmetrics.NewMetricMapping(
 		*totalQueuedRequestsMetric,
@@ -162,14 +165,15 @@ func run() error {
 
 	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval)
 	// Setup runner.
+	ctx := ctrl.SetupSignalHandler()
+
 	datastore := datastore.NewDatastore(ctx, pmf)
 
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
 		DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
 		DestinationEndpointHintKey:               *destinationEndpointHintKey,
-		PoolName:                                 *poolName,
-		PoolNamespace:                            *poolNamespace,
+		PoolNamespacedName:                       poolNamespacedName,
 		Datastore:                                datastore,
 		SecureServing:                            *secureServing,
 		CertPath:                                 *certPath,
 
@@ -2,7 +2,6 @@
 
 A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment.  
 
-
 ## Install
 
 To install an InferencePool named `vllm-llama3-8b-instruct`  that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
@@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \
 
 Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
 
+### Install for Triton TensorRT-LLM
+
+Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
+
+```txt
+$ helm install triton-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \
+  --set inferencePool.modelServerType=triton-tensorrt-llm \
+  --set provider.name=[none|gke] \
+  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
+```
+
 ## Uninstall
 
 Run the following command to uninstall the chart:
@@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart.
 | **Parameter Name**                          | **Description**                                                                                                        |
 |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
 | `inferencePool.targetPortNumber`            | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. |
+| `inferencePool.modelServerType`            | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. |
 | `inferencePool.modelServers.matchLabels`    | Label selector to match vllm backends managed by the inference pool.                                                   |
 | `inferenceExtension.replicas`               | Number of replicas for the endpoint picker extension service. Defaults to `1`.                                         |
 | `inferenceExtension.image.name`             | Name of the container image used for the endpoint picker.                                                              |
 
@@ -35,6 +35,14 @@ spec:
         - "9003"
         - -metricsPort
         - "9090"
+        {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
+        - -totalQueuedRequestsMetric
+        - "nv_trt_llm_request_metrics{request_type=waiting}"
+        - -kvCacheUsagePercentageMetric
+        - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}"
+        - -loraInfoMetric
+        - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
+        {{- end }}
         ports:
         - name: grpc
           containerPort: 9002
@@ -54,4 +62,3 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
-
 
@@ -9,6 +9,7 @@ inferenceExtension:
 
 inferencePool:
   targetPortNumber: 8000
+  modelServerType: vllm # vllm, triton-tensorrt-llm
   # modelServers: # REQUIRED
     # matchLabels: 
     #   app: vllm-llama3-8b-instruct
 
@@ -4,7 +4,6 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  labels:
   name: vllm-llama3-8b-instruct
 spec:
   targetPortNumber: 8000
@@ -54,6 +53,8 @@ spec:
         args:
         - -poolName
         - "vllm-llama3-8b-instruct"
+        - "-poolNamespace"
+        - "default"
         - -v
         - "4"
         - --zap-encoder
 
@@ -0,0 +1,99 @@
+# Gateway API Inference Extension
+
+Author(s): @kfswain
+## Proposal Status
+ ***Draft***
+
+## Table of Contents
+
+<!-- toc -->
+
+-   [Summary](#summary)
+-   [Goals](#goals)
+-   [Non-Goals](#non-goals)
+-   [Proposal](#proposal)
+    -   [Personas](#personas)
+        -   [Inference Platform Admin](#inference-platform-admin)
+        -   [Inference Workload Owner](#workload-owner)
+    -   [Axioms](#axioms)
+    -   [InferencePool](#inferencepool)
+    -   [InferenceModel](#inferencemodel)
+    -   [Spec](#spec)
+    -   [Diagrams](#diagrams)
+    -   [Alternatives](#alternatives) 
+- [Open Questions](#open-questions)
+    
+<!-- /toc -->
+
+## Summary
+
+This proposal seeks to standardize the implementation of an EPP (End-point Picker) for the Inference Gateway extension (also known as Gateway API Inference Extension). Additionally, this proposes to restructure the current implementation of the EPP to be more modular, and approachable.
+
+## Goals
+
+- Set a standard on how the EPP & APIs interact
+- Settle on common nomenclature for clearer communication
+- Allow for modularization of the EPP, to be extended to a user's specific needs
+
+## Non-Goals
+
+- Reshaping the current API
+- A change in scope of the current project
+
+## Proposal
+
+This proposal is not proposing any net new features, instead, we are refactoring our current implementation to better handle more devs, more features, etc. At the time of writing, GIE is currently at v0.3, and that stronger experimental context (along with external feedback) made clear the need this restructure. The image below give a high level view of how our components work together.
+
+<img src="./images/epp_arch.svg" alt="Scheduling Algorithm" width="1000" />
+
+## Overview
+At a quick glance, the EPP is being broken into specific layers. The `Data Layer` is of note, as it is a vertical that will be accessed by all the others. The data layer manages the k8s, data, metric & usage data, as well as processing of the above data to determine resource scarcity regimes.
+
+The other layers are handled in sequential process. Starting with the **Ext-Proc** call. The request is buffered and then sent to the **Routing Layer**, which processes any User defined per-InferenceModel routing rules & request enrichment happening first (at the time of writing that is currently just translating the InferenceModel name to a weight-split actual model). Then _all_ requests pass through the to-be-implemented [**Flow Controller**](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674) to ensure that any request entry to the pool adhereing to the guidelines set by the Priority, Fairness, & Queueing configuration. And finally, the **Scheduling Layer** is the load balancing algorithm that intelligently routes requests based on the current state of the InferencePool.
+
+## Components
+
+To further expand upon these component layers. We will first break them into `extensible` and `non-extensible` layers. `Non-extensible` layers are intended to be static, and handled on behalf of the user, typically implementing low-opinion infrastructure. 
+
+The `Extensible` layers are:
+- Data Layer
+- Routing Layer
+- Flow Controller
+- Scheduling Layer
+
+The `Non-Extensible` layer(s) are:
+- The Ext-Proc Server
+
+### `Extensible`
+
+#### Data Layer
+
+The data layer will consume and store: the InferencePool/InferenceModel config and the pre-defined [Model Server Protocol](../003-model-server-protocol/README.md). Additionally, the data fed from the model servers will be processed and digested to provide resource scarcity regime hints, and autoscaling reccomendations.
+
+Many extensions to scheduling will require changes to ingested metrics, as such, the data layer will be built to be extended, but extenders accept that the Model Server Protocol will no longer provide guarantees on portability of a model server out of the box. 
+
+#### Routing Layer
+
+The routing layer is likely to be the most opinion heavy section, as the scope of what constitutes a 'Route Rule' is somewhat broad. The current examples we expect would be:
+
+- System Prompt injection
+- RAG callout
+- Per-InferenceModel request validation (such as saftey/on-topic, etc)
+
+Due to the possibility of this becoming a bit of a dumping ground. The API will keep a _very_ tight scope on which of these route rules are included in the spec. A standard method of extension will be provided if the need to define a custom rule arises.
+
+#### Flow Controller (WIP - implementation tracked in [#674](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674))
+
+The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing).
+
+#### Scheduling Layer
+
+As the Scheduling Layer is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values.
+
+The Scheduler will define a strong interface API, so that new scheduling algos may be plugged & dark-launched to test in production traffic without impacting said traffic. Extension is expected to adhere to the [Scheduler Subsystem definition](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/603)
+
+### `Non-extensible`
+
+#### Ext-Proc Server
+
+The Ext-Proc Server protocol is very well defined & specific, deviation could cause the EPP to become unusable or unstable. Extension is ill-advised.
@@ -0,0 +1,5 @@
+# Proposals Best Practices
+
+
+## Naming
+The directory of the proposal should lead with a 4-digit PR number (will move to 5,6,... should our PR count get that high), followed by kebab-cased title. The PR number is not known until the PR is cut, so development can use a placeholder, ex. XXXX-my-proposal. PR number is used b/c it is unique & chronological, allowing the default ordering of proposals to follow the timeline of development.
@@ -9,8 +9,8 @@ require (
 	github.com/google/go-cmp v0.7.0
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
-	github.com/prometheus/client_golang v1.21.1
-	github.com/prometheus/client_model v0.6.1
+	github.com/prometheus/client_golang v1.22.0
+	github.com/prometheus/client_model v0.6.2
 	github.com/prometheus/common v0.63.0
 	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
@@ -25,7 +25,7 @@ require (
 	k8s.io/component-base v0.32.3
 	k8s.io/utils v0.0.0-20241210054802-24370beab758
 	sigs.k8s.io/controller-runtime v0.20.4
-	sigs.k8s.io/structured-merge-diff/v4 v4.6.0
+	sigs.k8s.io/structured-merge-diff/v4 v4.7.0
 	sigs.k8s.io/yaml v1.4.0
 )
 
@@ -74,7 +74,6 @@ require (
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/klauspost/compress v1.17.11 // indirect
 	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/leodido/go-urn v1.2.1 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ type PoolObjectReference struct {`
`126`	`126`	`}`
`127`	`127`
`128`	`128`	`// Criticality defines how important it is to serve the model compared to other models.`
`129`		`-// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.`
	`129`	`+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.`
`130`	`130`	`// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.`
`131`	`131`	`// +kubebuilder:validation:Enum=Critical;Standard;Sheddable`
`132`	`132`	`type Criticality string`