config/manifests/inferencepool-resources.yaml

# Note: If you change this file, please also change the file used for e2e tests!
# 
# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
  name: vllm-llama3-8b-instruct
spec:
  targetPortNumber: 8000
  selector:
    app: vllm-llama3-8b-instruct
  extensionRef:
    name: vllm-llama3-8b-instruct-epp
---
apiVersion: v1
kind: Service
metadata:
  name: vllm-llama3-8b-instruct-epp
  namespace: default
spec:
  selector:
    app: vllm-llama3-8b-instruct-epp
  ports:
    - protocol: TCP
      port: 9002
      targetPort: 9002
      appProtocol: http2
  type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: vllm-llama3-8b-instruct-epp
  namespace: default
  labels:
    app: vllm-llama3-8b-instruct-epp
spec:
  replicas: 1
  selector:
    matchLabels:
      app: vllm-llama3-8b-instruct-epp
  template:
    metadata:
      labels:
        app: vllm-llama3-8b-instruct-epp
    spec:
      # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
      terminationGracePeriodSeconds: 130
      containers:
      - name: epp
        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
        imagePullPolicy: Always
        args:
        - -poolName
        - "vllm-llama3-8b-instruct"
        - "-poolNamespace"
        - "default"
        - -v
        - "4"
        - --zap-encoder
        - "json"
        - -grpcPort
        - "9002"
        - -grpcHealthPort
        - "9003"
        ports:
        - containerPort: 9002
        - containerPort: 9003
        - name: metrics
          containerPort: 9090
        livenessProbe:
          grpc:
            port: 9003
            service: inference-extension
          initialDelaySeconds: 5
          periodSeconds: 10
        readinessProbe:
          grpc:
            port: 9003
            service: inference-extension
          initialDelaySeconds: 5
          periodSeconds: 10
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: pod-read
rules:
- apiGroups: ["inference.networking.x-k8s.io"]
  resources: ["inferencemodels"]
  verbs: ["get", "watch", "list"]
- apiGroups: [""]
  resources: ["pods"]
  verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
  resources: ["inferencepools"]
  verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
  resources: ["endpointslices"]
  verbs: ["get", "watch", "list"]
- apiGroups:
  - authentication.k8s.io
  resources:
  - tokenreviews
  verbs:
  - create
- apiGroups:
  - authorization.k8s.io
  resources:
  - subjectaccessreviews
  verbs:
  - create
--- 
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: pod-read-binding
subjects:
- kind: ServiceAccount
  name: default
  namespace: default
roleRef:
  kind: ClusterRole
  name: pod-read