config/manifests/gateway/patch_policy.yaml

apiVersion: gateway.envoyproxy.io/v1alpha1
kind: EnvoyPatchPolicy
metadata:
  name: custom-response-patch-policy
  namespace: default
spec:
  targetRef:
    group: gateway.networking.k8s.io
    kind: Gateway
    name: inference-gateway
  type: JSONPatch
  jsonPatches:
    # Necessary to create a cluster of the type: ORIGINAL_DST to allow for 
    # direct pod scheduling. Which is heavily utilized in our scheduling.
    # Specifically the field `original_dst_lb_config` allows us to enable
    # `use_http_header` and `http_header_name`. 
    # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto
    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
      name: original_destination_cluster
      operation:
        op: add
        path: ""
        value:
          name: original_destination_cluster
          type: ORIGINAL_DST
          original_dst_lb_config:
            use_http_header: true
            http_header_name: "x-gateway-destination-endpoint"
          connect_timeout: 1000s
          lb_policy: CLUSTER_PROVIDED
          dns_lookup_family: V4_ONLY
          circuit_breakers:
            thresholds:
            - max_connections: 40000
              max_pending_requests: 40000
              max_requests: 40000

    # This ensures that envoy accepts untrusted certificates. We tried to explicitly
    # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work
    # and what worked is setting the common_tls_context to empty.
    - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
      name: "envoyextensionpolicy/default/ext-proc-policy/extproc/0"
      operation:
        op: add
        path: "/transport_socket"
        value:
          name: "envoy.transport_sockets.tls"
          typed_config:
            "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext"
            common_tls_context: {}
    - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
      name: default/inference-gateway/llm-gw
      operation:
        op: replace
        path: "/virtual_hosts/0/routes/0/route/cluster"
        value: original_destination_cluster
# Comment the below to disable full duplex streaming
# NOTE: As of https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/552
# FULL_DUPLEX_STREAMED is the primary supported protocol for ext-proc. The buffered variant is no longer
# being actively developed, may be missing features/fixes, and will soon be removed.
    - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
      name: "default/inference-gateway/llm-gw"
      operation:
        op: add
        path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode"
        value: FULL_DUPLEX_STREAMED
    - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
      name: "default/inference-gateway/llm-gw"
      operation:
        op: add
        path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode"
        value: SEND
    - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
      name: "default/inference-gateway/llm-gw"
      operation:
        op: add
        path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode"
        value: FULL_DUPLEX_STREAMED
    - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
      name: "default/inference-gateway/llm-gw"
      operation:
        op: replace
        path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode"
        value: SEND
    - type: "type.googleapis.com/envoy.config.listener.v3.Listener"
      name: "default/inference-gateway/llm-gw"
      operation:
        op: replace
        path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode"
        value: SEND
---
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: EnvoyExtensionPolicy
metadata:
  name: ext-proc-policy
  namespace: default
spec:
  extProc:
    - backendRefs:
      - group: ""
        kind: Service
        name: vllm-llama2-7b-epp
        port: 9002
      processingMode:
        allowModeOverride: true
        request:
          body: Buffered
        response:
      # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
      # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. 
      messageTimeout: 1000s
      backendSettings:
        circuitBreaker:
          maxConnections: 40000
          maxPendingRequests: 40000
          maxParallelRequests: 40000
        timeout:
          tcp:
            connectTimeout: 24h
  targetRef:
    group: gateway.networking.k8s.io
    kind: HTTPRoute
    name: llm-route