generated from kubernetes/kubernetes-template-project
-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy pathpatch_policy.yaml
123 lines (122 loc) · 4.88 KB
/
patch_policy.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: EnvoyPatchPolicy
metadata:
name: custom-response-patch-policy
namespace: default
spec:
targetRef:
group: gateway.networking.k8s.io
kind: Gateway
name: inference-gateway
type: JSONPatch
jsonPatches:
# Necessary to create a cluster of the type: ORIGINAL_DST to allow for
# direct pod scheduling. Which is heavily utilized in our scheduling.
# Specifically the field `original_dst_lb_config` allows us to enable
# `use_http_header` and `http_header_name`.
# Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto
- type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
name: original_destination_cluster
operation:
op: add
path: ""
value:
name: original_destination_cluster
type: ORIGINAL_DST
original_dst_lb_config:
use_http_header: true
http_header_name: "x-gateway-destination-endpoint"
connect_timeout: 1000s
lb_policy: CLUSTER_PROVIDED
dns_lookup_family: V4_ONLY
circuit_breakers:
thresholds:
- max_connections: 40000
max_pending_requests: 40000
max_requests: 40000
# This ensures that envoy accepts untrusted certificates. We tried to explicitly
# set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work
# and what worked is setting the common_tls_context to empty.
- type: "type.googleapis.com/envoy.config.cluster.v3.Cluster"
name: "envoyextensionpolicy/default/ext-proc-policy/extproc/0"
operation:
op: add
path: "/transport_socket"
value:
name: "envoy.transport_sockets.tls"
typed_config:
"@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext"
common_tls_context: {}
- type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
name: default/inference-gateway/llm-gw
operation:
op: replace
path: "/virtual_hosts/0/routes/0/route/cluster"
value: original_destination_cluster
# Comment the below to disable full duplex streaming
# NOTE: As of https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/552
# FULL_DUPLEX_STREAMED is the primary supported protocol for ext-proc. The buffered variant is no longer
# being actively developed, may be missing features/fixes, and will soon be removed.
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
name: "default/inference-gateway/llm-gw"
operation:
op: add
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode"
value: FULL_DUPLEX_STREAMED
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
name: "default/inference-gateway/llm-gw"
operation:
op: add
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode"
value: SEND
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
name: "default/inference-gateway/llm-gw"
operation:
op: add
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode"
value: FULL_DUPLEX_STREAMED
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
name: "default/inference-gateway/llm-gw"
operation:
op: replace
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode"
value: SEND
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
name: "default/inference-gateway/llm-gw"
operation:
op: replace
path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode"
value: SEND
---
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: EnvoyExtensionPolicy
metadata:
name: ext-proc-policy
namespace: default
spec:
extProc:
- backendRefs:
- group: ""
kind: Service
name: vllm-llama2-7b-epp
port: 9002
processingMode:
allowModeOverride: true
request:
body: Buffered
response:
# The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
# The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly.
messageTimeout: 1000s
backendSettings:
circuitBreaker:
maxConnections: 40000
maxPendingRequests: 40000
maxParallelRequests: 40000
timeout:
tcp:
connectTimeout: 24h
targetRef:
group: gateway.networking.k8s.io
kind: HTTPRoute
name: llm-route