kubernetes-sigs · k8s-ci-robot · Mar 25, 2025 · Mar 20, 2025 · hzxuzhonghu · Mar 25, 2025
diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
@@ -46,26 +46,93 @@ spec:
             - containerPort: 8000
               name: http
               protocol: TCP
+          lifecycle:
+            preStop:
+              # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep
+              # to give upstream gateways a chance to take us out of rotation. The time we wait
+              # is dependent on the time it takes for all upstreams to completely remove us from
+              # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
+              # our deployment to run behind a modern gateway like Envoy which is designed to 
+              # probe for readiness aggressively.
+              sleep:
+                # Upstream gateway probers for health should be set on a low period, such as 5s,
+                # and the shorter we can tighten that bound the faster that we release
+                # accelerators during controlled shutdowns. However, we should expect variance,
+                # as load balancers may have internal delays, and we don't want to drop requests
+                # normally, so we're often aiming to set this value to a p99 propagation latency
+                # of readiness -> load balancer taking backend out of rotation, not the average.
+                # 
+                # This value is generally stable and must often be experimentally determined on
+                # for a given load balancer and health check period. We set the value here to
+                # the highest value we observe on a supported load balancer, and we recommend
+                # tuning this value down and verifying no requests are dropped.
+                #
+                # If this value is updated, be sure to update terminationGracePeriodSeconds.
+                #
+                seconds: 30
+              #
+              # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions
+              # replace with this exec action.
+              #exec:
+              #  command:
+              #  - /usr/bin/sleep
+              #  - 30
           livenessProbe:
-            failureThreshold: 240
             httpGet:
               path: /health
               port: http
               scheme: HTTP
-            initialDelaySeconds: 5
-            periodSeconds: 5
+            # vLLM's health check is simple, so we can more aggressively probe it.  Liveness
+            # check endpoints should always be suitable for aggressive probing.
+            periodSeconds: 1
             successThreshold: 1
+            # vLLM has a very simple health implementation, which means that any failure is
+            # likely significant. However, any liveness triggered restart requires the very
+            # large core model to be reloaded, and so we should bias towards ensuring the
+            # server is definitely unhealthy vs immediately restarting. Use 5 attempts as
+            # evidence of a serious problem.
+            failureThreshold: 5
             timeoutSeconds: 1
           readinessProbe:
-            failureThreshold: 600
             httpGet:
               path: /health
               port: http
               scheme: HTTP
-            initialDelaySeconds: 5
-            periodSeconds: 5
+            # vLLM's health check is simple, so we can more aggressively probe it.  Readiness
+            # check endpoints should always be suitable for aggressive probing, but may be
+            # slightly more expensive than readiness probes.
+            periodSeconds: 1
             successThreshold: 1
+            # vLLM has a very simple health implementation, which means that any failure is
+            # likely significant,
+            failureThreshold: 1
             timeoutSeconds: 1
+          # We set a startup probe so that we don't begin directing traffic or checking
+          # liveness to this instance until the model is loaded.
+          startupProbe:
+            # Failure threshold is when we believe startup will not happen at all, and is set
+            # to the maximum possible time we believe loading a model will take. In our
+            # default configuration we are downloading a model from HuggingFace, which may
+            # take a long time, then the model must load into the accelerator. We choose
+            # 10 minutes as a reasonable maximum startup time before giving up and attempting
+            # to restart the pod.
+            #
+            # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash
+            # loop forever. Be sure to set this appropriately.
+            failureThreshold: 600
+            # Set delay to start low so that if the base model changes to something smaller
+            # or an optimization is deployed, we don't wait unneccesarily.
+            initialDelaySeconds: 2
+            # As a startup probe, this stops running and so we can more aggressively probe
+            # even a moderately complex startup - this is a very important workload.
+            periodSeconds: 1
+            httpGet:
+              # vLLM does not start the OpenAI server (and hence make /health available)
+              # until models are loaded. This may not be true for all model servers.
+              path: /health
+              port: http
+              scheme: HTTP
+
           resources:
             limits:
               nvidia.com/gpu: 1
@@ -92,8 +159,71 @@ spec:
           - name: config-volume
             mountPath:  /config
       restartPolicy: Always
-      schedulerName: default-scheduler
-      terminationGracePeriodSeconds: 30
+
+      # vLLM allows VLLM_PORT to be specified as an environment variable, but a user might
+      # create a 'vllm' service in their namespace. That auto-injects VLLM_PORT in docker
+      # compatible form as `tcp://<IP>:<PORT>` instead of the numeric value vLLM accepts
+      # causing CrashLoopBackoff. Set service environment injection off by default.
+      enableServiceLinks: false
+
+      # Generally, the termination grace period needs to last longer than the slowest request
+      # we expect to serve plus any extra time spent waiting for load balancers to take the
+      # model server out of rotation.
+      #
+      # An easy starting point is the p99 or max request latency measured for your workload,
+      # although LLM request latencies vary significantly if clients send longer inputs or
+      # trigger longer outputs. Since steady state p99 will be higher than the latency
+      # to drain a server, you may wish to slightly this value either experimentally or
+      # via the calculation below.
+      #
+      # For most models you can derive an upper bound for the maximum drain latency as
+      # follows:
+      # 
+      #   1. Identify the maximum context length the model was trained on, or the maximum
+      #      allowed length of output tokens configured on vLLM (llama2-7b was trained to
+      #      4k context length, while llama3-8b was trained to 128k).
+      #   2. Output tokens are the more compute intensive to calculate and the accelerator
+      #      will have a maximum concurrency (batch size) - the time per output token at
+      #      maximum batch with no prompt tokens being processed is the slowest an output
+      #      token can be generated (for this model it would be about 100ms TPOT at a max
+      #      batch size around 50)
+      #   3. Calculate the worst case request duration if a request starts immediately
+      #      before the server stops accepting new connections - generally when it receives
+      #      SIGTERM (for this model that is about 4096 / 10 ~ 40s)
+      #   4. If there are any requests generating prompt tokens that will delay when those
+      #      output tokens start, and prompt token generation is roughly 6x faster than
+      #      compute-bound output token generation, so add 20% to the time from above (40s + 
+      #      16s ~ 55s)
+      #
+      # Thus we think it will take us at worst about 55s to complete the longest possible
+      # request the model is likely to receive at maximum concurrency (highest latency)
+      # once requests stop being sent.
+      #
+      # NOTE: This number will be lower than steady state p99 latency since we stop receiving
+      #       new requests which require continuous prompt token computation.
+      # NOTE: The max timeout for backend connections from gateway to model servers should
+      #       be configured based on steady state p99 latency, not drain p99 latency
+      #
+      #   5. Add the time the pod takes in its preStop hook to allow the load balancers have
+      #      stopped sending us new requests (55s + 30s ~ 85s)
+      #
+      # Because termination grace period controls when the Kubelet forcibly terminates a
+      # stuck or hung process (a possibility due to a GPU crash), there is operational safety
+      # in keeping the value roughly proportional to the time to finish serving. There is also
+      # value in adding a bit of extra time to deal with unexpectedly long workloads.
+      #   
+      #   6. Add a 50% safety buffer to this time since the operational impact should be low
+      #      (85s * 1.5 ~ 130s)
+      #
+      # One additional source of drain latency is that some workloads may run close to
+      # saturation and have queued requests on each server. Since traffic in excess of the
+      # max sustainable QPS will result in timeouts as the queues grow, we assume that failure
+      # to drain in time due to excess queues at the time of shutdown is an expected failure
+      # mode of server overload. If your workload occasionally experiences high queue depths
+      # due to periodic traffic, consider increasing the safety margin above to account for
+      # time to drain queued requests.
+      terminationGracePeriodSeconds: 130
+
       volumes:
         - name: data
           emptyDir: {}