Skip to content
This repository was archived by the owner on May 23, 2024. It is now read-only.

Commit 1bd309b

Browse files
authored
Nginx timeouts (#221)
* Add support for changing nginx proxy_read_timeout. * Fix space in log message. * Fix flake8 issue with string construction. * Fix dockerfiles for build all script. * Fix dockerfiles for build all script. * Update log message to include prior Nginx timeout. * Trigger build. * Remove old log message. * Trigger Build again.
1 parent a58583d commit 1bd309b

File tree

6 files changed

+202
-39
lines changed

6 files changed

+202
-39
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,15 @@ how long a Gunicorn worker may be silent before it is killed and restarted.
629629
# Defaults to 30.
630630
SAGEMAKER_GUNICORN_TIMEOUT_SECONDS="60"
631631
```
632+
[Configures](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout)
633+
the timeout for reading a response from the proxied server.
634+
Note: If SAGEMAKER_GUNICORN_TIMEOUT_SECONDS is greater,
635+
SAGEMAKER_NGINX_PROXY_READ_TIMEOUT_SECONDS will be set to the
636+
value of SAGEMAKER_GUNICORN_TIMEOUT_SECONDS.
637+
```bash
638+
# Defaults to 60.
639+
SAGEMAKER_NGINX_PROXY_READ_TIMEOUT_SECONDS="120"
640+
```
632641

633642
## Deploying to Multi-Model Endpoint
634643

docker/1.15/Dockerfile.gpu

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ ENV LANG=C.UTF-8
1616
ENV NCCL_VERSION=2.4.7-1+cuda10.0
1717
ENV CUDNN_VERSION=7.5.1.10-1+cuda10.0
1818
ENV TF_TENSORRT_VERSION=5.0.2
19+
ENV TF_TENSORRT_LIB_VERSION=5.1.2
1920
ENV PYTHONDONTWRITEBYTECODE=1
2021
# Python won’t try to write .pyc or .pyo files on the import of source modules
2122
ENV PYTHONUNBUFFERED=1
@@ -27,6 +28,21 @@ ENV MODEL_NAME=model
2728
# Prevent docker build from getting stopped by request for user interaction
2829
ENV DEBIAN_FRONTEND=noninteractive
2930

31+
# https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
32+
# Fix cuda repo's GPG key. Nvidia is no longer updating the machine-learning repo.
33+
# Need to manually pull and install necessary debs to continue using these versions.
34+
RUN rm /etc/apt/sources.list.d/cuda.list \
35+
&& rm /etc/apt/sources.list.d/nvidia-ml.list \
36+
&& apt-key del 7fa2af80 \
37+
&& apt-get update && apt-get install -y --no-install-recommends wget \
38+
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
39+
&& dpkg -i cuda-keyring_1.0-1_all.deb \
40+
&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libcudnn7_${CUDNN_VERSION}_amd64.deb \
41+
&& dpkg -i libcudnn7_${CUDNN_VERSION}_amd64.deb \
42+
&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libnccl2_${NCCL_VERSION}_amd64.deb \
43+
&& dpkg -i libnccl2_${NCCL_VERSION}_amd64.deb \
44+
&& rm *.deb
45+
3046
RUN apt-get update \
3147
&& apt-get install -y --no-install-recommends \
3248
ca-certificates \
@@ -36,8 +52,6 @@ RUN apt-get update \
3652
cuda-curand-10-0 \
3753
cuda-cusolver-10-0 \
3854
cuda-cusparse-10-0 \
39-
libcudnn7=${CUDNN_VERSION} \
40-
libnccl2=${NCCL_VERSION} \
4155
libgomp1 \
4256
curl \
4357
git \
@@ -49,25 +63,6 @@ RUN apt-get update \
4963
&& apt-get clean \
5064
&& rm -rf /var/lib/apt/lists/*
5165

52-
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-4.0.1-ga-cuda10.0
53-
# adds a new list which contains libnvinfer library, so it needs another
54-
# 'apt-get update' to retrieve that list before it can actually install the
55-
# library.
56-
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
57-
# and libnvinfer4 doesn't contain libnvinfer.a static library.
58-
# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
59-
RUN apt-get update \
60-
&& apt-get install -y --no-install-recommends \
61-
nvinfer-runtime-trt-repo-ubuntu1804-${TF_TENSORRT_VERSION}-ga-cuda10.0 \
62-
&& apt-get update \
63-
&& apt-get install -y --no-install-recommends \
64-
libnvinfer5=${TF_TENSORRT_VERSION}-1+cuda10.0 \
65-
&& apt-get clean \
66-
&& rm -rf /var/lib/apt/lists/* \
67-
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
68-
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
69-
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers*
70-
7166
RUN ${PIP} --no-cache-dir install --upgrade \
7267
pip \
7368
setuptools
@@ -106,6 +101,19 @@ RUN ${PIP} install -U --no-cache-dir \
106101
&& ${PIP} install --no-dependencies --no-cache-dir \
107102
tensorflow-serving-api-gpu==1.15.0
108103

104+
# https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
105+
# Fix cuda repo's GPG key. Nvidia is no longer updating the machine-learning repo.
106+
# Need to manually pull and install necessary debs to continue using these versions.
107+
RUN wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvinfer-runtime-trt-repo-ubuntu1804-${TF_TENSORRT_VERSION}-ga-cuda10.0_1-1_amd64.deb \
108+
&& dpkg -i nvinfer-runtime-trt-repo-ubuntu1804-${TF_TENSORRT_VERSION}-ga-cuda10.0_1-1_amd64.deb \
109+
&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libnvinfer5_${TF_TENSORRT_LIB_VERSION}-1+cuda10.0_amd64.deb \
110+
&& dpkg -i libnvinfer5_${TF_TENSORRT_LIB_VERSION}-1+cuda10.0_amd64.deb \
111+
&& rm *.deb \
112+
&& rm -rf /var/lib/apt/lists/* \
113+
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
114+
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
115+
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers*
116+
109117
COPY sagemaker /sagemaker
110118

111119
RUN curl ${TF_MODEL_SERVER_SOURCE} -o /usr/bin/tensorflow_model_server \

docker/2.1/Dockerfile.gpu

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ ARG TFS_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/Serving/GPU/te
1010

1111
ENV NCCL_VERSION=2.4.7-1+cuda10.1
1212
ENV CUDNN_VERSION=7.6.2.24-1+cuda10.1
13-
ENV TF_TENSORRT_VERSION=6.0.1
13+
ENV TF_TENSORRT_VERSION=5.0.2
14+
ENV TF_TENSORRT_LIB_VERSION=6.0.1
1415

1516
# See http://bugs.python.org/issue19846
1617
ENV LANG=C.UTF-8
@@ -25,6 +26,21 @@ ENV MODEL_NAME=model
2526
# Fix for the interactive mode during an install in step 21
2627
ENV DEBIAN_FRONTEND=noninteractive
2728

29+
# https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
30+
# Fix cuda repo's GPG key. Nvidia is no longer updating the machine-learning repo.
31+
# Need to manually pull and install necessary debs to continue using these versions.
32+
RUN rm /etc/apt/sources.list.d/cuda.list \
33+
&& rm /etc/apt/sources.list.d/nvidia-ml.list \
34+
&& apt-key del 7fa2af80 \
35+
&& apt-get update && apt-get install -y --no-install-recommends wget \
36+
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
37+
&& dpkg -i cuda-keyring_1.0-1_all.deb \
38+
&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libcudnn7_${CUDNN_VERSION}_amd64.deb \
39+
&& dpkg -i libcudnn7_${CUDNN_VERSION}_amd64.deb \
40+
&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libnccl2_${NCCL_VERSION}_amd64.deb \
41+
&& dpkg -i libnccl2_${NCCL_VERSION}_amd64.deb \
42+
&& rm *.deb
43+
2844
# allow unauthenticated and allow downgrades for special libcublas library
2945
RUN apt-get update \
3046
&& apt-get install -y --no-install-recommends --allow-unauthenticated --allow-downgrades\
@@ -37,8 +53,6 @@ RUN apt-get update \
3753
#cuda-cublas-dev not available with 10-1, install libcublas instead
3854
libcublas10=10.1.0.105-1 \
3955
libcublas-dev=10.1.0.105-1 \
40-
libcudnn7=${CUDNN_VERSION} \
41-
libnccl2=${NCCL_VERSION} \
4256
libgomp1 \
4357
curl \
4458
git \
@@ -52,21 +66,6 @@ RUN apt-get update \
5266
&& apt-get clean \
5367
&& rm -rf /var/lib/apt/lists/*
5468

55-
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-4.0.1-ga-cuda10.0
56-
# adds a new list which contains libnvinfer library, so it needs another
57-
# 'apt-get update' to retrieve that list before it can actually install the
58-
# library.
59-
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
60-
# and libnvinfer4 doesn't contain libnvinfer.a static library.
61-
RUN apt-get update \
62-
# nvinfer-runtime-trt-repo doesn't have a 1804-cuda10.1 version yet. see:
63-
# https://developer.download.nvidia.cn/compute/machine-learning/repos/ubuntu1804/x86_64/
64-
&& apt-get install -y --no-install-recommends nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
65-
&& apt-get update \
66-
&& apt-get install -y --no-install-recommends libnvinfer6=${TF_TENSORRT_VERSION}-1+cuda10.1 \
67-
&& apt-get clean \
68-
&& rm -rf /var/lib/apt/lists/*
69-
7069
RUN ${PIP} --no-cache-dir install --upgrade \
7170
pip \
7271
setuptools
@@ -88,6 +87,17 @@ RUN apt-get update \
8887
&& apt-get clean \
8988
&& rm -rf /var/lib/apt/lists/*
9089

90+
# https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
91+
# Nvidia is no longer updating the machine-learning repo.
92+
# Need to manually pull and install necessary debs to continue using these versions.
93+
# nvinfer-runtime-trt-repo doesn't have a 1804-cuda10.1 version.
94+
RUN wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvinfer-runtime-trt-repo-ubuntu1804-${TF_TENSORRT_VERSION}-ga-cuda10.0_1-1_amd64.deb \
95+
&& dpkg -i nvinfer-runtime-trt-repo-ubuntu1804-${TF_TENSORRT_VERSION}-ga-cuda10.0_1-1_amd64.deb \
96+
&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libnvinfer6_${TF_TENSORRT_LIB_VERSION}-1+cuda10.1_amd64.deb \
97+
&& dpkg -i libnvinfer6_${TF_TENSORRT_LIB_VERSION}-1+cuda10.1_amd64.deb \
98+
&& rm *.deb \
99+
&& rm -rf /var/lib/apt/lists/*
100+
91101
# cython, falcon, gunicorn, grpc
92102
RUN ${PIP} install -U --no-cache-dir \
93103
boto3 \

docker/build_artifacts/sagemaker/nginx.conf.template

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ http {
1717
access_log /dev/stdout combined;
1818
js_import tensorflowServing.js;
1919

20+
proxy_read_timeout %PROXY_READ_TIMEOUT%;
21+
2022
upstream tfs_upstream {
2123
%TFS_UPSTREAM%;
2224
}

docker/build_artifacts/sagemaker/serve.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,20 @@ def __init__(self):
6767
self._gunicorn_timeout_seconds = int(
6868
os.environ.get("SAGEMAKER_GUNICORN_TIMEOUT_SECONDS", 30)
6969
)
70+
self._nginx_proxy_read_timeout_seconds = int(
71+
os.environ.get("SAGEMAKER_NGINX_PROXY_READ_TIMEOUT_SECONDS", 60))
72+
73+
# Nginx proxy read timeout should not be less than the GUnicorn timeout. If it is, this
74+
# can result in upstream time out errors.
75+
if self._gunicorn_timeout_seconds > self._nginx_proxy_read_timeout_seconds:
76+
log.info(
77+
"GUnicorn timeout was higher than Nginx proxy read timeout."
78+
" Setting Nginx proxy read timeout from {} seconds to {} seconds"
79+
" to match GUnicorn timeout.".format(
80+
self._nginx_proxy_read_timeout_seconds, self._gunicorn_timeout_seconds
81+
)
82+
)
83+
self._nginx_proxy_read_timeout_seconds = self._gunicorn_timeout_seconds
7084

7185
if os.environ.get("OMP_NUM_THREADS") is None:
7286
os.environ["OMP_NUM_THREADS"] = "1"
@@ -270,6 +284,7 @@ def _create_nginx_config(self):
270284
"FORWARD_INVOCATION_REQUESTS": GUNICORN_INVOCATIONS
271285
if self._use_gunicorn
272286
else JS_INVOCATIONS,
287+
"PROXY_READ_TIMEOUT": str(self._nginx_proxy_read_timeout_seconds),
273288
}
274289

275290
config = pattern.sub(lambda x: template_values[x.group(1)], template)
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2019-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
import os
15+
import subprocess
16+
17+
import pytest
18+
19+
20+
@pytest.fixture(scope="session", autouse=True)
21+
def volume():
22+
try:
23+
model_dir = os.path.abspath("test/resources/models")
24+
subprocess.check_call(
25+
"docker volume create --name nginx_model_volume --opt type=none "
26+
"--opt device={} --opt o=bind".format(model_dir).split()
27+
)
28+
yield model_dir
29+
finally:
30+
subprocess.check_call("docker volume rm nginx_model_volume".split())
31+
32+
33+
def test_run_nginx_with_default_parameters(docker_base_name, tag, runtime_config):
34+
try:
35+
command = (
36+
"docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
37+
" --mount type=volume,source=nginx_model_volume,target=/opt/ml/model,readonly"
38+
" {}:{} serve"
39+
).format(runtime_config, docker_base_name, tag)
40+
41+
proc = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
42+
43+
lines_seen = {
44+
"error_log /dev/stderr error;": 0,
45+
"proxy_read_timeout 60;": 0,
46+
}
47+
48+
for stdout_line in iter(proc.stdout.readline, ""):
49+
stdout_line = str(stdout_line)
50+
for line in lines_seen.keys():
51+
if line in stdout_line:
52+
lines_seen[line] += 1
53+
if "started nginx" in stdout_line:
54+
for value in lines_seen.values():
55+
assert value == 1
56+
break
57+
58+
finally:
59+
subprocess.check_call("docker rm -f sagemaker-tensorflow-serving-test".split())
60+
61+
62+
def test_run_nginx_with_env_var_parameters(docker_base_name, tag, runtime_config):
63+
try:
64+
command = (
65+
"docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
66+
" --mount type=volume,source=nginx_model_volume,target=/opt/ml/model,readonly"
67+
" -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info"
68+
" -e SAGEMAKER_NGINX_PROXY_READ_TIMEOUT_SECONDS=63"
69+
" {}:{} serve"
70+
).format(runtime_config, docker_base_name, tag)
71+
72+
proc = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
73+
74+
lines_seen = {
75+
"error_log /dev/stderr info;": 0,
76+
"proxy_read_timeout 63;": 0,
77+
}
78+
79+
for stdout_line in iter(proc.stdout.readline, ""):
80+
stdout_line = str(stdout_line)
81+
for line in lines_seen.keys():
82+
if line in stdout_line:
83+
lines_seen[line] += 1
84+
if "started nginx" in stdout_line:
85+
for value in lines_seen.values():
86+
assert value == 1
87+
break
88+
89+
finally:
90+
subprocess.check_call("docker rm -f sagemaker-tensorflow-serving-test".split())
91+
92+
def test_run_nginx_with_higher_gunicorn_parameter(docker_base_name, tag, runtime_config):
93+
try:
94+
command = (
95+
"docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
96+
" --mount type=volume,source=nginx_model_volume,target=/opt/ml/model,readonly"
97+
" -e SAGEMAKER_NGINX_PROXY_READ_TIMEOUT_SECONDS=60"
98+
" -e SAGEMAKER_GUNICORN_TIMEOUT_SECONDS=120"
99+
" {}:{} serve"
100+
).format(runtime_config, docker_base_name, tag)
101+
102+
proc = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
103+
104+
lines_seen = {
105+
"proxy_read_timeout 120;": 0, # When GUnicorn is higher, set timeout to match.
106+
}
107+
108+
for stdout_line in iter(proc.stdout.readline, ""):
109+
stdout_line = str(stdout_line)
110+
for line in lines_seen.keys():
111+
if line in stdout_line:
112+
lines_seen[line] += 1
113+
if "started nginx" in stdout_line:
114+
for value in lines_seen.values():
115+
assert value == 1
116+
break
117+
118+
finally:
119+
subprocess.check_call("docker rm -f sagemaker-tensorflow-serving-test".split())

0 commit comments

Comments
 (0)