Skip to content

Commit c4adde5

Browse files
saimiduakhilmehraElizaaaaaowen-takartsky
authored andcommitted
update: Release TF 1.15.0 dockerfiles (#264)
* Add TF 1.15 dockerfiles and changes to entrypoint Co-authored-by: akhilmehra <[email protected]> Co-authored-by: ElizaZh <[email protected]> Co-authored-by: Owen Thomas <[email protected]> Co-authored-by: Kartik Kalamadi <[email protected]> Co-authored-by: Arjuna Keshavan <[email protected]> Co-authored-by: akhilmehra <[email protected]> Co-authored-by: ElizaZh <[email protected]> Co-authored-by: Owen Thomas <[email protected]> Co-authored-by: Kartik Kalamadi <[email protected]> Co-authored-by: Arjuna Keshavan <[email protected]>
1 parent 57aef72 commit c4adde5

File tree

10 files changed

+950
-3
lines changed

10 files changed

+950
-3
lines changed

docker/1.15.0/py2/Dockerfile.cpu

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
FROM ubuntu:18.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
# Prevent docker build get stopped by requesting user interaction
6+
ENV DEBIAN_FRONTEND=noninteractive
7+
ENV DEBCONF_NONINTERACTIVE_SEEN=true
8+
# Set environment variables for MKL
9+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
10+
ENV KMP_AFFINITY=granularity=fine,compact,1,0
11+
ENV KMP_BLOCKTIME=1
12+
ENV KMP_SETTINGS=0
13+
# Python won’t try to write .pyc or .pyo files on the import of source modules
14+
ENV PYTHONDONTWRITEBYTECODE=1
15+
ENV PYTHONUNBUFFERED=1
16+
# See http://bugs.python.org/issue19846
17+
ENV PYTHONIOENCODING=UTF-8
18+
ENV LANG=C.UTF-8
19+
ENV LC_ALL=C.UTF-8
20+
# Specify the location of module that contains the training logic for SageMaker
21+
# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
22+
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
23+
24+
# Define framework-related package sources
25+
ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz
26+
ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl
27+
28+
RUN apt-get update \
29+
&& apt-get install -y --no-install-recommends \
30+
software-properties-common \
31+
build-essential \
32+
openssh-client \
33+
openssh-server \
34+
ca-certificates \
35+
curl \
36+
git \
37+
wget \
38+
vim \
39+
zlib1g-dev \
40+
&& rm -rf /var/lib/apt/lists/*
41+
42+
# Install Open MPI
43+
RUN mkdir /tmp/openmpi \
44+
&& cd /tmp/openmpi \
45+
&& curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
46+
&& tar zxf openmpi-4.0.1.tar.gz \
47+
&& cd openmpi-4.0.1 \
48+
&& ./configure --enable-orterun-prefix-by-default \
49+
&& make -j $(nproc) all \
50+
&& make install \
51+
&& ldconfig \
52+
&& rm -rf /tmp/openmpi
53+
54+
# Create a wrapper for OpenMPI to allow running as root by default
55+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
56+
&& echo '#!/bin/bash' > /usr/local/bin/mpirun \
57+
&& echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
58+
&& chmod a+x /usr/local/bin/mpirun
59+
60+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
61+
&& echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
62+
63+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
64+
ENV PATH=/usr/local/openmpi/bin/:$PATH
65+
66+
# SSH login fix. Otherwise user is kicked off after login
67+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
68+
69+
# Create SSH key.
70+
RUN mkdir -p /root/.ssh/ \
71+
&& mkdir -p /var/run/sshd \
72+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
73+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
74+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
75+
76+
WORKDIR /
77+
78+
RUN apt-get update \
79+
&& apt-get install -y \
80+
python \
81+
python-pip
82+
83+
COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
84+
85+
RUN pip --no-cache-dir install --upgrade \
86+
pip \
87+
setuptools
88+
89+
# Some TF tools expect a "python" binary
90+
RUN ln -s $(which python) /usr/local/bin/python
91+
92+
RUN pip install --no-cache-dir -U \
93+
numpy==1.16.5 \
94+
scipy==1.2.2 \
95+
scikit-learn==0.20.3 \
96+
pandas==0.24.2 \
97+
Pillow==6.2.1 \
98+
h5py==2.9.0 \
99+
keras_applications==1.0.8 \
100+
keras_preprocessing==1.1.0 \
101+
requests==2.22.0 \
102+
keras==2.3.1 \
103+
# botocore requires python-dateutil<2.8.1
104+
"python-dateutil<2.8.1" \
105+
awscli==1.16.296 \
106+
mpi4py==3.0.2 \
107+
"cryptography>=2.3" \
108+
"sagemaker-tensorflow>=1.15,<1.16" \
109+
# Let's install TensorFlow separately in the end to avoid the library version to be overwritten
110+
&& pip install --force-reinstall --no-cache-dir -U \
111+
${TF_URL} \
112+
&& pip install --no-cache-dir -U \
113+
$FRAMEWORK_SUPPORT_INSTALLABLE \
114+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
115+
&& pip install --no-cache-dir -U \
116+
# awscli requires PyYAML<5.2
117+
"PyYAML<5.2" \
118+
horovod==0.18.2
119+
120+
COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
121+
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
122+
123+
RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
124+
&& chmod +x /usr/local/bin/deep_learning_container.py
125+
126+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
127+
128+
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
129+
CMD ["bin/bash"]

docker/1.15.0/py2/Dockerfile.gpu

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
2+
# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
3+
FROM nvidia/cuda:10.0-base-ubuntu18.04
4+
5+
LABEL maintainer="Amazon AI"
6+
7+
# Prevent docker build get stopped by requesting user interaction
8+
ENV DEBIAN_FRONTEND=noninteractive
9+
ENV DEBCONF_NONINTERACTIVE_SEEN=true
10+
# Python won’t try to write .pyc or .pyo files on the import of source modules
11+
ENV PYTHONDONTWRITEBYTECODE=1
12+
ENV PYTHONUNBUFFERED=1
13+
# See http://bugs.python.org/issue19846
14+
ENV PYTHONIOENCODING=UTF-8
15+
ENV LANG=C.UTF-8
16+
ENV LC_ALL=C.UTF-8
17+
# Specify the location of module that contains the training logic for SageMaker
18+
# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
19+
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
20+
21+
# Define framework-related package sources
22+
ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz
23+
ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl
24+
25+
RUN apt-get update \
26+
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
27+
ca-certificates \
28+
cuda-command-line-tools-10-0 \
29+
cuda-cublas-dev-10-0 \
30+
cuda-cudart-dev-10-0 \
31+
cuda-cufft-dev-10-0 \
32+
cuda-curand-dev-10-0 \
33+
cuda-cusolver-dev-10-0 \
34+
cuda-cusparse-dev-10-0 \
35+
curl \
36+
libcudnn7=7.5.1.10-1+cuda10.0 \
37+
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
38+
libnccl2=2.4.7-1+cuda10.0 \
39+
libgomp1 \
40+
libnccl-dev=2.4.7-1+cuda10.0 \
41+
libfreetype6-dev \
42+
libhdf5-serial-dev \
43+
libpng-dev \
44+
libzmq3-dev \
45+
git \
46+
wget \
47+
vim \
48+
build-essential \
49+
openssh-client \
50+
openssh-server \
51+
zlib1g-dev \
52+
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
53+
# adds a new list which contains libnvinfer library, so it needs another
54+
# 'apt-get update' to retrieve that list before it can actually install the library.
55+
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
56+
# and libnvinfer4 doesn't contain libnvinfer.a static library.
57+
&& apt-get update \
58+
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
59+
nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
60+
&& apt-get update \
61+
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
62+
libnvinfer5=5.0.2-1+cuda10.0 \
63+
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
64+
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
65+
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
66+
&& rm -rf /var/lib/apt/lists/* \
67+
&& mkdir -p /var/run/sshd
68+
69+
# Install Open MPI
70+
RUN mkdir /tmp/openmpi \
71+
&& cd /tmp/openmpi \
72+
&& curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
73+
&& tar zxf openmpi-4.0.1.tar.gz \
74+
&& cd openmpi-4.0.1 \
75+
&& ./configure --enable-orterun-prefix-by-default \
76+
&& make -j $(nproc) all \
77+
&& make install \
78+
&& ldconfig \
79+
&& rm -rf /tmp/openmpi
80+
81+
RUN apt-get update \
82+
&& apt-get install -y \
83+
python \
84+
python-pip
85+
86+
# Create a wrapper for OpenMPI to allow running as root by default
87+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
88+
&& echo '#!/bin/bash' > /usr/local/bin/mpirun \
89+
&& echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
90+
&& chmod a+x /usr/local/bin/mpirun
91+
92+
# Configure OpenMPI to run good defaults:
93+
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
94+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
95+
&& echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
96+
97+
# Set default NCCL parameters
98+
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
99+
100+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
101+
ENV PATH /usr/local/openmpi/bin/:$PATH
102+
ENV PATH=/usr/local/nvidia/bin:$PATH
103+
104+
# SSH login fix. Otherwise user is kicked off after login
105+
RUN mkdir -p /var/run/sshd \
106+
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
107+
108+
# Create SSH key.
109+
RUN mkdir -p /root/.ssh/ \
110+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
111+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
112+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
113+
114+
WORKDIR /
115+
116+
RUN pip --no-cache-dir install --upgrade \
117+
pip \
118+
setuptools
119+
120+
# Some TF tools expect a "python" binary
121+
RUN ln -s $(which python) /usr/local/bin/python
122+
123+
COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
124+
125+
RUN pip install --no-cache-dir -U \
126+
numpy==1.16.5 \
127+
scipy==1.2.2 \
128+
scikit-learn==0.20.3 \
129+
pandas==0.24.2 \
130+
Pillow==6.2.1 \
131+
h5py==2.9.0 \
132+
keras_applications==1.0.8 \
133+
keras_preprocessing==1.1.0 \
134+
requests==2.22.0 \
135+
keras==2.3.1 \
136+
# botocore requires python-dateutil<2.8.1
137+
"python-dateutil<2.8.1" \
138+
awscli==1.16.296 \
139+
mpi4py==3.0.2 \
140+
"cryptography>=2.3" \
141+
"sagemaker-tensorflow>=1.15,<1.16" \
142+
# Let's install TensorFlow separately in the end to avoid the library version to be overwritten
143+
&& pip install --force-reinstall --no-cache-dir -U \
144+
${TF_URL} \
145+
&& pip install --no-cache-dir -U \
146+
$FRAMEWORK_SUPPORT_INSTALLABLE \
147+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
148+
149+
# Install Horovod, temporarily using CUDA stubs
150+
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
151+
&& HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
152+
# awscli requires PyYAML<5.2
153+
"PyYAML<5.2" \
154+
horovod==0.18.2 \
155+
&& ldconfig
156+
157+
# Allow OpenSSH to talk to containers without asking for confirmation
158+
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
159+
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
160+
&& mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
161+
162+
COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
163+
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
164+
165+
RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
166+
&& chmod +x /usr/local/bin/deep_learning_container.py
167+
168+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
169+
170+
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
171+
CMD ["bin/bash"]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os.path
16+
import shlex
17+
import subprocess
18+
import sys
19+
20+
if not os.path.exists("/opt/ml/input/config"):
21+
subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
22+
23+
subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))

0 commit comments

Comments
 (0)