|
| 1 | +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. |
| 2 | +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ |
| 3 | +FROM nvidia/cuda:10.0-base-ubuntu18.04 |
| 4 | + |
| 5 | +LABEL maintainer="Amazon AI" |
| 6 | + |
| 7 | +# Prevent docker build get stopped by requesting user interaction |
| 8 | +ENV DEBIAN_FRONTEND=noninteractive |
| 9 | +ENV DEBCONF_NONINTERACTIVE_SEEN=true |
| 10 | +# Python won’t try to write .pyc or .pyo files on the import of source modules |
| 11 | +ENV PYTHONDONTWRITEBYTECODE=1 |
| 12 | +ENV PYTHONUNBUFFERED=1 |
| 13 | +# See http://bugs.python.org/issue19846 |
| 14 | +ENV PYTHONIOENCODING=UTF-8 |
| 15 | +ENV LANG=C.UTF-8 |
| 16 | +ENV LC_ALL=C.UTF-8 |
| 17 | +# Specify the location of module that contains the training logic for SageMaker |
| 18 | +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html |
| 19 | +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main |
| 20 | + |
| 21 | +# Define framework-related package sources |
| 22 | +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz |
| 23 | +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl |
| 24 | + |
| 25 | +RUN apt-get update \ |
| 26 | + && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 27 | + ca-certificates \ |
| 28 | + cuda-command-line-tools-10-0 \ |
| 29 | + cuda-cublas-dev-10-0 \ |
| 30 | + cuda-cudart-dev-10-0 \ |
| 31 | + cuda-cufft-dev-10-0 \ |
| 32 | + cuda-curand-dev-10-0 \ |
| 33 | + cuda-cusolver-dev-10-0 \ |
| 34 | + cuda-cusparse-dev-10-0 \ |
| 35 | + curl \ |
| 36 | + libcudnn7=7.5.1.10-1+cuda10.0 \ |
| 37 | + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it |
| 38 | + libnccl2=2.4.7-1+cuda10.0 \ |
| 39 | + libgomp1 \ |
| 40 | + libnccl-dev=2.4.7-1+cuda10.0 \ |
| 41 | + libfreetype6-dev \ |
| 42 | + libhdf5-serial-dev \ |
| 43 | + libpng-dev \ |
| 44 | + libzmq3-dev \ |
| 45 | + git \ |
| 46 | + wget \ |
| 47 | + vim \ |
| 48 | + build-essential \ |
| 49 | + openssh-client \ |
| 50 | + openssh-server \ |
| 51 | + zlib1g-dev \ |
| 52 | + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 |
| 53 | + # adds a new list which contains libnvinfer library, so it needs another |
| 54 | + # 'apt-get update' to retrieve that list before it can actually install the library. |
| 55 | + # We don't install libnvinfer-dev since we don't need to build against TensorRT, |
| 56 | + # and libnvinfer4 doesn't contain libnvinfer.a static library. |
| 57 | + && apt-get update \ |
| 58 | + && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 59 | + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ |
| 60 | + && apt-get update \ |
| 61 | + && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 62 | + libnvinfer5=5.0.2-1+cuda10.0 \ |
| 63 | + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ |
| 64 | + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ |
| 65 | + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ |
| 66 | + && rm -rf /var/lib/apt/lists/* \ |
| 67 | + && mkdir -p /var/run/sshd |
| 68 | + |
| 69 | +# Install Open MPI |
| 70 | +RUN mkdir /tmp/openmpi \ |
| 71 | + && cd /tmp/openmpi \ |
| 72 | + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ |
| 73 | + && tar zxf openmpi-4.0.1.tar.gz \ |
| 74 | + && cd openmpi-4.0.1 \ |
| 75 | + && ./configure --enable-orterun-prefix-by-default \ |
| 76 | + && make -j $(nproc) all \ |
| 77 | + && make install \ |
| 78 | + && ldconfig \ |
| 79 | + && rm -rf /tmp/openmpi |
| 80 | + |
| 81 | +RUN apt-get update \ |
| 82 | + && apt-get install -y \ |
| 83 | + python \ |
| 84 | + python-pip |
| 85 | + |
| 86 | +# Create a wrapper for OpenMPI to allow running as root by default |
| 87 | +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ |
| 88 | + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ |
| 89 | + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ |
| 90 | + && chmod a+x /usr/local/bin/mpirun |
| 91 | + |
| 92 | +# Configure OpenMPI to run good defaults: |
| 93 | +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 |
| 94 | +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ |
| 95 | + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf |
| 96 | + |
| 97 | +# Set default NCCL parameters |
| 98 | +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf |
| 99 | + |
| 100 | +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH |
| 101 | +ENV PATH /usr/local/openmpi/bin/:$PATH |
| 102 | +ENV PATH=/usr/local/nvidia/bin:$PATH |
| 103 | + |
| 104 | +# SSH login fix. Otherwise user is kicked off after login |
| 105 | +RUN mkdir -p /var/run/sshd \ |
| 106 | + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd |
| 107 | + |
| 108 | +# Create SSH key. |
| 109 | +RUN mkdir -p /root/.ssh/ \ |
| 110 | + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ |
| 111 | + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ |
| 112 | + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config |
| 113 | + |
| 114 | +WORKDIR / |
| 115 | + |
| 116 | +RUN pip --no-cache-dir install --upgrade \ |
| 117 | + pip \ |
| 118 | + setuptools |
| 119 | + |
| 120 | +# Some TF tools expect a "python" binary |
| 121 | +RUN ln -s $(which python) /usr/local/bin/python |
| 122 | + |
| 123 | +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . |
| 124 | + |
| 125 | +RUN pip install --no-cache-dir -U \ |
| 126 | + numpy==1.16.5 \ |
| 127 | + scipy==1.2.2 \ |
| 128 | + scikit-learn==0.20.3 \ |
| 129 | + pandas==0.24.2 \ |
| 130 | + Pillow==6.2.1 \ |
| 131 | + h5py==2.9.0 \ |
| 132 | + keras_applications==1.0.8 \ |
| 133 | + keras_preprocessing==1.1.0 \ |
| 134 | + requests==2.22.0 \ |
| 135 | + keras==2.3.1 \ |
| 136 | + # botocore requires python-dateutil<2.8.1 |
| 137 | + "python-dateutil<2.8.1" \ |
| 138 | + awscli==1.16.296 \ |
| 139 | + mpi4py==3.0.2 \ |
| 140 | + "cryptography>=2.3" \ |
| 141 | + "sagemaker-tensorflow>=1.15,<1.16" \ |
| 142 | + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten |
| 143 | + && pip install --force-reinstall --no-cache-dir -U \ |
| 144 | + ${TF_URL} \ |
| 145 | + && pip install --no-cache-dir -U \ |
| 146 | + $FRAMEWORK_SUPPORT_INSTALLABLE \ |
| 147 | + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE |
| 148 | + |
| 149 | +# Install Horovod, temporarily using CUDA stubs |
| 150 | +RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ |
| 151 | + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ |
| 152 | + # awscli requires PyYAML<5.2 |
| 153 | + "PyYAML<5.2" \ |
| 154 | + horovod==0.18.2 \ |
| 155 | + && ldconfig |
| 156 | + |
| 157 | +# Allow OpenSSH to talk to containers without asking for confirmation |
| 158 | +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ |
| 159 | + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ |
| 160 | + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config |
| 161 | + |
| 162 | +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py |
| 163 | +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py |
| 164 | + |
| 165 | +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ |
| 166 | + && chmod +x /usr/local/bin/deep_learning_container.py |
| 167 | + |
| 168 | +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt |
| 169 | + |
| 170 | +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] |
| 171 | +CMD ["bin/bash"] |
0 commit comments