@@ -2,11 +2,6 @@ FROM nvidia/cuda:10.0-base-ubuntu16.04
2
2
3
3
LABEL maintainer="Amazon AI"
4
4
5
- RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
6
- software-properties-common && \
7
- add-apt-repository ppa:deadsnakes/ppa -y && \
8
- rm -rf /var/lib/apt/lists/*
9
-
10
5
RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
11
6
ca-certificates \
12
7
cuda-command-line-tools-10-0 \
@@ -17,18 +12,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
17
12
cuda-cusolver-dev-10-0 \
18
13
cuda-cusparse-dev-10-0 \
19
14
curl \
20
- libcudnn7=7.4 .1.5 -1+cuda10.0 \
15
+ libcudnn7=7.5 .1.10 -1+cuda10.0 \
21
16
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
22
- libnccl2 \
23
- libnccl-dev \
17
+ libnccl2=2.4.7-1+cuda10.0 \
18
+ libgomp1 \
19
+ libnccl-dev=2.4.7-1+cuda10.0 \
24
20
libfreetype6-dev \
25
21
libhdf5-serial-dev \
26
22
libpng12-dev \
27
23
libzmq3-dev \
24
+ git \
28
25
wget \
26
+ vim \
27
+ build-essential \
29
28
openssh-client \
30
29
openssh-server \
31
- build-essential && \
30
+ zlib1g-dev && \
32
31
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
33
32
# adds a new list which contains libnvinfer library, so it needs another
34
33
# 'apt-get update' to retrieve that list before it can actually install the
@@ -42,7 +41,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
42
41
rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
43
42
rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
44
43
rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
45
- rm -rf /var/lib/apt/lists/*
44
+ rm -rf /var/lib/apt/lists/* && \
45
+ mkdir -p /var/run/sshd
46
46
47
47
###########################################################################
48
48
# Horovod & its dependencies
@@ -60,14 +60,17 @@ RUN mkdir /tmp/openmpi && \
60
60
ldconfig && \
61
61
rm -rf /tmp/openmpi
62
62
63
- ARG py_version
64
- ARG framework_installable
65
- ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
63
+ ARG PYTHON=python3
64
+ ARG PYTHON_PIP=python3-pip
65
+ ARG PIP=pip3
66
+ ARG PYTHON_VERSION=3.6.6
66
67
67
- RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \
68
- apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated && \
69
- ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \
70
- rm -rf /var/lib/apt/lists/*
68
+ RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
69
+ tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
70
+ ./configure && make && make install && \
71
+ apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
72
+ make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
73
+ ln -s /usr/local/bin/pip3 /usr/bin/pip
71
74
72
75
# Create a wrapper for OpenMPI to allow running as root by default
73
76
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
@@ -100,33 +103,51 @@ RUN mkdir -p /root/.ssh/ && \
100
103
# Python won’t try to write .pyc or .pyo files on the import of source modules
101
104
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
102
105
103
- RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
104
- python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \
105
- rm get-pip.py
106
-
107
106
WORKDIR /
108
107
109
- COPY $framework_installable tensorflow-1.13.1-py2.py3-none-any.whl
108
+ ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl"
109
+
110
+ RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
111
+
112
+ # Some TF tools expect a "python" binary
113
+ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
114
+
115
+ ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
110
116
COPY $framework_support_installable .
111
117
112
- RUN pip install --no-cache-dir -U \
118
+ RUN ${PIP} install --no-cache-dir -U \
119
+ numpy==1.16.2 \
120
+ scipy==1.2.1 \
121
+ scikit-learn==0.20.3 \
122
+ pandas==0.24.2 \
123
+ Pillow==5.4.1 \
124
+ h5py==2.9.0 \
125
+ keras_applications==1.0.7 \
126
+ keras_preprocessing==1.0.9 \
127
+ requests==2.21.0 \
113
128
keras==2.2.4 \
129
+ awscli==1.16.130 \
114
130
mpi4py==3.0.1 \
115
- $framework_support_installable \
116
131
"sagemaker-tensorflow>=1.13,<1.14" \
117
132
# Let's install TensorFlow separately in the end to avoid
118
133
# the library version to be overwritten
119
- && pip install --force-reinstall --no-cache-dir -U tensorflow-1.13.1-py2.py3-none-any.whl \
120
- \
121
- && rm -f tensorflow-1.13.1-py2.py3-none-any.whl \
122
- && rm -f $framework_support_installable \
123
- && pip uninstall -y --no-cache-dir \
134
+ && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
135
+ && ${PIP} install --no-cache-dir -U $framework_support_installable && \
136
+ rm -f $framework_support_installable \
137
+ && ${PIP} uninstall -y --no-cache-dir \
124
138
markdown \
125
139
tensorboard
126
140
127
141
# Install Horovod, temporarily using CUDA stubs
128
142
RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
129
- HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \
143
+ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.16.4 && \
130
144
ldconfig
131
145
132
- ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
146
+ # Allow OpenSSH to talk to containers without asking for confirmation
147
+ RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
148
+ echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
149
+ mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
150
+
151
+ ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
152
+
153
+ CMD ["bin/bash"]
0 commit comments