Skip to content

Commit 3fc3f0f

Browse files
[Tensorflow]|[build]|[test] Updating TF 2.4 cu110 py37 training binaries (#783)
1 parent 5ddd04c commit 3fc3f0f

File tree

6 files changed

+540
-96
lines changed

6 files changed

+540
-96
lines changed

tensorflow/buildspec.yml

Lines changed: 79 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,82 @@
1-
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2-
region: &REGION <set-$REGION-in-environment>
3-
framework: &FRAMEWORK tensorflow
4-
version: &VERSION 2.3.1
5-
short_version: &SHORT_VERSION 2.3
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
region: &REGION <set-$REGION-in-environment>
3+
framework: &FRAMEWORK tensorflow
4+
version: &VERSION 2.4.1
5+
short_version: &SHORT_VERSION 2.4
66

7-
repository_info:
8-
training_repository: &TRAINING_REPOSITORY
9-
image_type: &TRAINING_IMAGE_TYPE training
10-
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
11-
repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
12-
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
13-
inference_repository: &INFERENCE_REPOSITORY
14-
image_type: &INFERENCE_IMAGE_TYPE inference
15-
root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ]
16-
repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE]
17-
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
7+
repository_info:
8+
training_repository: &TRAINING_REPOSITORY
9+
image_type: &TRAINING_IMAGE_TYPE training
10+
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
11+
repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
12+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
13+
*REPOSITORY_NAME ]
14+
inference_repository:
15+
image_type: &INFERENCE_IMAGE_TYPE inference
16+
root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ]
17+
repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE]
18+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
19+
*REPOSITORY_NAME ]
1820

19-
context:
20-
training_context: &TRAINING_CONTEXT
21-
dockerd-entrypoint:
22-
source: docker/build_artifacts/dockerd-entrypoint.py
23-
target: dockerd-entrypoint.py
24-
inference_context: &INFERENCE_CONTEXT
25-
sagemaker_package_name:
26-
source: docker/build_artifacts/sagemaker
27-
target: sagemaker
28-
init:
29-
source: docker/build_artifacts/__init__.py
30-
target: __init__.py
31-
dockerd-entrypoint:
32-
source: docker/build_artifacts/dockerd-entrypoint.py
33-
target: dockerd-entrypoint.py
21+
context:
22+
training_context: &TRAINING_CONTEXT
23+
dockerd-entrypoint:
24+
source: docker/build_artifacts/dockerd-entrypoint.py
25+
target: dockerd-entrypoint.py
26+
inference_context:
27+
sagemaker_package_name:
28+
source: docker/build_artifacts/sagemaker
29+
target: sagemaker
30+
init:
31+
source: docker/build_artifacts/__init__.py
32+
target: __init__.py
33+
dockerd-entrypoint:
34+
source: docker/build_artifacts/dockerd-entrypoint.py
35+
target: dockerd-entrypoint.py
3436

35-
images:
36-
BuildTensorflowCPUTrainPy3DockerImage:
37-
<<: *TRAINING_REPOSITORY
38-
build: &TENSORFLOW_CPU_TRAINING_PY3 false
39-
image_size_baseline: 4899
40-
device_type: &DEVICE_TYPE cpu
41-
python_version: &DOCKER_PYTHON_VERSION py3
42-
tag_python_version: &TAG_PYTHON_VERSION py37
43-
os_version: &OS_VERSION ubuntu18.04
44-
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
45-
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
46-
context:
47-
<<: *TRAINING_CONTEXT
48-
BuildTensorflowGPUTrainPy3DockerImage:
49-
<<: *TRAINING_REPOSITORY
50-
build: &TENSORFLOW_GPU_TRAINING_PY3 false
51-
image_size_baseline: 11200
52-
device_type: &DEVICE_TYPE gpu
53-
python_version: &DOCKER_PYTHON_VERSION py3
54-
tag_python_version: &TAG_PYTHON_VERSION py37
55-
cuda_version: &CUDA_VERSION cu102
56-
os_version: &OS_VERSION ubuntu18.04
57-
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
58-
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
59-
context:
60-
<<: *TRAINING_CONTEXT
61-
BuildTensorflowExampleGPUTrainPy3DockerImage:
62-
<<: *TRAINING_REPOSITORY
63-
build: &TENSORFLOW_GPU_TRAINING_PY3 false
64-
image_size_baseline: 11200
65-
base_image_name: BuildTensorflowGPUTrainPy3DockerImage
66-
device_type: &DEVICE_TYPE gpu
67-
python_version: &DOCKER_PYTHON_VERSION py3
68-
tag_python_version: &TAG_PYTHON_VERSION py37
69-
cuda_version: &CUDA_VERSION cu102
70-
os_version: &OS_VERSION ubuntu18.04
71-
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION,
72-
"-example" ]
73-
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, /Dockerfile., *DEVICE_TYPE ]
74-
context:
75-
<<: *TRAINING_CONTEXT
37+
images:
38+
BuildTensorflowCpuPy37TrainingDockerImage:
39+
<<: *TRAINING_REPOSITORY
40+
build: &TENSORFLOW_CPU_TRAINING_PY3 false
41+
image_size_baseline: &IMAGE_SIZE_BASELINE 4489
42+
device_type: &DEVICE_TYPE cpu
43+
python_version: &DOCKER_PYTHON_VERSION py3
44+
tag_python_version: &TAG_PYTHON_VERSION py37
45+
os_version: &OS_VERSION ubuntu18.04
46+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION
47+
]
48+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.,
49+
*DEVICE_TYPE ]
50+
context:
51+
<<: *TRAINING_CONTEXT
52+
BuildTensorflowGpuPy37Cu110TrainingDockerImage:
53+
<<: *TRAINING_REPOSITORY
54+
build: &TENSORFLOW_GPU_TRAINING_PY3 false
55+
image_size_baseline: &IMAGE_SIZE_BASELINE 7738
56+
device_type: &DEVICE_TYPE gpu
57+
python_version: &DOCKER_PYTHON_VERSION py3
58+
tag_python_version: &TAG_PYTHON_VERSION py37
59+
cuda_version: &CUDA_VERSION cu110
60+
os_version: &OS_VERSION ubuntu18.04
61+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
62+
"-", *OS_VERSION ]
63+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
64+
/Dockerfile., *DEVICE_TYPE ]
65+
context:
66+
<<: *TRAINING_CONTEXT
67+
BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage:
68+
<<: *TRAINING_REPOSITORY
69+
build: &TENSORFLOW_GPU_TRAINING_PY3 false
70+
image_size_baseline: &IMAGE_SIZE_BASELINE 7738
71+
base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage
72+
device_type: &DEVICE_TYPE gpu
73+
python_version: &DOCKER_PYTHON_VERSION py3
74+
tag_python_version: &TAG_PYTHON_VERSION py37
75+
cuda_version: &CUDA_VERSION cu110
76+
os_version: &OS_VERSION ubuntu18.04
77+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
78+
"-", *OS_VERSION, "-example" ]
79+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
80+
/Dockerfile., *DEVICE_TYPE ]
81+
context:
82+
<<: *TRAINING_CONTEXT
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
FROM ubuntu:18.04
2+
3+
LABEL maintainer="Amazon AI"
4+
LABEL dlc_major_version="1"
5+
6+
# prevent stopping by user interaction
7+
ENV DEBIAN_FRONTEND noninteractive
8+
ENV DEBCONF_NONINTERACTIVE_SEEN true
9+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
10+
11+
# Set environment variables for MKL
12+
# For more about MKL with TensorFlow see:
13+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
14+
15+
ENV KMP_AFFINITY=granularity=fine,compact,1,0
16+
ENV KMP_BLOCKTIME=1
17+
ENV KMP_SETTINGS=0
18+
19+
ENV PYTHONDONTWRITEBYTECODE=1
20+
ENV PYTHONUNBUFFERED=1
21+
ENV PYTHONIOENCODING=UTF-8
22+
ENV LANG=C.UTF-8
23+
ENV LC_ALL=C.UTF-8
24+
25+
ARG PYTHON=python3.7
26+
ARG PYTHON_PIP=python3-pip
27+
ARG PIP=pip3
28+
ARG PYTHON_VERSION=3.7.7
29+
ARG OPENSSL_VERSION=1.1.1g
30+
31+
ARG TF_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/tensorflow/r2.4_aws/20210127-150238/cpu/py37/tensorflow_cpu-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl
32+
33+
ARG ESTIMATOR_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/estimator/r2.4_aws/20210127-150238/tensorflow_estimator-2.4.0-py2.py3-none-any.whl
34+
35+
# The smdebug pipeline relies for following format to perform string replace and trigger DLC pipeline for validating
36+
# the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed.
37+
ARG SMDEBUG_VERSION=1.0.2
38+
39+
RUN apt-get update && apt-get install -y --no-install-recommends \
40+
build-essential \
41+
openssh-client \
42+
openssh-server \
43+
ca-certificates \
44+
curl \
45+
emacs \
46+
git \
47+
libtemplate-perl \
48+
wget \
49+
vim \
50+
zlib1g-dev \
51+
# Install dependent library for OpenCV
52+
libgtk2.0-dev \
53+
&& rm -rf /var/lib/apt/lists/*
54+
55+
# Install Open MPI
56+
RUN mkdir /tmp/openmpi && \
57+
cd /tmp/openmpi && \
58+
curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz \
59+
&& tar zxf openmpi-4.0.4.tar.gz \
60+
&& cd openmpi-4.0.4 \
61+
&& ./configure --enable-orterun-prefix-by-default \
62+
&& make -j $(nproc) all \
63+
&& make install \
64+
&& ldconfig \
65+
&& rm -rf /tmp/openmpi
66+
67+
# Create a wrapper for OpenMPI to allow running as root by default
68+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
69+
&& echo '#!/bin/bash' > /usr/local/bin/mpirun \
70+
&& echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
71+
&& chmod a+x /usr/local/bin/mpirun
72+
73+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
74+
&& echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
75+
76+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
77+
ENV PATH /usr/local/openmpi/bin/:$PATH
78+
79+
# SSH login fix. Otherwise user is kicked off after login
80+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
81+
82+
# Create SSH key.
83+
RUN mkdir -p /root/.ssh/ \
84+
&& mkdir -p /var/run/sshd \
85+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
86+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
87+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
88+
89+
WORKDIR /
90+
91+
RUN apt-get update \
92+
&& apt-get install -y --no-install-recommends \
93+
libbz2-dev \
94+
libc6-dev \
95+
libffi-dev \
96+
libgdbm-dev \
97+
liblzma-dev \
98+
libncursesw5-dev \
99+
libreadline-gplv2-dev \
100+
libsqlite3-dev \
101+
libssl-dev \
102+
tk-dev \
103+
&& rm -rf /var/lib/apt/lists/* \
104+
&& apt-get clean
105+
106+
RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
107+
&& tar -xvf Python-$PYTHON_VERSION.tgz \
108+
&& cd Python-$PYTHON_VERSION \
109+
&& ./configure && make && make install \
110+
&& make && make install && rm -rf ../Python-$PYTHON_VERSION*
111+
112+
RUN ${PIP} --no-cache-dir install --upgrade \
113+
pip \
114+
setuptools
115+
116+
RUN wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \
117+
&& apt remove -y --purge openssl \
118+
&& rm -rf /usr/include/openssl \
119+
&& apt-get update \
120+
&& apt-get install -y \
121+
ca-certificates \
122+
&& tar -xzvf openssl-${OPENSSL_VERSION}.tar.gz \
123+
&& cd openssl-${OPENSSL_VERSION} \
124+
&& ./config && make && make test \
125+
&& make install \
126+
&& ldconfig \
127+
&& cd .. && rm -rf openssl-*
128+
129+
# when we remove previous openssl, the ca-certificates pkgs and its symlinks gets deleted
130+
# causing sslcertverificationerror the below steps are to fix that
131+
RUN ln -s /etc/ssl/certs/*.* /usr/local/ssl/certs/
132+
133+
# Some TF tools expect a "python" binary
134+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
135+
&& ln -s $(which ${PIP}) /usr/bin/pip
136+
137+
RUN apt-get update && apt-get -y install cmake protobuf-compiler
138+
139+
# install PyYAML==5.1.2 to avoid conflict with latest awscli
140+
# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
141+
RUN ${PIP} install --no-cache-dir -U \
142+
numpy==1.19.1 \
143+
scipy==1.5.2 \
144+
scikit-learn==0.23 \
145+
pandas==1.1 \
146+
Pillow==7.2.0 \
147+
python-dateutil==2.8.1 \
148+
pyYAML==5.3.1 \
149+
requests==2.24.0 \
150+
"awscli<2" \
151+
mpi4py==3.0.3 \
152+
opencv-python==4.3.0.36 \
153+
"sagemaker>=2,<3" \
154+
sagemaker-experiments==0.* \
155+
"sagemaker-tensorflow>=2.4,<2.5" \
156+
"sagemaker-tensorflow-training>=20" \
157+
158+
# Let's install TensorFlow separately in the end to avoid
159+
# the library version to be overwritten
160+
&& ${PIP} install --no-cache-dir -U \
161+
${TF_URL} \
162+
${ESTIMATOR_URL} \
163+
h5py==2.10.0 \
164+
"absl-py>=0.9,<0.11" \
165+
horovod==0.21.0 \
166+
werkzeug==1.0.1 \
167+
psutil==5.7.2 \
168+
smdebug==${SMDEBUG_VERSION} \
169+
smclarify
170+
171+
ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py
172+
173+
RUN chmod +x /usr/local/bin/deep_learning_container.py
174+
175+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.4/license.txt -o /license.txt
176+
177+
CMD ["bin/bash"]

0 commit comments

Comments
 (0)