From c4adde5be86318973beebdfc52e58214dacc879b Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Date: Mon, 30 Dec 2019 15:58:07 -0800 Subject: [PATCH 01/10] update: Release TF 1.15.0 dockerfiles (#264) * Add TF 1.15 dockerfiles and changes to entrypoint Co-authored-by: akhilmehra Co-authored-by: ElizaZh Co-authored-by: Owen Thomas <31292660+owen-t@users.noreply.github.com> Co-authored-by: Kartik Kalamadi Co-authored-by: Arjuna Keshavan <33526713+arjkesh@users.noreply.github.com> Co-authored-by: akhilmehra Co-authored-by: ElizaZh Co-authored-by: Owen Thomas <31292660+owen-t@users.noreply.github.com> Co-authored-by: Kartik Kalamadi Co-authored-by: Arjuna Keshavan <33526713+arjkesh@users.noreply.github.com> --- docker/1.15.0/py2/Dockerfile.cpu | 129 +++++++++++++ docker/1.15.0/py2/Dockerfile.gpu | 171 +++++++++++++++++ docker/1.15.0/py2/dockerd-entrypoint.py | 23 +++ docker/1.15.0/py3/Dockerfile.cpu | 133 +++++++++++++ docker/1.15.0/py3/Dockerfile.gpu | 179 ++++++++++++++++++ docker/1.15.0/py3/dockerd-entrypoint.py | 23 +++ setup.py | 4 +- .../deep_learning_container.py | 112 +++++++++++ test/resources/mnist/mnist.py | 21 +- test/unit/test_deep_learning_containers.py | 158 ++++++++++++++++ 10 files changed, 950 insertions(+), 3 deletions(-) create mode 100644 docker/1.15.0/py2/Dockerfile.cpu create mode 100644 docker/1.15.0/py2/Dockerfile.gpu create mode 100644 docker/1.15.0/py2/dockerd-entrypoint.py create mode 100644 docker/1.15.0/py3/Dockerfile.cpu create mode 100644 docker/1.15.0/py3/Dockerfile.gpu create mode 100644 docker/1.15.0/py3/dockerd-entrypoint.py create mode 100644 src/sagemaker_tensorflow_container/deep_learning_container.py create mode 100644 test/unit/test_deep_learning_containers.py diff --git a/docker/1.15.0/py2/Dockerfile.cpu b/docker/1.15.0/py2/Dockerfile.cpu new file mode 100644 index 00000000..f9387aa0 --- /dev/null +++ b/docker/1.15.0/py2/Dockerfile.cpu @@ -0,0 +1,129 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + openssh-client \ + openssh-server \ + ca-certificates \ + curl \ + git \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN apt-get update \ + && apt-get install -y \ + python \ + python-pip + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python) /usr/local/bin/python + +RUN pip install --no-cache-dir -U \ + numpy==1.16.5 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + # botocore requires python-dateutil<2.8.1 + "python-dateutil<2.8.1" \ + awscli==1.16.296 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ + && pip install --no-cache-dir -U \ + # awscli requires PyYAML<5.2 + "PyYAML<5.2" \ + horovod==0.18.2 + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py2/Dockerfile.gpu b/docker/1.15.0/py2/Dockerfile.gpu new file mode 100644 index 00000000..fa86d9dc --- /dev/null +++ b/docker/1.15.0/py2/Dockerfile.gpu @@ -0,0 +1,171 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + build-essential \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +RUN apt-get update \ + && apt-get install -y \ + python \ + python-pip + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH /usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python) /usr/local/bin/python + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip install --no-cache-dir -U \ + numpy==1.16.5 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + # botocore requires python-dateutil<2.8.1 + "python-dateutil<2.8.1" \ + awscli==1.16.296 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + # awscli requires PyYAML<5.2 + "PyYAML<5.2" \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py2/dockerd-entrypoint.py b/docker/1.15.0/py2/dockerd-entrypoint.py new file mode 100644 index 00000000..b9231abc --- /dev/null +++ b/docker/1.15.0/py2/dockerd-entrypoint.py @@ -0,0 +1,23 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os.path +import shlex +import subprocess +import sys + +if not os.path.exists("/opt/ml/input/config"): + subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) + +subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu new file mode 100644 index 00000000..b204769a --- /dev/null +++ b/docker/1.15.0/py3/Dockerfile.cpu @@ -0,0 +1,133 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + python3-setuptools \ + software-properties-common \ + build-essential \ + openssh-client \ + openssh-server \ + ca-certificates \ + curl \ + git \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip3 --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +# install PyYAML==5.1.2 to avoid conflict with latest awscli +# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli +RUN pip install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + keras==2.3.1 \ + # botocore requires python-dateutil<2.8.1 + "python-dateutil<2.8.1" \ + requests==2.22.0 \ + smdebug==0.4.14 \ + sagemaker-experiments==0.1.3 \ + awscli==1.16.296 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --force-reinstall --no-cache-dir -U \ + # awscli requires PyYAML<5.2 + "PyYAML<5.2" \ + horovod==0.18.2 \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu new file mode 100644 index 00000000..38c86b14 --- /dev/null +++ b/docker/1.15.0/py3/Dockerfile.gpu @@ -0,0 +1,179 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-dev \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + build-essential \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the + # library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +########################################################################### +# Horovod & its dependencies +########################################################################### + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip3 --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +# install PyYAML==5.1.2 to avoid conflict with latest awscli +# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli +RUN pip install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + # botocore requires python-dateutil<2.8.1 + "python-dateutil<2.8.1" \ + smdebug==0.4.14 \ + sagemaker-experiments==0.1.3 \ + awscli==1.16.296 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + # awscli requires PyYAML<5.2 + "PyYAML<5.2" \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/dockerd-entrypoint.py b/docker/1.15.0/py3/dockerd-entrypoint.py new file mode 100644 index 00000000..b9231abc --- /dev/null +++ b/docker/1.15.0/py3/dockerd-entrypoint.py @@ -0,0 +1,23 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os.path +import shlex +import subprocess +import sys + +if not os.path.exists("/opt/ml/input/config"): + subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) + +subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) diff --git a/setup.py b/setup.py index 88e412b0..02c007c4 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,9 @@ def read_version(): 'pandas', 'Pillow', 'h5py'], extras_require={ 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', - 'sagemaker==1.19.1', 'tensorflow<2.0', 'docker-compose', 'botocore>=1.12.140'], + 'sagemaker==1.19.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.32', + 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.32', + 'requests-mock', 'awscli==1.16.296'], 'benchmark': ['click'] }, ) diff --git a/src/sagemaker_tensorflow_container/deep_learning_container.py b/src/sagemaker_tensorflow_container/deep_learning_container.py new file mode 100644 index 00000000..0776dfb3 --- /dev/null +++ b/src/sagemaker_tensorflow_container/deep_learning_container.py @@ -0,0 +1,112 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import re + +import requests + + +def _validate_instance_id(instance_id): + """ + Validate instance ID + """ + instance_id_regex = r'^(i-\S{17})' + compiled_regex = re.compile(instance_id_regex) + match = compiled_regex.match(instance_id) + + if not match: + return None + + return match.group(1) + + +def _retrieve_instance_id(): + """ + Retrieve instance ID from instance metadata service + """ + instance_id = None + url = "http://169.254.169.254/latest/meta-data/instance-id" + response = requests_helper(url, timeout=0.1) + + if response is not None: + instance_id = _validate_instance_id(response.text) + + return instance_id + + +def _retrieve_instance_region(): + """ + Retrieve instance region from instance metadata service + """ + region = None + valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2', + 'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1', + 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', + 'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] + + url = "http://169.254.169.254/latest/dynamic/instance-identity/document" + response = requests_helper(url, timeout=0.1) + + if response is not None: + response_json = json.loads(response.text) + + if response_json['region'] in valid_regions: + region = response_json['region'] + + return region + + +def query_bucket(): + """ + GET request on an empty object from an Amazon S3 bucket + """ + response = None + instance_id = _retrieve_instance_id() + region = _retrieve_instance_region() + + if instance_id is not None and region is not None: + url = ("https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com" + "/dlc-containers.txt?x-instance-id={1}".format(region, instance_id)) + response = requests_helper(url, timeout=0.2) + + logging.debug("Query bucket finished: {}".format(response)) + + return response + + +def requests_helper(url, timeout): + response = None + try: + response = requests.get(url, timeout=timeout) + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + return response + + +def main(): + """ + Invoke bucket query + """ + # Logs are not necessary for normal run. Remove this line while debugging. + logging.getLogger().disabled = True + + logging.basicConfig(level=logging.ERROR) + query_bucket() + + +if __name__ == '__main__': + main() diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py index 47d2bcd0..e4349ce2 100644 --- a/test/resources/mnist/mnist.py +++ b/test/resources/mnist/mnist.py @@ -1,8 +1,11 @@ -import tensorflow as tf import argparse +import json import os +import sys + import numpy as np -import json +import tensorflow as tf + def _parse_args(): @@ -32,6 +35,18 @@ def _load_testing_data(base_dir): return x_test, y_test +def assert_can_track_sagemaker_experiments(): + in_sagemaker_training = 'TRAINING_JOB_ARN' in os.environ + in_python_three = sys.version_info[0] == 3 + + if in_sagemaker_training and in_python_three: + import smexperiments.tracker + + with smexperiments.tracker.Tracker.load() as tracker: + tracker.log_parameter('param', 1) + tracker.log_metric('metric', 1.0) + + args, unknown = _parse_args() model = tf.keras.models.Sequential([ @@ -48,5 +63,7 @@ def _load_testing_data(base_dir): x_test, y_test = _load_testing_data(args.train) model.fit(x_train, y_train, epochs=args.epochs) model.evaluate(x_test, y_test) + if args.current_host == args.hosts[0]: model.save(os.path.join('/opt/ml/model', 'my_model.h5')) + assert_can_track_sagemaker_experiments() diff --git a/test/unit/test_deep_learning_containers.py b/test/unit/test_deep_learning_containers.py new file mode 100644 index 00000000..2da6959c --- /dev/null +++ b/test/unit/test_deep_learning_containers.py @@ -0,0 +1,158 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the 'license' file accompanying this file. This file is +# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import unittest + +import pytest +import requests + +from sagemaker_tensorflow_container import deep_learning_container as deep_learning_container_to_test + + +@pytest.fixture(name='fixture_valid_instance_id') +def fixture_valid_instance_id(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', + text='i-123t32e11s32t1231') + + +@pytest.fixture(name='fixture_invalid_instance_id') +def fixture_invalid_instance_id(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text='i-123') + + +@pytest.fixture(name='fixture_none_instance_id') +def fixture_none_instance_id(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text=None) + + +@pytest.fixture(name='fixture_invalid_region') +def fixture_invalid_region(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', + json={'region': 'test'}) + + +@pytest.fixture(name='fixture_valid_region') +def fixture_valid_region(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', + json={'region': 'us-east-1'}) + + +def test_retrieve_instance_id(fixture_valid_instance_id): + result = deep_learning_container_to_test._retrieve_instance_id() + assert 'i-123t32e11s32t1231' == result + + +def test_retrieve_none_instance_id(fixture_none_instance_id): + result = deep_learning_container_to_test._retrieve_instance_id() + assert result is None + + +def test_retrieve_invalid_instance_id(fixture_invalid_instance_id): + result = deep_learning_container_to_test._retrieve_instance_id() + assert result is None + + +def test_retrieve_invalid_region(fixture_invalid_region): + result = deep_learning_container_to_test._retrieve_instance_region() + assert result is None + + +def test_retrieve_valid_region(fixture_valid_region): + result = deep_learning_container_to_test._retrieve_instance_region() + assert 'us-east-1' == result + + +def test_query_bucket(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + requests_mock.get(('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231'), + text='Access Denied') + actual_response = deep_learning_container_to_test.query_bucket() + assert 'Access Denied' == actual_response.text + + +def test_query_bucket_region_none(fixture_invalid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_invalid_region.return_value = None + actual_response = deep_learning_container_to_test.query_bucket() + assert actual_response is None + + +def test_query_bucket_instance_id_none(requests_mock, fixture_valid_region, fixture_none_instance_id): + fixture_none_instance_id.return_value = None + fixture_valid_region.return_value = 'us-east-1' + actual_response = deep_learning_container_to_test.query_bucket() + assert actual_response is None + + +def test_query_bucket_instance_id_invalid(requests_mock, fixture_valid_region, fixture_invalid_instance_id): + fixture_invalid_instance_id.return_value = None + fixture_valid_region.return_value = 'us-east-1' + actual_response = deep_learning_container_to_test.query_bucket() + assert actual_response is None + + +def test_HTTP_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') + + requests_mock.get( + query_s3_url, + exc=requests.exceptions.HTTPError) + requests_mock.side_effect = requests.exceptions.HTTPError + + with pytest.raises(requests.exceptions.HTTPError): + actual_response = requests.get(query_s3_url) + assert actual_response is None + + +def test_connection_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') + + requests_mock.get( + query_s3_url, + exc=requests.exceptions.ConnectionError) + + with pytest.raises(requests.exceptions.ConnectionError): + actual_response = requests.get( + query_s3_url) + + assert actual_response is None + + +def test_timeout_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') + + requests_mock.get( + query_s3_url, + exc=requests.Timeout) + + with pytest.raises(requests.exceptions.Timeout): + actual_response = requests.get( + query_s3_url) + + assert actual_response is None + + +if __name__ == '__main__': + unittest.main() From 90a7b8426f639fc65e1d994ecad5f6475a4c9717 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Tue, 7 Jan 2020 12:43:51 -0800 Subject: [PATCH 02/10] change: update copyright year in license header (#266) --- docker/1.15.0/py2/dockerd-entrypoint.py | 2 +- docker/1.15.0/py3/dockerd-entrypoint.py | 2 +- scripts/build_all.py | 2 +- scripts/publish_all.py | 2 +- setup.py | 2 +- src/sagemaker_tensorflow_container/deep_learning_container.py | 2 +- src/sagemaker_tensorflow_container/s3_utils.py | 2 +- src/sagemaker_tensorflow_container/training.py | 2 +- test/integration/local/test_horovod.py | 2 +- test/integration/local/test_keras.py | 2 +- test/integration/local/test_training.py | 2 +- test/integration/sagemaker/test_horovod.py | 2 +- test/integration/sagemaker/test_mnist.py | 2 +- test/integration/sagemaker/test_tuning_model_dir.py | 2 +- test/integration/sagemaker/timeout.py | 2 +- test/integration/utils.py | 2 +- test/resources/mnist/horovod_mnist.py | 2 +- test/resources/test_py_version/entry.py | 2 +- test/resources/tuning_model_dir/entry.py | 2 +- test/unit/test_deep_learning_containers.py | 2 +- test/unit/test_s3_utils.py | 2 +- test/unit/test_training.py | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docker/1.15.0/py2/dockerd-entrypoint.py b/docker/1.15.0/py2/dockerd-entrypoint.py index b9231abc..cd222026 100644 --- a/docker/1.15.0/py2/dockerd-entrypoint.py +++ b/docker/1.15.0/py2/dockerd-entrypoint.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/docker/1.15.0/py3/dockerd-entrypoint.py b/docker/1.15.0/py3/dockerd-entrypoint.py index b9231abc..cd222026 100644 --- a/docker/1.15.0/py3/dockerd-entrypoint.py +++ b/docker/1.15.0/py3/dockerd-entrypoint.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/scripts/build_all.py b/scripts/build_all.py index de7913d3..9f340d5d 100644 --- a/scripts/build_all.py +++ b/scripts/build_all.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/scripts/publish_all.py b/scripts/publish_all.py index 092ae113..2c78e8a7 100644 --- a/scripts/publish_all.py +++ b/scripts/publish_all.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/setup.py b/setup.py index 02c007c4..11c8be66 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/src/sagemaker_tensorflow_container/deep_learning_container.py b/src/sagemaker_tensorflow_container/deep_learning_container.py index 0776dfb3..7e3967c7 100644 --- a/src/sagemaker_tensorflow_container/deep_learning_container.py +++ b/src/sagemaker_tensorflow_container/deep_learning_container.py @@ -1,4 +1,4 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/src/sagemaker_tensorflow_container/s3_utils.py b/src/sagemaker_tensorflow_container/s3_utils.py index 22e2ef74..0137ef25 100644 --- a/src/sagemaker_tensorflow_container/s3_utils.py +++ b/src/sagemaker_tensorflow_container/s3_utils.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index 5b176a28..bce6a69c 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index 2d4e9ce3..f35ba03a 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_keras.py b/test/integration/local/test_keras.py index 2e473bf9..1eca0c2a 100644 --- a/test/integration/local/test_keras.py +++ b/test/integration/local/test_keras.py @@ -1,4 +1,4 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py index 6a2bab25..bd1641b0 100644 --- a/test/integration/local/test_training.py +++ b/test/integration/local/test_training.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index 08e41704..1d2bd8ac 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 15e51a99..25c8db3e 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_tuning_model_dir.py b/test/integration/sagemaker/test_tuning_model_dir.py index 604d4c93..e833c3a4 100644 --- a/test/integration/sagemaker/test_tuning_model_dir.py +++ b/test/integration/sagemaker/test_tuning_model_dir.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py index 4360987a..d4738d32 100644 --- a/test/integration/sagemaker/timeout.py +++ b/test/integration/sagemaker/timeout.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/test/integration/utils.py b/test/integration/utils.py index 83271f67..4944eb20 100644 --- a/test/integration/utils.py +++ b/test/integration/utils.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py index cb5f81c6..1014f2bb 100644 --- a/test/resources/mnist/horovod_mnist.py +++ b/test/resources/mnist/horovod_mnist.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/test_py_version/entry.py b/test/resources/test_py_version/entry.py index e844e07c..8f71a01b 100644 --- a/test/resources/test_py_version/entry.py +++ b/test/resources/test_py_version/entry.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/tuning_model_dir/entry.py b/test/resources/tuning_model_dir/entry.py index 2fae72fc..0bce7165 100644 --- a/test/resources/tuning_model_dir/entry.py +++ b/test/resources/tuning_model_dir/entry.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_deep_learning_containers.py b/test/unit/test_deep_learning_containers.py index 2da6959c..8d6fe08e 100644 --- a/test/unit/test_deep_learning_containers.py +++ b/test/unit/test_deep_learning_containers.py @@ -1,4 +1,4 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_s3_utils.py b/test/unit/test_s3_utils.py index fa2cef6b..03de70a3 100644 --- a/test/unit/test_s3_utils.py +++ b/test/unit/test_s3_utils.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_training.py b/test/unit/test_training.py index f49d34ed..b69beed2 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of From 97be95230f160e4aef58387330f2077c8e25e58b Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Date: Thu, 9 Jan 2020 11:32:03 -0800 Subject: [PATCH 03/10] update: Update buildspec for TF 1.15.0 (#265) * Change path for entrypoint script * Change path for deep learning container script * Change build context to folder containing dockerfiles * Update buildspec * Update buildspec-release --- buildspec-release.yml | 22 +++--- buildspec.yml | 79 +++++++++++-------- docker/__init__.py | 0 docker/build_artifacts/__init__.py | 0 .../deep_learning_container.py | 0 .../dockerd-entrypoint.py | 0 ...ers.py => test_deep_learning_container.py} | 3 +- 7 files changed, 60 insertions(+), 44 deletions(-) create mode 100644 docker/__init__.py create mode 100644 docker/build_artifacts/__init__.py rename {src/sagemaker_tensorflow_container => docker/build_artifacts}/deep_learning_container.py (100%) rename docker/{1.15.0/py2 => build_artifacts}/dockerd-entrypoint.py (100%) rename test/unit/{test_deep_learning_containers.py => test_deep_learning_container.py} (98%) diff --git a/buildspec-release.yml b/buildspec-release.yml index a4ff55a5..2e5a9a86 100644 --- a/buildspec-release.yml +++ b/buildspec-release.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.13.1' + FRAMEWORK_VERSION: '1.15.0' GPU_INSTANCE_TYPE: 'ml.p2.xlarge' SETUP_FILE: 'setup_cmds.sh' SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' @@ -60,21 +60,21 @@ phases: echo '[{ "repository": "sagemaker-tensorflow-scriptmode", "tags": [{ - "source": "1.13.1-cpu-py2", - "dest": ["1.13.1-cpu-py2", "1.13-cpu-py2", "1.13.1-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-cpu-py2", + "dest": ["1.15.0-cpu-py2", "1.15-cpu-py2", "1.15.0-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.13.1-cpu-py3", - "dest": ["1.13.1-cpu-py3", "1.13-cpu-py3", "1.13.1-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-cpu-py3", + "dest": ["1.15.0-cpu-py3", "1.15-cpu-py3", "1.15.0-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.13.1-gpu-py2", - "dest": ["1.13.1-gpu-py2", "1.13-gpu-py2", "1.13.1-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-gpu-py2", + "dest": ["1.15.0-gpu-py2", "1.15-gpu-py2", "1.15.0-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.13.1-gpu-py3", - "dest": ["1.13.1-gpu-py3", "1.13-gpu-py3", "1.13.1-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-gpu-py3", + "dest": ["1.15.0-gpu-py3", "1.15-gpu-py3", "1.15.0-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] }], "test": [ - "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --instance-type {cpu-instance-type} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.13.1 --processor cpu --py-version 2,3", - "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.13.1 --processor gpu --py-version 2,3" + "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --instance-type {cpu-instance-type} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.15.0 --processor cpu --py-version 2,3", + "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.15.0 --processor gpu --py-version 2,3" ] }]' > deployments.json diff --git a/buildspec.yml b/buildspec.yml index 214cdcca..d59393c3 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -2,11 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.13.1' - CPU_FRAMEWORK_BINARY: 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' - CPU_PY_VERSION: '3' - GPU_FRAMEWORK_BINARY: 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' - GPU_PY_VERSION: '3' + FRAMEWORK_VERSION: '1.15.0' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-container' SETUP_FILE: 'setup_cmds.sh' @@ -34,42 +30,56 @@ phases: - tox -e py36,py27 test/unit # Create pip archive - - build_dir="docker/$FRAMEWORK_VERSION" + - root_dir=$(pwd) - build_id="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - python3 setup.py sdist - tar_name=$(ls dist) - - cp dist/$tar_name $build_dir - # build cpu image - - cpu_dockerfile="Dockerfile.cpu" + # Find build artifacts + - build_artifacts=$root_dir/docker/artifacts - # Download framework binary - - cpu_fw_binary=$(basename $CPU_FRAMEWORK_BINARY) - - wget -O $build_dir/$cpu_fw_binary $CPU_FRAMEWORK_BINARY - - - CPU_TAG="$FRAMEWORK_VERSION-cpu-py$CPU_PY_VERSION-$build_id" + # build py2 images + # prepare build context + - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py2" + - cp $root_dir/dist/$tar_name $build_dir + - cp $build_artifacts/* $build_dir/ - cd $build_dir - - docker build -f $cpu_dockerfile --build-arg framework_support_installable=$tar_name --build-arg py_version=$CPU_PY_VERSION --build-arg framework_installable=$cpu_fw_binary -t $PREPROD_IMAGE:$CPU_TAG . - - cd ../../ + + # build cpu image + - cpu_dockerfile="Dockerfile.cpu" + - CPU_TAG_PY2="$FRAMEWORK_VERSION-cpu-py2-$build_id" + - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY2 . # build gpu image - gpu_dockerfile="Dockerfile.gpu" + - GPU_TAG_PY2="$FRAMEWORK_VERSION-gpu-py2-$build_id" + - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY2 . - # Download framework binary - - gpu_fw_binary=$(basename $GPU_FRAMEWORK_BINARY) - - wget -O $build_dir/$gpu_fw_binary $GPU_FRAMEWORK_BINARY - - - GPU_TAG="$FRAMEWORK_VERSION-gpu-py$GPU_PY_VERSION-$build_id" + # build py3 images + # prepare build context + - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py3" + - cp $root_dir/dist/$tar_name $build_dir + - cp $build_artifacts/* $build_dir/ - cd $build_dir - - docker build -f $gpu_dockerfile --build-arg framework_support_installable=$tar_name --build-arg py_version=$GPU_PY_VERSION --build-arg framework_installable=$gpu_fw_binary -t $PREPROD_IMAGE:$GPU_TAG . - - cd ../../ + + # build cpu image + - cpu_dockerfile="Dockerfile.cpu" + - CPU_TAG_PY3="$FRAMEWORK_VERSION-cpu-py3-$build_id" + - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY3 . + + # build gpu image + - gpu_dockerfile="Dockerfile.gpu" + - GPU_TAG_PY3="$FRAMEWORK_VERSION-gpu-py3-$build_id" + - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY3 . # push images to ecr - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - docker push $PREPROD_IMAGE:$CPU_TAG - - docker push $PREPROD_IMAGE:$GPU_TAG + - docker push $PREPROD_IMAGE:$CPU_TAG_PY2 + - docker push $PREPROD_IMAGE:$GPU_TAG_PY2 + - docker push $PREPROD_IMAGE:$CPU_TAG_PY3 + - docker push $PREPROD_IMAGE:$GPU_TAG_PY3 # launch remote gpu instance - instance_type='p2.xlarge' @@ -79,7 +89,8 @@ phases: # run cpu integration tests - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu + pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu + pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu else echo "skipping cpu integration tests" fi @@ -88,7 +99,9 @@ phases: - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then printf "$SETUP_CMDS" > $SETUP_FILE - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu" + cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" + remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" + cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" else echo "skipping gpu integration tests" @@ -97,8 +110,10 @@ phases: # run sagemaker tests - | if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG --py-version $CPU_PY_VERSION --processor cpu - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG --py-version $GPU_PY_VERSION --processor gpu + pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu + pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu + pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu + pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu else echo "skipping sagemaker tests" fi @@ -109,5 +124,7 @@ phases: - cleanup-key-pairs # remove ecr image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY2 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY2 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY3 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY3 diff --git a/docker/__init__.py b/docker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docker/build_artifacts/__init__.py b/docker/build_artifacts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/sagemaker_tensorflow_container/deep_learning_container.py b/docker/build_artifacts/deep_learning_container.py similarity index 100% rename from src/sagemaker_tensorflow_container/deep_learning_container.py rename to docker/build_artifacts/deep_learning_container.py diff --git a/docker/1.15.0/py2/dockerd-entrypoint.py b/docker/build_artifacts/dockerd-entrypoint.py similarity index 100% rename from docker/1.15.0/py2/dockerd-entrypoint.py rename to docker/build_artifacts/dockerd-entrypoint.py diff --git a/test/unit/test_deep_learning_containers.py b/test/unit/test_deep_learning_container.py similarity index 98% rename from test/unit/test_deep_learning_containers.py rename to test/unit/test_deep_learning_container.py index 8d6fe08e..7d5d7d86 100644 --- a/test/unit/test_deep_learning_containers.py +++ b/test/unit/test_deep_learning_container.py @@ -14,11 +14,10 @@ import unittest +from docker.build_artifacts import deep_learning_container as deep_learning_container_to_test import pytest import requests -from sagemaker_tensorflow_container import deep_learning_container as deep_learning_container_to_test - @pytest.fixture(name='fixture_valid_instance_id') def fixture_valid_instance_id(requests_mock): From 2792fcbd3a9de288f3f8660e2ca9d186b6f23024 Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Date: Fri, 10 Jan 2020 19:45:11 -0800 Subject: [PATCH 04/10] update: Update awscli version and remove related pins (#267) * Update awscli, remove dependent pins * Update setup.py package versions to latest --- docker/1.15.0/py2/Dockerfile.cpu | 6 +----- docker/1.15.0/py2/Dockerfile.gpu | 6 +----- docker/1.15.0/py3/Dockerfile.cpu | 8 +------- docker/1.15.0/py3/Dockerfile.gpu | 8 +------- setup.py | 8 ++++---- 5 files changed, 8 insertions(+), 28 deletions(-) diff --git a/docker/1.15.0/py2/Dockerfile.cpu b/docker/1.15.0/py2/Dockerfile.cpu index f9387aa0..4fa24019 100644 --- a/docker/1.15.0/py2/Dockerfile.cpu +++ b/docker/1.15.0/py2/Dockerfile.cpu @@ -100,9 +100,6 @@ RUN pip install --no-cache-dir -U \ keras_preprocessing==1.1.0 \ requests==2.22.0 \ keras==2.3.1 \ - # botocore requires python-dateutil<2.8.1 - "python-dateutil<2.8.1" \ - awscli==1.16.296 \ mpi4py==3.0.2 \ "cryptography>=2.3" \ "sagemaker-tensorflow>=1.15,<1.16" \ @@ -111,10 +108,9 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.16.314 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ && pip install --no-cache-dir -U \ - # awscli requires PyYAML<5.2 - "PyYAML<5.2" \ horovod==0.18.2 COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py diff --git a/docker/1.15.0/py2/Dockerfile.gpu b/docker/1.15.0/py2/Dockerfile.gpu index fa86d9dc..534066e8 100644 --- a/docker/1.15.0/py2/Dockerfile.gpu +++ b/docker/1.15.0/py2/Dockerfile.gpu @@ -133,9 +133,6 @@ RUN pip install --no-cache-dir -U \ keras_preprocessing==1.1.0 \ requests==2.22.0 \ keras==2.3.1 \ - # botocore requires python-dateutil<2.8.1 - "python-dateutil<2.8.1" \ - awscli==1.16.296 \ mpi4py==3.0.2 \ "cryptography>=2.3" \ "sagemaker-tensorflow>=1.15,<1.16" \ @@ -144,13 +141,12 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.16.314 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE # Install Horovod, temporarily using CUDA stubs RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ - # awscli requires PyYAML<5.2 - "PyYAML<5.2" \ horovod==0.18.2 \ && ldconfig diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu index b204769a..7eaef285 100644 --- a/docker/1.15.0/py3/Dockerfile.cpu +++ b/docker/1.15.0/py3/Dockerfile.cpu @@ -88,8 +88,6 @@ RUN pip3 --no-cache-dir install --upgrade \ RUN ln -s $(which python3) /usr/local/bin/python \ && ln -s $(which pip3) /usr/bin/pip -# install PyYAML==5.1.2 to avoid conflict with latest awscli -# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli RUN pip install --no-cache-dir -U \ numpy==1.17.4 \ scipy==1.2.2 \ @@ -100,12 +98,9 @@ RUN pip install --no-cache-dir -U \ keras_applications==1.0.8 \ keras_preprocessing==1.1.0 \ keras==2.3.1 \ - # botocore requires python-dateutil<2.8.1 - "python-dateutil<2.8.1" \ requests==2.22.0 \ smdebug==0.4.14 \ sagemaker-experiments==0.1.3 \ - awscli==1.16.296 \ mpi4py==3.0.2 \ "cryptography>=2.3" \ "sagemaker-tensorflow>=1.15,<1.16" \ @@ -114,11 +109,10 @@ RUN pip install --no-cache-dir -U \ && pip install --force-reinstall --no-cache-dir -U \ ${TF_URL} \ && pip install --force-reinstall --no-cache-dir -U \ - # awscli requires PyYAML<5.2 - "PyYAML<5.2" \ horovod==0.18.2 \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.16.314 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu index 38c86b14..d912f5ba 100644 --- a/docker/1.15.0/py3/Dockerfile.gpu +++ b/docker/1.15.0/py3/Dockerfile.gpu @@ -125,8 +125,6 @@ RUN ln -s $(which python3) /usr/local/bin/python \ COPY $FRAMEWORK_SUPPORT_INSTALLABLE . -# install PyYAML==5.1.2 to avoid conflict with latest awscli -# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli RUN pip install --no-cache-dir -U \ numpy==1.17.4 \ scipy==1.2.2 \ @@ -138,11 +136,8 @@ RUN pip install --no-cache-dir -U \ keras_preprocessing==1.1.0 \ requests==2.22.0 \ keras==2.3.1 \ - # botocore requires python-dateutil<2.8.1 - "python-dateutil<2.8.1" \ smdebug==0.4.14 \ sagemaker-experiments==0.1.3 \ - awscli==1.16.296 \ mpi4py==3.0.2 \ "cryptography>=2.3" \ "sagemaker-tensorflow>=1.15,<1.16" \ @@ -152,13 +147,12 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.16.314 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE # Install Horovod, temporarily using CUDA stubs RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ - # awscli requires PyYAML<5.2 - "PyYAML<5.2" \ horovod==0.18.2 \ && ldconfig diff --git a/setup.py b/setup.py index 11c8be66..6b7537f9 100644 --- a/setup.py +++ b/setup.py @@ -53,13 +53,13 @@ def read_version(): 'Programming Language :: Python :: 3.6', ], - install_requires=['sagemaker-containers>=2.4.6', 'numpy', 'scipy', 'sklearn', + install_requires=['sagemaker-containers>=2.6.2', 'numpy', 'scipy', 'sklearn', 'pandas', 'Pillow', 'h5py'], extras_require={ 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', - 'sagemaker==1.19.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.32', - 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.32', - 'requests-mock', 'awscli==1.16.296'], + 'sagemaker==1.50.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.50', + 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50', + 'requests-mock', 'awscli==1.16.314'], 'benchmark': ['click'] }, ) From f0a557c546ec7fe185594aa2e553dc2e8f8d217e Mon Sep 17 00:00:00 2001 From: Denis Davydenko Date: Fri, 10 Jan 2020 20:01:00 -0800 Subject: [PATCH 05/10] bump smdebug version to 0.5.0.post0 (#268) * bump smdebug to 0.5.0 * changed awscli to prevent botocore conflict --- docker/1.15.0/py3/Dockerfile.cpu | 2 +- docker/1.15.0/py3/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu index 7eaef285..4b72f937 100644 --- a/docker/1.15.0/py3/Dockerfile.cpu +++ b/docker/1.15.0/py3/Dockerfile.cpu @@ -99,7 +99,7 @@ RUN pip install --no-cache-dir -U \ keras_preprocessing==1.1.0 \ keras==2.3.1 \ requests==2.22.0 \ - smdebug==0.4.14 \ + smdebug==0.5.0.post0 \ sagemaker-experiments==0.1.3 \ mpi4py==3.0.2 \ "cryptography>=2.3" \ diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu index d912f5ba..3a409e73 100644 --- a/docker/1.15.0/py3/Dockerfile.gpu +++ b/docker/1.15.0/py3/Dockerfile.gpu @@ -136,7 +136,7 @@ RUN pip install --no-cache-dir -U \ keras_preprocessing==1.1.0 \ requests==2.22.0 \ keras==2.3.1 \ - smdebug==0.4.14 \ + smdebug==0.5.0.post0 \ sagemaker-experiments==0.1.3 \ mpi4py==3.0.2 \ "cryptography>=2.3" \ From ca1a008172f846f9d9ea52f255dcac032da3d15c Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Date: Mon, 13 Jan 2020 15:20:52 -0800 Subject: [PATCH 06/10] documentation: Add link to TF 2.0 branch (#269) * Add link to TF 2.0 branch * Add url for dockerfiles --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index 6e83d245..72418561 100644 --- a/README.rst +++ b/README.rst @@ -56,6 +56,10 @@ The Docker images are built from the Dockerfiles specified in The Docker files are grouped based on TensorFlow version and separated based on Python version and processor type. +The Docker files for TensorFlow 2.0 are available in the +`tf-2 `__ branch, in +`docker/2.0.0/ `__. + The Docker images, used to run training & inference jobs, are built from both corresponding "base" and "final" Dockerfiles. From 3c384adab5960e18651463a5850f8b22680c7c4e Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:40:18 -0800 Subject: [PATCH 07/10] Pin awscli to latest (#270) --- docker/1.15.0/py3/Dockerfile.cpu | 2 +- docker/1.15.0/py3/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu index 4b72f937..7e60c4f4 100644 --- a/docker/1.15.0/py3/Dockerfile.cpu +++ b/docker/1.15.0/py3/Dockerfile.cpu @@ -112,7 +112,7 @@ RUN pip install --no-cache-dir -U \ horovod==0.18.2 \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.16.314 \ + awscli==1.17.1 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu index 3a409e73..c16fb42d 100644 --- a/docker/1.15.0/py3/Dockerfile.gpu +++ b/docker/1.15.0/py3/Dockerfile.gpu @@ -147,7 +147,7 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.16.314 \ + awscli==1.17.1 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE # Install Horovod, temporarily using CUDA stubs From c46b6f60848388487cef71705ca209ac539e3c72 Mon Sep 17 00:00:00 2001 From: Arjuna Keshavan <33526713+arjkesh@users.noreply.github.com> Date: Tue, 21 Jan 2020 15:56:16 -0800 Subject: [PATCH 08/10] pin awscli to latest version (#272) awscli==1.17.7 --- docker/1.15.0/py2/Dockerfile.cpu | 2 +- docker/1.15.0/py2/Dockerfile.gpu | 2 +- docker/1.15.0/py3/Dockerfile.cpu | 2 +- docker/1.15.0/py3/Dockerfile.gpu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/1.15.0/py2/Dockerfile.cpu b/docker/1.15.0/py2/Dockerfile.cpu index 4fa24019..5a161f80 100644 --- a/docker/1.15.0/py2/Dockerfile.cpu +++ b/docker/1.15.0/py2/Dockerfile.cpu @@ -108,7 +108,7 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.16.314 \ + awscli==1.17.7 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ && pip install --no-cache-dir -U \ horovod==0.18.2 diff --git a/docker/1.15.0/py2/Dockerfile.gpu b/docker/1.15.0/py2/Dockerfile.gpu index 534066e8..50b1484f 100644 --- a/docker/1.15.0/py2/Dockerfile.gpu +++ b/docker/1.15.0/py2/Dockerfile.gpu @@ -141,7 +141,7 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.16.314 \ + awscli==1.17.7 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE # Install Horovod, temporarily using CUDA stubs diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu index 7e60c4f4..d0fe3027 100644 --- a/docker/1.15.0/py3/Dockerfile.cpu +++ b/docker/1.15.0/py3/Dockerfile.cpu @@ -112,7 +112,7 @@ RUN pip install --no-cache-dir -U \ horovod==0.18.2 \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.17.1 \ + awscli==1.17.7 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu index c16fb42d..68c68383 100644 --- a/docker/1.15.0/py3/Dockerfile.gpu +++ b/docker/1.15.0/py3/Dockerfile.gpu @@ -147,7 +147,7 @@ RUN pip install --no-cache-dir -U \ ${TF_URL} \ && pip install --no-cache-dir -U \ $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.17.1 \ + awscli==1.17.7 \ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE # Install Horovod, temporarily using CUDA stubs From 8961aacea642219f0679b35b462671c96c00b461 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Fri, 24 Jan 2020 16:31:22 -0800 Subject: [PATCH 09/10] infra: properly fail build if has-matching-changes fails (#273) --- buildspec.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index d59393c3..5deb4c15 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -86,9 +86,11 @@ phases: - create-key-pair - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu + - HAS_MATCHING_CHANGES_OUTPUT=$(has-matching-changes "test/" "tests/" "src/*.py" "setup.py" "docker/*" "buildspec.yml") + # run cpu integration tests - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then + if [ "$HAS_MATCHING_CHANGES" = "Changes Found" ] ; then pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu else @@ -97,7 +99,7 @@ phases: # run gpu integration tests - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then + if [ "$HAS_MATCHING_CHANGES" = "Changes Found" ] ; then printf "$SETUP_CMDS" > $SETUP_FILE cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" @@ -109,7 +111,7 @@ phases: # run sagemaker tests - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then + if [ "$HAS_MATCHING_CHANGES" = "Changes Found" ] ; then pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu From f00a3b12fe7b83378f8f65ad233b95cd1671e82c Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Mon, 27 Jan 2020 17:57:55 -0800 Subject: [PATCH 10/10] infra: properly fail build if has-matching-changes fails (#274) --- buildspec.yml | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index 5deb4c15..eece6ae1 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -86,39 +86,31 @@ phases: - create-key-pair - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu - - HAS_MATCHING_CHANGES_OUTPUT=$(has-matching-changes "test/" "tests/" "src/*.py" "setup.py" "docker/*" "buildspec.yml") - # run cpu integration tests - - | - if [ "$HAS_MATCHING_CHANGES" = "Changes Found" ] ; then - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu - else - echo "skipping cpu integration tests" - fi + - py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu" + - py2_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu" + - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" # run gpu integration tests - - | - if [ "$HAS_MATCHING_CHANGES" = "Changes Found" ] ; then - printf "$SETUP_CMDS" > $SETUP_FILE - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" - remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu" - remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" - else - echo "skipping gpu integration tests" - fi + - printf "$SETUP_CMDS" > $SETUP_FILE + - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" + - py3_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" + - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + + - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu" + - py2_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" + - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" # run sagemaker tests - - | - if [ "$HAS_MATCHING_CHANGES" = "Changes Found" ] ; then - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu - else - echo "skipping sagemaker tests" - fi + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" finally: # shut down remote gpu instance