diff --git a/README.rst b/README.rst index 6e83d245..72418561 100644 --- a/README.rst +++ b/README.rst @@ -56,6 +56,10 @@ The Docker images are built from the Dockerfiles specified in The Docker files are grouped based on TensorFlow version and separated based on Python version and processor type. +The Docker files for TensorFlow 2.0 are available in the +`tf-2 `__ branch, in +`docker/2.0.0/ `__. + The Docker images, used to run training & inference jobs, are built from both corresponding "base" and "final" Dockerfiles. diff --git a/buildspec-release.yml b/buildspec-release.yml index a4ff55a5..2e5a9a86 100644 --- a/buildspec-release.yml +++ b/buildspec-release.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.13.1' + FRAMEWORK_VERSION: '1.15.0' GPU_INSTANCE_TYPE: 'ml.p2.xlarge' SETUP_FILE: 'setup_cmds.sh' SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' @@ -60,21 +60,21 @@ phases: echo '[{ "repository": "sagemaker-tensorflow-scriptmode", "tags": [{ - "source": "1.13.1-cpu-py2", - "dest": ["1.13.1-cpu-py2", "1.13-cpu-py2", "1.13.1-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-cpu-py2", + "dest": ["1.15.0-cpu-py2", "1.15-cpu-py2", "1.15.0-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.13.1-cpu-py3", - "dest": ["1.13.1-cpu-py3", "1.13-cpu-py3", "1.13.1-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-cpu-py3", + "dest": ["1.15.0-cpu-py3", "1.15-cpu-py3", "1.15.0-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.13.1-gpu-py2", - "dest": ["1.13.1-gpu-py2", "1.13-gpu-py2", "1.13.1-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-gpu-py2", + "dest": ["1.15.0-gpu-py2", "1.15-gpu-py2", "1.15.0-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.13.1-gpu-py3", - "dest": ["1.13.1-gpu-py3", "1.13-gpu-py3", "1.13.1-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.15.0-gpu-py3", + "dest": ["1.15.0-gpu-py3", "1.15-gpu-py3", "1.15.0-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] }], "test": [ - "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --instance-type {cpu-instance-type} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.13.1 --processor cpu --py-version 2,3", - "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.13.1 --processor gpu --py-version 2,3" + "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --instance-type {cpu-instance-type} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.15.0 --processor cpu --py-version 2,3", + "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.15.0 --processor gpu --py-version 2,3" ] }]' > deployments.json diff --git a/buildspec.yml b/buildspec.yml index 214cdcca..eece6ae1 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -2,11 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.13.1' - CPU_FRAMEWORK_BINARY: 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' - CPU_PY_VERSION: '3' - GPU_FRAMEWORK_BINARY: 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' - GPU_PY_VERSION: '3' + FRAMEWORK_VERSION: '1.15.0' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-container' SETUP_FILE: 'setup_cmds.sh' @@ -34,42 +30,56 @@ phases: - tox -e py36,py27 test/unit # Create pip archive - - build_dir="docker/$FRAMEWORK_VERSION" + - root_dir=$(pwd) - build_id="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - python3 setup.py sdist - tar_name=$(ls dist) - - cp dist/$tar_name $build_dir - # build cpu image - - cpu_dockerfile="Dockerfile.cpu" + # Find build artifacts + - build_artifacts=$root_dir/docker/artifacts - # Download framework binary - - cpu_fw_binary=$(basename $CPU_FRAMEWORK_BINARY) - - wget -O $build_dir/$cpu_fw_binary $CPU_FRAMEWORK_BINARY - - - CPU_TAG="$FRAMEWORK_VERSION-cpu-py$CPU_PY_VERSION-$build_id" + # build py2 images + # prepare build context + - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py2" + - cp $root_dir/dist/$tar_name $build_dir + - cp $build_artifacts/* $build_dir/ - cd $build_dir - - docker build -f $cpu_dockerfile --build-arg framework_support_installable=$tar_name --build-arg py_version=$CPU_PY_VERSION --build-arg framework_installable=$cpu_fw_binary -t $PREPROD_IMAGE:$CPU_TAG . - - cd ../../ + + # build cpu image + - cpu_dockerfile="Dockerfile.cpu" + - CPU_TAG_PY2="$FRAMEWORK_VERSION-cpu-py2-$build_id" + - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY2 . # build gpu image - gpu_dockerfile="Dockerfile.gpu" + - GPU_TAG_PY2="$FRAMEWORK_VERSION-gpu-py2-$build_id" + - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY2 . - # Download framework binary - - gpu_fw_binary=$(basename $GPU_FRAMEWORK_BINARY) - - wget -O $build_dir/$gpu_fw_binary $GPU_FRAMEWORK_BINARY - - - GPU_TAG="$FRAMEWORK_VERSION-gpu-py$GPU_PY_VERSION-$build_id" + # build py3 images + # prepare build context + - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py3" + - cp $root_dir/dist/$tar_name $build_dir + - cp $build_artifacts/* $build_dir/ - cd $build_dir - - docker build -f $gpu_dockerfile --build-arg framework_support_installable=$tar_name --build-arg py_version=$GPU_PY_VERSION --build-arg framework_installable=$gpu_fw_binary -t $PREPROD_IMAGE:$GPU_TAG . - - cd ../../ + + # build cpu image + - cpu_dockerfile="Dockerfile.cpu" + - CPU_TAG_PY3="$FRAMEWORK_VERSION-cpu-py3-$build_id" + - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY3 . + + # build gpu image + - gpu_dockerfile="Dockerfile.gpu" + - GPU_TAG_PY3="$FRAMEWORK_VERSION-gpu-py3-$build_id" + - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY3 . # push images to ecr - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - docker push $PREPROD_IMAGE:$CPU_TAG - - docker push $PREPROD_IMAGE:$GPU_TAG + - docker push $PREPROD_IMAGE:$CPU_TAG_PY2 + - docker push $PREPROD_IMAGE:$GPU_TAG_PY2 + - docker push $PREPROD_IMAGE:$CPU_TAG_PY3 + - docker push $PREPROD_IMAGE:$GPU_TAG_PY3 # launch remote gpu instance - instance_type='p2.xlarge' @@ -77,31 +87,30 @@ phases: - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu # run cpu integration tests - - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then - pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu - else - echo "skipping cpu integration tests" - fi + - py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu" + - py2_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu" + - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" # run gpu integration tests - - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then - printf "$SETUP_CMDS" > $SETUP_FILE - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu" - remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" - else - echo "skipping gpu integration tests" - fi + - printf "$SETUP_CMDS" > $SETUP_FILE + - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" + - py3_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" + - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + + - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu" + - py2_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" + - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" # run sagemaker tests - - | - if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG --py-version $CPU_PY_VERSION --processor cpu - pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG --py-version $GPU_PY_VERSION --processor gpu - else - echo "skipping sagemaker tests" - fi + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" finally: # shut down remote gpu instance @@ -109,5 +118,7 @@ phases: - cleanup-key-pairs # remove ecr image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY2 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY2 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY3 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY3 diff --git a/docker/1.15.0/py2/Dockerfile.cpu b/docker/1.15.0/py2/Dockerfile.cpu new file mode 100644 index 00000000..5a161f80 --- /dev/null +++ b/docker/1.15.0/py2/Dockerfile.cpu @@ -0,0 +1,125 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + openssh-client \ + openssh-server \ + ca-certificates \ + curl \ + git \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN apt-get update \ + && apt-get install -y \ + python \ + python-pip + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python) /usr/local/bin/python + +RUN pip install --no-cache-dir -U \ + numpy==1.16.5 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.17.7 \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ + && pip install --no-cache-dir -U \ + horovod==0.18.2 + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py2/Dockerfile.gpu b/docker/1.15.0/py2/Dockerfile.gpu new file mode 100644 index 00000000..50b1484f --- /dev/null +++ b/docker/1.15.0/py2/Dockerfile.gpu @@ -0,0 +1,167 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + build-essential \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +RUN apt-get update \ + && apt-get install -y \ + python \ + python-pip + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH /usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python) /usr/local/bin/python + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip install --no-cache-dir -U \ + numpy==1.16.5 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.17.7 \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu new file mode 100644 index 00000000..d0fe3027 --- /dev/null +++ b/docker/1.15.0/py3/Dockerfile.cpu @@ -0,0 +1,127 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + python3-setuptools \ + software-properties-common \ + build-essential \ + openssh-client \ + openssh-server \ + ca-certificates \ + curl \ + git \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip3 --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +RUN pip install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + keras==2.3.1 \ + requests==2.22.0 \ + smdebug==0.5.0.post0 \ + sagemaker-experiments==0.1.3 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --force-reinstall --no-cache-dir -U \ + horovod==0.18.2 \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.17.7 \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu new file mode 100644 index 00000000..68c68383 --- /dev/null +++ b/docker/1.15.0/py3/Dockerfile.gpu @@ -0,0 +1,173 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-dev \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + build-essential \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the + # library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +########################################################################### +# Horovod & its dependencies +########################################################################### + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip3 --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . + +RUN pip install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.1 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + smdebug==0.5.0.post0 \ + sagemaker-experiments==0.1.3 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + $FRAMEWORK_SUPPORT_INSTALLABLE \ + awscli==1.17.7 \ + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/dockerd-entrypoint.py b/docker/1.15.0/py3/dockerd-entrypoint.py new file mode 100644 index 00000000..cd222026 --- /dev/null +++ b/docker/1.15.0/py3/dockerd-entrypoint.py @@ -0,0 +1,23 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os.path +import shlex +import subprocess +import sys + +if not os.path.exists("/opt/ml/input/config"): + subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) + +subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) diff --git a/docker/__init__.py b/docker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docker/build_artifacts/__init__.py b/docker/build_artifacts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docker/build_artifacts/deep_learning_container.py b/docker/build_artifacts/deep_learning_container.py new file mode 100644 index 00000000..7e3967c7 --- /dev/null +++ b/docker/build_artifacts/deep_learning_container.py @@ -0,0 +1,112 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import re + +import requests + + +def _validate_instance_id(instance_id): + """ + Validate instance ID + """ + instance_id_regex = r'^(i-\S{17})' + compiled_regex = re.compile(instance_id_regex) + match = compiled_regex.match(instance_id) + + if not match: + return None + + return match.group(1) + + +def _retrieve_instance_id(): + """ + Retrieve instance ID from instance metadata service + """ + instance_id = None + url = "http://169.254.169.254/latest/meta-data/instance-id" + response = requests_helper(url, timeout=0.1) + + if response is not None: + instance_id = _validate_instance_id(response.text) + + return instance_id + + +def _retrieve_instance_region(): + """ + Retrieve instance region from instance metadata service + """ + region = None + valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2', + 'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1', + 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', + 'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] + + url = "http://169.254.169.254/latest/dynamic/instance-identity/document" + response = requests_helper(url, timeout=0.1) + + if response is not None: + response_json = json.loads(response.text) + + if response_json['region'] in valid_regions: + region = response_json['region'] + + return region + + +def query_bucket(): + """ + GET request on an empty object from an Amazon S3 bucket + """ + response = None + instance_id = _retrieve_instance_id() + region = _retrieve_instance_region() + + if instance_id is not None and region is not None: + url = ("https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com" + "/dlc-containers.txt?x-instance-id={1}".format(region, instance_id)) + response = requests_helper(url, timeout=0.2) + + logging.debug("Query bucket finished: {}".format(response)) + + return response + + +def requests_helper(url, timeout): + response = None + try: + response = requests.get(url, timeout=timeout) + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + return response + + +def main(): + """ + Invoke bucket query + """ + # Logs are not necessary for normal run. Remove this line while debugging. + logging.getLogger().disabled = True + + logging.basicConfig(level=logging.ERROR) + query_bucket() + + +if __name__ == '__main__': + main() diff --git a/docker/build_artifacts/dockerd-entrypoint.py b/docker/build_artifacts/dockerd-entrypoint.py new file mode 100644 index 00000000..cd222026 --- /dev/null +++ b/docker/build_artifacts/dockerd-entrypoint.py @@ -0,0 +1,23 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os.path +import shlex +import subprocess +import sys + +if not os.path.exists("/opt/ml/input/config"): + subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) + +subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) diff --git a/scripts/build_all.py b/scripts/build_all.py index de7913d3..9f340d5d 100644 --- a/scripts/build_all.py +++ b/scripts/build_all.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/scripts/publish_all.py b/scripts/publish_all.py index 092ae113..2c78e8a7 100644 --- a/scripts/publish_all.py +++ b/scripts/publish_all.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/setup.py b/setup.py index fa86dbd6..0af7e92c 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of @@ -53,13 +53,11 @@ def read_version(): 'Programming Language :: Python :: 3.6', ], - install_requires=['sagemaker-containers>=2.6.1', 'numpy', 'scipy', 'sklearn', - 'pandas', 'Pillow', 'h5py'], extras_require={ 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', - 'sagemaker==1.48.0', 'tensorflow', 'docker-compose', 'boto3>=1.10.41', - 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore>=1.13.41', - 'awscli>=1.16.305'], + 'sagemaker==1.50.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.50', + 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50', + 'requests-mock', 'awscli==1.16.314'], 'benchmark': ['click'] }, ) diff --git a/src/sagemaker_tensorflow_container/s3_utils.py b/src/sagemaker_tensorflow_container/s3_utils.py index 22e2ef74..0137ef25 100644 --- a/src/sagemaker_tensorflow_container/s3_utils.py +++ b/src/sagemaker_tensorflow_container/s3_utils.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index 29fbb47b..ba4fbe6a 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index 2d4e9ce3..f35ba03a 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_keras.py b/test/integration/local/test_keras.py index 2e473bf9..1eca0c2a 100644 --- a/test/integration/local/test_keras.py +++ b/test/integration/local/test_keras.py @@ -1,4 +1,4 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py index 6a2bab25..bd1641b0 100644 --- a/test/integration/local/test_training.py +++ b/test/integration/local/test_training.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index 08e41704..1d2bd8ac 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index a4381d40..ce685abc 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_tuning_model_dir.py b/test/integration/sagemaker/test_tuning_model_dir.py index 604d4c93..e833c3a4 100644 --- a/test/integration/sagemaker/test_tuning_model_dir.py +++ b/test/integration/sagemaker/test_tuning_model_dir.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py index 4360987a..d4738d32 100644 --- a/test/integration/sagemaker/timeout.py +++ b/test/integration/sagemaker/timeout.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/test/integration/utils.py b/test/integration/utils.py index 83271f67..4944eb20 100644 --- a/test/integration/utils.py +++ b/test/integration/utils.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py index 7d9e7940..c5c0b242 100644 --- a/test/resources/mnist/horovod_mnist.py +++ b/test/resources/mnist/horovod_mnist.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py index 47d2bcd0..e4349ce2 100644 --- a/test/resources/mnist/mnist.py +++ b/test/resources/mnist/mnist.py @@ -1,8 +1,11 @@ -import tensorflow as tf import argparse +import json import os +import sys + import numpy as np -import json +import tensorflow as tf + def _parse_args(): @@ -32,6 +35,18 @@ def _load_testing_data(base_dir): return x_test, y_test +def assert_can_track_sagemaker_experiments(): + in_sagemaker_training = 'TRAINING_JOB_ARN' in os.environ + in_python_three = sys.version_info[0] == 3 + + if in_sagemaker_training and in_python_three: + import smexperiments.tracker + + with smexperiments.tracker.Tracker.load() as tracker: + tracker.log_parameter('param', 1) + tracker.log_metric('metric', 1.0) + + args, unknown = _parse_args() model = tf.keras.models.Sequential([ @@ -48,5 +63,7 @@ def _load_testing_data(base_dir): x_test, y_test = _load_testing_data(args.train) model.fit(x_train, y_train, epochs=args.epochs) model.evaluate(x_test, y_test) + if args.current_host == args.hosts[0]: model.save(os.path.join('/opt/ml/model', 'my_model.h5')) + assert_can_track_sagemaker_experiments() diff --git a/test/resources/test_py_version/entry.py b/test/resources/test_py_version/entry.py index e844e07c..8f71a01b 100644 --- a/test/resources/test_py_version/entry.py +++ b/test/resources/test_py_version/entry.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/tuning_model_dir/entry.py b/test/resources/tuning_model_dir/entry.py index 2fae72fc..0bce7165 100644 --- a/test/resources/tuning_model_dir/entry.py +++ b/test/resources/tuning_model_dir/entry.py @@ -1,4 +1,4 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_deep_learning_container.py b/test/unit/test_deep_learning_container.py new file mode 100644 index 00000000..7d5d7d86 --- /dev/null +++ b/test/unit/test_deep_learning_container.py @@ -0,0 +1,157 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the 'license' file accompanying this file. This file is +# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import unittest + +from docker.build_artifacts import deep_learning_container as deep_learning_container_to_test +import pytest +import requests + + +@pytest.fixture(name='fixture_valid_instance_id') +def fixture_valid_instance_id(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', + text='i-123t32e11s32t1231') + + +@pytest.fixture(name='fixture_invalid_instance_id') +def fixture_invalid_instance_id(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text='i-123') + + +@pytest.fixture(name='fixture_none_instance_id') +def fixture_none_instance_id(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text=None) + + +@pytest.fixture(name='fixture_invalid_region') +def fixture_invalid_region(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', + json={'region': 'test'}) + + +@pytest.fixture(name='fixture_valid_region') +def fixture_valid_region(requests_mock): + return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', + json={'region': 'us-east-1'}) + + +def test_retrieve_instance_id(fixture_valid_instance_id): + result = deep_learning_container_to_test._retrieve_instance_id() + assert 'i-123t32e11s32t1231' == result + + +def test_retrieve_none_instance_id(fixture_none_instance_id): + result = deep_learning_container_to_test._retrieve_instance_id() + assert result is None + + +def test_retrieve_invalid_instance_id(fixture_invalid_instance_id): + result = deep_learning_container_to_test._retrieve_instance_id() + assert result is None + + +def test_retrieve_invalid_region(fixture_invalid_region): + result = deep_learning_container_to_test._retrieve_instance_region() + assert result is None + + +def test_retrieve_valid_region(fixture_valid_region): + result = deep_learning_container_to_test._retrieve_instance_region() + assert 'us-east-1' == result + + +def test_query_bucket(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + requests_mock.get(('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231'), + text='Access Denied') + actual_response = deep_learning_container_to_test.query_bucket() + assert 'Access Denied' == actual_response.text + + +def test_query_bucket_region_none(fixture_invalid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_invalid_region.return_value = None + actual_response = deep_learning_container_to_test.query_bucket() + assert actual_response is None + + +def test_query_bucket_instance_id_none(requests_mock, fixture_valid_region, fixture_none_instance_id): + fixture_none_instance_id.return_value = None + fixture_valid_region.return_value = 'us-east-1' + actual_response = deep_learning_container_to_test.query_bucket() + assert actual_response is None + + +def test_query_bucket_instance_id_invalid(requests_mock, fixture_valid_region, fixture_invalid_instance_id): + fixture_invalid_instance_id.return_value = None + fixture_valid_region.return_value = 'us-east-1' + actual_response = deep_learning_container_to_test.query_bucket() + assert actual_response is None + + +def test_HTTP_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') + + requests_mock.get( + query_s3_url, + exc=requests.exceptions.HTTPError) + requests_mock.side_effect = requests.exceptions.HTTPError + + with pytest.raises(requests.exceptions.HTTPError): + actual_response = requests.get(query_s3_url) + assert actual_response is None + + +def test_connection_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') + + requests_mock.get( + query_s3_url, + exc=requests.exceptions.ConnectionError) + + with pytest.raises(requests.exceptions.ConnectionError): + actual_response = requests.get( + query_s3_url) + + assert actual_response is None + + +def test_timeout_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): + fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' + fixture_valid_region.return_value = 'us-east-1' + query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' + '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') + + requests_mock.get( + query_s3_url, + exc=requests.Timeout) + + with pytest.raises(requests.exceptions.Timeout): + actual_response = requests.get( + query_s3_url) + + assert actual_response is None + + +if __name__ == '__main__': + unittest.main() diff --git a/test/unit/test_s3_utils.py b/test/unit/test_s3_utils.py index fa2cef6b..03de70a3 100644 --- a/test/unit/test_s3_utils.py +++ b/test/unit/test_s3_utils.py @@ -1,4 +1,4 @@ -# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_training.py b/test/unit/test_training.py index 6bb41b7d..2962f5d3 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -1,4 +1,4 @@ -# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of