From e6d929d517d08583529436ec23951ab865de5cc2 Mon Sep 17 00:00:00 2001 From: Nadia Yakimakha <32335935+nadiaya@users.noreply.github.com> Date: Tue, 11 Feb 2020 14:59:03 -0800 Subject: [PATCH] Revert "Merge 'master' branch into 'tf-2' branch. (#279)" This reverts commit 555de39d8b57aa39bbb3bb7807cde06111476d5c. --- README.rst | 4 - buildspec-release.yml | 22 +-- buildspec.yml | 107 +++++------ docker/1.15.0/py2/Dockerfile.cpu | 125 ------------- docker/1.15.0/py2/Dockerfile.gpu | 167 ----------------- docker/1.15.0/py3/Dockerfile.cpu | 127 ------------- docker/1.15.0/py3/Dockerfile.gpu | 173 ------------------ docker/1.15.0/py3/dockerd-entrypoint.py | 23 --- docker/__init__.py | 0 docker/build_artifacts/__init__.py | 0 .../deep_learning_container.py | 112 ------------ docker/build_artifacts/dockerd-entrypoint.py | 23 --- scripts/build_all.py | 2 +- scripts/publish_all.py | 2 +- setup.py | 10 +- .../s3_utils.py | 2 +- .../training.py | 2 +- test/integration/local/test_horovod.py | 2 +- test/integration/local/test_keras.py | 2 +- test/integration/local/test_training.py | 2 +- test/integration/sagemaker/test_horovod.py | 2 +- test/integration/sagemaker/test_mnist.py | 2 +- .../sagemaker/test_tuning_model_dir.py | 2 +- test/integration/sagemaker/timeout.py | 2 +- test/integration/utils.py | 2 +- test/resources/mnist/horovod_mnist.py | 2 +- test/resources/mnist/mnist.py | 21 +-- test/resources/test_py_version/entry.py | 2 +- test/resources/tuning_model_dir/entry.py | 2 +- test/unit/test_deep_learning_container.py | 157 ---------------- test/unit/test_s3_utils.py | 2 +- test/unit/test_training.py | 2 +- 32 files changed, 84 insertions(+), 1021 deletions(-) delete mode 100644 docker/1.15.0/py2/Dockerfile.cpu delete mode 100644 docker/1.15.0/py2/Dockerfile.gpu delete mode 100644 docker/1.15.0/py3/Dockerfile.cpu delete mode 100644 docker/1.15.0/py3/Dockerfile.gpu delete mode 100644 docker/1.15.0/py3/dockerd-entrypoint.py delete mode 100644 docker/__init__.py delete mode 100644 docker/build_artifacts/__init__.py delete mode 100644 docker/build_artifacts/deep_learning_container.py delete mode 100644 docker/build_artifacts/dockerd-entrypoint.py delete mode 100644 test/unit/test_deep_learning_container.py diff --git a/README.rst b/README.rst index 72418561..6e83d245 100644 --- a/README.rst +++ b/README.rst @@ -56,10 +56,6 @@ The Docker images are built from the Dockerfiles specified in The Docker files are grouped based on TensorFlow version and separated based on Python version and processor type. -The Docker files for TensorFlow 2.0 are available in the -`tf-2 `__ branch, in -`docker/2.0.0/ `__. - The Docker images, used to run training & inference jobs, are built from both corresponding "base" and "final" Dockerfiles. diff --git a/buildspec-release.yml b/buildspec-release.yml index 2e5a9a86..a4ff55a5 100644 --- a/buildspec-release.yml +++ b/buildspec-release.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.15.0' + FRAMEWORK_VERSION: '1.13.1' GPU_INSTANCE_TYPE: 'ml.p2.xlarge' SETUP_FILE: 'setup_cmds.sh' SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' @@ -60,21 +60,21 @@ phases: echo '[{ "repository": "sagemaker-tensorflow-scriptmode", "tags": [{ - "source": "1.15.0-cpu-py2", - "dest": ["1.15.0-cpu-py2", "1.15-cpu-py2", "1.15.0-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.13.1-cpu-py2", + "dest": ["1.13.1-cpu-py2", "1.13-cpu-py2", "1.13.1-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.15.0-cpu-py3", - "dest": ["1.15.0-cpu-py3", "1.15-cpu-py3", "1.15.0-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.13.1-cpu-py3", + "dest": ["1.13.1-cpu-py3", "1.13-cpu-py3", "1.13.1-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.15.0-gpu-py2", - "dest": ["1.15.0-gpu-py2", "1.15-gpu-py2", "1.15.0-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.13.1-gpu-py2", + "dest": ["1.13.1-gpu-py2", "1.13-gpu-py2", "1.13.1-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"] },{ - "source": "1.15.0-gpu-py3", - "dest": ["1.15.0-gpu-py3", "1.15-gpu-py3", "1.15.0-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] + "source": "1.13.1-gpu-py3", + "dest": ["1.13.1-gpu-py3", "1.13-gpu-py3", "1.13.1-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"] }], "test": [ - "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --instance-type {cpu-instance-type} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.15.0 --processor cpu --py-version 2,3", - "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.15.0 --processor gpu --py-version 2,3" + "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --instance-type {cpu-instance-type} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.13.1 --processor cpu --py-version 2,3", + "IGNORE_COVERAGE=- tox -e py36 -- -m deploy_test test/integration/sagemaker -n 4 --region {region} --account-id {aws-id} --docker-base-name sagemaker-tensorflow-scriptmode --framework-version 1.13.1 --processor gpu --py-version 2,3" ] }]' > deployments.json diff --git a/buildspec.yml b/buildspec.yml index eece6ae1..214cdcca 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -2,7 +2,11 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.15.0' + FRAMEWORK_VERSION: '1.13.1' + CPU_FRAMEWORK_BINARY: 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' + CPU_PY_VERSION: '3' + GPU_FRAMEWORK_BINARY: 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' + GPU_PY_VERSION: '3' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-container' SETUP_FILE: 'setup_cmds.sh' @@ -30,56 +34,42 @@ phases: - tox -e py36,py27 test/unit # Create pip archive - - root_dir=$(pwd) + - build_dir="docker/$FRAMEWORK_VERSION" - build_id="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - python3 setup.py sdist - tar_name=$(ls dist) - - # Find build artifacts - - build_artifacts=$root_dir/docker/artifacts - - # build py2 images - - # prepare build context - - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py2" - - cp $root_dir/dist/$tar_name $build_dir - - cp $build_artifacts/* $build_dir/ - - cd $build_dir + - cp dist/$tar_name $build_dir # build cpu image - cpu_dockerfile="Dockerfile.cpu" - - CPU_TAG_PY2="$FRAMEWORK_VERSION-cpu-py2-$build_id" - - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY2 . - # build gpu image - - gpu_dockerfile="Dockerfile.gpu" - - GPU_TAG_PY2="$FRAMEWORK_VERSION-gpu-py2-$build_id" - - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY2 . + # Download framework binary + - cpu_fw_binary=$(basename $CPU_FRAMEWORK_BINARY) + - wget -O $build_dir/$cpu_fw_binary $CPU_FRAMEWORK_BINARY - # build py3 images + - CPU_TAG="$FRAMEWORK_VERSION-cpu-py$CPU_PY_VERSION-$build_id" - # prepare build context - - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py3" - - cp $root_dir/dist/$tar_name $build_dir - - cp $build_artifacts/* $build_dir/ - cd $build_dir - - # build cpu image - - cpu_dockerfile="Dockerfile.cpu" - - CPU_TAG_PY3="$FRAMEWORK_VERSION-cpu-py3-$build_id" - - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY3 . + - docker build -f $cpu_dockerfile --build-arg framework_support_installable=$tar_name --build-arg py_version=$CPU_PY_VERSION --build-arg framework_installable=$cpu_fw_binary -t $PREPROD_IMAGE:$CPU_TAG . + - cd ../../ # build gpu image - gpu_dockerfile="Dockerfile.gpu" - - GPU_TAG_PY3="$FRAMEWORK_VERSION-gpu-py3-$build_id" - - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY3 . + + # Download framework binary + - gpu_fw_binary=$(basename $GPU_FRAMEWORK_BINARY) + - wget -O $build_dir/$gpu_fw_binary $GPU_FRAMEWORK_BINARY + + - GPU_TAG="$FRAMEWORK_VERSION-gpu-py$GPU_PY_VERSION-$build_id" + + - cd $build_dir + - docker build -f $gpu_dockerfile --build-arg framework_support_installable=$tar_name --build-arg py_version=$GPU_PY_VERSION --build-arg framework_installable=$gpu_fw_binary -t $PREPROD_IMAGE:$GPU_TAG . + - cd ../../ # push images to ecr - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - docker push $PREPROD_IMAGE:$CPU_TAG_PY2 - - docker push $PREPROD_IMAGE:$GPU_TAG_PY2 - - docker push $PREPROD_IMAGE:$CPU_TAG_PY3 - - docker push $PREPROD_IMAGE:$GPU_TAG_PY3 + - docker push $PREPROD_IMAGE:$CPU_TAG + - docker push $PREPROD_IMAGE:$GPU_TAG # launch remote gpu instance - instance_type='p2.xlarge' @@ -87,30 +77,31 @@ phases: - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu # run cpu integration tests - - py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu" - - py2_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu" - - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - | + if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then + pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu + else + echo "skipping cpu integration tests" + fi # run gpu integration tests - - printf "$SETUP_CMDS" > $SETUP_FILE - - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" - - py3_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu" - - py2_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - | + if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then + printf "$SETUP_CMDS" > $SETUP_FILE + cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu" + remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM" + else + echo "skipping gpu integration tests" + fi # run sagemaker tests - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" + - | + if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then + pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG --py-version $CPU_PY_VERSION --processor cpu + pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG --py-version $GPU_PY_VERSION --processor gpu + else + echo "skipping sagemaker tests" + fi finally: # shut down remote gpu instance @@ -118,7 +109,5 @@ phases: - cleanup-key-pairs # remove ecr image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY2 - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY2 - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY3 - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY3 + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG diff --git a/docker/1.15.0/py2/Dockerfile.cpu b/docker/1.15.0/py2/Dockerfile.cpu deleted file mode 100644 index 5a161f80..00000000 --- a/docker/1.15.0/py2/Dockerfile.cpu +++ /dev/null @@ -1,125 +0,0 @@ -FROM ubuntu:18.04 - -LABEL maintainer="Amazon AI" - -# Prevent docker build get stopped by requesting user interaction -ENV DEBIAN_FRONTEND=noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN=true -# Set environment variables for MKL -# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn -ENV KMP_AFFINITY=granularity=fine,compact,1,0 -ENV KMP_BLOCKTIME=1 -ENV KMP_SETTINGS=0 -# Python won’t try to write .pyc or .pyo files on the import of source modules -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -# See http://bugs.python.org/issue19846 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -# Specify the location of module that contains the training logic for SageMaker -# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html -ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main - -# Define framework-related package sources -ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz -ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - software-properties-common \ - build-essential \ - openssh-client \ - openssh-server \ - ca-certificates \ - curl \ - git \ - wget \ - vim \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* - -# Install Open MPI -RUN mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ - && tar zxf openmpi-4.0.1.tar.gz \ - && cd openmpi-4.0.1 \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -# Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ - && echo '#!/bin/bash' > /usr/local/bin/mpirun \ - && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ - && chmod a+x /usr/local/bin/mpirun - -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf - -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH=/usr/local/openmpi/bin/:$PATH - -# SSH login fix. Otherwise user is kicked off after login -RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - -# Create SSH key. -RUN mkdir -p /root/.ssh/ \ - && mkdir -p /var/run/sshd \ - && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ - && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ - && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config - -WORKDIR / - -RUN apt-get update \ - && apt-get install -y \ - python \ - python-pip - -COPY $FRAMEWORK_SUPPORT_INSTALLABLE . - -RUN pip --no-cache-dir install --upgrade \ - pip \ - setuptools - -# Some TF tools expect a "python" binary -RUN ln -s $(which python) /usr/local/bin/python - -RUN pip install --no-cache-dir -U \ - numpy==1.16.5 \ - scipy==1.2.2 \ - scikit-learn==0.20.3 \ - pandas==0.24.2 \ - Pillow==6.2.1 \ - h5py==2.9.0 \ - keras_applications==1.0.8 \ - keras_preprocessing==1.1.0 \ - requests==2.22.0 \ - keras==2.3.1 \ - mpi4py==3.0.2 \ - "cryptography>=2.3" \ - "sagemaker-tensorflow>=1.15,<1.16" \ - # Let's install TensorFlow separately in the end to avoid the library version to be overwritten - && pip install --force-reinstall --no-cache-dir -U \ - ${TF_URL} \ - && pip install --no-cache-dir -U \ - $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.17.7 \ - && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ - && pip install --no-cache-dir -U \ - horovod==0.18.2 - -COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py - -RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ - && chmod +x /usr/local/bin/deep_learning_container.py - -RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt - -ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] -CMD ["bin/bash"] diff --git a/docker/1.15.0/py2/Dockerfile.gpu b/docker/1.15.0/py2/Dockerfile.gpu deleted file mode 100644 index 50b1484f..00000000 --- a/docker/1.15.0/py2/Dockerfile.gpu +++ /dev/null @@ -1,167 +0,0 @@ -# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. -# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ -FROM nvidia/cuda:10.0-base-ubuntu18.04 - -LABEL maintainer="Amazon AI" - -# Prevent docker build get stopped by requesting user interaction -ENV DEBIAN_FRONTEND=noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN=true -# Python won’t try to write .pyc or .pyo files on the import of source modules -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -# See http://bugs.python.org/issue19846 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -# Specify the location of module that contains the training logic for SageMaker -# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html -ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main - -# Define framework-related package sources -ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz -ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl - -RUN apt-get update \ - && apt-get install -y --no-install-recommends --allow-unauthenticated \ - ca-certificates \ - cuda-command-line-tools-10-0 \ - cuda-cublas-dev-10-0 \ - cuda-cudart-dev-10-0 \ - cuda-cufft-dev-10-0 \ - cuda-curand-dev-10-0 \ - cuda-cusolver-dev-10-0 \ - cuda-cusparse-dev-10-0 \ - curl \ - libcudnn7=7.5.1.10-1+cuda10.0 \ - # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it - libnccl2=2.4.7-1+cuda10.0 \ - libgomp1 \ - libnccl-dev=2.4.7-1+cuda10.0 \ - libfreetype6-dev \ - libhdf5-serial-dev \ - libpng-dev \ - libzmq3-dev \ - git \ - wget \ - vim \ - build-essential \ - openssh-client \ - openssh-server \ - zlib1g-dev \ - # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 - # adds a new list which contains libnvinfer library, so it needs another - # 'apt-get update' to retrieve that list before it can actually install the library. - # We don't install libnvinfer-dev since we don't need to build against TensorRT, - # and libnvinfer4 doesn't contain libnvinfer.a static library. - && apt-get update \ - && apt-get install -y --no-install-recommends --allow-unauthenticated \ - nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ - && apt-get update \ - && apt-get install -y --no-install-recommends --allow-unauthenticated \ - libnvinfer5=5.0.2-1+cuda10.0 \ - && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ - && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ - && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /var/run/sshd - -# Install Open MPI -RUN mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ - && tar zxf openmpi-4.0.1.tar.gz \ - && cd openmpi-4.0.1 \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -RUN apt-get update \ - && apt-get install -y \ - python \ - python-pip - -# Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ - && echo '#!/bin/bash' > /usr/local/bin/mpirun \ - && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ - && chmod a+x /usr/local/bin/mpirun - -# Configure OpenMPI to run good defaults: -# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf - -# Set default NCCL parameters -RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf - -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH /usr/local/openmpi/bin/:$PATH -ENV PATH=/usr/local/nvidia/bin:$PATH - -# SSH login fix. Otherwise user is kicked off after login -RUN mkdir -p /var/run/sshd \ - && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - -# Create SSH key. -RUN mkdir -p /root/.ssh/ \ - && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ - && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ - && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config - -WORKDIR / - -RUN pip --no-cache-dir install --upgrade \ - pip \ - setuptools - -# Some TF tools expect a "python" binary -RUN ln -s $(which python) /usr/local/bin/python - -COPY $FRAMEWORK_SUPPORT_INSTALLABLE . - -RUN pip install --no-cache-dir -U \ - numpy==1.16.5 \ - scipy==1.2.2 \ - scikit-learn==0.20.3 \ - pandas==0.24.2 \ - Pillow==6.2.1 \ - h5py==2.9.0 \ - keras_applications==1.0.8 \ - keras_preprocessing==1.1.0 \ - requests==2.22.0 \ - keras==2.3.1 \ - mpi4py==3.0.2 \ - "cryptography>=2.3" \ - "sagemaker-tensorflow>=1.15,<1.16" \ - # Let's install TensorFlow separately in the end to avoid the library version to be overwritten - && pip install --force-reinstall --no-cache-dir -U \ - ${TF_URL} \ - && pip install --no-cache-dir -U \ - $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.17.7 \ - && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE - -# Install Horovod, temporarily using CUDA stubs -RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ - && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ - horovod==0.18.2 \ - && ldconfig - -# Allow OpenSSH to talk to containers without asking for confirmation -RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ - && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ - && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config - -COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py - -RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ - && chmod +x /usr/local/bin/deep_learning_container.py - -RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt - -ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] -CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/Dockerfile.cpu b/docker/1.15.0/py3/Dockerfile.cpu deleted file mode 100644 index d0fe3027..00000000 --- a/docker/1.15.0/py3/Dockerfile.cpu +++ /dev/null @@ -1,127 +0,0 @@ -FROM ubuntu:18.04 - -LABEL maintainer="Amazon AI" - -# Prevent docker build get stopped by requesting user interaction -ENV DEBIAN_FRONTEND=noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN=true -# Set environment variables for MKL -# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn -ENV KMP_AFFINITY=granularity=fine,compact,1,0 -ENV KMP_BLOCKTIME=1 -ENV KMP_SETTINGS=0 -# Python won’t try to write .pyc or .pyo files on the import of source modules -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -# See http://bugs.python.org/issue19846 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -# Specify the location of module that contains the training logic for SageMaker -# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html -ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main - -# Define framework-related package sources -ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz -ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - python3-dev \ - python3-pip \ - python3-setuptools \ - software-properties-common \ - build-essential \ - openssh-client \ - openssh-server \ - ca-certificates \ - curl \ - git \ - wget \ - vim \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* - -# Install Open MPI -RUN mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ - && tar zxf openmpi-4.0.1.tar.gz \ - && cd openmpi-4.0.1 \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -# Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ - && echo '#!/bin/bash' > /usr/local/bin/mpirun \ - && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ - && chmod a+x /usr/local/bin/mpirun - -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf - -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH=/usr/local/openmpi/bin/:$PATH - -# SSH login fix. Otherwise user is kicked off after login -RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - -# Create SSH key. -RUN mkdir -p /root/.ssh/ \ - && mkdir -p /var/run/sshd \ - && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ - && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ - && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config - -WORKDIR / - -COPY $FRAMEWORK_SUPPORT_INSTALLABLE . - -RUN pip3 --no-cache-dir install --upgrade \ - pip \ - setuptools - -# Some TF tools expect a "python" binary -RUN ln -s $(which python3) /usr/local/bin/python \ - && ln -s $(which pip3) /usr/bin/pip - -RUN pip install --no-cache-dir -U \ - numpy==1.17.4 \ - scipy==1.2.2 \ - scikit-learn==0.20.3 \ - pandas==0.24.2 \ - Pillow==6.2.1 \ - h5py==2.9.0 \ - keras_applications==1.0.8 \ - keras_preprocessing==1.1.0 \ - keras==2.3.1 \ - requests==2.22.0 \ - smdebug==0.5.0.post0 \ - sagemaker-experiments==0.1.3 \ - mpi4py==3.0.2 \ - "cryptography>=2.3" \ - "sagemaker-tensorflow>=1.15,<1.16" \ - # Let's install TensorFlow separately in the end to avoid - # the library version to be overwritten - && pip install --force-reinstall --no-cache-dir -U \ - ${TF_URL} \ - && pip install --force-reinstall --no-cache-dir -U \ - horovod==0.18.2 \ - && pip install --no-cache-dir -U \ - $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.17.7 \ - && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE - -COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py - -RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ - && chmod +x /usr/local/bin/deep_learning_container.py - -RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt - -ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] -CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/Dockerfile.gpu b/docker/1.15.0/py3/Dockerfile.gpu deleted file mode 100644 index 68c68383..00000000 --- a/docker/1.15.0/py3/Dockerfile.gpu +++ /dev/null @@ -1,173 +0,0 @@ -# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. -# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ -FROM nvidia/cuda:10.0-base-ubuntu18.04 - -LABEL maintainer="Amazon AI" - -# Prevent docker build get stopped by requesting user interaction -ENV DEBIAN_FRONTEND=noninteractive -ENV DEBCONF_NONINTERACTIVE_SEEN=true -# Python won’t try to write .pyc or .pyo files on the import of source modules -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -# See http://bugs.python.org/issue19846 -ENV PYTHONIOENCODING=UTF-8 -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 -# Specify the location of module that contains the training logic for SageMaker -# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html -ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main - -# Define framework-related package sources -ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz -ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl - -RUN apt-get update \ - && apt-get install -y --no-install-recommends --allow-unauthenticated \ - python3-dev \ - python3-pip \ - python3-setuptools \ - python3-dev \ - ca-certificates \ - cuda-command-line-tools-10-0 \ - cuda-cublas-dev-10-0 \ - cuda-cudart-dev-10-0 \ - cuda-cufft-dev-10-0 \ - cuda-curand-dev-10-0 \ - cuda-cusolver-dev-10-0 \ - cuda-cusparse-dev-10-0 \ - curl \ - libcudnn7=7.5.1.10-1+cuda10.0 \ - # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it - libnccl2=2.4.7-1+cuda10.0 \ - libgomp1 \ - libnccl-dev=2.4.7-1+cuda10.0 \ - libfreetype6-dev \ - libhdf5-serial-dev \ - libpng-dev \ - libzmq3-dev \ - git \ - wget \ - vim \ - build-essential \ - openssh-client \ - openssh-server \ - zlib1g-dev \ - # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 - # adds a new list which contains libnvinfer library, so it needs another - # 'apt-get update' to retrieve that list before it can actually install the - # library. - # We don't install libnvinfer-dev since we don't need to build against TensorRT, - # and libnvinfer4 doesn't contain libnvinfer.a static library. - && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ - nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ - && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ - libnvinfer5=5.0.2-1+cuda10.0 \ - && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ - && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ - && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /var/run/sshd - -########################################################################### -# Horovod & its dependencies -########################################################################### - -# Install Open MPI -RUN mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ - && tar zxf openmpi-4.0.1.tar.gz \ - && cd openmpi-4.0.1 \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -# Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ - && echo '#!/bin/bash' > /usr/local/bin/mpirun \ - && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ - && chmod a+x /usr/local/bin/mpirun - -# Configure OpenMPI to run good defaults: -# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf - -# Set default NCCL parameters -RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf - -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH=/usr/local/openmpi/bin/:$PATH -ENV PATH=/usr/local/nvidia/bin:$PATH - -# SSH login fix. Otherwise user is kicked off after login -RUN mkdir -p /var/run/sshd \ - && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - -# Create SSH key. -RUN mkdir -p /root/.ssh/ \ - && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ - && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ - && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config - -WORKDIR / - -RUN pip3 --no-cache-dir install --upgrade \ - pip \ - setuptools - -# Some TF tools expect a "python" binary -RUN ln -s $(which python3) /usr/local/bin/python \ - && ln -s $(which pip3) /usr/bin/pip - -COPY $FRAMEWORK_SUPPORT_INSTALLABLE . - -RUN pip install --no-cache-dir -U \ - numpy==1.17.4 \ - scipy==1.2.2 \ - scikit-learn==0.20.3 \ - pandas==0.24.2 \ - Pillow==6.2.1 \ - h5py==2.9.0 \ - keras_applications==1.0.8 \ - keras_preprocessing==1.1.0 \ - requests==2.22.0 \ - keras==2.3.1 \ - smdebug==0.5.0.post0 \ - sagemaker-experiments==0.1.3 \ - mpi4py==3.0.2 \ - "cryptography>=2.3" \ - "sagemaker-tensorflow>=1.15,<1.16" \ - # Let's install TensorFlow separately in the end to avoid - # the library version to be overwritten - && pip install --force-reinstall --no-cache-dir -U \ - ${TF_URL} \ - && pip install --no-cache-dir -U \ - $FRAMEWORK_SUPPORT_INSTALLABLE \ - awscli==1.17.7 \ - && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE - -# Install Horovod, temporarily using CUDA stubs -RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \ - && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ - horovod==0.18.2 \ - && ldconfig - -# Allow OpenSSH to talk to containers without asking for confirmation -RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ - && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ - && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config - -COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py - -RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ - && chmod +x /usr/local/bin/deep_learning_container.py - -RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt - -ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] -CMD ["bin/bash"] diff --git a/docker/1.15.0/py3/dockerd-entrypoint.py b/docker/1.15.0/py3/dockerd-entrypoint.py deleted file mode 100644 index cd222026..00000000 --- a/docker/1.15.0/py3/dockerd-entrypoint.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os.path -import shlex -import subprocess -import sys - -if not os.path.exists("/opt/ml/input/config"): - subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) - -subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) diff --git a/docker/__init__.py b/docker/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docker/build_artifacts/__init__.py b/docker/build_artifacts/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docker/build_artifacts/deep_learning_container.py b/docker/build_artifacts/deep_learning_container.py deleted file mode 100644 index 7e3967c7..00000000 --- a/docker/build_artifacts/deep_learning_container.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import json -import logging -import re - -import requests - - -def _validate_instance_id(instance_id): - """ - Validate instance ID - """ - instance_id_regex = r'^(i-\S{17})' - compiled_regex = re.compile(instance_id_regex) - match = compiled_regex.match(instance_id) - - if not match: - return None - - return match.group(1) - - -def _retrieve_instance_id(): - """ - Retrieve instance ID from instance metadata service - """ - instance_id = None - url = "http://169.254.169.254/latest/meta-data/instance-id" - response = requests_helper(url, timeout=0.1) - - if response is not None: - instance_id = _validate_instance_id(response.text) - - return instance_id - - -def _retrieve_instance_region(): - """ - Retrieve instance region from instance metadata service - """ - region = None - valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2', - 'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1', - 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', - 'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] - - url = "http://169.254.169.254/latest/dynamic/instance-identity/document" - response = requests_helper(url, timeout=0.1) - - if response is not None: - response_json = json.loads(response.text) - - if response_json['region'] in valid_regions: - region = response_json['region'] - - return region - - -def query_bucket(): - """ - GET request on an empty object from an Amazon S3 bucket - """ - response = None - instance_id = _retrieve_instance_id() - region = _retrieve_instance_region() - - if instance_id is not None and region is not None: - url = ("https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com" - "/dlc-containers.txt?x-instance-id={1}".format(region, instance_id)) - response = requests_helper(url, timeout=0.2) - - logging.debug("Query bucket finished: {}".format(response)) - - return response - - -def requests_helper(url, timeout): - response = None - try: - response = requests.get(url, timeout=timeout) - except requests.exceptions.RequestException as e: - logging.error("Request exception: {}".format(e)) - - return response - - -def main(): - """ - Invoke bucket query - """ - # Logs are not necessary for normal run. Remove this line while debugging. - logging.getLogger().disabled = True - - logging.basicConfig(level=logging.ERROR) - query_bucket() - - -if __name__ == '__main__': - main() diff --git a/docker/build_artifacts/dockerd-entrypoint.py b/docker/build_artifacts/dockerd-entrypoint.py deleted file mode 100644 index cd222026..00000000 --- a/docker/build_artifacts/dockerd-entrypoint.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os.path -import shlex -import subprocess -import sys - -if not os.path.exists("/opt/ml/input/config"): - subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) - -subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) diff --git a/scripts/build_all.py b/scripts/build_all.py index 9f340d5d..de7913d3 100644 --- a/scripts/build_all.py +++ b/scripts/build_all.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/scripts/publish_all.py b/scripts/publish_all.py index 2c78e8a7..092ae113 100644 --- a/scripts/publish_all.py +++ b/scripts/publish_all.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/setup.py b/setup.py index 0af7e92c..fa86dbd6 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of @@ -53,11 +53,13 @@ def read_version(): 'Programming Language :: Python :: 3.6', ], + install_requires=['sagemaker-containers>=2.6.1', 'numpy', 'scipy', 'sklearn', + 'pandas', 'Pillow', 'h5py'], extras_require={ 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', - 'sagemaker==1.50.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.50', - 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50', - 'requests-mock', 'awscli==1.16.314'], + 'sagemaker==1.48.0', 'tensorflow', 'docker-compose', 'boto3>=1.10.41', + 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore>=1.13.41', + 'awscli>=1.16.305'], 'benchmark': ['click'] }, ) diff --git a/src/sagemaker_tensorflow_container/s3_utils.py b/src/sagemaker_tensorflow_container/s3_utils.py index 0137ef25..22e2ef74 100644 --- a/src/sagemaker_tensorflow_container/s3_utils.py +++ b/src/sagemaker_tensorflow_container/s3_utils.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index ba4fbe6a..29fbb47b 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -1,4 +1,4 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index f35ba03a..2d4e9ce3 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_keras.py b/test/integration/local/test_keras.py index 1eca0c2a..2e473bf9 100644 --- a/test/integration/local/test_keras.py +++ b/test/integration/local/test_keras.py @@ -1,4 +1,4 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py index bd1641b0..6a2bab25 100644 --- a/test/integration/local/test_training.py +++ b/test/integration/local/test_training.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index 1d2bd8ac..08e41704 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index ce685abc..a4381d40 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/test_tuning_model_dir.py b/test/integration/sagemaker/test_tuning_model_dir.py index e833c3a4..604d4c93 100644 --- a/test/integration/sagemaker/test_tuning_model_dir.py +++ b/test/integration/sagemaker/test_tuning_model_dir.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py index d4738d32..4360987a 100644 --- a/test/integration/sagemaker/timeout.py +++ b/test/integration/sagemaker/timeout.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. diff --git a/test/integration/utils.py b/test/integration/utils.py index 4944eb20..83271f67 100644 --- a/test/integration/utils.py +++ b/test/integration/utils.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py index c5c0b242..7d9e7940 100644 --- a/test/resources/mnist/horovod_mnist.py +++ b/test/resources/mnist/horovod_mnist.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py index e4349ce2..47d2bcd0 100644 --- a/test/resources/mnist/mnist.py +++ b/test/resources/mnist/mnist.py @@ -1,11 +1,8 @@ +import tensorflow as tf import argparse -import json import os -import sys - import numpy as np -import tensorflow as tf - +import json def _parse_args(): @@ -35,18 +32,6 @@ def _load_testing_data(base_dir): return x_test, y_test -def assert_can_track_sagemaker_experiments(): - in_sagemaker_training = 'TRAINING_JOB_ARN' in os.environ - in_python_three = sys.version_info[0] == 3 - - if in_sagemaker_training and in_python_three: - import smexperiments.tracker - - with smexperiments.tracker.Tracker.load() as tracker: - tracker.log_parameter('param', 1) - tracker.log_metric('metric', 1.0) - - args, unknown = _parse_args() model = tf.keras.models.Sequential([ @@ -63,7 +48,5 @@ def assert_can_track_sagemaker_experiments(): x_test, y_test = _load_testing_data(args.train) model.fit(x_train, y_train, epochs=args.epochs) model.evaluate(x_test, y_test) - if args.current_host == args.hosts[0]: model.save(os.path.join('/opt/ml/model', 'my_model.h5')) - assert_can_track_sagemaker_experiments() diff --git a/test/resources/test_py_version/entry.py b/test/resources/test_py_version/entry.py index 8f71a01b..e844e07c 100644 --- a/test/resources/test_py_version/entry.py +++ b/test/resources/test_py_version/entry.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/resources/tuning_model_dir/entry.py b/test/resources/tuning_model_dir/entry.py index 0bce7165..2fae72fc 100644 --- a/test/resources/tuning_model_dir/entry.py +++ b/test/resources/tuning_model_dir/entry.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_deep_learning_container.py b/test/unit/test_deep_learning_container.py deleted file mode 100644 index 7d5d7d86..00000000 --- a/test/unit/test_deep_learning_container.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the 'License'). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the 'license' file accompanying this file. This file is -# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import unittest - -from docker.build_artifacts import deep_learning_container as deep_learning_container_to_test -import pytest -import requests - - -@pytest.fixture(name='fixture_valid_instance_id') -def fixture_valid_instance_id(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', - text='i-123t32e11s32t1231') - - -@pytest.fixture(name='fixture_invalid_instance_id') -def fixture_invalid_instance_id(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text='i-123') - - -@pytest.fixture(name='fixture_none_instance_id') -def fixture_none_instance_id(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text=None) - - -@pytest.fixture(name='fixture_invalid_region') -def fixture_invalid_region(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', - json={'region': 'test'}) - - -@pytest.fixture(name='fixture_valid_region') -def fixture_valid_region(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', - json={'region': 'us-east-1'}) - - -def test_retrieve_instance_id(fixture_valid_instance_id): - result = deep_learning_container_to_test._retrieve_instance_id() - assert 'i-123t32e11s32t1231' == result - - -def test_retrieve_none_instance_id(fixture_none_instance_id): - result = deep_learning_container_to_test._retrieve_instance_id() - assert result is None - - -def test_retrieve_invalid_instance_id(fixture_invalid_instance_id): - result = deep_learning_container_to_test._retrieve_instance_id() - assert result is None - - -def test_retrieve_invalid_region(fixture_invalid_region): - result = deep_learning_container_to_test._retrieve_instance_region() - assert result is None - - -def test_retrieve_valid_region(fixture_valid_region): - result = deep_learning_container_to_test._retrieve_instance_region() - assert 'us-east-1' == result - - -def test_query_bucket(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - requests_mock.get(('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231'), - text='Access Denied') - actual_response = deep_learning_container_to_test.query_bucket() - assert 'Access Denied' == actual_response.text - - -def test_query_bucket_region_none(fixture_invalid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_invalid_region.return_value = None - actual_response = deep_learning_container_to_test.query_bucket() - assert actual_response is None - - -def test_query_bucket_instance_id_none(requests_mock, fixture_valid_region, fixture_none_instance_id): - fixture_none_instance_id.return_value = None - fixture_valid_region.return_value = 'us-east-1' - actual_response = deep_learning_container_to_test.query_bucket() - assert actual_response is None - - -def test_query_bucket_instance_id_invalid(requests_mock, fixture_valid_region, fixture_invalid_instance_id): - fixture_invalid_instance_id.return_value = None - fixture_valid_region.return_value = 'us-east-1' - actual_response = deep_learning_container_to_test.query_bucket() - assert actual_response is None - - -def test_HTTP_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') - - requests_mock.get( - query_s3_url, - exc=requests.exceptions.HTTPError) - requests_mock.side_effect = requests.exceptions.HTTPError - - with pytest.raises(requests.exceptions.HTTPError): - actual_response = requests.get(query_s3_url) - assert actual_response is None - - -def test_connection_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') - - requests_mock.get( - query_s3_url, - exc=requests.exceptions.ConnectionError) - - with pytest.raises(requests.exceptions.ConnectionError): - actual_response = requests.get( - query_s3_url) - - assert actual_response is None - - -def test_timeout_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') - - requests_mock.get( - query_s3_url, - exc=requests.Timeout) - - with pytest.raises(requests.exceptions.Timeout): - actual_response = requests.get( - query_s3_url) - - assert actual_response is None - - -if __name__ == '__main__': - unittest.main() diff --git a/test/unit/test_s3_utils.py b/test/unit/test_s3_utils.py index 03de70a3..fa2cef6b 100644 --- a/test/unit/test_s3_utils.py +++ b/test/unit/test_s3_utils.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of diff --git a/test/unit/test_training.py b/test/unit/test_training.py index 2962f5d3..6bb41b7d 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -1,4 +1,4 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of