diff --git a/buildspec-container-pr.yml b/buildspec-container-pr.yml new file mode 100644 index 00000000..c43cb34f --- /dev/null +++ b/buildspec-container-pr.yml @@ -0,0 +1,13 @@ +version: 0.2 + +phases: + pre_build: + commands: + - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') + - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.' + + build: + commands: + + - error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1" + - execute-command-if-has-matching-changes "$error_cmd" "docker/" diff --git a/buildspec-toolkit.yml b/buildspec-toolkit.yml deleted file mode 100644 index a7046e0b..00000000 --- a/buildspec-toolkit.yml +++ /dev/null @@ -1,94 +0,0 @@ -version: 0.2 - -env: - variables: - FRAMEWORK_VERSION: '1.4.0' - CPU_INSTANCE_TYPE: 'ml.c4.xlarge' - GPU_INSTANCE_TYPE: 'ml.p2.8xlarge' - ECR_REPO: 'sagemaker-test' - GITHUB_REPO: 'sagemaker-pytorch-container' - DLC_ACCOUNT: '763104351884' - SETUP_FILE: 'setup_cmds.sh' - SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' - -phases: - pre_build: - commands: - - start-dockerd - - ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text) - - PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO" - - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') - - BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.' - - build: - commands: - - TOX_PARALLEL_NO_SPINNER=1 - - PY_COLORS=0 - - # install - - pip3 install -U -e .[test] - - # run linters - - tox -e flake8,twine - - # run unit tests - - tox -e py27,py36,py37 test/unit - - # define tags - - GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID" - - DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID" - - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID" - - # run local CPU integration tests (build and push the image to ECR repo) - - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - # launch remote GPU instance - - prefix='ml.' - - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - - create-key-pair - - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest - - # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test - - python3 setup.py sdist - - build_dir="test/container/$FRAMEWORK_VERSION" - - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION . - # push DLC GPU image to ECR - - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - docker push $PREPROD_IMAGE:$DLC_GPU_TAG - - # run GPU local integration tests - - printf "$SETUP_CMDS" > $SETUP_FILE - # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests - - generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" - - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" - - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - # run CPU sagemaker integration tests - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - # run GPU sagemaker integration tests - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - finally: - # shut down remote GPU instance - - cleanup-gpu-instances - - cleanup-key-pairs - - # remove ECR image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG diff --git a/buildspec.yml b/buildspec.yml index c43cb34f..b9a6d55e 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -1,13 +1,94 @@ version: 0.2 +env: + variables: + FRAMEWORK_VERSION: '1.4.0' + CPU_INSTANCE_TYPE: 'ml.c4.xlarge' + GPU_INSTANCE_TYPE: 'ml.p2.8xlarge' + ECR_REPO: 'sagemaker-test' + GITHUB_REPO: 'sagemaker-pytorch-container' + DLC_ACCOUNT: '763104351884' + SETUP_FILE: 'setup_cmds.sh' + SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' + phases: pre_build: commands: + - start-dockerd + - ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text) + - PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO" - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') + - BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.' build: commands: + - TOX_PARALLEL_NO_SPINNER=1 + - PY_COLORS=0 + + # install + - pip3 install -U -e .[test] + + # run linters + - tox -e flake8,twine + + # run unit tests + - tox -e py27,py36,py37 test/unit + + # define tags + - GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID" + - DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID" + - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID" + + # run local CPU integration tests (build and push the image to ECR repo) + - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + + # launch remote GPU instance + - prefix='ml.' + - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} + - create-key-pair + - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest + + # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test + - python3 setup.py sdist + - build_dir="test/container/$FRAMEWORK_VERSION" + - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) + - docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION . + # push DLC GPU image to ECR + - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) + - docker push $PREPROD_IMAGE:$DLC_GPU_TAG + + # run GPU local integration tests + - printf "$SETUP_CMDS" > $SETUP_FILE + # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests + - generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" + - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" + - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + + # run CPU sagemaker integration tests + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + + # run GPU sagemaker integration tests + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + + finally: + # shut down remote GPU instance + - cleanup-gpu-instances + - cleanup-key-pairs - - error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1" - - execute-command-if-has-matching-changes "$error_cmd" "docker/" + # remove ECR image + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG diff --git a/lib/changehostname.c b/lib/changehostname.c new file mode 100644 index 00000000..1ae5ee1a --- /dev/null +++ b/lib/changehostname.c @@ -0,0 +1,18 @@ +#include +#include + +/* + * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker. + * + * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host, + * not realizing that it needs to use NET/Socket. + * + * When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json + * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library. + */ +int gethostname(char *name, size_t len) +{ + const char *val = PLACEHOLDER_HOSTNAME; + strncpy(name, val, len); + return 0; +} diff --git a/lib/start_with_right_hostname.sh b/lib/start_with_right_hostname.sh new file mode 100644 index 00000000..652f3c47 --- /dev/null +++ b/lib/start_with_right_hostname.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +if [[ "$1" = "train" ]]; then + CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json) + sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c + gcc -o changehostname.o -c -fPIC -Wall changehostname.c + gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl + LD_PRELOAD=/libchangehostname.so train +else + eval "$@" +fi diff --git a/test/container/1.4.0/Dockerfile.dlc.cpu b/test/container/1.4.0/Dockerfile.dlc.cpu index 113422bc..b86f14c6 100644 --- a/test/container/1.4.0/Dockerfile.dlc.cpu +++ b/test/container/1.4.0/Dockerfile.dlc.cpu @@ -1,8 +1,8 @@ ARG region from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-cpu-py2 -COPY docker/build_artifacts/changehostname.c / -COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +COPY lib/changehostname.c / +COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh RUN chmod +x /usr/local/bin/start_with_right_hostname.sh COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz diff --git a/test/container/1.4.0/Dockerfile.dlc.gpu b/test/container/1.4.0/Dockerfile.dlc.gpu index 15eddd76..de8c23fc 100644 --- a/test/container/1.4.0/Dockerfile.dlc.gpu +++ b/test/container/1.4.0/Dockerfile.dlc.gpu @@ -1,8 +1,8 @@ ARG region from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-gpu-py3 -COPY docker/build_artifacts/changehostname.c / -COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +COPY lib/changehostname.c / +COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh RUN chmod +x /usr/local/bin/start_with_right_hostname.sh COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz diff --git a/test/container/1.4.0/Dockerfile.pytorch b/test/container/1.4.0/Dockerfile.pytorch index b20fce7a..9849c68b 100644 --- a/test/container/1.4.0/Dockerfile.pytorch +++ b/test/container/1.4.0/Dockerfile.pytorch @@ -4,8 +4,8 @@ RUN apt-get update \ && apt-get install -y --no-install-recommends jq \ && rm -rf /var/lib/apt/lists/* -COPY docker/build_artifacts/changehostname.c / -COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +COPY lib/changehostname.c / +COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh RUN chmod +x /usr/local/bin/start_with_right_hostname.sh COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz