Skip to content

fix: Rename buildspec files. #211

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions buildspec-container-pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
version: 0.2

phases:
pre_build:
commands:
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'

build:
commands:

- error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1"
- execute-command-if-has-matching-changes "$error_cmd" "docker/"
94 changes: 0 additions & 94 deletions buildspec-toolkit.yml

This file was deleted.

85 changes: 83 additions & 2 deletions buildspec.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,94 @@
version: 0.2

env:
variables:
FRAMEWORK_VERSION: '1.4.0'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p2.8xlarge'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-pytorch-container'
DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'

phases:
pre_build:
commands:
- start-dockerd
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'

build:
commands:
- TOX_PARALLEL_NO_SPINNER=1
- PY_COLORS=0

# install
- pip3 install -U -e .[test]

# run linters
- tox -e flake8,twine

# run unit tests
- tox -e py27,py36,py37 test/unit

# define tags
- GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"

# run local CPU integration tests (build and push the image to ECR repo)
- test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
- test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"

# launch remote GPU instance
- prefix='ml.'
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest

# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- python3 setup.py sdist
- build_dir="test/container/$FRAMEWORK_VERSION"
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
# push DLC GPU image to ECR
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG

# run GPU local integration tests
- printf "$SETUP_CMDS" > $SETUP_FILE
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
- generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
- dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"

# run CPU sagemaker integration tests
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"

# run GPU sagemaker integration tests
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"

finally:
# shut down remote GPU instance
- cleanup-gpu-instances
- cleanup-key-pairs

- error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1"
- execute-command-if-has-matching-changes "$error_cmd" "docker/"
# remove ECR image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
18 changes: 18 additions & 0 deletions lib/changehostname.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include <stdio.h>
#include <string.h>

/*
* Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker.
*
* Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host,
* not realizing that it needs to use NET/Socket.
*
* When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json
* and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library.
*/
int gethostname(char *name, size_t len)
{
const char *val = PLACEHOLDER_HOSTNAME;
strncpy(name, val, len);
return 0;
}
11 changes: 11 additions & 0 deletions lib/start_with_right_hostname.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash

if [[ "$1" = "train" ]]; then
CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json)
sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c
gcc -o changehostname.o -c -fPIC -Wall changehostname.c
gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl
LD_PRELOAD=/libchangehostname.so train
else
eval "$@"
fi
4 changes: 2 additions & 2 deletions test/container/1.4.0/Dockerfile.dlc.cpu
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG region
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-cpu-py2

COPY docker/build_artifacts/changehostname.c /
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
COPY lib/changehostname.c /
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh

COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
Expand Down
4 changes: 2 additions & 2 deletions test/container/1.4.0/Dockerfile.dlc.gpu
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG region
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-gpu-py3

COPY docker/build_artifacts/changehostname.c /
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
COPY lib/changehostname.c /
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh

COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
Expand Down
4 changes: 2 additions & 2 deletions test/container/1.4.0/Dockerfile.pytorch
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ RUN apt-get update \
&& apt-get install -y --no-install-recommends jq \
&& rm -rf /var/lib/apt/lists/*

COPY docker/build_artifacts/changehostname.c /
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
COPY lib/changehostname.c /
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh

COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
Expand Down