Skip to content

Commit 6afc3ef

Browse files
authored
infra: Rename buildspec files. (#211)
1 parent fc2d908 commit 6afc3ef

8 files changed

+131
-102
lines changed

buildspec-container-pr.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
version: 0.2
2+
3+
phases:
4+
pre_build:
5+
commands:
6+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
7+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
8+
9+
build:
10+
commands:
11+
12+
- error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1"
13+
- execute-command-if-has-matching-changes "$error_cmd" "docker/"

buildspec-toolkit.yml

Lines changed: 0 additions & 94 deletions
This file was deleted.

buildspec.yml

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,94 @@
11
version: 0.2
22

3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '1.4.0'
6+
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
7+
GPU_INSTANCE_TYPE: 'ml.p2.8xlarge'
8+
ECR_REPO: 'sagemaker-test'
9+
GITHUB_REPO: 'sagemaker-pytorch-container'
10+
DLC_ACCOUNT: '763104351884'
11+
SETUP_FILE: 'setup_cmds.sh'
12+
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'
13+
314
phases:
415
pre_build:
516
commands:
17+
- start-dockerd
18+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
19+
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
620
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
21+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
722
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
823

924
build:
1025
commands:
26+
- TOX_PARALLEL_NO_SPINNER=1
27+
- PY_COLORS=0
28+
29+
# install
30+
- pip3 install -U -e .[test]
31+
32+
# run linters
33+
- tox -e flake8,twine
34+
35+
# run unit tests
36+
- tox -e py27,py36,py37 test/unit
37+
38+
# define tags
39+
- GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
40+
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
41+
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
42+
43+
# run local CPU integration tests (build and push the image to ECR repo)
44+
- test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
45+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
46+
- test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
47+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
48+
49+
# launch remote GPU instance
50+
- prefix='ml.'
51+
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
52+
- create-key-pair
53+
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
54+
55+
# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
56+
- python3 setup.py sdist
57+
- build_dir="test/container/$FRAMEWORK_VERSION"
58+
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
59+
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
60+
# push DLC GPU image to ECR
61+
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
62+
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG
63+
64+
# run GPU local integration tests
65+
- printf "$SETUP_CMDS" > $SETUP_FILE
66+
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
67+
- generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
68+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
69+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
70+
- dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
71+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
72+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
73+
74+
# run CPU sagemaker integration tests
75+
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
76+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
77+
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
78+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
79+
80+
# run GPU sagemaker integration tests
81+
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
82+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
83+
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
84+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
85+
86+
finally:
87+
# shut down remote GPU instance
88+
- cleanup-gpu-instances
89+
- cleanup-key-pairs
1190

12-
- error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1"
13-
- execute-command-if-has-matching-changes "$error_cmd" "docker/"
91+
# remove ECR image
92+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
93+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
94+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG

lib/changehostname.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#include <stdio.h>
2+
#include <string.h>
3+
4+
/*
5+
* Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker.
6+
*
7+
* Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host,
8+
* not realizing that it needs to use NET/Socket.
9+
*
10+
* When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json
11+
* and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library.
12+
*/
13+
int gethostname(char *name, size_t len)
14+
{
15+
const char *val = PLACEHOLDER_HOSTNAME;
16+
strncpy(name, val, len);
17+
return 0;
18+
}

lib/start_with_right_hostname.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/env bash
2+
3+
if [[ "$1" = "train" ]]; then
4+
CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json)
5+
sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c
6+
gcc -o changehostname.o -c -fPIC -Wall changehostname.c
7+
gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl
8+
LD_PRELOAD=/libchangehostname.so train
9+
else
10+
eval "$@"
11+
fi

test/container/1.4.0/Dockerfile.dlc.cpu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG region
22
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-cpu-py2
33

4-
COPY docker/build_artifacts/changehostname.c /
5-
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
4+
COPY lib/changehostname.c /
5+
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
66
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
77

88
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz

test/container/1.4.0/Dockerfile.dlc.gpu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG region
22
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-gpu-py3
33

4-
COPY docker/build_artifacts/changehostname.c /
5-
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
4+
COPY lib/changehostname.c /
5+
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
66
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
77

88
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz

test/container/1.4.0/Dockerfile.pytorch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ RUN apt-get update \
44
&& apt-get install -y --no-install-recommends jq \
55
&& rm -rf /var/lib/apt/lists/*
66

7-
COPY docker/build_artifacts/changehostname.c /
8-
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
7+
COPY lib/changehostname.c /
8+
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
99
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
1010

1111
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz

0 commit comments

Comments
 (0)