Skip to content

Commit f13ff09

Browse files
committed
Move lib file from out of docker folder.
1 parent 3c5302b commit f13ff09

File tree

6 files changed

+43
-14
lines changed

6 files changed

+43
-14
lines changed

buildspec.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ phases:
4242

4343
# run local CPU integration tests (build and push the image to ECR repo)
4444
- test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
45-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
45+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
4646
- test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
47-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
47+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
4848

4949
# launch remote GPU instance
5050
- prefix='ml.'
@@ -66,22 +66,22 @@ phases:
6666
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
6767
- generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
6868
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
69-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
69+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
7070
- dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
7171
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
72-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
72+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
7373

7474
# run CPU sagemaker integration tests
7575
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
76-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
76+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
7777
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
78-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
78+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
7979

8080
# run GPU sagemaker integration tests
8181
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
82-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
82+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
8383
- test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
84-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
84+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*"
8585

8686
finally:
8787
# shut down remote GPU instance

lib/changehostname.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#include <stdio.h>
2+
#include <string.h>
3+
4+
/*
5+
* Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker.
6+
*
7+
* Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host,
8+
* not realizing that it needs to use NET/Socket.
9+
*
10+
* When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json
11+
* and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library.
12+
*/
13+
int gethostname(char *name, size_t len)
14+
{
15+
const char *val = PLACEHOLDER_HOSTNAME;
16+
strncpy(name, val, len);
17+
return 0;
18+
}

lib/start_with_right_hostname.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/env bash
2+
3+
if [[ "$1" = "train" ]]; then
4+
CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json)
5+
sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c
6+
gcc -o changehostname.o -c -fPIC -Wall changehostname.c
7+
gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl
8+
LD_PRELOAD=/libchangehostname.so train
9+
else
10+
eval "$@"
11+
fi

test/container/1.4.0/Dockerfile.dlc.cpu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG region
22
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-cpu-py2
33

4-
COPY docker/build_artifacts/changehostname.c /
5-
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
4+
COPY lib/changehostname.c /
5+
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
66
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
77

88
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz

test/container/1.4.0/Dockerfile.dlc.gpu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG region
22
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-gpu-py3
33

4-
COPY docker/build_artifacts/changehostname.c /
5-
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
4+
COPY lib/changehostname.c /
5+
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
66
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
77

88
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz

test/container/1.4.0/Dockerfile.pytorch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ RUN apt-get update \
44
&& apt-get install -y --no-install-recommends jq \
55
&& rm -rf /var/lib/apt/lists/*
66

7-
COPY docker/build_artifacts/changehostname.c /
8-
COPY docker/build_artifacts/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
7+
COPY lib/changehostname.c /
8+
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
99
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
1010

1111
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz

0 commit comments

Comments
 (0)