Skip to content

Commit f5944f3

Browse files
Dan ChoiChoiByungWook
Dan Choi
authored andcommitted
fix: remove TODOs in 1.6.0 dlc gpu dockerfile and reduce parameters for data parallel integ test
1 parent 104c11a commit f5944f3

File tree

2 files changed

+1
-23
lines changed

2 files changed

+1
-23
lines changed
Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,6 @@
11
ARG region
22
from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu110-ubuntu16.04
33

4-
# TODO: Remove once the 1.6.0-gpu-py3 DLC image installs mpi4py
5-
RUN pip3 install mpi4py==3.0.3
6-
7-
# TODO: Remove once the 1.6.0-gpu-py3 DLC image fixes OpenSSH config
8-
# Configure OpenSSH so that nodes can communicate with each other
9-
RUN mkdir -p /var/run/sshd && \
10-
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
11-
12-
RUN rm -rf /root/.ssh/ && \
13-
mkdir -p /root/.ssh/ && \
14-
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
15-
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
16-
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
17-
18-
# TODO: Remove once the 1.6.0-gpu-py3 DLC image fixes MPI config
19-
# Comment line in MPI config to prevent mutually exclusive MCA settings
20-
RUN sed -i '62,62 s/^/#/' /home/.openmpi/etc/openmpi-mca-params.conf
21-
22-
COPY lib/changehostname.c /
23-
COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
24-
RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
25-
264
COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
275
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_training.tar.gz && \
286
rm /sagemaker_pytorch_training.tar.gz

test/integration/sagemaker/test_smdataparallel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
@pytest.mark.skip_generic
2626
@pytest.mark.parametrize(
2727
"instances, train_instance_type",
28-
[(1, "ml.p3.16xlarge"), (2, "ml.p3.16xlarge"), (1, "ml.p3dn.24xlarge"), (2, "ml.p3dn.24xlarge")],
28+
[(2, "ml.p3.16xlarge")],
2929
)
3030
def test_smdataparallel_training(
3131
instances, train_instance_type, sagemaker_session, image_uri, framework_version, tmpdir

0 commit comments

Comments
 (0)