File tree 2 files changed +5
-4
lines changed
src/sagemaker_pytorch_container
2 files changed +5
-4
lines changed Original file line number Diff line number Diff line change @@ -47,7 +47,7 @@ def train(training_environment):
47
47
48
48
_set_nccl_environment (training_environment .network_interface_name )
49
49
50
- _set_distributed_environment (training_environment . hosts )
50
+ _set_distributed_environment (training_environment )
51
51
52
52
mpi_enabled = training_environment .additional_framework_parameters .get ('sagemaker_mpi_enabled' )
53
53
@@ -88,15 +88,15 @@ def _dns_lookup(host):
88
88
return socket .gethostbyname (host )
89
89
90
90
91
- def _set_distributed_environment (hosts ):
91
+ def _set_distributed_environment (training_env ):
92
92
"""Set environment variable for distributed training.
93
93
94
94
Args:
95
95
hosts: list of hosts that are used for training.
96
96
"""
97
97
# According to https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html
98
98
# hosts are sorted lexicographically.
99
- os .environ ['MASTER_ADDR' ] = hosts [ 0 ]
99
+ os .environ ['MASTER_ADDR' ] = training_env . master_hostname
100
100
os .environ ['MASTER_PORT' ] = MASTER_PORT
101
101
102
102
Original file line number Diff line number Diff line change @@ -31,6 +31,7 @@ def fixture_training_env():
31
31
env = MagicMock ()
32
32
env .current_host = 'algo-1'
33
33
env .hosts = ['algo-1' ]
34
+ env .master_hostname = 'algo-1'
34
35
env .network_interface_name = 'eth0'
35
36
tmp = tempfile .mkdtemp ()
36
37
os .makedirs (os .path .join (tmp , 'model' ))
@@ -96,7 +97,7 @@ def test_environment(training_env):
96
97
97
98
# distributed training specific environment
98
99
assert MASTER_PORT == os .environ ['MASTER_PORT' ]
99
- assert training_env .hosts [ 0 ] == os .environ ['MASTER_ADDR' ]
100
+ assert training_env .master_hostname == os .environ ['MASTER_ADDR' ]
100
101
101
102
# nccl specific environment
102
103
assert training_env .network_interface_name == os .environ ['NCCL_SOCKET_IFNAME' ]
You can’t perform that action at this time.
0 commit comments