Skip to content

Commit f232b2a

Browse files
ChoiByungWookjesterhazy
authored andcommitted
increase docker-compose timeout to 120 and ping interval to 5 (#525)
1 parent 65f1f15 commit f232b2a

File tree

3 files changed

+10
-3
lines changed

3 files changed

+10
-3
lines changed

CHANGELOG.rst

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ CHANGELOG
66
======
77

88
* doc-fix: Change ``distribution`` to ``distributions``
9+
* bug-fix: Increase docker-compose http timeout and health check timeout to 120.
910

1011
1.16.1.post1
1112
============

src/sagemaker/local/entities.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
logger.setLevel(logging.WARNING)
3030

3131
_UNUSED_ARN = 'local:arn-does-not-matter'
32-
HEALTH_CHECK_TIMEOUT_LIMIT = 30
32+
HEALTH_CHECK_TIMEOUT_LIMIT = 120
3333

3434

3535
class _LocalTrainingJob(object):
@@ -405,7 +405,7 @@ def _wait_for_serving_container(serving_port):
405405

406406
endpoint_url = 'http://localhost:%s/ping' % serving_port
407407
while True:
408-
i += 1
408+
i += 5
409409
if i >= HEALTH_CHECK_TIMEOUT_LIMIT:
410410
raise RuntimeError('Giving up, endpoint didn\'t launch correctly')
411411

@@ -416,7 +416,7 @@ def _wait_for_serving_container(serving_port):
416416
else:
417417
return
418418

419-
time.sleep(1)
419+
time.sleep(5)
420420

421421

422422
def _perform_request(endpoint_url, pool_manager=None):

src/sagemaker/local/image.py

+6
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939

4040
CONTAINER_PREFIX = 'algo'
4141
DOCKER_COMPOSE_FILENAME = 'docker-compose.yaml'
42+
DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = 'COMPOSE_HTTP_TIMEOUT'
43+
DOCKER_COMPOSE_HTTP_TIMEOUT = '120'
44+
4245

4346
# Environment variables to be set during training
4447
REGION_ENV_NAME = 'AWS_REGION'
@@ -359,6 +362,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
359362
additional_env_var_list = ['{}={}'.format(k, v) for k, v in additional_env_vars.items()]
360363
environment.extend(additional_env_var_list)
361364

365+
if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None:
366+
os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT
367+
362368
if command == 'train':
363369
optml_dirs = {'output', 'output/data', 'input'}
364370

0 commit comments

Comments
 (0)