diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cd4098a81d..62a1a1b9cc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ CHANGELOG ====== * doc-fix: Change ``distribution`` to ``distributions`` +* bug-fix: Increase docker-compose http timeout and health check timeout to 120. 1.16.1.post1 ============ diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index d54aa8e4f5..91034ca5c9 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -29,7 +29,7 @@ logger.setLevel(logging.WARNING) _UNUSED_ARN = 'local:arn-does-not-matter' -HEALTH_CHECK_TIMEOUT_LIMIT = 30 +HEALTH_CHECK_TIMEOUT_LIMIT = 120 class _LocalTrainingJob(object): @@ -405,7 +405,7 @@ def _wait_for_serving_container(serving_port): endpoint_url = 'http://localhost:%s/ping' % serving_port while True: - i += 1 + i += 5 if i >= HEALTH_CHECK_TIMEOUT_LIMIT: raise RuntimeError('Giving up, endpoint didn\'t launch correctly') @@ -416,7 +416,7 @@ def _wait_for_serving_container(serving_port): else: return - time.sleep(1) + time.sleep(5) def _perform_request(endpoint_url, pool_manager=None): diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index 8416e47048..77ecffc1e5 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -39,6 +39,9 @@ CONTAINER_PREFIX = 'algo' DOCKER_COMPOSE_FILENAME = 'docker-compose.yaml' +DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = 'COMPOSE_HTTP_TIMEOUT' +DOCKER_COMPOSE_HTTP_TIMEOUT = '120' + # Environment variables to be set during training REGION_ENV_NAME = 'AWS_REGION' @@ -359,6 +362,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en additional_env_var_list = ['{}={}'.format(k, v) for k, v in additional_env_vars.items()] environment.extend(additional_env_var_list) + if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None: + os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT + if command == 'train': optml_dirs = {'output', 'output/data', 'input'}