From 6c9d3c5fd768c3a2459d3e6da413822aa1572eef Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Wed, 5 Dec 2018 15:02:32 -0800 Subject: [PATCH 1/3] increase timeout to 120 and ping interval to 5 --- src/sagemaker/local/entities.py | 6 +++--- src/sagemaker/local/image.py | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index d54aa8e4f5..91034ca5c9 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -29,7 +29,7 @@ logger.setLevel(logging.WARNING) _UNUSED_ARN = 'local:arn-does-not-matter' -HEALTH_CHECK_TIMEOUT_LIMIT = 30 +HEALTH_CHECK_TIMEOUT_LIMIT = 120 class _LocalTrainingJob(object): @@ -405,7 +405,7 @@ def _wait_for_serving_container(serving_port): endpoint_url = 'http://localhost:%s/ping' % serving_port while True: - i += 1 + i += 5 if i >= HEALTH_CHECK_TIMEOUT_LIMIT: raise RuntimeError('Giving up, endpoint didn\'t launch correctly') @@ -416,7 +416,7 @@ def _wait_for_serving_container(serving_port): else: return - time.sleep(1) + time.sleep(5) def _perform_request(endpoint_url, pool_manager=None): diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index 8416e47048..9b1c9cb5c5 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -36,10 +36,12 @@ import sagemaker.local.data import sagemaker.local.utils import sagemaker.utils +from sagemaker.local.entities import HEALTH_CHECK_TIMEOUT_LIMIT CONTAINER_PREFIX = 'algo' DOCKER_COMPOSE_FILENAME = 'docker-compose.yaml' + # Environment variables to be set during training REGION_ENV_NAME = 'AWS_REGION' TRAINING_JOB_NAME_ENV_NAME = 'TRAINING_JOB_NAME' @@ -359,6 +361,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en additional_env_var_list = ['{}={}'.format(k, v) for k, v in additional_env_vars.items()] environment.extend(additional_env_var_list) + if os.environ.get('COMPOSE_HTTP_TIMEOUT') is None: + os.environ['COMPOSE_HTTP_TIMEOUT'] = str(HEALTH_CHECK_TIMEOUT_LIMIT) + if command == 'train': optml_dirs = {'output', 'output/data', 'input'} From 744e9f7ec11b0f6738fa11d579c904a16c287e24 Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Wed, 5 Dec 2018 15:14:55 -0800 Subject: [PATCH 2/3] remove circular dependency --- src/sagemaker/local/image.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index 9b1c9cb5c5..77ecffc1e5 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -36,10 +36,11 @@ import sagemaker.local.data import sagemaker.local.utils import sagemaker.utils -from sagemaker.local.entities import HEALTH_CHECK_TIMEOUT_LIMIT CONTAINER_PREFIX = 'algo' DOCKER_COMPOSE_FILENAME = 'docker-compose.yaml' +DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = 'COMPOSE_HTTP_TIMEOUT' +DOCKER_COMPOSE_HTTP_TIMEOUT = '120' # Environment variables to be set during training @@ -361,8 +362,8 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en additional_env_var_list = ['{}={}'.format(k, v) for k, v in additional_env_vars.items()] environment.extend(additional_env_var_list) - if os.environ.get('COMPOSE_HTTP_TIMEOUT') is None: - os.environ['COMPOSE_HTTP_TIMEOUT'] = str(HEALTH_CHECK_TIMEOUT_LIMIT) + if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None: + os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT if command == 'train': optml_dirs = {'output', 'output/data', 'input'} From 8997b04baa863346c6f32051e4bd531871fa4e26 Mon Sep 17 00:00:00 2001 From: Dan Choi Date: Wed, 5 Dec 2018 15:39:28 -0800 Subject: [PATCH 3/3] update changelog --- CHANGELOG.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d21d3f80ee..449d01d69a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,11 @@ CHANGELOG ========= +1.16.2dev +====== + +* bug-fix: Increase docker-compose http timeout and health check timeout to 120. + 1.16.1.post1 ============