From c4ce8ddffb9cbe9681a53aaeef1cbdb5de1c744a Mon Sep 17 00:00:00 2001 From: Marcio Dos Santos Date: Mon, 16 Jul 2018 16:59:29 -0700 Subject: [PATCH 1/3] Deleting endpoint for PyTorch test, retrying endpoint deletion 3 times --- tests/integ/test_pytorch_train.py | 2 +- tests/integ/timeout.py | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/tests/integ/test_pytorch_train.py b/tests/integ/test_pytorch_train.py index 9c325eb5e9..feaef2d083 100644 --- a/tests/integ/test_pytorch_train.py +++ b/tests/integ/test_pytorch_train.py @@ -41,7 +41,7 @@ def fixture_training_job(sagemaker_session, pytorch_full_version): def test_sync_fit_deploy(pytorch_training_job, sagemaker_session): # TODO: add tests against local mode when it's ready to be used endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp()) - with timeout(minutes=20): + with timeout_and_delete_endpoint_by_name(minutes=20): estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32) diff --git a/tests/integ/timeout.py b/tests/integ/timeout.py index 5915fcdf25..9290735df8 100644 --- a/tests/integ/timeout.py +++ b/tests/integ/timeout.py @@ -15,6 +15,8 @@ import signal from contextlib import contextmanager import logging +from time import sleep + from awslogs.core import AWSLogs from botocore.exceptions import ClientError @@ -65,17 +67,24 @@ def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, second yield [t] no_errors = True finally: - try: - sagemaker_session.delete_endpoint(endpoint_name) - LOGGER.info('deleted endpoint {}'.format(endpoint_name)) - - _show_endpoint_logs(endpoint_name, sagemaker_session) - if no_errors: - _cleanup_endpoint_logs(endpoint_name, sagemaker_session) - except ClientError as ce: - if ce.response['Error']['Code'] == 'ValidationException': - # avoids the inner exception to be overwritten - pass + attempts = 3 + + while attempts > 0: + attempts -= 1 + try: + sagemaker_session.delete_endpoint(endpoint_name) + LOGGER.info('deleted endpoint {}'.format(endpoint_name)) + + _show_endpoint_logs(endpoint_name, sagemaker_session) + if no_errors: + _cleanup_endpoint_logs(endpoint_name, sagemaker_session) + return + except ClientError as ce: + if ce.response['Error']['Code'] == 'ValidationException': + # avoids the inner exception to be overwritten + pass + # trying to delte the resource again in 10 seconds + sleep(10) def _show_endpoint_logs(endpoint_name, sagemaker_session): From 61c567d4a71fb2d3a5a4467eb9d3d246e7761b1c Mon Sep 17 00:00:00 2001 From: Marcio Vinicius dos Santos Date: Mon, 16 Jul 2018 21:10:54 -0700 Subject: [PATCH 2/3] Fixed typo --- tests/integ/timeout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/timeout.py b/tests/integ/timeout.py index 9290735df8..7c5d43e357 100644 --- a/tests/integ/timeout.py +++ b/tests/integ/timeout.py @@ -78,12 +78,12 @@ def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, second _show_endpoint_logs(endpoint_name, sagemaker_session) if no_errors: _cleanup_endpoint_logs(endpoint_name, sagemaker_session) - return + return except ClientError as ce: if ce.response['Error']['Code'] == 'ValidationException': # avoids the inner exception to be overwritten pass - # trying to delte the resource again in 10 seconds + # trying to delete the resource again in 10 seconds sleep(10) From 925a4bafbda5431248df5c73525db6f8fd97bd8d Mon Sep 17 00:00:00 2001 From: Marcio Dos Santos Date: Tue, 24 Jul 2018 07:11:50 -0700 Subject: [PATCH 3/3] Fix integ test --- tests/integ/test_pytorch_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/test_pytorch_train.py b/tests/integ/test_pytorch_train.py index feaef2d083..cb46fc8fd0 100644 --- a/tests/integ/test_pytorch_train.py +++ b/tests/integ/test_pytorch_train.py @@ -41,7 +41,7 @@ def fixture_training_job(sagemaker_session, pytorch_full_version): def test_sync_fit_deploy(pytorch_training_job, sagemaker_session): # TODO: add tests against local mode when it's ready to be used endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp()) - with timeout_and_delete_endpoint_by_name(minutes=20): + with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32)