Merge pull request #9 from aws/mvs-timeout-deletes-endpoints

mvsusp · web-flow · commit f8b90f2ffadd · 2017-12-06T16:49:31.000-08:00
Improving endpoint deletion
diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py
@@ -10,6 +10,10 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import logging
 import os
 DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
 REGION = 'us-west-2'
+
+logging.getLogger('boto3').setLevel(logging.INFO)
+logging.getLogger('botocore').setLevel(logging.INFO)
diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py
@@ -11,16 +11,17 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 import gzip
-import os
 import pickle
 import sys
-import pytest  # noqa
+
 import boto3
+import os
+
 import sagemaker
 from sagemaker import KMeans, KMeansModel
-
+from sagemaker.utils import name_from_base
 from tests.integ import DATA_DIR, REGION
-from tests.integ.timeout import timeout
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
 def test_kmeans():
@@ -49,15 +50,13 @@ def test_kmeans():
 
         kmeans.fit(kmeans.record_set(train_set[0][:100]))
 
-    with timeout(minutes=15):
+    endpoint_name = name_from_base('kmeans')
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
         model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
-        predictor = model.deploy(1, 'ml.c4.xlarge')
+        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
         result = predictor.predict(train_set[0][:10])
 
-        try:
-            assert len(result) == 10
-            for record in result:
-                assert record.label["closest_cluster"] is not None
-                assert record.label["distance_to_cluster"] is not None
-        finally:
-            sagemaker_session.delete_endpoint(predictor.endpoint)
+        assert len(result) == 10
+        for record in result:
+            assert record.label["closest_cluster"] is not None
+            assert record.label["distance_to_cluster"] is not None
diff --git a/tests/integ/test_linear_learner.py b/tests/integ/test_linear_learner.py
@@ -19,9 +19,10 @@
 import numpy as np
 import sagemaker
 from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
+from sagemaker.utils import name_from_base
 
 from tests.integ import DATA_DIR, REGION
-from tests.integ.timeout import timeout
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
 def test_linear_learner():
@@ -72,15 +73,14 @@ def test_linear_learner():
         ll.num_point_for_scala = 10000
         ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))
 
-    with timeout(minutes=15):
+    endpoint_name = name_from_base('linear-learner')
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
+
         model = LinearLearnerModel(ll.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
-        predictor = model.deploy(1, 'ml.c4.xlarge')
+        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
 
-        try:
-            result = predictor.predict(train_set[0][0:100])
-            assert len(result) == 100
-            for record in result:
-                assert record.label["predicted_label"] is not None
-                assert record.label["score"] is not None
-        finally:
-            sagemaker_session.delete_endpoint(predictor.endpoint)
+        result = predictor.predict(train_set[0][0:100])
+        assert len(result) == 100
+        for record in result:
+            assert record.label["predicted_label"] is not None
+            assert record.label["score"] is not None
diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py
@@ -21,7 +21,7 @@
 from sagemaker.mxnet.model import MXNetModel
 
 from tests.integ import DATA_DIR, REGION
-from tests.integ.timeout import timeout
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
 @pytest.fixture(scope='module')
@@ -49,26 +49,24 @@ def mxnet_training_job(sagemaker_session):
 
 
 def test_attach_deploy(mxnet_training_job, sagemaker_session):
-    with timeout(minutes=15):
+    endpoint_name = 'test-mxnet-attach-deploy-{}'.format(int(time.time()))
+
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
         estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session)
-        predictor = estimator.deploy(1, 'ml.m4.xlarge',
-                                     endpoint_name='test-mxnet-attach-deploy-{}'.format(int(time.time())))
-        try:
-            data = numpy.zeros(shape=(1, 1, 28, 28))
-            predictor.predict(data)
-        finally:
-            sagemaker_session.delete_endpoint(predictor.endpoint)
+        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
+        data = numpy.zeros(shape=(1, 1, 28, 28))
+        predictor.predict(data)
 
 
 def test_deploy_model(mxnet_training_job, sagemaker_session):
-    with timeout(minutes=15):
+    endpoint_name = 'test-mxnet-deploy-model-{}'.format(int(time.time()))
+
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
         desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=mxnet_training_job)
         model_data = desc['ModelArtifacts']['S3ModelArtifacts']
         script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
         model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, sagemaker_session=sagemaker_session)
-        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name='test-mxnet-deploy-model-{}'.format(int(time.time())))
-        try:
-            data = numpy.zeros(shape=(1, 1, 28, 28))
-            predictor.predict(data)
-        finally:
-            sagemaker_session.delete_endpoint(predictor.endpoint)
+        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
+
+        data = numpy.zeros(shape=(1, 1, 28, 28))
+        predictor.predict(data)
diff --git a/tests/integ/test_pca.py b/tests/integ/test_pca.py
@@ -18,9 +18,10 @@
 import boto3
 import sagemaker
 import sagemaker.amazon.pca
+from sagemaker.utils import name_from_base
 
 from tests.integ import DATA_DIR, REGION
-from tests.integ.timeout import timeout
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
 def test_pca():
@@ -42,16 +43,15 @@ def test_pca():
         pca.extra_components = 5
         pca.fit(pca.record_set(train_set[0][:100]))
 
-    with timeout(minutes=15):
+    endpoint_name = name_from_base('pca')
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
         pca_model = sagemaker.amazon.pca.PCAModel(model_data=pca.model_data, role='SageMakerRole',
                                                   sagemaker_session=sagemaker_session)
-        predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
+        predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge",
+                                     endpoint_name=endpoint_name)
 
-        try:
-            result = predictor.predict(train_set[0][:5])
+        result = predictor.predict(train_set[0][:5])
 
-            assert len(result) == 5
-            for record in result:
-                assert record.label["projection"] is not None
-        finally:
-            sagemaker_session.delete_endpoint(predictor.endpoint)
+        assert len(result) == 5
+        for record in result:
+            assert record.label["projection"] is not None
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -10,17 +10,14 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-import os
-
 import boto3
+import os
 import pytest
+
 from sagemaker import Session
 from sagemaker.tensorflow import TensorFlow
-
 from tests.integ import DATA_DIR, REGION
-from tests.integ.timeout import timeout
-
-DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
+from tests.integ.timeout import timeout_and_delete_endpoint, timeout
 
 
 @pytest.fixture(scope='module')
@@ -47,40 +44,8 @@ def test_tf(sagemaker_session):
         estimator.fit(inputs)
         print('job succeeded: {}'.format(estimator.latest_training_job.name))
 
-    try:
-        with timeout(minutes=15):
-            json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
-
-            result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
-            print('predict result: {}'.format(result))
-    finally:
-        try:
-            estimator.delete_endpoint()
-        except Exception:
-            pass
-
-
-def test_cifar(sagemaker_session):
-    with timeout(minutes=15):
-        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
-
-        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')
-
-        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
-                               training_steps=20, evaluation_steps=5,
-                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
-                               sagemaker_session=sagemaker_session,
-                               base_job_name='test-cifar')
-
-        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
-        estimator.fit(inputs)
-        print('job succeeded: {}'.format(estimator.latest_training_job.name))
+    with timeout_and_delete_endpoint(estimator=estimator, minutes=15):
+        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
 
-    try:
-        with timeout(minutes=15):
-            estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
-    finally:
-        try:
-            estimator.delete_endpoint()
-        except Exception:
-            pass
+        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
+        print('predict result: {}'.format(result))
diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py
@@ -0,0 +1,45 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import boto3
+import os
+import pytest
+
+from sagemaker import Session
+from sagemaker.tensorflow import TensorFlow
+from tests.integ import DATA_DIR, REGION
+from tests.integ.timeout import timeout_and_delete_endpoint, timeout
+
+
+@pytest.fixture(scope='module')
+def sagemaker_session():
+    return Session(boto_session=boto3.Session(region_name=REGION))
+
+
+def test_cifar(sagemaker_session):
+    with timeout(minutes=15):
+        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
+
+        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')
+
+        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
+                               training_steps=20, evaluation_steps=5,
+                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
+                               sagemaker_session=sagemaker_session,
+                               base_job_name='test-cifar')
+
+        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
+        estimator.fit(inputs)
+        print('job succeeded: {}'.format(estimator.latest_training_job.name))
+
+    with timeout_and_delete_endpoint(estimator=estimator, minutes=15):
+        estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
diff --git a/tests/integ/timeout.py b/tests/integ/timeout.py
@@ -12,6 +12,9 @@
 # language governing permissions and limitations under the License.
 import signal
 from contextlib import contextmanager
+import logging
+
+LOGGER = logging.getLogger('timeout')
 
 
 class TimeoutError(Exception):
@@ -48,3 +51,23 @@ def handler(signum, frame):
         yield
     finally:
         signal.alarm(0)
+
+
+@contextmanager
+def timeout_and_delete_endpoint(estimator, seconds=0, minutes=0, hours=0):
+    with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
+        try:
+            yield [t]
+        finally:
+            estimator.delete_endpoint()
+            LOGGER.info('deleted endpoint')
+
+
+@contextmanager
+def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
+    with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
+        try:
+            yield [t]
+        finally:
+            sagemaker_session.delete_endpoint(endpoint_name)
+            LOGGER.info('deleted endpoint {}'.format(endpoint_name))
diff --git a/tox.ini b/tox.ini
@@ -20,6 +20,9 @@ exclude =
 max-complexity = 10
 
 [testenv]
+# TEAMCITY_VERSION environment variable exists during build on Teamcity. teamcity-messages uses it in order to enable
+# reporting to TeamCity.
+passenv = TEAMCITY_VERSION
 # {posargs} can be passed in by additional arguments specified when invoking tox.
 # Can be used to specify which tests to run, e.g.: tox -- -s
 commands =
@@ -31,6 +34,7 @@ deps =
     tensorflow
     mock
     contextlib2
+    teamcity-messages
 
 [testenv:flake8]
 basepython = python