Skip to content

Commit f8b90f2

Browse files
authored
Merge pull request #9 from aws/mvs-timeout-deletes-endpoints
Improving endpoint deletion
2 parents 6a6d4c6 + c3bd56a commit f8b90f2

9 files changed

+130
-92
lines changed

tests/integ/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
import logging
1314
import os
1415
DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
1516
REGION = 'us-west-2'
17+
18+
logging.getLogger('boto3').setLevel(logging.INFO)
19+
logging.getLogger('botocore').setLevel(logging.INFO)

tests/integ/test_kmeans.py

+12-13
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,17 @@
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
1313
import gzip
14-
import os
1514
import pickle
1615
import sys
17-
import pytest # noqa
16+
1817
import boto3
18+
import os
19+
1920
import sagemaker
2021
from sagemaker import KMeans, KMeansModel
21-
22+
from sagemaker.utils import name_from_base
2223
from tests.integ import DATA_DIR, REGION
23-
from tests.integ.timeout import timeout
24+
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2425

2526

2627
def test_kmeans():
@@ -49,15 +50,13 @@ def test_kmeans():
4950

5051
kmeans.fit(kmeans.record_set(train_set[0][:100]))
5152

52-
with timeout(minutes=15):
53+
endpoint_name = name_from_base('kmeans')
54+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
5355
model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
54-
predictor = model.deploy(1, 'ml.c4.xlarge')
56+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
5557
result = predictor.predict(train_set[0][:10])
5658

57-
try:
58-
assert len(result) == 10
59-
for record in result:
60-
assert record.label["closest_cluster"] is not None
61-
assert record.label["distance_to_cluster"] is not None
62-
finally:
63-
sagemaker_session.delete_endpoint(predictor.endpoint)
59+
assert len(result) == 10
60+
for record in result:
61+
assert record.label["closest_cluster"] is not None
62+
assert record.label["distance_to_cluster"] is not None

tests/integ/test_linear_learner.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
import numpy as np
2020
import sagemaker
2121
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
22+
from sagemaker.utils import name_from_base
2223

2324
from tests.integ import DATA_DIR, REGION
24-
from tests.integ.timeout import timeout
25+
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2526

2627

2728
def test_linear_learner():
@@ -72,15 +73,14 @@ def test_linear_learner():
7273
ll.num_point_for_scala = 10000
7374
ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))
7475

75-
with timeout(minutes=15):
76+
endpoint_name = name_from_base('linear-learner')
77+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
78+
7679
model = LinearLearnerModel(ll.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
77-
predictor = model.deploy(1, 'ml.c4.xlarge')
80+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
7881

79-
try:
80-
result = predictor.predict(train_set[0][0:100])
81-
assert len(result) == 100
82-
for record in result:
83-
assert record.label["predicted_label"] is not None
84-
assert record.label["score"] is not None
85-
finally:
86-
sagemaker_session.delete_endpoint(predictor.endpoint)
82+
result = predictor.predict(train_set[0][0:100])
83+
assert len(result) == 100
84+
for record in result:
85+
assert record.label["predicted_label"] is not None
86+
assert record.label["score"] is not None

tests/integ/test_mxnet_train.py

+14-16
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from sagemaker.mxnet.model import MXNetModel
2222

2323
from tests.integ import DATA_DIR, REGION
24-
from tests.integ.timeout import timeout
24+
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2525

2626

2727
@pytest.fixture(scope='module')
@@ -49,26 +49,24 @@ def mxnet_training_job(sagemaker_session):
4949

5050

5151
def test_attach_deploy(mxnet_training_job, sagemaker_session):
52-
with timeout(minutes=15):
52+
endpoint_name = 'test-mxnet-attach-deploy-{}'.format(int(time.time()))
53+
54+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
5355
estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session)
54-
predictor = estimator.deploy(1, 'ml.m4.xlarge',
55-
endpoint_name='test-mxnet-attach-deploy-{}'.format(int(time.time())))
56-
try:
57-
data = numpy.zeros(shape=(1, 1, 28, 28))
58-
predictor.predict(data)
59-
finally:
60-
sagemaker_session.delete_endpoint(predictor.endpoint)
56+
predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
57+
data = numpy.zeros(shape=(1, 1, 28, 28))
58+
predictor.predict(data)
6159

6260

6361
def test_deploy_model(mxnet_training_job, sagemaker_session):
64-
with timeout(minutes=15):
62+
endpoint_name = 'test-mxnet-deploy-model-{}'.format(int(time.time()))
63+
64+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
6565
desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=mxnet_training_job)
6666
model_data = desc['ModelArtifacts']['S3ModelArtifacts']
6767
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
6868
model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, sagemaker_session=sagemaker_session)
69-
predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name='test-mxnet-deploy-model-{}'.format(int(time.time())))
70-
try:
71-
data = numpy.zeros(shape=(1, 1, 28, 28))
72-
predictor.predict(data)
73-
finally:
74-
sagemaker_session.delete_endpoint(predictor.endpoint)
69+
predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
70+
71+
data = numpy.zeros(shape=(1, 1, 28, 28))
72+
predictor.predict(data)

tests/integ/test_pca.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
import boto3
1919
import sagemaker
2020
import sagemaker.amazon.pca
21+
from sagemaker.utils import name_from_base
2122

2223
from tests.integ import DATA_DIR, REGION
23-
from tests.integ.timeout import timeout
24+
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2425

2526

2627
def test_pca():
@@ -42,16 +43,15 @@ def test_pca():
4243
pca.extra_components = 5
4344
pca.fit(pca.record_set(train_set[0][:100]))
4445

45-
with timeout(minutes=15):
46+
endpoint_name = name_from_base('pca')
47+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=15):
4648
pca_model = sagemaker.amazon.pca.PCAModel(model_data=pca.model_data, role='SageMakerRole',
4749
sagemaker_session=sagemaker_session)
48-
predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
50+
predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge",
51+
endpoint_name=endpoint_name)
4952

50-
try:
51-
result = predictor.predict(train_set[0][:5])
53+
result = predictor.predict(train_set[0][:5])
5254

53-
assert len(result) == 5
54-
for record in result:
55-
assert record.label["projection"] is not None
56-
finally:
57-
sagemaker_session.delete_endpoint(predictor.endpoint)
55+
assert len(result) == 5
56+
for record in result:
57+
assert record.label["projection"] is not None

tests/integ/test_tf.py

+7-42
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,14 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13-
import os
14-
1513
import boto3
14+
import os
1615
import pytest
16+
1717
from sagemaker import Session
1818
from sagemaker.tensorflow import TensorFlow
19-
2019
from tests.integ import DATA_DIR, REGION
21-
from tests.integ.timeout import timeout
22-
23-
DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
20+
from tests.integ.timeout import timeout_and_delete_endpoint, timeout
2421

2522

2623
@pytest.fixture(scope='module')
@@ -47,40 +44,8 @@ def test_tf(sagemaker_session):
4744
estimator.fit(inputs)
4845
print('job succeeded: {}'.format(estimator.latest_training_job.name))
4946

50-
try:
51-
with timeout(minutes=15):
52-
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
53-
54-
result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
55-
print('predict result: {}'.format(result))
56-
finally:
57-
try:
58-
estimator.delete_endpoint()
59-
except Exception:
60-
pass
61-
62-
63-
def test_cifar(sagemaker_session):
64-
with timeout(minutes=15):
65-
script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
66-
67-
dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')
68-
69-
estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
70-
training_steps=20, evaluation_steps=5,
71-
train_instance_count=2, train_instance_type='ml.p2.xlarge',
72-
sagemaker_session=sagemaker_session,
73-
base_job_name='test-cifar')
74-
75-
inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
76-
estimator.fit(inputs)
77-
print('job succeeded: {}'.format(estimator.latest_training_job.name))
47+
with timeout_and_delete_endpoint(estimator=estimator, minutes=15):
48+
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
7849

79-
try:
80-
with timeout(minutes=15):
81-
estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
82-
finally:
83-
try:
84-
estimator.delete_endpoint()
85-
except Exception:
86-
pass
50+
result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
51+
print('predict result: {}'.format(result))

tests/integ/test_tf_cifar.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
import boto3
14+
import os
15+
import pytest
16+
17+
from sagemaker import Session
18+
from sagemaker.tensorflow import TensorFlow
19+
from tests.integ import DATA_DIR, REGION
20+
from tests.integ.timeout import timeout_and_delete_endpoint, timeout
21+
22+
23+
@pytest.fixture(scope='module')
24+
def sagemaker_session():
25+
return Session(boto_session=boto3.Session(region_name=REGION))
26+
27+
28+
def test_cifar(sagemaker_session):
29+
with timeout(minutes=15):
30+
script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
31+
32+
dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')
33+
34+
estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
35+
training_steps=20, evaluation_steps=5,
36+
train_instance_count=2, train_instance_type='ml.p2.xlarge',
37+
sagemaker_session=sagemaker_session,
38+
base_job_name='test-cifar')
39+
40+
inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
41+
estimator.fit(inputs)
42+
print('job succeeded: {}'.format(estimator.latest_training_job.name))
43+
44+
with timeout_and_delete_endpoint(estimator=estimator, minutes=15):
45+
estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')

tests/integ/timeout.py

+23
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
# language governing permissions and limitations under the License.
1313
import signal
1414
from contextlib import contextmanager
15+
import logging
16+
17+
LOGGER = logging.getLogger('timeout')
1518

1619

1720
class TimeoutError(Exception):
@@ -48,3 +51,23 @@ def handler(signum, frame):
4851
yield
4952
finally:
5053
signal.alarm(0)
54+
55+
56+
@contextmanager
57+
def timeout_and_delete_endpoint(estimator, seconds=0, minutes=0, hours=0):
58+
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
59+
try:
60+
yield [t]
61+
finally:
62+
estimator.delete_endpoint()
63+
LOGGER.info('deleted endpoint')
64+
65+
66+
@contextmanager
67+
def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
68+
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
69+
try:
70+
yield [t]
71+
finally:
72+
sagemaker_session.delete_endpoint(endpoint_name)
73+
LOGGER.info('deleted endpoint {}'.format(endpoint_name))

tox.ini

+4
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ exclude =
2020
max-complexity = 10
2121

2222
[testenv]
23+
# TEAMCITY_VERSION environment variable exists during build on Teamcity. teamcity-messages uses it in order to enable
24+
# reporting to TeamCity.
25+
passenv = TEAMCITY_VERSION
2326
# {posargs} can be passed in by additional arguments specified when invoking tox.
2427
# Can be used to specify which tests to run, e.g.: tox -- -s
2528
commands =
@@ -31,6 +34,7 @@ deps =
3134
tensorflow
3235
mock
3336
contextlib2
37+
teamcity-messages
3438

3539
[testenv:flake8]
3640
basepython = python

0 commit comments

Comments
 (0)