Skip to content

change: skip p2/p3 tests in eu-central-1 #769

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions tests/integ/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
TRANSFORM_DEFAULT_TIMEOUT_MINUTES = 20
PYTHON_VERSION = 'py' + str(sys.version_info.major)

# 'eu-central-1' has some p2, but no enough for continuous testing
HOSTING_NO_P2_REGIONS = ['ca-central-1', 'eu-west-2', 'us-west-1', 'eu-central-1']
# these regions have some p2 and p3 instances, but not enough for continuous testing
HOSTING_NO_P2_REGIONS = ['ca-central-1', 'eu-central-1', 'eu-west-2', 'us-west-1']
HOSTING_NO_P3_REGIONS = ['ap-southeast-1', 'ap-southeast-2', 'ap-south-1', 'ca-central-1',
'eu-west-2', 'us-west-1']
'eu-central-1', 'eu-west-2', 'us-west-1']

# EI is currently only supported in the following regions
# regions were derived from https://aws.amazon.com/machine-learning/elastic-inference/pricing/
EI_SUPPORTED_REGIONS = ['us-east-1', 'us-east-2', 'us-west-2', 'eu-west-1', 'ap-northeast-1', 'ap-northeast-2']
Expand Down
63 changes: 36 additions & 27 deletions tests/integ/test_tf_script_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@
from sagemaker.tensorflow import TensorFlow
from six.moves.urllib.parse import urlparse
from sagemaker.utils import unique_name_from_base
import tests.integ as integ
from tests.integ import kms_utils
import tests.integ.timeout as timeout

import tests.integ
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. Another way I like to do as well is:

from test import integ

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that works too. i'm long winded so i generally prefer fully FQ names.


ROLE = 'SageMakerRole'

Expand All @@ -35,14 +34,18 @@
TAGS = [{'Key': 'some-key', 'Value': 'some-value'}]


@pytest.fixture(scope='session', params=['ml.c5.xlarge', 'ml.p2.xlarge'])
@pytest.fixture(scope='session', params=[
'ml.c5.xlarge',
pytest.param('ml.p2.xlarge',
marks=pytest.mark.skipif(
tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS,
reason='no ml.p2 instances in this region'))])
def instance_type(request):
return request.param


@pytest.mark.skipif(integ.test_region() in integ.HOSTING_NO_P2_REGIONS,
reason='no ml.p2 instances in these regions')
@pytest.mark.skipif(integ.PYTHON_VERSION != 'py3', reason="Script Mode tests are only configured to run with Python 3")
@pytest.mark.skipif(tests.integ.PYTHON_VERSION != 'py3',
reason="Script Mode tests are only configured to run with Python 3")
def test_mnist(sagemaker_session, instance_type):
estimator = TensorFlow(entry_point=SCRIPT,
role='SageMakerRole',
Expand All @@ -51,26 +54,26 @@ def test_mnist(sagemaker_session, instance_type):
sagemaker_session=sagemaker_session,
py_version='py3',
framework_version=TensorFlow.LATEST_VERSION,
metric_definitions=[{'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}])
metric_definitions=[
{'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}])
inputs = estimator.sagemaker_session.upload_data(
path=os.path.join(RESOURCE_PATH, 'data'),
key_prefix='scriptmode/mnist')

with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-mnist'))
_assert_s3_files_exist(estimator.model_dir,
['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
df = estimator.training_job_analytics.dataframe()
print(df)
assert df.size > 0


def test_server_side_encryption(sagemaker_session):

boto_session = sagemaker_session.boto_session
with kms_utils.bucket_with_encryption(boto_session, ROLE) as (bucket_with_kms, kms_key):

output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption', time.strftime('%y%m%d-%H%M'))
with tests.integ.kms_utils.bucket_with_encryption(boto_session, ROLE) as (
bucket_with_kms, kms_key):
output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption',
time.strftime('%y%m%d-%H%M'))

estimator = TensorFlow(entry_point=SCRIPT,
role=ROLE,
Expand All @@ -88,28 +91,29 @@ def test_server_side_encryption(sagemaker_session):
path=os.path.join(RESOURCE_PATH, 'data'),
key_prefix='scriptmode/mnist')

with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-server-side-encryption'))
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
estimator.fit(inputs=inputs,
job_name=unique_name_from_base('test-server-side-encryption'))


@pytest.mark.canary_quick
@pytest.mark.skipif(integ.PYTHON_VERSION != 'py3', reason="Script Mode tests are only configured to run with Python 3")
@pytest.mark.skipif(tests.integ.PYTHON_VERSION != 'py3',
reason="Script Mode tests are only configured to run with Python 3")
def test_mnist_distributed(sagemaker_session, instance_type):
estimator = TensorFlow(entry_point=SCRIPT,
role=ROLE,
train_instance_count=2,
# TODO: change train_instance_type to instance_type once the test is passing consistently
train_instance_type='ml.c5.xlarge',
train_instance_type=instance_type,
sagemaker_session=sagemaker_session,
py_version=integ.PYTHON_VERSION,
py_version=tests.integ.PYTHON_VERSION,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
distributions=PARAMETER_SERVER_DISTRIBUTION)
inputs = estimator.sagemaker_session.upload_data(
path=os.path.join(RESOURCE_PATH, 'data'),
key_prefix='scriptmode/distributed_mnist')

with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-distributed'))
_assert_s3_files_exist(estimator.model_dir,
['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
Expand All @@ -131,22 +135,26 @@ def test_mnist_async(sagemaker_session):
training_job_name = estimator.latest_training_job.name
time.sleep(20)
endpoint_name = training_job_name
_assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
_assert_training_job_tags_match(sagemaker_session.sagemaker_client,
estimator.latest_training_job.name, TAGS)
with tests.integ.timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
estimator = TensorFlow.attach(training_job_name=training_job_name,
sagemaker_session=sagemaker_session)
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
endpoint_name=endpoint_name)

result = predictor.predict(np.zeros(784))
print('predict result: {}'.format(result))
_assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS)
_assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
_assert_model_tags_match(sagemaker_session.sagemaker_client,
estimator.latest_training_job.name, TAGS)


def _assert_s3_files_exist(s3_url, files):
parsed_url = urlparse(s3_url)
s3 = boto3.client('s3')
contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/'))["Contents"]
contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/'))[
"Contents"]
for f in files:
found = [x['Key'] for x in contents if x['Key'].endswith(f)]
if not found:
Expand All @@ -169,5 +177,6 @@ def _assert_endpoint_tags_match(sagemaker_client, endpoint_name, tags):


def _assert_training_job_tags_match(sagemaker_client, training_job_name, tags):
training_job_description = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
training_job_description = sagemaker_client.describe_training_job(
TrainingJobName=training_job_name)
_assert_tags_match(sagemaker_client, training_job_description['TrainingJobArn'], tags)
8 changes: 6 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -60,21 +60,25 @@ commands =
deps = .[test]

[testenv:flake8]
basepython = python
basepython = python3
skipdist = true
skip_install = true
deps =
flake8
flake8-future-import
commands = flake8

[testenv:pylint]
basepython = python3
skipdist = true
skip_install = true
deps =
pylint==2.3.1
commands =
python -m pylint --rcfile=.pylintrc -j 0 src/sagemaker

[testenv:twine]
basepython = python
basepython = python3
# twine check was added starting in 1.12.0
# https://github.com/pypa/twine/blob/master/docs/changelog.rst
deps =
Expand Down