diff --git a/.readthedocs.yml b/.readthedocs.yaml similarity index 88% rename from .readthedocs.yml rename to .readthedocs.yaml index ceac6c46a4..0a6e3928b5 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yaml @@ -4,13 +4,19 @@ version: 2 +build: + os: ubuntu-20.04 + tools: + python: "3.9" + + python: - version: 3.9 install: - method: pip path: . - requirements: doc/requirements.txt + sphinx: configuration: doc/conf.py fail_on_warning: true # http://www.sphinx-doc.org/en/master/man/sphinx-build.html#id6 diff --git a/tests/conftest.py b/tests/conftest.py index b43e3fc3ec..1b6daa9b01 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -405,6 +405,15 @@ def gpu_instance_type(sagemaker_session, request): return "ml.p3.2xlarge" +@pytest.fixture(scope="session") +def gpu_instance_type_list(sagemaker_session, request): + region = sagemaker_session.boto_session.region_name + if region in NO_P3_REGIONS: + return ["ml.p2.xlarge"] + else: + return ["ml.p3.2xlarge", "ml.p2.xlarge"] + + @pytest.fixture(scope="session") def inf_instance_type(sagemaker_session, request): return "ml.inf1.xlarge" diff --git a/tests/integ/test_huggingface.py b/tests/integ/test_huggingface.py index f93e70a472..52d5da4fbf 100644 --- a/tests/integ/test_huggingface.py +++ b/tests/integ/test_huggingface.py @@ -15,6 +15,7 @@ import os import pytest +import logging from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor @@ -22,6 +23,7 @@ from tests import integ from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name +from sagemaker.exceptions import UnexpectedStatusException ROLE = "SageMakerRole" @@ -34,32 +36,41 @@ ) def test_framework_processing_job_with_deps( sagemaker_session, - gpu_instance_type, + gpu_instance_type_list, huggingface_training_latest_version, huggingface_training_pytorch_latest_version, huggingface_pytorch_latest_training_py_version, ): - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") - entry_point = "main_script.py" - - processor = HuggingFaceProcessor( - transformers_version=huggingface_training_latest_version, - pytorch_version=huggingface_training_pytorch_latest_version, - py_version=huggingface_pytorch_latest_training_py_version, - role=ROLE, - instance_count=1, - instance_type=gpu_instance_type, - sagemaker_session=sagemaker_session, - base_job_name="test-huggingface", - ) - - processor.run( - code=entry_point, - source_dir=code_path, - inputs=[], - wait=True, - ) + for i_type in gpu_instance_type_list: + logging.info("Using the instance type: {}".format(i_type)) + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): + code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") + entry_point = "main_script.py" + + processor = HuggingFaceProcessor( + transformers_version=huggingface_training_latest_version, + pytorch_version=huggingface_training_pytorch_latest_version, + py_version=huggingface_pytorch_latest_training_py_version, + role=ROLE, + instance_count=1, + instance_type=i_type, + sagemaker_session=sagemaker_session, + base_job_name="test-huggingface", + ) + try: + processor.run( + code=entry_point, + source_dir=code_path, + inputs=[], + wait=True, + ) + except UnexpectedStatusException as e: + if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]: + logging.warning("Failure using instance type: {}. {}".format(i_type, str(e))) + continue + else: + raise + break @pytest.mark.release diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py index 25e463161c..ffb16949c8 100644 --- a/tests/integ/test_tf.py +++ b/tests/integ/test_tf.py @@ -15,6 +15,7 @@ import numpy as np import os import time +import logging import pytest @@ -25,6 +26,8 @@ from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, kms_utils, timeout from tests.integ.retry import retries from tests.integ.s3_utils import assert_s3_file_patterns_exist +from sagemaker.exceptions import UnexpectedStatusException + ROLE = "SageMakerRole" @@ -42,30 +45,39 @@ @pytest.mark.release def test_framework_processing_job_with_deps( sagemaker_session, - instance_type, + gpu_instance_type_list, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): - with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") - entry_point = "main_script.py" - - processor = TensorFlowProcessor( - framework_version=tensorflow_training_latest_version, - py_version=tensorflow_training_latest_py_version, - role=ROLE, - instance_count=1, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - base_job_name="test-tensorflow", - ) - - processor.run( - code=entry_point, - source_dir=code_path, - inputs=[], - wait=True, - ) + for i_type in gpu_instance_type_list: + logging.info("Using the instance type: {}".format(i_type)) + with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): + code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") + entry_point = "main_script.py" + + processor = TensorFlowProcessor( + framework_version=tensorflow_training_latest_version, + py_version=tensorflow_training_latest_py_version, + role=ROLE, + instance_count=1, + instance_type=i_type, + sagemaker_session=sagemaker_session, + base_job_name="test-tensorflow", + ) + try: + processor.run( + code=entry_point, + source_dir=code_path, + inputs=[], + wait=True, + ) + except UnexpectedStatusException as e: + if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]: + logging.warning("Failure using instance type: {}. {}".format(i_type, str(e))) + continue + else: + raise + break def test_mnist_with_checkpoint_config(