|
15 | 15 | import os
|
16 | 16 |
|
17 | 17 | import pytest
|
| 18 | +import logging |
18 | 19 |
|
19 | 20 | from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
|
20 | 21 | from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor
|
21 | 22 | from sagemaker.utils import unique_name_from_base
|
22 | 23 | from tests import integ
|
23 | 24 | from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
|
24 | 25 | from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
|
| 26 | +from sagemaker.exceptions import UnexpectedStatusException |
25 | 27 |
|
26 | 28 | ROLE = "SageMakerRole"
|
27 | 29 |
|
|
34 | 36 | )
|
35 | 37 | def test_framework_processing_job_with_deps(
|
36 | 38 | sagemaker_session,
|
37 |
| - gpu_instance_type, |
| 39 | + gpu_instance_type_list, |
38 | 40 | huggingface_training_latest_version,
|
39 | 41 | huggingface_training_pytorch_latest_version,
|
40 | 42 | huggingface_pytorch_latest_training_py_version,
|
41 | 43 | ):
|
42 |
| - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): |
43 |
| - code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") |
44 |
| - entry_point = "main_script.py" |
45 |
| - |
46 |
| - processor = HuggingFaceProcessor( |
47 |
| - transformers_version=huggingface_training_latest_version, |
48 |
| - pytorch_version=huggingface_training_pytorch_latest_version, |
49 |
| - py_version=huggingface_pytorch_latest_training_py_version, |
50 |
| - role=ROLE, |
51 |
| - instance_count=1, |
52 |
| - instance_type=gpu_instance_type, |
53 |
| - sagemaker_session=sagemaker_session, |
54 |
| - base_job_name="test-huggingface", |
55 |
| - ) |
56 |
| - |
57 |
| - processor.run( |
58 |
| - code=entry_point, |
59 |
| - source_dir=code_path, |
60 |
| - inputs=[], |
61 |
| - wait=True, |
62 |
| - ) |
| 44 | + for i_type in gpu_instance_type_list: |
| 45 | + logging.info("Using the instance type: {}".format(i_type)) |
| 46 | + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): |
| 47 | + code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") |
| 48 | + entry_point = "main_script.py" |
| 49 | + |
| 50 | + processor = HuggingFaceProcessor( |
| 51 | + transformers_version=huggingface_training_latest_version, |
| 52 | + pytorch_version=huggingface_training_pytorch_latest_version, |
| 53 | + py_version=huggingface_pytorch_latest_training_py_version, |
| 54 | + role=ROLE, |
| 55 | + instance_count=1, |
| 56 | + instance_type=i_type, |
| 57 | + sagemaker_session=sagemaker_session, |
| 58 | + base_job_name="test-huggingface", |
| 59 | + ) |
| 60 | + try: |
| 61 | + processor.run( |
| 62 | + code=entry_point, |
| 63 | + source_dir=code_path, |
| 64 | + inputs=[], |
| 65 | + wait=True, |
| 66 | + ) |
| 67 | + except UnexpectedStatusException as e: |
| 68 | + if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]: |
| 69 | + logging.warning("Failure using instance type: {}. {}".format(i_type, str(e))) |
| 70 | + continue |
| 71 | + else: |
| 72 | + raise |
| 73 | + break |
63 | 74 |
|
64 | 75 |
|
65 | 76 | @pytest.mark.release
|
|
0 commit comments