diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 4cb03c466e..62b702fc73 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -100,6 +100,7 @@ class EstimatorBase(with_metaclass(ABCMeta, object)): # pylint: disable=too-man instance. """ + LAUNCH_PT_XLA_ENV_NAME = "sagemaker_pytorch_xla_multi_worker_enabled" LAUNCH_PS_ENV_NAME = "sagemaker_parameter_server_enabled" LAUNCH_MPI_ENV_NAME = "sagemaker_mpi_enabled" LAUNCH_SM_DDP_ENV_NAME = "sagemaker_distributed_dataparallel_enabled" @@ -3316,6 +3317,10 @@ def _distribution_configuration(self, distribution): "instance_groups" ] + if "pytorchxla" in distribution: + pt_xla_enabled = distribution.get("pytorchxla").get("enabled", False) + distribution_config[self.LAUNCH_PT_XLA_ENV_NAME] = pt_xla_enabled + if "parameter_server" in distribution: ps_enabled = distribution.get("parameter_server").get("enabled", False) distribution_config[self.LAUNCH_PS_ENV_NAME] = ps_enabled diff --git a/src/sagemaker/huggingface/estimator.py b/src/sagemaker/huggingface/estimator.py index 628c14dc8e..ad756da3b0 100644 --- a/src/sagemaker/huggingface/estimator.py +++ b/src/sagemaker/huggingface/estimator.py @@ -141,6 +141,28 @@ def __init__( } } } + + To enable distributed training with + `SageMaker Training Compiler `_ + for Hugging Face Transformers with PyTorch: + + .. code:: python + + { + "pytorchxla": { + "enabled": True + } + } + + To learn more, see `SageMaker Training Compiler + `_ + in the *Amazon SageMaker Developer Guide*. + + .. note:: + + When you use this PyTorch XLA option for distributed training strategy, + you must add the ``compiler_config`` parameter and activate SageMaker + Training Compiler. compiler_config (:class:`~sagemaker.huggingface.TrainingCompilerConfig`): Configures SageMaker Training Compiler to accelerate training. @@ -204,6 +226,13 @@ def __init__( raise ValueError(error_string) if compiler_config: compiler_config.validate(self) + elif distribution is not None and "pytorchxla" in distribution: + raise ValueError( + "Distributed training through PyTorch XLA is currently only supported " + "when SageMaker Training Compiler is enabled. To learn more, " + "see Enable SageMaker Training Compiler at " + "https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler-enable.html." + ) self.compiler_config = compiler_config def _validate_args(self, image_uri): diff --git a/src/sagemaker/huggingface/training_compiler/config.py b/src/sagemaker/huggingface/training_compiler/config.py index b19fb2be2b..9f0288115a 100644 --- a/src/sagemaker/huggingface/training_compiler/config.py +++ b/src/sagemaker/huggingface/training_compiler/config.py @@ -14,6 +14,8 @@ from __future__ import absolute_import import logging from typing import Union +from packaging.specifiers import SpecifierSet +from packaging.version import Version from sagemaker.training_compiler.config import TrainingCompilerConfig as BaseConfig from sagemaker.workflow.entities import PipelineVariable @@ -24,7 +26,14 @@ class TrainingCompilerConfig(BaseConfig): """The SageMaker Training Compiler configuration class.""" - SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"] + SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4d", "g5"] + SUPPORTED_INSTANCE_TYPES_WITH_EFA = [ + "ml.g4dn.8xlarge", + "ml.g4dn.12xlarge", + "ml.g5.48xlarge", + "ml.p3dn.24xlarge", + "ml.p4d.24xlarge", + ] def __init__( self, @@ -85,7 +94,7 @@ def validate( """Checks if SageMaker Training Compiler is configured correctly. Args: - estimator (str): A estimator object + estimator (:class:`sagemaker.huggingface.HuggingFace`): An estimator object. If SageMaker Training Compiler is enabled, it will validate whether the estimator is configured to be compatible with Training Compiler. @@ -105,3 +114,46 @@ def validate( "transformer_version, tensorflow_version or pytorch_version, and compiler_config." ) raise ValueError(error_helper_string) + + if estimator.distribution: + pt_xla_present = "pytorchxla" in estimator.distribution + pt_xla_enabled = estimator.distribution.get("pytorchxla", {}).get("enabled", False) + if pt_xla_enabled: + if estimator.tensorflow_version: + error_helper_string = ( + "Distribution mechanism 'pytorchxla' is currently only supported for " + "PyTorch >= 1.11 when SageMaker Training Compiler is enabled. Received " + "tensorflow_version={} which is unsupported." + ) + raise ValueError(error_helper_string.format(estimator.tensorflow_version)) + if estimator.pytorch_version: + if Version(estimator.pytorch_version) in SpecifierSet("< 1.11"): + error_helper_string = ( + "Distribution mechanism 'pytorchxla' is currently only supported for " + "PyTorch >= 1.11 when SageMaker Training Compiler is enabled." + " Received pytorch_version={} which is unsupported." + ) + raise ValueError(error_helper_string.format(estimator.pytorch_version)) + if estimator.instance_type not in cls.SUPPORTED_INSTANCE_TYPES_WITH_EFA: + logger.warning( + "Consider using instances with EFA support when " + "training with PyTorch >= 1.11 and SageMaker Training Compiler " + "enabled. SageMaker Training Compiler leverages EFA to provide better " + "performance for distributed training." + ) + if not pt_xla_present: + if estimator.pytorch_version: + if Version(estimator.pytorch_version) in SpecifierSet(">= 1.11"): + error_helper_string = ( + "'pytorchxla' is the only distribution mechanism currently supported " + "for PyTorch >= 1.11 when SageMaker Training Compiler is enabled." + " Received distribution={} which is unsupported." + ) + raise ValueError(error_helper_string.format(estimator.distribution)) + elif estimator.instance_count and estimator.instance_count > 1: + if estimator.pytorch_version: + if Version(estimator.pytorch_version) in SpecifierSet(">= 1.11"): + logger.warning( + "Consider setting 'distribution' to 'pytorchxla' for distributed " + "training with PyTorch >= 1.11 and SageMaker Training Compiler enabled." + ) diff --git a/src/sagemaker/image_uri_config/huggingface-training-compiler.json b/src/sagemaker/image_uri_config/huggingface-training-compiler.json index 1b4c6e3e71..e771e2a548 100644 --- a/src/sagemaker/image_uri_config/huggingface-training-compiler.json +++ b/src/sagemaker/image_uri_config/huggingface-training-compiler.json @@ -3,7 +3,8 @@ "processors": ["gpu"], "version_aliases": { "4.11": "4.11.0", - "4.17": "4.17.0" + "4.17": "4.17.0", + "4.21": "4.21.1" }, "versions": { "4.11.0": { @@ -97,6 +98,40 @@ "repository": "huggingface-tensorflow-trcomp-training", "container_version": {"gpu":"cu112-ubuntu20.04"} } + }, + "4.21.1": { + "version_aliases": { + "pytorch1.11": "pytorch1.11.0" + }, + "pytorch1.11.0": { + "py_versions": ["py38"], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ca-central-1": "763104351884", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "me-south-1": "217643126080", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "huggingface-pytorch-trcomp-training", + "container_version": {"gpu":"cu113-ubuntu20.04"} + } } } } diff --git a/src/sagemaker/training_compiler/config.py b/src/sagemaker/training_compiler/config.py index cd1fbd5957..85bd983ae8 100644 --- a/src/sagemaker/training_compiler/config.py +++ b/src/sagemaker/training_compiler/config.py @@ -21,7 +21,7 @@ class TrainingCompilerConfig(object): """The SageMaker Training Compiler configuration class.""" DEBUG_PATH = "/opt/ml/output/data/compiler/" - SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"] + SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4d", "g5"] HP_ENABLE_COMPILER = "sagemaker_training_compiler_enabled" HP_ENABLE_DEBUG = "sagemaker_training_compiler_debug_mode" @@ -123,7 +123,7 @@ def validate( """Checks if SageMaker Training Compiler is configured correctly. Args: - estimator (str): A estimator object + estimator (:class:`sagemaker.estimator.Estimator`): An estimator object. When SageMaker Training Compiler is enabled, it validates if the estimator is configured to be compatible with Training Compiler. @@ -132,31 +132,34 @@ def validate( ValueError: Raised if the requested configuration is not compatible with SageMaker Training Compiler. """ - - if "local" not in estimator.instance_type: - requested_instance_class = estimator.instance_type.split(".")[ - 1 - ] # Expecting ml.class.size - if not any( - [ - requested_instance_class.startswith(i) - for i in cls.SUPPORTED_INSTANCE_CLASS_PREFIXES - ] - ): + if estimator.instance_type: + if "local" not in estimator.instance_type: + requested_instance_class = estimator.instance_type.split(".")[ + 1 + ] # Expecting ml.class.size + if not any( + [ + requested_instance_class.startswith(i) + for i in cls.SUPPORTED_INSTANCE_CLASS_PREFIXES + ] + ): + error_helper_string = ( + "Unsupported Instance class {}." + "SageMaker Training Compiler only supports {}" + ) + error_helper_string = error_helper_string.format( + requested_instance_class, cls.SUPPORTED_INSTANCE_CLASS_PREFIXES + ) + raise ValueError(error_helper_string) + elif estimator.instance_type == "local": error_helper_string = ( - "Unsupported Instance class {}. SageMaker Training Compiler only supports {}" + "SageMaker Training Compiler doesn't support local mode." + "It only supports the following GPU instances: {}" ) error_helper_string = error_helper_string.format( - requested_instance_class, cls.SUPPORTED_INSTANCE_CLASS_PREFIXES + cls.SUPPORTED_INSTANCE_CLASS_PREFIXES ) raise ValueError(error_helper_string) - elif estimator.instance_type == "local": - error_helper_string = ( - "The local mode is not supported by SageMaker Training Compiler." - "It only supports the following GPU instances: {}" - ) - error_helper_string = error_helper_string.format(cls.SUPPORTED_INSTANCE_CLASS_PREFIXES) - raise ValueError(error_helper_string) if estimator.distribution and "smdistributed" in estimator.distribution: raise ValueError( @@ -180,3 +183,12 @@ def validate( estimator.debugger_hook_config, estimator.disable_profiler ) logger.warning(helper_string) + + if estimator.instance_groups: + raise ValueError( + "SageMaker Training Compiler currently only supports homogeneous clusters of " + "the following GPU instance families: {}. Please use the 'instance_type' " + "and 'instance_count' parameters instead of 'instance_groups'".format( + cls.SUPPORTED_INSTANCE_CLASS_PREFIXES + ) + ) diff --git a/tests/conftest.py b/tests/conftest.py index 011937f027..59397ec9af 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -252,20 +252,34 @@ def huggingface_pytorch_training_py_version(huggingface_pytorch_training_version @pytest.fixture(scope="module") def huggingface_training_compiler_pytorch_version(huggingface_training_compiler_version): - return _huggingface_base_fm_version( + versions = _huggingface_base_fm_version( huggingface_training_compiler_version, "pytorch", "huggingface_training_compiler" - )[0] + ) + if not versions: + pytest.skip( + f"Hugging Face Training Compiler version {huggingface_training_compiler_version} does " + f"not have a PyTorch release." + ) + return versions[0] @pytest.fixture(scope="module") def huggingface_training_compiler_tensorflow_version(huggingface_training_compiler_version): - return _huggingface_base_fm_version( + versions = _huggingface_base_fm_version( huggingface_training_compiler_version, "tensorflow", "huggingface_training_compiler" - )[0] + ) + if not versions: + pytest.skip( + f"Hugging Face Training Compiler version {huggingface_training_compiler_version} " + f"does not have a TensorFlow release." + ) + return versions[0] @pytest.fixture(scope="module") -def huggingface_training_compiler_py_version(huggingface_training_compiler_tensorflow_version): +def huggingface_training_compiler_tensorflow_py_version( + huggingface_training_compiler_tensorflow_version, +): return ( "py37" if Version(huggingface_training_compiler_tensorflow_version) < Version("2.6") @@ -273,6 +287,11 @@ def huggingface_training_compiler_py_version(huggingface_training_compiler_tenso ) +@pytest.fixture(scope="module") +def huggingface_training_compiler_pytorch_py_version(huggingface_training_compiler_pytorch_version): + return "py38" + + @pytest.fixture(scope="module") def huggingface_pytorch_latest_training_py_version(huggingface_training_pytorch_latest_version): return ( diff --git a/tests/integ/test_training_compiler.py b/tests/integ/test_training_compiler.py index f76894b6a6..3d5fad9ea9 100644 --- a/tests/integ/test_training_compiler.py +++ b/tests/integ/test_training_compiler.py @@ -26,11 +26,14 @@ from tests.integ.timeout import timeout -@pytest.fixture(scope="module") def gpu_instance_type(request): return "ml.p3.2xlarge" +def instance_count(request): + return 1 + + @pytest.fixture(scope="module") def imagenet_val_set(request, sagemaker_session, tmpdir_factory): """ @@ -62,21 +65,32 @@ def huggingface_dummy_dataset(request, sagemaker_session): return train_input -@pytest.fixture(scope="module", autouse=True) -def skip_if_incompatible(request): +@pytest.fixture(autouse=True) +def skip_if_incompatible(gpu_instance_type, request): """ These tests are for training compiler enabled images/estimators only. """ - if integ.test_region() not in integ.TRAINING_COMPILER_SUPPORTED_REGIONS: + region = integ.test_region() + if region not in integ.TRAINING_COMPILER_SUPPORTED_REGIONS: pytest.skip("SageMaker Training Compiler is not supported in this region") - if integ.test_region() in integ.TRAINING_NO_P3_REGIONS: + if gpu_instance_type == "ml.p3.16xlarge" and region not in integ.DATA_PARALLEL_TESTING_REGIONS: + pytest.skip("Data parallel testing is not allowed in this region") + if gpu_instance_type == "ml.p3.2xlarge" and region in integ.TRAINING_NO_P3_REGIONS: pytest.skip("no ml.p3 instances in this region") @pytest.mark.release +@pytest.mark.parametrize( + "gpu_instance_type,instance_count", + [ + ("ml.p3.2xlarge", 1), + ("ml.p3.16xlarge", 2), + ], +) def test_huggingface_pytorch( sagemaker_session, gpu_instance_type, + instance_count, huggingface_training_compiler_latest_version, huggingface_training_compiler_pytorch_latest_version, huggingface_dummy_dataset, @@ -93,7 +107,7 @@ def test_huggingface_pytorch( role="SageMakerRole", transformers_version=huggingface_training_compiler_latest_version, pytorch_version=huggingface_training_compiler_pytorch_latest_version, - instance_count=1, + instance_count=instance_count, instance_type=gpu_instance_type, hyperparameters={ "model_name_or_path": "distilbert-base-cased", @@ -105,10 +119,10 @@ def test_huggingface_pytorch( "per_device_train_batch_size": 128, "output_dir": "/opt/ml/model", }, - environment={"GPU_NUM_DEVICES": "1"}, sagemaker_session=sagemaker_session, disable_profiler=True, compiler_config=HFTrainingCompilerConfig(), + distribution={"pytorchxla": {"enabled": True}} if instance_count > 1 else None, ) hf.fit(huggingface_dummy_dataset) diff --git a/tests/unit/sagemaker/training_compiler/__init__.py b/tests/unit/sagemaker/training_compiler/__init__.py index 751253995d..335671fb69 100644 --- a/tests/unit/sagemaker/training_compiler/__init__.py +++ b/tests/unit/sagemaker/training_compiler/__init__.py @@ -12,4 +12,4 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -EC2_GPU_INSTANCE_CLASSES = {"p2", "g4dn", "g4ad", "p3", "p3dn", "p4dn"} +EC2_GPU_INSTANCE_CLASSES = {"p2", "g4dn", "g4ad", "p3", "p3dn", "p4d", "g5"} diff --git a/tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py b/tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py index 9bcf0559c5..af46cf4360 100644 --- a/tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py +++ b/tests/unit/sagemaker/training_compiler/test_huggingface_pytorch_compiler.py @@ -18,10 +18,12 @@ import pytest from mock import MagicMock, Mock, patch, ANY +from packaging.version import Version from sagemaker import image_uris from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig from sagemaker.huggingface.model import HuggingFaceModel +from sagemaker.instance_group import InstanceGroup from tests.unit.sagemaker.training_compiler import EC2_GPU_INSTANCE_CLASSES @@ -41,7 +43,7 @@ ROLE = "Dummy" REGION = "us-east-1" GPU = "ml.p3.2xlarge" -SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4dn"} +SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4d", "g5"} UNSUPPORTED_GPU_INSTANCE_CLASSES = EC2_GPU_INSTANCE_CLASSES - SUPPORTED_GPU_INSTANCE_CLASSES LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]} @@ -95,7 +97,9 @@ def _get_full_gpu_image_uri( ) -def _create_train_job(version, base_framework_version, instance_type, training_compiler_config): +def _create_train_job( + version, base_framework_version, instance_type, training_compiler_config, instance_count=1 +): return { "image_uri": _get_full_gpu_image_uri( version, base_framework_version, instance_type, training_compiler_config @@ -117,7 +121,7 @@ def _create_train_job(version, base_framework_version, instance_type, training_c "output_config": {"S3OutputPath": "s3://{}/".format(BUCKET_NAME)}, "resource_config": { "InstanceType": instance_type, - "InstanceCount": 1, + "InstanceCount": instance_count, "VolumeSizeInGB": 30, }, "hyperparameters": { @@ -271,6 +275,141 @@ def test_unsupported_python_2( ).fit() +def test_unsupported_instance_group( + huggingface_training_compiler_version, + huggingface_training_compiler_pytorch_version, +): + if Version(huggingface_training_compiler_pytorch_version) < Version("1.11"): + pytest.skip("This test is intended for PyTorch 1.11 and above") + with pytest.raises(ValueError): + HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_groups=[ + InstanceGroup("ml.p3dn.24xlarge", "ml.p3dn.24xlarge", 16), + InstanceGroup("ml.p4d.24xlarge", "ml.p4d.24xlarge", 16), + ], + transformers_version=huggingface_training_compiler_version, + pytorch_version=huggingface_training_compiler_pytorch_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + ).fit() + + +def test_unsupported_distribution( + huggingface_training_compiler_version, + huggingface_training_compiler_pytorch_version, +): + if Version(huggingface_training_compiler_pytorch_version) < Version("1.11"): + pytest.skip("This test is intended for PyTorch 1.11 and above") + with pytest.raises(ValueError): + HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=2, + instance_type=INSTANCE_TYPE, + transformers_version=huggingface_training_compiler_version, + pytorch_version=huggingface_training_compiler_pytorch_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + distribution={"smdistributed": {"dataparallel": {"enabled": True}}}, + ).fit() + + with pytest.raises(ValueError): + HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=2, + instance_type=INSTANCE_TYPE, + transformers_version="4.17", + pytorch_version="1.10", + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + distribution={"pytorchxla": {"enabled": True}}, + ).fit() + + with pytest.raises(ValueError): + HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=2, + instance_type=INSTANCE_TYPE, + transformers_version=huggingface_training_compiler_version, + pytorch_version=huggingface_training_compiler_pytorch_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + distribution={"mpi": {"enabled": True}}, + ).fit() + + +@patch("sagemaker.utils.repack_model", MagicMock()) +@patch("sagemaker.utils.create_tar_file", MagicMock()) +@patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME) +@patch("time.time", return_value=TIME) +@pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES) +def test_pytorchxla_distribution( + time, + name_from_base, + sagemaker_session, + huggingface_training_compiler_version, + huggingface_training_compiler_pytorch_version, + instance_class, +): + if Version(huggingface_training_compiler_pytorch_version) < Version("1.11"): + pytest.skip("This test is intended for PyTorch 1.11 and above") + compiler_config = TrainingCompilerConfig() + instance_type = f"ml.{instance_class}.xlarge" + + hf = HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=2, + instance_type=instance_type, + transformers_version=huggingface_training_compiler_version, + pytorch_version=huggingface_training_compiler_pytorch_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + distribution={"pytorchxla": {"enabled": True}}, + ) + + inputs = "s3://mybucket/train" + + hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) + + sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] + assert sagemaker_call_names == ["train", "logs_for_job"] + boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] + assert boto_call_names == ["resource"] + + expected_train_args = _create_train_job( + huggingface_training_compiler_version, + f"pytorch{huggingface_training_compiler_pytorch_version}", + instance_type, + compiler_config, + instance_count=2, + ) + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["enable_sagemaker_metrics"] = False + expected_train_args["hyperparameters"][TrainingCompilerConfig.HP_ENABLE_COMPILER] = json.dumps( + True + ) + expected_train_args["hyperparameters"][HuggingFace.LAUNCH_PT_XLA_ENV_NAME] = json.dumps(True) + expected_train_args["hyperparameters"][TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps( + False + ) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert ( + actual_train_args == expected_train_args + ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}" + + @patch("sagemaker.utils.repack_model", MagicMock()) @patch("sagemaker.utils.create_tar_file", MagicMock()) @patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME) @@ -513,7 +652,7 @@ def test_register_hf_pytorch_model_auto_infer_framework( sagemaker_session, huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_pytorch_py_version, ): model_package_group_name = "test-hf-tfs-register-model" @@ -528,7 +667,7 @@ def test_register_hf_pytorch_model_auto_infer_framework( role=ROLE, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_pytorch_py_version, sagemaker_session=sagemaker_session, ) diff --git a/tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py b/tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py index 32dc3c5634..5aef9316da 100644 --- a/tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py +++ b/tests/unit/sagemaker/training_compiler/test_huggingface_tensorflow_compiler.py @@ -41,7 +41,7 @@ ROLE = "Dummy" REGION = "us-east-1" GPU = "ml.p3.2xlarge" -SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4dn"} +SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4d", "g5"} UNSUPPORTED_GPU_INSTANCE_CLASSES = EC2_GPU_INSTANCE_CLASSES - SUPPORTED_GPU_INSTANCE_CLASSES LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]} @@ -158,18 +158,18 @@ def _create_train_job( def test_unsupported_BYOC( huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): byoc = ( f"1.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-trcomp-training:" f"2.6.3-" f"transformers4.17.0-gpu-" - f"{huggingface_training_compiler_py_version}-cu112-ubuntu20.04" + f"{huggingface_training_compiler_tensorflow_py_version}-cu112-ubuntu20.04" ) with pytest.raises(ValueError): HuggingFace( image_uri=byoc, - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, @@ -185,11 +185,11 @@ def test_unsupported_cpu_instance( cpu_instance_type, huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): with pytest.raises(ValueError): HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, @@ -206,11 +206,11 @@ def test_unsupported_gpu_instance( unsupported_gpu_instance_class, huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): with pytest.raises(ValueError): HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, @@ -224,11 +224,11 @@ def test_unsupported_gpu_instance( def test_unsupported_framework_version( huggingface_training_compiler_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): with pytest.raises(ValueError): HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, @@ -244,11 +244,11 @@ def test_unsupported_framework_version( def test_unsupported_framework_mxnet( huggingface_training_compiler_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): with pytest.raises(ValueError): HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, @@ -278,6 +278,39 @@ def test_unsupported_python_2( ).fit() +def test_unsupported_distribution( + huggingface_training_compiler_version, + huggingface_training_compiler_tensorflow_version, +): + with pytest.raises(ValueError): + HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=2, + instance_type=INSTANCE_TYPE, + transformers_version=huggingface_training_compiler_version, + tensorflow_version=huggingface_training_compiler_tensorflow_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + distribution={"smdistributed": {"dataparallel": {"enabled": True}}}, + ).fit() + + with pytest.raises(ValueError): + HuggingFace( + py_version="py38", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=2, + instance_type=INSTANCE_TYPE, + transformers_version=huggingface_training_compiler_version, + tensorflow_version=huggingface_training_compiler_tensorflow_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + distribution={"pytorchxla": {"enabled": True}}, + ).fit() + + @patch("sagemaker.utils.repack_model", MagicMock()) @patch("sagemaker.utils.create_tar_file", MagicMock()) @patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME) @@ -290,13 +323,13 @@ def test_default_compiler_config( huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, instance_class, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): compiler_config = TrainingCompilerConfig() instance_type = f"ml.{instance_class}.xlarge" hf = HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -322,7 +355,7 @@ def test_default_compiler_config( f"tensorflow{huggingface_training_compiler_tensorflow_version}", instance_type, compiler_config, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -349,12 +382,12 @@ def test_debug_compiler_config( sagemaker_session, huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): compiler_config = TrainingCompilerConfig(debug=True) hf = HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -380,7 +413,7 @@ def test_debug_compiler_config( f"tensorflow{huggingface_training_compiler_tensorflow_version}", INSTANCE_TYPE, compiler_config, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -407,12 +440,12 @@ def test_disable_compiler_config( sagemaker_session, huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): compiler_config = TrainingCompilerConfig(enabled=False) hf = HuggingFace( - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, @@ -438,7 +471,7 @@ def test_disable_compiler_config( f"tensorflow{huggingface_training_compiler_tensorflow_version}", INSTANCE_TYPE, compiler_config, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False @@ -459,13 +492,16 @@ def test_disable_compiler_config( ["compiler_enabled", "debug_enabled"], [(True, False), (True, True), (False, False)] ) def test_attach( - sagemaker_session, compiler_enabled, debug_enabled, huggingface_training_compiler_py_version + sagemaker_session, + compiler_enabled, + debug_enabled, + huggingface_training_compiler_tensorflow_py_version, ): training_image = ( f"1.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-trcomp-training:" f"2.6.3-" f"transformers4.17.0-gpu-" - f"{huggingface_training_compiler_py_version}-cu112-ubuntu20.04" + f"{huggingface_training_compiler_tensorflow_py_version}-cu112-ubuntu20.04" ) returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, @@ -499,7 +535,7 @@ def test_attach( estimator = HuggingFace.attach(training_job_name="trcomp", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "trcomp" - assert estimator.py_version == huggingface_training_compiler_py_version + assert estimator.py_version == huggingface_training_compiler_tensorflow_py_version assert estimator.framework_version == "4.17.0" assert estimator.tensorflow_version == "2.6.3" assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" @@ -524,7 +560,7 @@ def test_register_hf_tfs_model_auto_infer_framework( sagemaker_session, huggingface_training_compiler_version, huggingface_training_compiler_tensorflow_version, - huggingface_training_compiler_py_version, + huggingface_training_compiler_tensorflow_py_version, ): model_package_group_name = "test-hf-tfs-register-model" @@ -539,7 +575,7 @@ def test_register_hf_tfs_model_auto_infer_framework( role=ROLE, transformers_version=huggingface_training_compiler_version, tensorflow_version=huggingface_training_compiler_tensorflow_version, - py_version=huggingface_training_compiler_py_version, + py_version=huggingface_training_compiler_tensorflow_py_version, sagemaker_session=sagemaker_session, ) diff --git a/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py b/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py index 4618786483..7517f3a641 100644 --- a/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py +++ b/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py @@ -41,7 +41,7 @@ ROLE = "Dummy" REGION = "us-east-1" GPU = "ml.p3.2xlarge" -SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4dn", "g5"} +SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4d", "g5"} UNSUPPORTED_GPU_INSTANCE_CLASSES = EC2_GPU_INSTANCE_CLASSES - SUPPORTED_GPU_INSTANCE_CLASSES LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]}