diff --git a/doc/frameworks/tensorflow/sagemaker.tensorflow.rst b/doc/frameworks/tensorflow/sagemaker.tensorflow.rst index c9187ffa04..c61823fcfc 100644 --- a/doc/frameworks/tensorflow/sagemaker.tensorflow.rst +++ b/doc/frameworks/tensorflow/sagemaker.tensorflow.rst @@ -10,6 +10,14 @@ TensorFlow Estimator :undoc-members: :show-inheritance: +TensorFlow Training Compiler Configuration +------------------------------------------ + +.. autoclass:: sagemaker.tensorflow.TrainingCompilerConfig + :members: + :undoc-members: + :show-inheritance: + TensorFlow Serving Model ------------------------ diff --git a/src/sagemaker/huggingface/__init__.py b/src/sagemaker/huggingface/__init__.py index 1d7ca57d29..8c70f49631 100644 --- a/src/sagemaker/huggingface/__init__.py +++ b/src/sagemaker/huggingface/__init__.py @@ -17,4 +17,4 @@ from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor # noqa: F401 from sagemaker.huggingface.processing import HuggingFaceProcessor # noqa:F401 -from sagemaker.training_compiler.config import TrainingCompilerConfig # noqa: F401 +from sagemaker.huggingface.training_compiler.config import TrainingCompilerConfig # noqa: F401 diff --git a/src/sagemaker/huggingface/estimator.py b/src/sagemaker/huggingface/estimator.py index 81b24b5aa3..bb43890ce4 100644 --- a/src/sagemaker/huggingface/estimator.py +++ b/src/sagemaker/huggingface/estimator.py @@ -26,7 +26,7 @@ from sagemaker.huggingface.model import HuggingFaceModel from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT -from sagemaker.training_compiler.config import TrainingCompilerConfig +from sagemaker.huggingface.training_compiler.config import TrainingCompilerConfig logger = logging.getLogger("sagemaker") @@ -190,6 +190,8 @@ def __init__( entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs ) + self.distribution = distribution or {} + if compiler_config is not None: if not isinstance(compiler_config, TrainingCompilerConfig): error_string = ( @@ -199,13 +201,7 @@ def __init__( ) raise ValueError(error_string) if compiler_config: - compiler_config.validate( - image_uri=image_uri, - instance_type=instance_type, - distribution=distribution, - ) - - self.distribution = distribution or {} + compiler_config.validate(self) self.compiler_config = compiler_config def _validate_args(self, image_uri): diff --git a/src/sagemaker/huggingface/training_compiler/__init__.py b/src/sagemaker/huggingface/training_compiler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sagemaker/huggingface/training_compiler/config.py b/src/sagemaker/huggingface/training_compiler/config.py new file mode 100644 index 0000000000..07a3bcf9b7 --- /dev/null +++ b/src/sagemaker/huggingface/training_compiler/config.py @@ -0,0 +1,105 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Configuration for the SageMaker Training Compiler.""" +from __future__ import absolute_import +import logging + +from sagemaker.training_compiler.config import TrainingCompilerConfig as BaseConfig + +logger = logging.getLogger(__name__) + + +class TrainingCompilerConfig(BaseConfig): + """The SageMaker Training Compiler configuration class.""" + + SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"] + + def __init__( + self, + enabled=True, + debug=False, + ): + """This class initializes a ``TrainingCompilerConfig`` instance. + + `Amazon SageMaker Training Compiler + `_ + is a feature of SageMaker Training + and speeds up training jobs by optimizing model execution graphs. + + You can compile Hugging Face models + by passing the object of this configuration class to the ``compiler_config`` + parameter of the :class:`~sagemaker.huggingface.HuggingFace` + estimator. + + Args: + enabled (bool): Optional. Switch to enable SageMaker Training Compiler. + The default is ``True``. + debug (bool): Optional. Whether to dump detailed logs for debugging. + This comes with a potential performance slowdown. + The default is ``False``. + + **Example**: The following code shows the basic usage of the + :class:`sagemaker.huggingface.TrainingCompilerConfig()` class + to run a HuggingFace training job with the compiler. + + .. code-block:: python + + from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig + + huggingface_estimator=HuggingFace( + ... + compiler_config=TrainingCompilerConfig() + ) + + .. seealso:: + + For more information about how to enable SageMaker Training Compiler + for various training settings such as using TensorFlow-based models, + PyTorch-based models, and distributed training, + see `Enable SageMaker Training Compiler + `_ + in the `Amazon SageMaker Training Compiler developer guide + `_. + + """ + + super(TrainingCompilerConfig, self).__init__(enabled=enabled, debug=debug) + + @classmethod + def validate( + cls, + estimator, + ): + """Checks if SageMaker Training Compiler is configured correctly. + + Args: + estimator (str): A estimator object + If SageMaker Training Compiler is enabled, it will validate whether + the estimator is configured to be compatible with Training Compiler. + + Raises: + ValueError: Raised if the requested configuration is not compatible + with SageMaker Training Compiler. + """ + + super(TrainingCompilerConfig, cls).validate(estimator) + + if estimator.image_uri: + error_helper_string = ( + "Overriding the image URI is currently not supported " + "for SageMaker Training Compiler." + "Specify the following parameters to run the Hugging Face training job " + "with SageMaker Training Compiler enabled: " + "transformer_version, tensorflow_version or pytorch_version, and compiler_config." + ) + raise ValueError(error_helper_string) diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index a161557216..c0d616969c 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -134,21 +134,18 @@ def retrieve( tolerate_vulnerable_model, tolerate_deprecated_model, ) - if training_compiler_config is None: + + if training_compiler_config and (framework == HUGGING_FACE_FRAMEWORK): + config = _config_for_framework_and_scope( + framework + "-training-compiler", image_scope, accelerator_type + ) + else: _framework = framework if framework == HUGGING_FACE_FRAMEWORK: inference_tool = _get_inference_tool(inference_tool, instance_type) if inference_tool == "neuron": _framework = f"{framework}-{inference_tool}" config = _config_for_framework_and_scope(_framework, image_scope, accelerator_type) - elif framework == HUGGING_FACE_FRAMEWORK: - config = _config_for_framework_and_scope( - framework + "-training-compiler", image_scope, accelerator_type - ) - else: - raise ValueError( - "Unsupported Configuration: Training Compiler is only supported with HuggingFace" - ) original_version = version version = _validate_version_and_set_if_needed(version, config, framework) diff --git a/src/sagemaker/tensorflow/__init__.py b/src/sagemaker/tensorflow/__init__.py index e2e8ebee8b..f7fee4a50c 100644 --- a/src/sagemaker/tensorflow/__init__.py +++ b/src/sagemaker/tensorflow/__init__.py @@ -16,3 +16,5 @@ from sagemaker.tensorflow.estimator import TensorFlow # noqa: F401 (imported but unused) from sagemaker.tensorflow.model import TensorFlowModel, TensorFlowPredictor # noqa: F401 from sagemaker.tensorflow.processing import TensorFlowProcessor # noqa: F401 + +from sagemaker.tensorflow.training_compiler.config import TrainingCompilerConfig # noqa: F401 diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 9e39bd7213..d97e39d313 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -26,6 +26,7 @@ from sagemaker.transformer import Transformer from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT from sagemaker.workflow import is_pipeline_variable +from sagemaker.tensorflow.training_compiler.config import TrainingCompilerConfig logger = logging.getLogger("sagemaker") @@ -45,7 +46,8 @@ def __init__( model_dir=None, image_uri=None, distribution=None, - **kwargs + compiler_config=None, + **kwargs, ): """Initialize a ``TensorFlow`` estimator. @@ -157,6 +159,8 @@ def __init__( To learn more, see `Training with parameter servers `_. + compiler_config (:class:`~sagemaker.tensorflow.TrainingCompilerConfig`): + Configures SageMaker Training Compiler to accelerate training. **kwargs: Additional kwargs passed to the Framework constructor. @@ -202,6 +206,17 @@ def __init__( self.distribution = distribution or {} self._validate_args(py_version=py_version) + if compiler_config is not None: + if not isinstance(compiler_config, TrainingCompilerConfig): + error_string = ( + f"Expected instance of type {TrainingCompilerConfig}" + f"for argument compiler_config. " + f"Instead got {type(compiler_config)}" + ) + raise ValueError(error_string) + if compiler_config: + compiler_config.validate(self) + self.compiler_config = compiler_config def _validate_args(self, py_version): """Placeholder docstring""" @@ -301,7 +316,7 @@ def create_model( entry_point=None, source_dir=None, dependencies=None, - **kwargs + **kwargs, ): """Creates ``TensorFlowModel`` object to be used for creating SageMaker model entities. @@ -352,7 +367,7 @@ def create_model( entry_point=entry_point, source_dir=source_dir, dependencies=dependencies, - **kwargs + **kwargs, ) def hyperparameters(self): @@ -369,6 +384,13 @@ def hyperparameters(self): hyperparameters.update( EstimatorBase._json_encode_hyperparameters(additional_hyperparameters) ) + + if self.compiler_config: + training_compiler_hyperparameters = self.compiler_config._to_hyperparameter_dict() + hyperparameters.update( + EstimatorBase._json_encode_hyperparameters(training_compiler_hyperparameters) + ) + return hyperparameters def _default_s3_path(self, directory, mpi=False): diff --git a/src/sagemaker/tensorflow/training_compiler/__init__.py b/src/sagemaker/tensorflow/training_compiler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sagemaker/tensorflow/training_compiler/config.py b/src/sagemaker/tensorflow/training_compiler/config.py new file mode 100644 index 0000000000..d14cc3359b --- /dev/null +++ b/src/sagemaker/tensorflow/training_compiler/config.py @@ -0,0 +1,111 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Configuration for the SageMaker Training Compiler.""" +from __future__ import absolute_import +import logging +from packaging.specifiers import SpecifierSet +from packaging.version import Version + +from sagemaker.training_compiler.config import TrainingCompilerConfig as BaseConfig + +logger = logging.getLogger(__name__) + + +class TrainingCompilerConfig(BaseConfig): + """The SageMaker Training Compiler configuration class.""" + + SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4", "g5"] + MIN_SUPPORTED_VERSION = "2.9" + + def __init__( + self, + enabled=True, + debug=False, + ): + """This class initializes a ``TrainingCompilerConfig`` instance. + + `Amazon SageMaker Training Compiler + `_ + is a feature of SageMaker Training + and speeds up training jobs by optimizing model execution graphs. + + You can compile TensorFlow models + by passing the object of this configuration class to the ``compiler_config`` + parameter of the :class:`~sagemaker.tensorflow.TensorFlow` + estimator. + + Args: + enabled (bool): Optional. Switch to enable SageMaker Training Compiler. + The default is ``True``. + debug (bool): Optional. Whether to dump detailed logs for debugging. + This comes with a potential performance slowdown. + The default is ``False``. + + **Example**: The following code shows the basic usage of the + :class:`sagemaker.tensorflow.TrainingCompilerConfig()` class + to run a TensorFlow training job with the compiler. + + .. code-block:: python + + from sagemaker.tensorflow import TensorFlow, TrainingCompilerConfig + + tensorflow_estimator=TensorFlow( + ... + compiler_config=TrainingCompilerConfig() + ) + + .. seealso:: + + For more information about how to enable SageMaker Training Compiler + for various training settings such as using TensorFlow-based models, + PyTorch-based models, and distributed training, + see `Enable SageMaker Training Compiler + `_ + in the `Amazon SageMaker Training Compiler developer guide + `_. + + """ + + super(TrainingCompilerConfig, self).__init__(enabled=enabled, debug=debug) + + @classmethod + def validate( + cls, + estimator, + ): + """Checks if SageMaker Training Compiler is configured correctly. + + Args: + estimator (str): A estimator object + If SageMaker Training Compiler is enabled, it will validate whether + the estimator is configured to be compatible with Training Compiler. + + Raises: + ValueError: Raised if the requested configuration is not compatible + with SageMaker Training Compiler. + """ + + super(TrainingCompilerConfig, cls).validate(estimator) + + if estimator.framework_version: + if Version(estimator.framework_version) in SpecifierSet( + f"< {cls.MIN_SUPPORTED_VERSION}" + ): + error_helper_string = ( + "SageMaker Training Compiler only supports TensorFlow version " + ">= {} but received {}" + ) + error_helper_string = error_helper_string.format( + cls.MIN_SUPPORTED_VERSION, estimator.framework_version + ) + raise ValueError(error_helper_string) diff --git a/src/sagemaker/training_compiler/config.py b/src/sagemaker/training_compiler/config.py index c45fa4cdaf..cd1fbd5957 100644 --- a/src/sagemaker/training_compiler/config.py +++ b/src/sagemaker/training_compiler/config.py @@ -118,36 +118,25 @@ def _to_hyperparameter_dict(self): @classmethod def validate( cls, - image_uri, - instance_type, - distribution, + estimator, ): """Checks if SageMaker Training Compiler is configured correctly. Args: - image_uri (str): A string of a Docker image URI that's specified - to :class:`~sagemaker.huggingface.HuggingFace`. - If SageMaker Training Compiler is enabled, the HuggingFace estimator - automatically chooses the right image URI. You cannot specify and override - the image URI. - instance_type (str): A string of the training instance type that's specified - to :class:`~sagemaker.huggingface.HuggingFace`. - The `validate` classmethod raises error - if an instance type not in the ``SUPPORTED_INSTANCE_CLASS_PREFIXES`` list - or ``local`` is passed to the `instance_type` parameter. - distribution (dict): A dictionary of the distributed training option that's specified - to :class:`~sagemaker.huggingface.HuggingFace`. - SageMaker's distributed data parallel and model parallel libraries - are currently not compatible - with SageMaker Training Compiler. + estimator (str): A estimator object + When SageMaker Training Compiler is enabled, it validates if + the estimator is configured to be compatible with Training Compiler. + Raises: ValueError: Raised if the requested configuration is not compatible with SageMaker Training Compiler. """ - if "local" not in instance_type: - requested_instance_class = instance_type.split(".")[1] # Expecting ml.class.size + if "local" not in estimator.instance_type: + requested_instance_class = estimator.instance_type.split(".")[ + 1 + ] # Expecting ml.class.size if not any( [ requested_instance_class.startswith(i) @@ -161,25 +150,33 @@ def validate( requested_instance_class, cls.SUPPORTED_INSTANCE_CLASS_PREFIXES ) raise ValueError(error_helper_string) - elif instance_type == "local": + elif estimator.instance_type == "local": error_helper_string = ( "The local mode is not supported by SageMaker Training Compiler." - "It only supports the following GPU instances: p3, g4dn, and p4." - ) - raise ValueError(error_helper_string) - - if image_uri: - error_helper_string = ( - "Overriding the image URI is currently not supported " - "for SageMaker Training Compiler." - "Specify the following parameters to run the Hugging Face training job " - "with SageMaker Training Compiler enabled: " - "transformer_version, tensorflow_version or pytorch_version, and compiler_config." + "It only supports the following GPU instances: {}" ) + error_helper_string = error_helper_string.format(cls.SUPPORTED_INSTANCE_CLASS_PREFIXES) raise ValueError(error_helper_string) - if distribution and "smdistributed" in distribution: + if estimator.distribution and "smdistributed" in estimator.distribution: raise ValueError( "SageMaker distributed training configuration is currently not compatible with " "SageMaker Training Compiler." ) + + if estimator.debugger_hook_config or (not estimator.disable_profiler): + helper_string = ( + "Using Debugger and/or Profiler with SageMaker Training Compiler " + "might add recompilation overhead and degrade" + "performance. Found debugger_hook_config={} " + "disable_profiler={}. Please set " + "debugger_hook_config=None and disable_profiler=True for optimal " + "performance. For more information, see Training Compiler " + "Performance Considerations " + "(https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler-tips-pitfalls.html" + "#training-compiler-tips-pitfalls-considerations)." + ) + helper_string = helper_string.format( + estimator.debugger_hook_config, estimator.disable_profiler + ) + logger.warning(helper_string) diff --git a/tests/integ/test_training_compiler.py b/tests/integ/test_training_compiler.py index 0d9653ea93..92d4dbbffb 100644 --- a/tests/integ/test_training_compiler.py +++ b/tests/integ/test_training_compiler.py @@ -13,10 +13,14 @@ from __future__ import absolute_import import os - +from packaging import version import pytest -from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig +from sagemaker.huggingface import HuggingFace +from sagemaker.huggingface import TrainingCompilerConfig as HFTrainingCompilerConfig +from sagemaker.tensorflow import TensorFlow +from sagemaker.tensorflow import TrainingCompilerConfig as TFTrainingCompilerConfig + from tests import integ from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout @@ -27,15 +31,15 @@ def gpu_instance_type(request): return "ml.p3.2xlarge" +@pytest.fixture(scope="module", autouse=True) +def skip_if_incompatible(request): + if integ.test_region() not in integ.TRAINING_COMPILER_SUPPORTED_REGIONS: + pytest.skip("SageMaker Training Compiler is not supported in this region") + if integ.test_region() in integ.TRAINING_NO_P3_REGIONS: + pytest.skip("no ml.p3 instances in this region") + + @pytest.mark.release -@pytest.mark.skipif( - integ.test_region() not in integ.TRAINING_COMPILER_SUPPORTED_REGIONS, - reason="SageMaker Training Compiler is not supported in this region", -) -@pytest.mark.skipif( - integ.test_region() in integ.TRAINING_NO_P3_REGIONS, - reason="no ml.p3 instances in this region", -) def test_huggingface_pytorch( sagemaker_session, gpu_instance_type, @@ -66,7 +70,7 @@ def test_huggingface_pytorch( environment={"GPU_NUM_DEVICES": "1"}, sagemaker_session=sagemaker_session, disable_profiler=True, - compiler_config=TrainingCompilerConfig(), + compiler_config=HFTrainingCompilerConfig(), ) train_input = hf.sagemaker_session.upload_data( @@ -78,14 +82,6 @@ def test_huggingface_pytorch( @pytest.mark.release -@pytest.mark.skipif( - integ.test_region() not in integ.TRAINING_COMPILER_SUPPORTED_REGIONS, - reason="SageMaker Training Compiler is not supported in this region", -) -@pytest.mark.skipif( - integ.test_region() in integ.TRAINING_NO_P3_REGIONS, - reason="no ml.p3 instances in this region", -) def test_huggingface_tensorflow( sagemaker_session, gpu_instance_type, @@ -113,7 +109,7 @@ def test_huggingface_tensorflow( }, sagemaker_session=sagemaker_session, disable_profiler=True, - compiler_config=TrainingCompilerConfig(), + compiler_config=HFTrainingCompilerConfig(), ) train_input = hf.sagemaker_session.upload_data( @@ -121,3 +117,63 @@ def test_huggingface_tensorflow( ) hf.fit(train_input) + + +@pytest.mark.release +def test_tensorflow( + sagemaker_session, + gpu_instance_type, + tensorflow_training_latest_version, +): + if version.parse(tensorflow_training_latest_version) < version.parse("2.9"): + pytest.skip("Training Compiler only supports TF >= 2.9") + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): + epochs = 10 + batch = 256 + train_steps = int(10240 * epochs / batch) + steps_per_loop = train_steps // 10 + overrides = ( + f"runtime.enable_xla=True," + f"runtime.num_gpus=1," + f"runtime.distribution_strategy=one_device," + f"runtime.mixed_precision_dtype=float16," + f"task.train_data.global_batch_size={batch}," + f"task.train_data.input_path=/opt/ml/input/data/training/validation*," + f"task.train_data.cache=False," + f"trainer.train_steps={train_steps}," + f"trainer.steps_per_loop={steps_per_loop}," + f"trainer.summary_interval={steps_per_loop}," + f"trainer.checkpoint_interval={train_steps}," + f"task.model.backbone.type=resnet," + f"task.model.backbone.resnet.model_id=50" + ) + tf = TensorFlow( + py_version="py39", + git_config={ + "repo": "https://github.com/tensorflow/models.git", + "branch": "v2.9.2", + }, + source_dir=".", + entry_point="official/vision/train.py", + model_dir=False, + role="SageMakerRole", + framework_version=tensorflow_training_latest_version, + instance_count=1, + instance_type=gpu_instance_type, + hyperparameters={ + "experiment": "resnet_imagenet", + "config_file": "official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml", + "mode": "train", + "model_dir": "/opt/ml/model", + "params_override": overrides, + }, + sagemaker_session=sagemaker_session, + disable_profiler=True, + compiler_config=TFTrainingCompilerConfig(), + ) + + tf.fit( + inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/validation", + logs=True, + wait=True, + ) diff --git a/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py b/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py new file mode 100644 index 0000000000..4618786483 --- /dev/null +++ b/tests/unit/sagemaker/training_compiler/test_tensorflow_compiler.py @@ -0,0 +1,511 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import +import logging + +import json +import os +from packaging import version + +import pytest +from mock import MagicMock, Mock, patch + +from sagemaker import image_uris +from sagemaker.tensorflow import TensorFlow, TrainingCompilerConfig + +from tests.unit.sagemaker.training_compiler import EC2_GPU_INSTANCE_CLASSES + + +DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data") +SCRIPT_PATH = os.path.join(DATA_DIR, "dummy_script.py") +SERVING_SCRIPT_FILE = "another_dummy_script.py" +MODEL_DATA = "s3://some/data.tar.gz" +ENV = {"DUMMY_ENV_VAR": "dummy_value"} +TIMESTAMP = "2017-11-06-14:14:15.672" +TIME = 1510006209.073025 +BUCKET_NAME = "mybucket" +INSTANCE_COUNT = 1 +INSTANCE_TYPE = "ml.p3.2xlarge" +IMAGE_URI = "tensorflow" +JOB_NAME = "{}-{}".format(IMAGE_URI, TIMESTAMP) +ROLE = "Dummy" +REGION = "us-east-1" +GPU = "ml.p3.2xlarge" +SUPPORTED_GPU_INSTANCE_CLASSES = {"p3", "p3dn", "g4dn", "p4dn", "g5"} +UNSUPPORTED_GPU_INSTANCE_CLASSES = EC2_GPU_INSTANCE_CLASSES - SUPPORTED_GPU_INSTANCE_CLASSES + +LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]} + +EXPERIMENT_CONFIG = { + "ExperimentName": "exp", + "TrialName": "trial", + "TrialComponentDisplayName": "tc", +} + + +@pytest.fixture(scope="module", autouse=True) +def skip_if_incompatible(tensorflow_training_version, request): + if version.parse(tensorflow_training_version) < version.parse("2.9"): + pytest.skip("Training Compiler only supports TF >= 2.9") + + +@pytest.fixture(scope="module") +def cpu_instance_type(): + return "ml.m5.xlarge" + + +@pytest.fixture(name="sagemaker_session", scope="function") +def fixture_sagemaker_session(): + boto_mock = Mock(name="boto_session", region_name=REGION) + session = Mock( + name="sagemaker_session", + boto_session=boto_mock, + boto_region_name=REGION, + config=None, + local_mode=False, + s3_resource=None, + s3_client=None, + ) + + describe = {"ModelArtifacts": {"S3ModelArtifacts": "s3://m/m.tar.gz"}} + session.sagemaker_client.describe_training_job = Mock(return_value=describe) + session.sagemaker_client.list_tags = Mock(return_value=LIST_TAGS_RESULT) + session.default_bucket = Mock(name="default_bucket", return_value=BUCKET_NAME) + session.expand_role = Mock(name="expand_role", return_value=ROLE) + return session + + +def _get_full_gpu_image_uri(framework_version, instance_type, training_compiler_config, py_version): + return image_uris.retrieve( + "tensorflow", + REGION, + version=framework_version, + py_version=py_version, + instance_type=instance_type, + image_scope="training", + container_version=None, + training_compiler_config=training_compiler_config, + ) + + +def _create_train_job(framework_version, instance_type, training_compiler_config, py_version): + return { + "image_uri": _get_full_gpu_image_uri( + framework_version, instance_type, training_compiler_config, py_version + ), + "input_mode": "File", + "input_config": [ + { + "ChannelName": "training", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + } + }, + } + ], + "role": ROLE, + "job_name": JOB_NAME, + "output_config": {"S3OutputPath": "s3://{}/".format(BUCKET_NAME)}, + "resource_config": { + "InstanceType": instance_type, + "InstanceCount": 1, + "VolumeSizeInGB": 30, + }, + "hyperparameters": { + "sagemaker_program": json.dumps("dummy_script.py"), + "sagemaker_container_log_level": str(logging.INFO), + "sagemaker_job_name": json.dumps(JOB_NAME), + "sagemaker_submit_directory": json.dumps( + "s3://{}/{}/source/sourcedir.tar.gz".format(BUCKET_NAME, JOB_NAME) + ), + "sagemaker_region": '"us-east-1"', + "model_dir": json.dumps("s3://{}/{}/model".format(BUCKET_NAME, JOB_NAME)), + }, + "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "tags": None, + "vpc_config": None, + "metric_definitions": None, + "environment": None, + "retry_strategy": None, + "experiment_config": EXPERIMENT_CONFIG, + "debugger_hook_config": { + "CollectionConfigurations": [], + "S3OutputPath": "s3://{}/".format(BUCKET_NAME), + }, + "profiler_rule_configs": [ + { + "RuleConfigurationName": "ProfilerReport-1510006209", + "RuleEvaluatorImage": "503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest", + "RuleParameters": {"rule_to_invoke": "ProfilerReport"}, + } + ], + "profiler_config": { + "S3OutputPath": "s3://{}/".format(BUCKET_NAME), + }, + } + + +class TestUnsupportedConfig: + def test_cpu_instance( + self, + cpu_instance_type, + tensorflow_training_version, + tensorflow_training_py_version, + ): + with pytest.raises(ValueError): + TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=INSTANCE_COUNT, + instance_type=cpu_instance_type, + framework_version=tensorflow_training_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + ).fit() + + @pytest.mark.parametrize("unsupported_gpu_instance_class", UNSUPPORTED_GPU_INSTANCE_CLASSES) + def test_gpu_instance( + self, + unsupported_gpu_instance_class, + tensorflow_training_version, + tensorflow_training_py_version, + ): + with pytest.raises(ValueError): + TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=INSTANCE_COUNT, + instance_type=f"ml.{unsupported_gpu_instance_class}.2xlarge", + framework_version=tensorflow_training_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + ).fit() + + def test_framework_version( + self, + tensorflow_training_py_version, + ): + with pytest.raises(ValueError): + TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + framework_version="2.8", + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + ).fit() + + def test_python_2( + self, + tensorflow_training_version, + ): + with pytest.raises(ValueError): + TensorFlow( + py_version="py27", + entry_point=SCRIPT_PATH, + role=ROLE, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + framework_version=tensorflow_training_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(), + ).fit() + + +@patch("sagemaker.utils.repack_model", MagicMock()) +@patch("sagemaker.utils.create_tar_file", MagicMock()) +@patch("sagemaker.estimator.name_from_base", return_value=JOB_NAME) +@patch("time.time", return_value=TIME) +class TestTrainingCompilerConfig: + @pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES) + def test_default( + self, + time, + name_from_base, + sagemaker_session, + tensorflow_training_version, + tensorflow_training_py_version, + instance_class, + ): + compiler_config = TrainingCompilerConfig() + instance_type = f"ml.{instance_class}.2xlarge" + + tf = TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=instance_type, + framework_version=tensorflow_training_version, + enable_sagemaker_metrics=False, + compiler_config=compiler_config, + ) + + inputs = "s3://mybucket/train" + + tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) + + sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] + assert sagemaker_call_names == ["train", "logs_for_job"] + boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] + assert boto_call_names == ["resource"] + + expected_train_args = _create_train_job( + tensorflow_training_version, + instance_type, + compiler_config, + tensorflow_training_py_version, + ) + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["enable_sagemaker_metrics"] = False + expected_train_args["hyperparameters"][ + TrainingCompilerConfig.HP_ENABLE_COMPILER + ] = json.dumps(True) + expected_train_args["hyperparameters"][TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps( + False + ) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert ( + actual_train_args == expected_train_args + ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}" + + @pytest.mark.parametrize("instance_class", SUPPORTED_GPU_INSTANCE_CLASSES) + def test_byoc( + self, + time, + name_from_base, + sagemaker_session, + tensorflow_training_version, + tensorflow_training_py_version, + instance_class, + ): + compiler_config = TrainingCompilerConfig() + instance_type = f"ml.{instance_class}.2xlarge" + + tf = TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=instance_type, + image_uri=_get_full_gpu_image_uri( + tensorflow_training_version, + instance_type, + compiler_config, + tensorflow_training_py_version, + ), + enable_sagemaker_metrics=False, + compiler_config=compiler_config, + ) + + inputs = "s3://mybucket/train" + + tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) + + sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] + assert sagemaker_call_names == ["train", "logs_for_job"] + boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] + assert boto_call_names == ["resource"] + + expected_train_args = _create_train_job( + tensorflow_training_version, + instance_type, + compiler_config, + tensorflow_training_py_version, + ) + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["enable_sagemaker_metrics"] = False + expected_train_args["hyperparameters"][ + TrainingCompilerConfig.HP_ENABLE_COMPILER + ] = json.dumps(True) + expected_train_args["hyperparameters"][TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps( + False + ) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert ( + actual_train_args == expected_train_args + ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}" + + def test_debug_compiler_config( + self, + time, + name_from_base, + sagemaker_session, + tensorflow_training_version, + tensorflow_training_py_version, + ): + compiler_config = TrainingCompilerConfig(debug=True) + + tf = TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + framework_version=tensorflow_training_version, + enable_sagemaker_metrics=False, + compiler_config=compiler_config, + ) + + inputs = "s3://mybucket/train" + + tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) + + sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] + assert sagemaker_call_names == ["train", "logs_for_job"] + boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] + assert boto_call_names == ["resource"] + + expected_train_args = _create_train_job( + tensorflow_training_version, + INSTANCE_TYPE, + compiler_config, + tensorflow_training_py_version, + ) + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["enable_sagemaker_metrics"] = False + expected_train_args["hyperparameters"][ + TrainingCompilerConfig.HP_ENABLE_COMPILER + ] = json.dumps(True) + expected_train_args["hyperparameters"][TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps( + True + ) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert ( + actual_train_args == expected_train_args + ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}" + + def test_disable_compiler_config( + self, + time, + name_from_base, + sagemaker_session, + tensorflow_training_version, + tensorflow_training_py_version, + ): + compiler_config = TrainingCompilerConfig(enabled=False) + + tf = TensorFlow( + py_version=tensorflow_training_py_version, + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + framework_version=tensorflow_training_version, + enable_sagemaker_metrics=False, + compiler_config=TrainingCompilerConfig(enabled=False), + ) + + inputs = "s3://mybucket/train" + + tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) + + sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] + assert sagemaker_call_names == ["train", "logs_for_job"] + boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] + assert boto_call_names == ["resource"] + + expected_train_args = _create_train_job( + tensorflow_training_version, + INSTANCE_TYPE, + compiler_config, + tensorflow_training_py_version, + ) + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["enable_sagemaker_metrics"] = False + expected_train_args["hyperparameters"][ + TrainingCompilerConfig.HP_ENABLE_COMPILER + ] = json.dumps(False) + expected_train_args["hyperparameters"][TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps( + False + ) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert ( + actual_train_args == expected_train_args + ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}" + + +@pytest.mark.parametrize( + ["compiler_enabled", "debug_enabled"], [(True, False), (True, True), (False, False)] +) +def test_attach(sagemaker_session, compiler_enabled, debug_enabled, tensorflow_training_py_version): + training_image = ( + f"1.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:" + f"2.9.1-" + f"gpu-" + f"{tensorflow_training_py_version}-cu112-ubuntu20.04" + ) + returned_job_description = { + "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', + "sagemaker_container_log_level": '"logging.INFO"', + "sagemaker_job_name": '"trcomp"', + "training_steps": "100", + "sagemaker_region": '"us-east-1"', + TrainingCompilerConfig.HP_ENABLE_COMPILER: json.dumps(compiler_enabled), + TrainingCompilerConfig.HP_ENABLE_DEBUG: json.dumps(debug_enabled), + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.p3.2xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "trcomp", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/trcomp", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/trcomp"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=returned_job_description + ) + + estimator = TensorFlow.attach(training_job_name="trcomp", sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == "trcomp" + assert estimator.py_version == tensorflow_training_py_version + assert estimator.framework_version == "2.9.1" + assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" + assert estimator.instance_count == 1 + assert estimator.max_run == 24 * 60 * 60 + assert estimator.input_mode == "File" + assert estimator.base_job_name == "trcomp" + assert estimator.output_path == "s3://place/output/trcomp" + assert estimator.output_kms_key == "" + assert estimator.hyperparameters()["training_steps"] == "100" + assert estimator.hyperparameters()[TrainingCompilerConfig.HP_ENABLE_COMPILER] == json.dumps( + compiler_enabled + ) + assert estimator.hyperparameters()[TrainingCompilerConfig.HP_ENABLE_DEBUG] == json.dumps( + debug_enabled + ) + assert estimator.source_dir == "s3://some/sourcedir.tar.gz" + assert estimator.entry_point == "iris-dnn-classifier.py"